From a24564d2b71617dce754e4b291a3bb59ae01cb2d Mon Sep 17 00:00:00 2001 From: "David A. Hysom" Date: Mon, 6 May 2019 10:32:56 -0700 Subject: [PATCH 001/634] added method: void set_super_node_mode() --- include/lbann/data_store/data_store_conduit.hpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/lbann/data_store/data_store_conduit.hpp b/include/lbann/data_store/data_store_conduit.hpp index c5698427787..f24da68c90d 100644 --- a/include/lbann/data_store/data_store_conduit.hpp +++ b/include/lbann/data_store/data_store_conduit.hpp @@ -133,6 +133,10 @@ class data_store_conduit { ++m_n; } + void set_super_node_mode() { + m_super_node = true; + } + protected : /// records the number of times exchange_mini_batch_data has been called From b19d73b4553d98ec6010ddeb04e1570a369583bd Mon Sep 17 00:00:00 2001 From: "David A. Hysom" Date: Mon, 6 May 2019 10:33:30 -0700 Subject: [PATCH 002/634] added method: void preload_data_store() override; --- include/lbann/data_readers/data_reader_image.hpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/lbann/data_readers/data_reader_image.hpp b/include/lbann/data_readers/data_reader_image.hpp index a7f6c1b269c..dfd611f265d 100644 --- a/include/lbann/data_readers/data_reader_image.hpp +++ b/include/lbann/data_readers/data_reader_image.hpp @@ -100,6 +100,8 @@ class image_data_reader : public generic_data_reader { return m_image_list.at(idx); } + void preload_data_store() override; + protected: /// Set the default values for the width, the height, the number of channels, and the number of labels of an image virtual void set_defaults(); From 9febb357eb773a8f2db00db88d9e32699c372298 Mon Sep 17 00:00:00 2001 From: "David A. Hysom" Date: Mon, 6 May 2019 10:34:00 -0700 Subject: [PATCH 003/634] initial modification for adding data_store support --- src/data_readers/data_reader_image.cpp | 75 ++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) diff --git a/src/data_readers/data_reader_image.cpp b/src/data_readers/data_reader_image.cpp index c0e40476091..19aaf43bfd3 100644 --- a/src/data_readers/data_reader_image.cpp +++ b/src/data_readers/data_reader_image.cpp @@ -27,10 +27,15 @@ //////////////////////////////////////////////////////////////////////////////// #include "lbann/data_readers/data_reader_image.hpp" +#include "lbann/utils/timer.hpp" +#include "lbann/data_store/data_store_conduit.hpp" +#include "lbann/utils/file_utils.hpp" #include namespace lbann { +#define DATA_ID_STR(data_id) pad(std::to_string(data_id), SAMPLE_ID_PAD, '0') + image_data_reader::image_data_reader(bool shuffle) : generic_data_reader(shuffle) { set_defaults(); @@ -108,6 +113,8 @@ void image_data_reader::load() { //const std::string imageDir = get_file_dir(); const std::string imageListFile = get_data_filename(); + options *opts = options::get(); + m_image_list.clear(); // load image list @@ -128,14 +135,82 @@ void image_data_reader::load() { } fclose(fplist); + // TODO: this will probably need to change after sample_list class + // is modified + + std::vector local_list_sizes; + if (opts->get_bool("preload_data_store")) { + int np = m_comm->get_procs_per_trainer(); + int base_files_per_rank = m_image_list.size() / np; + int extra = m_image_list.size() - (base_files_per_rank*np); + if (extra > np) { + LBANN_ERROR("extra > np"); + } + local_list_sizes.resize(np, 0); + for (int j=0; jget_bool("preload_data_store") || opts->get_bool("use_data_store")) { + m_data_store->set_super_node_mode(); + } + select_subset_of_data(); } +void read_raw_data(const std::string &filename, std::vector &data) { + data.clear(); + std::ifstream in(filename.c_str()); + if (!in) { + LBANN_ERROR("failed to open " + filename + " for reading"); + } + in.seekg(0, in.end); + int num_bytes = in.tellg(); + in.seekg(0, in.beg); + data.resize(num_bytes); + in.read((char*)data.data(), num_bytes); + in.close(); +} + +void image_data_reader::preload_data_store() { + double tm1 = get_time(); + m_data_store->set_preload(); + + int rank = m_comm->get_rank_in_trainer(); + std::vector data; + for (size_t data_id=0; data_idget_index_owner(data_id) != rank) { + continue; + } + + conduit::Node node; + const std::string filename = get_file_dir() + m_image_list[data_id].first; + int label = m_image_list[data_id].second; + node[DATA_ID_STR(data_id) + "/label"] = label; + node[DATA_ID_STR(data_id) + "/filename"] = filename; //not really needed, but nice to have + + read_raw_data(filename, data); + + node[DATA_ID_STR(data_id) + "/buffer"].set_char_ptr(data.data()); + m_data_store->set_conduit_node(data_id, node); + } + + + if (is_master()) { + std::cout << "image_data_reader::preload_data_store time: " << (get_time() - tm1) << "\n"; + } +} + void image_data_reader::setup(int num_io_threads, std::shared_ptr io_thread_pool) { generic_data_reader::setup(num_io_threads, io_thread_pool); From 124c8361b9f195cedcecf55115507be46df914ec Mon Sep 17 00:00:00 2001 From: "David A. Hysom" Date: Mon, 6 May 2019 15:38:12 -0700 Subject: [PATCH 004/634] added a define for padding conduit IDs --- include/lbann/utils/file_utils.hpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/include/lbann/utils/file_utils.hpp b/include/lbann/utils/file_utils.hpp index 4b882ef3cd5..cd024b45dc4 100644 --- a/include/lbann/utils/file_utils.hpp +++ b/include/lbann/utils/file_utils.hpp @@ -76,6 +76,9 @@ inline void __swapEndianInt(unsigned int& ui) { ui = ((ui >> 24) | ((ui<<8) & 0x00FF0000) | ((ui>>8) & 0x0000FF00) | (ui << 24)); } +#define DATA_ID_STR(data_id) pad(std::to_string(data_id), SAMPLE_ID_PAD, '0') + + // The generic approach template std::basic_string pad(const std::basic_string& s, From 4601b0d8e926f2976f564eb8f49ac4cd6fc4fa90 Mon Sep 17 00:00:00 2001 From: "David A. Hysom" Date: Mon, 6 May 2019 15:39:11 -0700 Subject: [PATCH 005/634] removed #define DATA_ID_STR -- since it's now in file_utils.hpp --- src/data_readers/numpy_conduit_converter.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/data_readers/numpy_conduit_converter.cpp b/src/data_readers/numpy_conduit_converter.cpp index dfa1bc3e74f..92e71259b00 100644 --- a/src/data_readers/numpy_conduit_converter.cpp +++ b/src/data_readers/numpy_conduit_converter.cpp @@ -32,8 +32,6 @@ namespace lbann { -#define DATA_ID_STR(data_id) pad(std::to_string(data_id), SAMPLE_ID_PAD, '0') - //static void numpy_conduit_converter::load_conduit_node(const std::string filename, int data_id, conduit::Node &output, bool reset) { From 9d3c4bc15e9a0829cb70279dabedaa1cd0134522 Mon Sep 17 00:00:00 2001 From: "David A. Hysom" Date: Mon, 6 May 2019 15:40:56 -0700 Subject: [PATCH 006/634] removed #define DATA_ID_STR -- since it's now in file_utils.hpp --- src/data_readers/data_reader_numpy_npz_conduit.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/data_readers/data_reader_numpy_npz_conduit.cpp b/src/data_readers/data_reader_numpy_npz_conduit.cpp index ae1ee3c9b3e..be2a115b721 100644 --- a/src/data_readers/data_reader_numpy_npz_conduit.cpp +++ b/src/data_readers/data_reader_numpy_npz_conduit.cpp @@ -37,8 +37,6 @@ namespace lbann { -#define DATA_ID_STR(data_id) pad(std::to_string(data_id), SAMPLE_ID_PAD, '0') - numpy_npz_conduit_reader::numpy_npz_conduit_reader(const bool shuffle) : generic_data_reader(shuffle) {} From 94a234e8d8820c1b79ac592030bc3a3d7ab6e191 Mon Sep 17 00:00:00 2001 From: "David A. Hysom" Date: Mon, 6 May 2019 15:42:02 -0700 Subject: [PATCH 007/634] added method: void load_conduit_node_from_file(int data_id, conduit::Node &node); which is called by at least two classes in the image hierarchy --- include/lbann/data_readers/data_reader_image.hpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/include/lbann/data_readers/data_reader_image.hpp b/include/lbann/data_readers/data_reader_image.hpp index dfd611f265d..3a9fc428b9f 100644 --- a/include/lbann/data_readers/data_reader_image.hpp +++ b/include/lbann/data_readers/data_reader_image.hpp @@ -32,6 +32,7 @@ #include "data_reader.hpp" #include "image_preprocessor.hpp" #include "cv_process.hpp" +#include "lbann/data_store/data_store_conduit.hpp" namespace lbann { class image_data_reader : public generic_data_reader { @@ -117,6 +118,8 @@ class image_data_reader : public generic_data_reader { int m_image_linearized_size; ///< linearized image size int m_num_labels; ///< number of labels std::vector m_thread_cv_buffer; + + void load_conduit_node_from_file(int data_id, conduit::Node &node); }; } // namespace lbann From 97618f45a2d7da15d72efd199bb6b92240758a7a Mon Sep 17 00:00:00 2001 From: "David A. Hysom" Date: Mon, 6 May 2019 15:43:06 -0700 Subject: [PATCH 008/634] several changes related to adding data_store support. Code compiles; next step is testing. --- src/data_readers/data_reader_imagenet.cpp | 59 ++++++++++++----------- 1 file changed, 32 insertions(+), 27 deletions(-) diff --git a/src/data_readers/data_reader_imagenet.cpp b/src/data_readers/data_reader_imagenet.cpp index 3ee457f15d9..b796b61ab55 100644 --- a/src/data_readers/data_reader_imagenet.cpp +++ b/src/data_readers/data_reader_imagenet.cpp @@ -28,6 +28,7 @@ #include "lbann/data_readers/data_reader_imagenet.hpp" #include "lbann/data_readers/image_utils.hpp" +#include "lbann/utils/file_utils.hpp" #include namespace lbann { @@ -37,9 +38,7 @@ imagenet_reader::imagenet_reader(const std::shared_ptr& pp, bool shu set_defaults(); if (!pp) { - std::stringstream err; - err << __FILE__<<" "<<__LINE__<< " :: " << get_type() << " construction error: no image processor"; - throw lbann_exception(err.str()); + LBANN_ERROR("construction error: no image processor"); } m_master_pps = lbann::make_unique(*pp); @@ -48,9 +47,7 @@ imagenet_reader::imagenet_reader(const std::shared_ptr& pp, bool shu imagenet_reader::imagenet_reader(const imagenet_reader& rhs) : image_data_reader(rhs) { if (!rhs.m_master_pps) { - std::stringstream err; - err << __FILE__<<" "<<__LINE__<< " :: " << get_type() << " construction error: no image processor"; - throw lbann_exception(err.str()); + LBANN_ERROR("construction error: no image processor"); } m_master_pps = lbann::make_unique(*rhs.m_master_pps); } @@ -64,9 +61,7 @@ imagenet_reader& imagenet_reader::operator=(const imagenet_reader& rhs) { image_data_reader::operator=(rhs); if (!rhs.m_master_pps) { - std::stringstream err; - err << __FILE__<<" "<<__LINE__<< " :: " << get_type() << " construction error: no image processor"; - throw lbann_exception(err.str()); + LBANN_ERROR("construction error: no image processor"); } m_master_pps = lbann::make_unique(*rhs.m_master_pps); return (*this); @@ -103,10 +98,7 @@ bool imagenet_reader::replicate_processor(const cv_process& pp, const int nthrea } if (!ok || (nthreads <= 0)) { - std::stringstream err; - err << __FILE__<<" "<<__LINE__<< " :: " << get_type() << " cannot replicate image processor"; - throw lbann_exception(err.str()); - return false; + LBANN_ERROR("cannot replicate image processor"); } const std::vector dims = pp.get_data_dims(); @@ -124,27 +116,40 @@ CPUMat imagenet_reader::create_datum_view(CPUMat& X, const int mb_idx) const { } bool imagenet_reader::fetch_datum(CPUMat& X, int data_id, int mb_idx) { - int tid = m_io_thread_pool->get_local_thread_id(); - const std::string imagepath = get_file_dir() + m_image_list[data_id].first; - int width=0, height=0, img_type=0; - + int tid = m_io_thread_pool->get_local_thread_id(); CPUMat X_v = create_datum_view(X, mb_idx); - bool ret; - ret = lbann::image_utils::load_image(imagepath, width, height, img_type, *(m_pps[tid]), X_v, m_thread_buffer[tid], &m_thread_cv_buffer[tid]); + const std::string imagepath = get_file_dir() + m_image_list[data_id].first; + + if (m_data_store != nullptr) { + conduit::Node node; + if (data_store_active()) { + const conduit::Node& ds_node = m_data_store->get_conduit_node(data_id); + node.set_external(ds_node); + } else if (priming_data_store()) { + load_conduit_node_from_file(data_id, node); + m_data_store->set_conduit_node(data_id, node); + } else { + LBANN_ERROR("you shouldn't be here; please contact Dave Hysom"); + } + + char *buf = node[DATA_ID_STR(data_id) + "/buffer"].value(); + ret = lbann::image_utils::load_image(buf, width, height, img_type, *(m_pps[tid]), X_v, m_thread_buffer[tid], &m_thread_cv_buffer[tid]); + } + + // not using data store + else { + ret = lbann::image_utils::load_image(imagepath, width, height, img_type, *(m_pps[tid]), X_v, m_thread_buffer[tid], &m_thread_cv_buffer[tid]); + } if(!ret) { - throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__) + " " - + get_type() + ": image_utils::load_image failed to load - " - + imagepath); + LBANN_ERROR(get_type() + ": image_utils::load_image failed to load - " + imagepath); } if((width * height * CV_MAT_CN(img_type)) != m_image_linearized_size) { - throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__) + " " - + get_type() + ": mismatch data size -- either width, height or channel - " - + imagepath + "[w,h,c]=[" + std::to_string(width) + "x" + std::to_string(height) - + "x" + std::to_string(CV_MAT_CN(img_type)) + "]"); - } + LBANN_ERROR( get_type() + ": mismatch data size -- either width, height or channel - " + imagepath + "[w,h,c]=[" + std::to_string(width) + "x" + std::to_string(height) + "x" + std::to_string(CV_MAT_CN(img_type)) + "]"); + } + return true; } From f3c615f195d1f9b5b787b7bc261c1f0b53a22aa6 Mon Sep 17 00:00:00 2001 From: Yosuke Oyama <17844184+oyamay@users.noreply.github.com> Date: Fri, 10 May 2019 07:57:03 -0700 Subject: [PATCH 009/634] WIP: Add cosmoflow script for LC clusters --- model_zoo/cosmoflow/README.md | 15 ++ model_zoo/cosmoflow/cosmoflow.py | 290 +++++++++++++++++++++++++++++++ 2 files changed, 305 insertions(+) create mode 100644 model_zoo/cosmoflow/README.md create mode 100755 model_zoo/cosmoflow/cosmoflow.py diff --git a/model_zoo/cosmoflow/README.md b/model_zoo/cosmoflow/README.md new file mode 100644 index 00000000000..24a15013e29 --- /dev/null +++ b/model_zoo/cosmoflow/README.md @@ -0,0 +1,15 @@ +## Reference +``` +Amrita Mathuriya, Deborah Bard, Peter Mendygral, Lawrence Meadows, +James Arnemann, Lei Shao, Siyu He, Tuomas Karna, Diana Moise, +Simon J. Pennycook, Kristyn Maschhoff, Jason Sewall, Nalini Kumar, +Shirley Ho, Michael F. Ringenburg, Prabhat, and Victor Lee. +"Cosmoflow: Using deep learning to learn the universe at scale." +Proceedings of the International Conference for High Performance +Computing, Networking, Storage, and Analysis, SC'18, pp. 65:1-65:11, +2018. +``` + +Available at: +* [The ACM Digital Library](https://dl.acm.org/citation.cfm?id=3291743) +* [arXiv](https://arxiv.org/abs/1808.04728) diff --git a/model_zoo/cosmoflow/cosmoflow.py b/model_zoo/cosmoflow/cosmoflow.py new file mode 100755 index 00000000000..678f5146b50 --- /dev/null +++ b/model_zoo/cosmoflow/cosmoflow.py @@ -0,0 +1,290 @@ +#!/usr/bin/env python3 +import argparse +import os.path +import google.protobuf.text_format as txtf +import lbann +import lbann.contrib.lc.launcher +import lbann.modules as lm +import lbann.proto as lp +from lbann.weights import Weights + +import numpy as np + +# ---------------------------------- +# The CosmoFlow module +# ---------------------------------- + +class CosmoFlow(lm.Module): + """ + CosmoFlow neural network. + + See: + Amrita Mathuriya, Deborah Bard, Peter Mendygral, Lawrence Meadows, + James Arnemann, Lei Shao, Siyu He, Tuomas Karna, Diana Moise, + Simon J. Pennycook, Kristyn Maschhoff, Jason Sewall, Nalini Kumar, + Shirley Ho, Michael F. Ringenburg, Prabhat, and Victor Lee. + "Cosmoflow: Using deep learning to learn the universe at scale." + Proceedings of the International Conference for High Performance + Computing, Networking, Storage, and Analysis, SC'18, pp. 65:1-65:11, + 2018. + + Note that this model is somewhat different from the model. + """ + + global_count = 0 # Static counter, used for default names + + def __init__(self, output_size, + input_width, + name=None): + """Initialize CosmFlow. + + Args: + output_size (int): Size of output tensor. + input_width (int): Width of input tensor. + name (str, optional): Module name + (default: 'cosmoflow_module'). + + """ + CosmoFlow.global_count += 1 + self.instance = 0 + self.name = (name if name + else 'cosmoflow_module{0}'.format(CosmoFlow.global_count)) + self.input_width = input_width + assert self.input_width in [128, 256, 512] + + self.layer_params = [ + {"type": "conv", "out_channels": 16, "kernel_size": 3, "stride": 1}, + {"type": "pool"}, + {"type": "conv", "out_channels": 32, "kernel_size": 3, "stride": 1}, + {"type": "pool"}, + {"type": "conv", "out_channels": 64, "kernel_size": 3, "stride": 1}, + {"type": "pool"}, + {"type": "conv", "out_channels": 128, "kernel_size": 3, "stride": 2}, + {"type": "pool"}, + {"type": "conv", "out_channels": 256, "kernel_size": 3, "stride": 1}, + {"type": "pool"}, + {"type": "conv", "out_channels": 256, "kernel_size": 3, "stride": 1}, + {"type": "conv", "out_channels": 256, "kernel_size": 3, "stride": 1}, + ] + for p in self.layer_params: + if p["type"] == "conv": + p["padding"] = int((p["kernel_size"]-1)/2) + + additional_pools = [] + if self.input_width == 256: + additional_pools = [6] + elif self.input_width == 512: + additional_pools = [6, 7] + + for i in additional_pools: + conv_idx = list(np.cumsum([1 if x["type"] == "conv" else 0 for x in self.layer_params])).index(i) + self.layer_params.insert(conv_idx+1, {"type": "pool"}) + + width = self.input_width + for p in self.layer_params: + if p["type"] == "conv": + output_width = int(width / p["stride"]) + else: + output_width = int(width / 2) + + p["width"] = output_width + width = output_width + assert width > 0 + + for i, param in enumerate(filter(lambda x: x["type"] == "conv", self.layer_params)): + conv_name ="conv"+str(i+1) + conv_weights = [Weights(initializer=lbann.GlorotUniformInitializer())] + + param_actual = dict(param) + param_actual.pop("type", None) + param_actual.pop("width", None) + + conv = lm.Convolution3dModule( + **param_actual, + activation=lbann.LeakyRelu, + name=self.name+"_"+conv_name, + bias=False, + weights=conv_weights) + setattr(self, conv_name, conv) + + # Create fully-connected layers + fc_params = [ + {"size": 2048}, + {"size": 256}, + {"size": output_size}, + ] + for i, param in enumerate(fc_params): + fc_name ="fc"+str(i+1) + fc = lm.FullyConnectedModule( + **param, + activation=lbann.LeakyRelu if i < len(fc_params)-1 else None, + name=self.name+"_"+fc_name, + weights=[Weights(initializer=lbann.GlorotUniformInitializer()), + Weights(initializer=lbann.ConstantInitializer(value=0.1))], + ) + setattr(self, fc_name, fc) + + def forward(self, x): + self.instance += 1 + + def create_pooling(x, i, w): + return lbann.Pooling( + x, num_dims=3, has_vectors=False, + pool_dims_i=3, + pool_pads_i=1, + pool_strides_i=2, + pool_mode='average', + name='{0}_pool{1}_instance{2}'.format(self.name,i,self.instance)) + + def create_dropout(x, i): + return lbann.Dropout(x, keep_prob=0.8, + name='{0}_drop{1}_instance{2}'.format(self.name,i,self.instance)) + + # Convolutional network + i_conv = 1 + i_pool = 1 + for param in self.layer_params: + if param["type"] == "conv": + x = getattr(self, "conv{}".format(i_conv))(x) + i_conv += 1 + + else: + x = create_pooling(x, i_pool, param["width"]) + i_pool += 1 + + # Fully-connected layers + x = create_dropout(x,1) + x = self.fc1(x) + x = create_dropout(x,2) + x = self.fc2(x) + x = create_dropout(x,3) + x = self.fc3(x) + + return x + +# TODO: Use numpy_npz_conduit data reader +def create_data_reader(train_path, val_path, test_path): + readerArgs = [ + {"role": "train", "data_file_pattern": "{}/train_*_int16.npz".format(train_path)}, + {"role": "validate", "data_file_pattern": "{}/val_*_int16.npz".format(val_path)}, + {"role": "test", "data_file_pattern": "{}/test_*_int16.npz".format(test_path)}, + ] + + readers = [] + for readerArg in readerArgs: + reader = lp.lbann_pb2.Reader( + name="cosmoflow", + shuffle=True, + validation_percent=0, + absolute_sample_count=0, + percent_of_data_to_use=1.0, + scaling_factor_int16=1.0, + **readerArg) + + readers.append(reader) + + return lp.lbann_pb2.DataReader(reader=readers) + +# ---------------------------------- +# Command-line arguments +# ---------------------------------- + +desc = ('Construct and run the CosmoFlow network. ' + 'Running the experiment is only supported on LC systems.') +parser = argparse.ArgumentParser(description=desc) +parser.add_argument( + '--partition', action='store', type=str, + help='scheduler partition', metavar='NAME') +parser.add_argument( + '--account', action='store', type=str, + help='scheduler account', metavar='NAME') +parser.add_argument( + "--learn-rate", action="store", default=0.0005, type=float, + help="The initial learning-rate") +parser.add_argument( + "--mini-batch-size", action="store", default=128, type=int, + help="The mini-batch size") +parser.add_argument( + "--epochs", action="store", default=130, type=float, + help="The number of epochs") +parser.add_argument( + "--output-size", action="store", default=4, type=int, + help="Size of output tensor") +parser.add_argument( + "--input-width", action="store", default=256, type=int, + help="Width of input tensor") +for role, label in [("train", "training"), ("val", "validation"), ("test", "test")]: + parser.add_argument( + "--{}-path".format(role), action="store", default="", type=str, + help="Path to {} dataset".format(label)) +args = parser.parse_args() + +# ---------------------------------- +# Construct layer graph +# ---------------------------------- + +# Input data +input = lbann.Input(io_buffer='partitioned', + target_mode='regression') +universes = lbann.Identity(input) +secrets = lbann.Identity(input) + +# CosmoFlow +x = CosmoFlow(args.output_size, + args.input_width).forward(universes) + +# Loss function +loss = lbann.MeanSquaredError([x, secrets]) + +# Metrics +metrics = [lbann.Metric(loss, name="MSE", unit="")] + +# Callbacks +callbacks = [ + lbann.CallbackPrint(), + lbann.CallbackTimer(), + lbann.CallbackPolyLearningRate( + power=1.0, + num_epochs=100, # TODO: Warn if args.epochs < 100 + ), + lbann.CallbackGPUMemoryUsage(), + lbann.CallbackDumpOutputs( + directory="dump_acts/", + layers="cosmoflow_module1_fc3_instance1 layer3", + execution_modes="test" + ), + lbann.CallbackProfiler(skip_init=True) +] + +# ---------------------------------- +# Setup experiment +# ---------------------------------- + +# Setup model +model = lbann.Model(args.mini_batch_size, + args.epochs, + layers=lbann.traverse_layer_graph(input), + objective_function=loss, + metrics=metrics, + callbacks=callbacks) + +# Setup optimizer +opt = lbann.Adam(learn_rate=args.learn_rate) + +# Setup data reader +data_reader_proto = create_data_reader(args.train_path, + args.val_path, + args.test_path) + +# ---------------------------------- +# Run experiment +# ---------------------------------- +# Note: Use `lbann.run` instead for non-LC systems. + +kwargs = {} +if args.partition: kwargs['partition'] = args.partition +if args.account: kwargs['account'] = args.account +lbann.contrib.lc.launcher.run(model, data_reader_proto, opt, + job_name='lbann_lenet', + nodes=8, + **kwargs) From 21eddab15a866173f45f12c910e8c90f3f79133e Mon Sep 17 00:00:00 2001 From: Yosuke Oyama <17844184+oyamay@users.noreply.github.com> Date: Mon, 13 May 2019 02:54:23 -0700 Subject: [PATCH 010/634] use numpy_npz_conduit data reader for the CosmoFlow network --- model_zoo/cosmoflow/cosmoflow.py | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/model_zoo/cosmoflow/cosmoflow.py b/model_zoo/cosmoflow/cosmoflow.py index 678f5146b50..58cb183dafc 100755 --- a/model_zoo/cosmoflow/cosmoflow.py +++ b/model_zoo/cosmoflow/cosmoflow.py @@ -162,23 +162,21 @@ def create_dropout(x, i): return x -# TODO: Use numpy_npz_conduit data reader def create_data_reader(train_path, val_path, test_path): readerArgs = [ - {"role": "train", "data_file_pattern": "{}/train_*_int16.npz".format(train_path)}, - {"role": "validate", "data_file_pattern": "{}/val_*_int16.npz".format(val_path)}, - {"role": "test", "data_file_pattern": "{}/test_*_int16.npz".format(test_path)}, + {"role": "train", "data_filename": train_path}, + {"role": "validate", "data_filename": val_path}, + {"role": "test", "data_filename": test_path}, ] readers = [] for readerArg in readerArgs: reader = lp.lbann_pb2.Reader( - name="cosmoflow", + name="numpy_npz_conduit_reader", shuffle=True, validation_percent=0, absolute_sample_count=0, percent_of_data_to_use=1.0, - scaling_factor_int16=1.0, **readerArg) readers.append(reader) @@ -198,14 +196,20 @@ def create_data_reader(train_path, val_path, test_path): parser.add_argument( '--account', action='store', type=str, help='scheduler account', metavar='NAME') +parser.add_argument( + '--experiment-dir', action='store', type=str, + help='experiment directory', metavar='NAME') parser.add_argument( "--learn-rate", action="store", default=0.0005, type=float, help="The initial learning-rate") parser.add_argument( - "--mini-batch-size", action="store", default=128, type=int, + "--nodes", action="store", default=8, type=int, + help="The number of nodes") +parser.add_argument( + "--mini-batch-size", action="store", default=32, type=int, help="The mini-batch size") parser.add_argument( - "--epochs", action="store", default=130, type=float, + "--epochs", action="store", default=130, type=int, help="The number of epochs") parser.add_argument( "--output-size", action="store", default=4, type=int, @@ -284,7 +288,10 @@ def create_data_reader(train_path, val_path, test_path): kwargs = {} if args.partition: kwargs['partition'] = args.partition if args.account: kwargs['account'] = args.account +if args.experiment_dir: kwargs['experiment_dir'] = args.experiment_dir + lbann.contrib.lc.launcher.run(model, data_reader_proto, opt, - job_name='lbann_lenet', - nodes=8, + lbann_args=" --use_data_store --preload_data_store", + job_name='lbann_cosmoflow', + nodes=args.nodes, **kwargs) From 8f975165b0d1a4269e4a988973319f8962c69160 Mon Sep 17 00:00:00 2001 From: "David A. Hysom" Date: Mon, 13 May 2019 08:07:41 -0700 Subject: [PATCH 011/634] ongoing development. Only commiting these changes as I'm about to switch to another branch, and want to ensure I don't lose anything (yes, I know I can stash -- but I've lost work that way before! --- .../lbann/data_readers/data_reader_image.hpp | 5 +- .../lbann/data_store/data_store_conduit.hpp | 22 ++++- include/lbann/utils/file_utils.hpp | 3 - src/data_readers/data_reader_image.cpp | 97 +++++++++++++------ src/data_readers/data_reader_imagenet.cpp | 61 +++++++++++- 5 files changed, 151 insertions(+), 37 deletions(-) diff --git a/include/lbann/data_readers/data_reader_image.hpp b/include/lbann/data_readers/data_reader_image.hpp index 3a9fc428b9f..faaa9130e77 100644 --- a/include/lbann/data_readers/data_reader_image.hpp +++ b/include/lbann/data_readers/data_reader_image.hpp @@ -43,6 +43,8 @@ class image_data_reader : public generic_data_reader { image_data_reader(bool shuffle = true); image_data_reader(const image_data_reader&); + image_data_reader(const image_data_reader&, const std::vector& ds_sample_move_list); + image_data_reader(const image_data_reader&, const std::vector& ds_sample_move_list, std::string role); image_data_reader& operator=(const image_data_reader&); /** Set up imagenet specific input parameters @@ -104,12 +106,13 @@ class image_data_reader : public generic_data_reader { void preload_data_store() override; protected: + void copy_members(const image_data_reader &rhs, const std::vector& ds_sample_move_list = std::vector()); + /// Set the default values for the width, the height, the number of channels, and the number of labels of an image virtual void set_defaults(); bool fetch_label(Mat& Y, int data_id, int mb_idx) override; void set_linearized_image_size(); - protected: std::string m_image_dir; ///< where images are stored std::vector m_image_list; ///< list of image files and labels int m_image_width; ///< image width diff --git a/include/lbann/data_store/data_store_conduit.hpp b/include/lbann/data_store/data_store_conduit.hpp index 865763ea29a..992a9e652b4 100644 --- a/include/lbann/data_store/data_store_conduit.hpp +++ b/include/lbann/data_store/data_store_conduit.hpp @@ -74,7 +74,10 @@ class data_store_conduit { void set_data_reader_ptr(generic_data_reader *reader) { m_reader = reader; } //! convenience handle - void set_shuffled_indices(const std::vector *indices) { m_shuffled_indices = indices; } + void set_shuffled_indices(const std::vector *indices); + + /// for use during development and debugging + int get_num_indices() { return m_shuffled_indices->size(); } void setup(int mini_batch_size); @@ -131,6 +134,9 @@ class data_store_conduit { /// with the index int get_index_owner(int idx); + /// for use during development and debugging + void set_role(const std::string role); + bool is_local_cache() const { return m_is_local_cache; } void exchange_mini_batch_data(size_t current_pos, size_t mb_size) { @@ -149,8 +155,18 @@ class data_store_conduit { m_super_node = true; } + void set_node_sizes_vary() { m_node_sizes_vary = true; } + bool has_conduit_node(int data_id) const; + /// only used for debugging; pass --debug on cmd line to get + /// each data store to print to a different file. This is made + /// public so data readers can also print to the file + mutable std::ofstream m_output; + + /// for use during development and debugging + int get_data_size() { return m_data.size(); } + protected : /// records the number of times exchange_mini_batch_data has been called @@ -230,7 +246,7 @@ protected : void setup_data_store_buffers(); /// called by exchange_data - static void build_node_for_sending(const conduit::Node &node_in, conduit::Node &node_out); + void build_node_for_sending(const conduit::Node &node_in, conduit::Node &node_out); /// fills in m_owner, which maps index -> owning processor void build_owner_map(int mini_batch_size); @@ -254,6 +270,8 @@ protected : void error_check_compacted_node(const conduit::Node &nd, int data_id); bool m_is_local_cache; + + bool m_node_sizes_vary; }; } // namespace lbann diff --git a/include/lbann/utils/file_utils.hpp b/include/lbann/utils/file_utils.hpp index cd024b45dc4..4b882ef3cd5 100644 --- a/include/lbann/utils/file_utils.hpp +++ b/include/lbann/utils/file_utils.hpp @@ -76,9 +76,6 @@ inline void __swapEndianInt(unsigned int& ui) { ui = ((ui >> 24) | ((ui<<8) & 0x00FF0000) | ((ui>>8) & 0x0000FF00) | (ui << 24)); } -#define DATA_ID_STR(data_id) pad(std::to_string(data_id), SAMPLE_ID_PAD, '0') - - // The generic approach template std::basic_string pad(const std::basic_string& s, diff --git a/src/data_readers/data_reader_image.cpp b/src/data_readers/data_reader_image.cpp index 19aaf43bfd3..03124053fc2 100644 --- a/src/data_readers/data_reader_image.cpp +++ b/src/data_readers/data_reader_image.cpp @@ -34,23 +34,33 @@ namespace lbann { -#define DATA_ID_STR(data_id) pad(std::to_string(data_id), SAMPLE_ID_PAD, '0') - image_data_reader::image_data_reader(bool shuffle) : generic_data_reader(shuffle) { +if (is_master()) std::cout << "XX image_data_reader(bool shuffle); role: " << get_role() << "\n"; set_defaults(); } image_data_reader::image_data_reader(const image_data_reader& rhs) - : generic_data_reader(rhs), - m_image_dir(rhs.m_image_dir), - m_image_list(rhs.m_image_list), - m_image_width(rhs.m_image_width), - m_image_height(rhs.m_image_height), - m_image_num_channels(rhs.m_image_num_channels), - m_image_linearized_size(rhs.m_image_linearized_size), - m_num_labels(rhs.m_num_labels) -{} + : generic_data_reader(rhs) +{ +if (is_master()) std::cout << "XX image_data_reader copy ctor; role: " << get_role() << "\n"; + copy_members(rhs); +} + +image_data_reader::image_data_reader(const image_data_reader& rhs,const std::vector& ds_sample_move_list, std::string role) + : generic_data_reader(rhs) +{ + set_role(role); +if (is_master()) std::cout << "XX copy ctor, ds_sample_move_list: " << ds_sample_move_list.size() << "; role: " << get_role() << "\n"; + copy_members(rhs, ds_sample_move_list); +} + +image_data_reader::image_data_reader(const image_data_reader& rhs,const std::vector& ds_sample_move_list) + : generic_data_reader(rhs) +{ +if (is_master()) std::cout << "XX copy ctor, ds_sample_move_list: " << ds_sample_move_list.size() << "; role: " << get_role() << "\n"; + copy_members(rhs, ds_sample_move_list); +} image_data_reader& image_data_reader::operator=(const image_data_reader& rhs) { generic_data_reader::operator=(rhs); @@ -65,6 +75,35 @@ image_data_reader& image_data_reader::operator=(const image_data_reader& rhs) { return (*this); } +void image_data_reader::copy_members(const image_data_reader &rhs, const std::vector& ds_sample_move_list) { + if (is_master()) std::cout << "XX image_data_reader::copy_members; role: " << get_role() << " ds_sample_move_list.size: " << ds_sample_move_list.size() << "\n"; + + if(rhs.m_data_store != nullptr) { + if(ds_sample_move_list.size() == 0) { + m_data_store = new data_store_conduit(rhs.get_data_store()); + } else { + m_data_store = new data_store_conduit(rhs.get_data_store(), ds_sample_move_list); + } + m_data_store->set_data_reader_ptr(this); + } + + if(m_data_store != nullptr) { + if (m_data_store->m_output) { + m_data_store->m_output << "image_data_reader::copy_members; role: " << get_role() << " ds_sample_move_list size: " << ds_sample_move_list.size() << "\n"; + } + } + m_image_dir = rhs.m_image_dir; + m_image_list = rhs.m_image_list; + m_image_width = rhs.m_image_width; + m_image_height = rhs.m_image_height; + m_image_num_channels = rhs.m_image_num_channels; + m_image_linearized_size = rhs.m_image_linearized_size; + m_num_labels = rhs.m_num_labels; + //m_thread_cv_buffer = rhs.m_thread_cv_buffer +if (is_master()) std::cout << "XX image_data_reader::copy_members; role: " << get_role() << "\n"; +} + + void image_data_reader::set_linearized_image_size() { m_image_linearized_size = m_image_width * m_image_height * m_image_num_channels; } @@ -160,14 +199,14 @@ void image_data_reader::load() { m_shuffled_indices.resize(m_image_list.size()); std::iota(m_shuffled_indices.begin(), m_shuffled_indices.end(), 0); + opts->set_option("node_sizes_vary", 1); + opts->set_option("super_node", 1); instantiate_data_store(local_list_sizes); - if (opts->get_bool("preload_data_store") || opts->get_bool("use_data_store")) { - m_data_store->set_super_node_mode(); - } select_subset_of_data(); } +//void read_raw_data(const std::string &filename, std::vector &data) { void read_raw_data(const std::string &filename, std::vector &data) { data.clear(); std::ifstream in(filename.c_str()); @@ -186,26 +225,16 @@ void image_data_reader::preload_data_store() { double tm1 = get_time(); m_data_store->set_preload(); + conduit::Node node; int rank = m_comm->get_rank_in_trainer(); - std::vector data; for (size_t data_id=0; data_idget_index_owner(data_id) != rank) { continue; } - - conduit::Node node; - const std::string filename = get_file_dir() + m_image_list[data_id].first; - int label = m_image_list[data_id].second; - node[DATA_ID_STR(data_id) + "/label"] = label; - node[DATA_ID_STR(data_id) + "/filename"] = filename; //not really needed, but nice to have - - read_raw_data(filename, data); - - node[DATA_ID_STR(data_id) + "/buffer"].set_char_ptr(data.data()); - m_data_store->set_conduit_node(data_id, node); + load_conduit_node_from_file(data_id, node); + m_data_store->set_preloaded_conduit_node(data_id, node); } - if (is_master()) { std::cout << "image_data_reader::preload_data_store time: " << (get_time() - tm1) << "\n"; } @@ -228,4 +257,18 @@ std::vector image_data_reader::get_image_list_of_cu return ret; } +void image_data_reader::load_conduit_node_from_file(int data_id, conduit::Node &node) { + node.reset(); + const std::string filename = get_file_dir() + m_image_list[data_id].first; + int label = m_image_list[data_id].second; + //std::vector data; + std::vector data; + read_raw_data(filename, data); + node[LBANN_DATA_ID_STR(data_id) + "/label"].set(label); + node[LBANN_DATA_ID_STR(data_id) + "/buffer"].set(data); + node[LBANN_DATA_ID_STR(data_id) + "/buffer"].set_char_ptr(data.data(), data.size()); + node[LBANN_DATA_ID_STR(data_id) + "/buffer_size"] = data.size(); +} + + } // namespace lbann diff --git a/src/data_readers/data_reader_imagenet.cpp b/src/data_readers/data_reader_imagenet.cpp index b796b61ab55..21a2eaf96bb 100644 --- a/src/data_readers/data_reader_imagenet.cpp +++ b/src/data_readers/data_reader_imagenet.cpp @@ -42,6 +42,8 @@ imagenet_reader::imagenet_reader(const std::shared_ptr& pp, bool shu } m_master_pps = lbann::make_unique(*pp); + + if (is_master()) std::cout << "XX imagenet_reader ctor, pp, shuffle\n"; } imagenet_reader::imagenet_reader(const imagenet_reader& rhs) @@ -50,9 +52,33 @@ imagenet_reader::imagenet_reader(const imagenet_reader& rhs) LBANN_ERROR("construction error: no image processor"); } m_master_pps = lbann::make_unique(*rhs.m_master_pps); + if (is_master()) std::cout << "XX imagenet_reader copy ctor\n"; +} + + +imagenet_reader::imagenet_reader(const imagenet_reader& rhs, const std::vector& ds_sample_move_list, std::string role) + : image_data_reader(rhs, ds_sample_move_list) { + if (!rhs.m_master_pps) { + LBANN_ERROR("construction error: no image processor"); + } + m_master_pps = lbann::make_unique(*rhs.m_master_pps); + set_role(role); + + if (is_master()) std::cout << "XX imagenet_reader copy ctor, ds_sample_list size: " << ds_sample_move_list.size() << "\n"; +} + +imagenet_reader::imagenet_reader(const imagenet_reader& rhs, const std::vector& ds_sample_move_list) + : image_data_reader(rhs, ds_sample_move_list) { + if (!rhs.m_master_pps) { + LBANN_ERROR("construction error: no image processor"); + } + m_master_pps = lbann::make_unique(*rhs.m_master_pps); + + if (is_master()) std::cout << "XX imagenet_reader copy ctor, ds_sample_list size: " << ds_sample_move_list.size() << "\n"; } imagenet_reader& imagenet_reader::operator=(const imagenet_reader& rhs) { + if (is_master()) std::cout << "XX imagenet_reader operator=\n"; // check for self-assignment if (this == &rhs) { return (*this); @@ -122,20 +148,47 @@ bool imagenet_reader::fetch_datum(CPUMat& X, int data_id, int mb_idx) { bool ret; const std::string imagepath = get_file_dir() + m_image_list[data_id].first; + bool have_node = true; if (m_data_store != nullptr) { conduit::Node node; - if (data_store_active()) { + if (m_data_store->is_local_cache()) { + if (m_data_store->has_conduit_node(data_id)) { + const conduit::Node& ds_node = m_data_store->get_conduit_node(data_id); + node.set_external(ds_node); + } else { + load_conduit_node_from_file(data_id, node); + m_data_store->set_conduit_node(data_id, node); + } + } else if (data_store_active()) { const conduit::Node& ds_node = m_data_store->get_conduit_node(data_id); node.set_external(ds_node); } else if (priming_data_store()) { load_conduit_node_from_file(data_id, node); m_data_store->set_conduit_node(data_id, node); } else { - LBANN_ERROR("you shouldn't be here; please contact Dave Hysom"); + if (get_role() != "test") { + LBANN_ERROR("you shouldn't be here; please contact Dave Hysom"); + } + if (m_issue_warning) { + if (is_master()) { + LBANN_WARNING("m_data_store != nullptr, but we are not retrivieving a node from the store; role: " + get_role() + "; this is probably OK for test mode, but may be an error for train or validate modes"); + } + m_issue_warning = false; + } + ret = lbann::image_utils::load_image(imagepath, width, height, img_type, *(m_pps[tid]), X_v, m_thread_buffer[tid], &m_thread_cv_buffer[tid]); + have_node = false; } - char *buf = node[DATA_ID_STR(data_id) + "/buffer"].value(); - ret = lbann::image_utils::load_image(buf, width, height, img_type, *(m_pps[tid]), X_v, m_thread_buffer[tid], &m_thread_cv_buffer[tid]); + if (have_node) { + char *buf = node[LBANN_DATA_ID_STR(data_id) + "/buffer"].value(); + size_t size = node[LBANN_DATA_ID_STR(data_id) + "/buffer_size"].value(); + std::vector v2(size); + for (size_t j=0; j Date: Mon, 13 May 2019 10:26:57 -0700 Subject: [PATCH 012/634] ongoing development. Compiles and runs in all three modes. --- include/lbann/data_readers/data_reader.hpp | 19 +- .../data_readers/data_reader_imagenet.hpp | 3 +- .../lbann/data_store/data_store_conduit.hpp | 4 +- src/data_readers/data_reader.cpp | 18 ++ src/data_readers/data_reader_image.cpp | 11 -- src/data_readers/data_reader_imagenet.cpp | 8 - src/data_store/data_store_conduit.cpp | 170 +++++++++++++++--- src/proto/proto_common.cpp | 3 +- 8 files changed, 179 insertions(+), 57 deletions(-) diff --git a/include/lbann/data_readers/data_reader.hpp b/include/lbann/data_readers/data_reader.hpp index 219fc84793f..f63237b0401 100644 --- a/include/lbann/data_readers/data_reader.hpp +++ b/include/lbann/data_readers/data_reader.hpp @@ -99,7 +99,8 @@ class generic_data_reader : public lbann_image_preprocessor { m_procs_per_partition(1), m_io_thread_pool(nullptr), m_jag_partitioned(false), - m_model(nullptr) + m_model(nullptr), + m_issue_warning(true) {} generic_data_reader(const generic_data_reader&) = default; generic_data_reader& operator=(const generic_data_reader&) = default; @@ -249,16 +250,7 @@ class generic_data_reader : public lbann_image_preprocessor { * Set an idenifier for the dataset. * The role should be one of "train", "test", or "validate". */ - virtual void set_role(std::string role) { - m_role = role; - if (options::get()->has_string("jag_partitioned") - && get_role() == "train") { - m_jag_partitioned = true; - if (is_master()) { - std::cerr << "USING JAG DATA PARTITIONING\n"; - } - } - } + virtual void set_role(std::string role); /** * Get the role for this dataset. @@ -907,6 +899,11 @@ class generic_data_reader : public lbann_image_preprocessor { /// etc. void set_jag_variables(int mb_size); model *m_model; + + /// for use with data_store: issue a warning a single time if m_data_store != nullptr, + /// but we're not retrieving a conduit::Node from the store. This typically occurs + /// during the test phase + bool m_issue_warning; }; template diff --git a/include/lbann/data_readers/data_reader_imagenet.hpp b/include/lbann/data_readers/data_reader_imagenet.hpp index 7335c918137..2c8837472a1 100644 --- a/include/lbann/data_readers/data_reader_imagenet.hpp +++ b/include/lbann/data_readers/data_reader_imagenet.hpp @@ -38,6 +38,8 @@ class imagenet_reader : public image_data_reader { imagenet_reader(bool shuffle) = delete; imagenet_reader(const std::shared_ptr& pp, bool shuffle = true); imagenet_reader(const imagenet_reader&); + imagenet_reader(const imagenet_reader&, const std::vector& ds_sample_move_list);; + imagenet_reader(const imagenet_reader&, const std::vector& ds_sample_move_list, std::string role); imagenet_reader& operator=(const imagenet_reader&); ~imagenet_reader() override; @@ -55,7 +57,6 @@ class imagenet_reader : public image_data_reader { virtual CPUMat create_datum_view(CPUMat& X, const int mb_idx) const; bool fetch_datum(CPUMat& X, int data_id, int mb_idx) override; - protected: /// preprocessor duplicated for each omp thread std::vector > m_pps; std::unique_ptr m_master_pps; diff --git a/include/lbann/data_store/data_store_conduit.hpp b/include/lbann/data_store/data_store_conduit.hpp index 992a9e652b4..22531b327e9 100644 --- a/include/lbann/data_store/data_store_conduit.hpp +++ b/include/lbann/data_store/data_store_conduit.hpp @@ -167,6 +167,9 @@ class data_store_conduit { /// for use during development and debugging int get_data_size() { return m_data.size(); } + /// made public for debugging during development + void copy_members(const data_store_conduit& rhs, const std::vector& = std::vector()); + protected : /// records the number of times exchange_mini_batch_data has been called @@ -174,7 +177,6 @@ protected : bool m_is_setup; - void copy_members(const data_store_conduit& rhs, const std::vector& = std::vector()); generic_data_reader *m_reader; lbann_comm *m_comm; diff --git a/src/data_readers/data_reader.cpp b/src/data_readers/data_reader.cpp index df99a65be19..17873faa91b 100644 --- a/src/data_readers/data_reader.cpp +++ b/src/data_readers/data_reader.cpp @@ -726,6 +726,10 @@ void generic_data_reader::instantiate_data_store(const std::vector& local_l LBANN_ERROR("shuffled_indices.size() == 0"); } + if (opts->get_bool("node_sizes_vary")) { + m_data_store->set_node_sizes_vary(); + } + //a call to m_data_store->check_mem_capacity(...) should go here, but //at the moment that depends on the sample_list class, which it shouldn't //TODO: revisit @@ -810,4 +814,18 @@ void generic_data_reader::set_mini_batch_size(const int s) { m_mini_batch_size = s; } +void generic_data_reader::set_role(std::string role) { + m_role = role; + if (options::get()->has_string("jag_partitioned") + && get_role() == "train") { + m_jag_partitioned = true; + if (is_master()) { + std::cerr << "USING JAG DATA PARTITIONING\n"; + } + } + if (m_data_store != nullptr) { + m_data_store->set_role(role); + } +} + } // namespace lbann diff --git a/src/data_readers/data_reader_image.cpp b/src/data_readers/data_reader_image.cpp index 03124053fc2..4d4f58dc50b 100644 --- a/src/data_readers/data_reader_image.cpp +++ b/src/data_readers/data_reader_image.cpp @@ -36,14 +36,12 @@ namespace lbann { image_data_reader::image_data_reader(bool shuffle) : generic_data_reader(shuffle) { -if (is_master()) std::cout << "XX image_data_reader(bool shuffle); role: " << get_role() << "\n"; set_defaults(); } image_data_reader::image_data_reader(const image_data_reader& rhs) : generic_data_reader(rhs) { -if (is_master()) std::cout << "XX image_data_reader copy ctor; role: " << get_role() << "\n"; copy_members(rhs); } @@ -51,14 +49,12 @@ image_data_reader::image_data_reader(const image_data_reader& rhs,const std::vec : generic_data_reader(rhs) { set_role(role); -if (is_master()) std::cout << "XX copy ctor, ds_sample_move_list: " << ds_sample_move_list.size() << "; role: " << get_role() << "\n"; copy_members(rhs, ds_sample_move_list); } image_data_reader::image_data_reader(const image_data_reader& rhs,const std::vector& ds_sample_move_list) : generic_data_reader(rhs) { -if (is_master()) std::cout << "XX copy ctor, ds_sample_move_list: " << ds_sample_move_list.size() << "; role: " << get_role() << "\n"; copy_members(rhs, ds_sample_move_list); } @@ -76,7 +72,6 @@ image_data_reader& image_data_reader::operator=(const image_data_reader& rhs) { } void image_data_reader::copy_members(const image_data_reader &rhs, const std::vector& ds_sample_move_list) { - if (is_master()) std::cout << "XX image_data_reader::copy_members; role: " << get_role() << " ds_sample_move_list.size: " << ds_sample_move_list.size() << "\n"; if(rhs.m_data_store != nullptr) { if(ds_sample_move_list.size() == 0) { @@ -87,11 +82,6 @@ void image_data_reader::copy_members(const image_data_reader &rhs, const std::ve m_data_store->set_data_reader_ptr(this); } - if(m_data_store != nullptr) { - if (m_data_store->m_output) { - m_data_store->m_output << "image_data_reader::copy_members; role: " << get_role() << " ds_sample_move_list size: " << ds_sample_move_list.size() << "\n"; - } - } m_image_dir = rhs.m_image_dir; m_image_list = rhs.m_image_list; m_image_width = rhs.m_image_width; @@ -100,7 +90,6 @@ void image_data_reader::copy_members(const image_data_reader &rhs, const std::ve m_image_linearized_size = rhs.m_image_linearized_size; m_num_labels = rhs.m_num_labels; //m_thread_cv_buffer = rhs.m_thread_cv_buffer -if (is_master()) std::cout << "XX image_data_reader::copy_members; role: " << get_role() << "\n"; } diff --git a/src/data_readers/data_reader_imagenet.cpp b/src/data_readers/data_reader_imagenet.cpp index 21a2eaf96bb..c3025e7390a 100644 --- a/src/data_readers/data_reader_imagenet.cpp +++ b/src/data_readers/data_reader_imagenet.cpp @@ -42,8 +42,6 @@ imagenet_reader::imagenet_reader(const std::shared_ptr& pp, bool shu } m_master_pps = lbann::make_unique(*pp); - - if (is_master()) std::cout << "XX imagenet_reader ctor, pp, shuffle\n"; } imagenet_reader::imagenet_reader(const imagenet_reader& rhs) @@ -52,7 +50,6 @@ imagenet_reader::imagenet_reader(const imagenet_reader& rhs) LBANN_ERROR("construction error: no image processor"); } m_master_pps = lbann::make_unique(*rhs.m_master_pps); - if (is_master()) std::cout << "XX imagenet_reader copy ctor\n"; } @@ -63,8 +60,6 @@ imagenet_reader::imagenet_reader(const imagenet_reader& rhs, const std::vector(*rhs.m_master_pps); set_role(role); - - if (is_master()) std::cout << "XX imagenet_reader copy ctor, ds_sample_list size: " << ds_sample_move_list.size() << "\n"; } imagenet_reader::imagenet_reader(const imagenet_reader& rhs, const std::vector& ds_sample_move_list) @@ -73,12 +68,9 @@ imagenet_reader::imagenet_reader(const imagenet_reader& rhs, const std::vector(*rhs.m_master_pps); - - if (is_master()) std::cout << "XX imagenet_reader copy ctor, ds_sample_list size: " << ds_sample_move_list.size() << "\n"; } imagenet_reader& imagenet_reader::operator=(const imagenet_reader& rhs) { - if (is_master()) std::cout << "XX imagenet_reader operator=\n"; // check for self-assignment if (this == &rhs) { return (*this); diff --git a/src/data_store/data_store_conduit.cpp b/src/data_store/data_store_conduit.cpp index ff6a04dc01e..c9533a45ae8 100644 --- a/src/data_store/data_store_conduit.cpp +++ b/src/data_store/data_store_conduit.cpp @@ -47,7 +47,8 @@ data_store_conduit::data_store_conduit( m_owner_map_mb_size(0), m_super_node(false), m_compacted_sample_size(0), - m_is_local_cache(false) { + m_is_local_cache(false), + m_node_sizes_vary(false) { m_comm = m_reader->get_comm(); if (m_comm == nullptr) { LBANN_ERROR(" m_comm is nullptr"); @@ -57,24 +58,33 @@ data_store_conduit::data_store_conduit( m_trainer_master = m_comm->am_trainer_master(); m_rank_in_trainer = m_comm->get_rank_in_trainer(); m_np_in_trainer = m_comm->get_procs_per_trainer(); - + options *opts = options::get(); m_super_node = opts->get_bool("super_node"); + if (opts->get_bool("debug")) { + std::stringstream ss; + ss << "debug_" << m_reader->get_role() << "." << m_comm->get_rank_in_world(); + m_output.open(ss.str().c_str()); + } + m_is_local_cache = opts->get_bool("data_store_cache"); if (m_is_local_cache && opts->get_bool("preload_data_store")) { LBANN_ERROR("you cannot use both of these options: --data_store_cache --preload_data_store"); } } -data_store_conduit::~data_store_conduit() {} +data_store_conduit::~data_store_conduit() { + if (m_output) { + m_output.close(); + } +} data_store_conduit::data_store_conduit(const data_store_conduit& rhs) { copy_members(rhs); } data_store_conduit::data_store_conduit(const data_store_conduit& rhs, const std::vector& ds_sample_move_list) { - copy_members(rhs, ds_sample_move_list); } @@ -87,6 +97,15 @@ data_store_conduit& data_store_conduit::operator=(const data_store_conduit& rhs) return (*this); } +void data_store_conduit::set_role(const std::string role) { + + if (options::get()->get_bool("debug")) { + std::stringstream ss; + ss << "debug_" << m_reader->get_role() << "." << m_comm->get_rank_in_world(); + m_output.open(ss.str().c_str()); + } +} + void data_store_conduit::copy_members(const data_store_conduit& rhs, const std::vector& ds_sample_move_list) { m_n = rhs.m_n; m_is_setup = rhs.m_is_setup; @@ -104,18 +123,34 @@ void data_store_conduit::copy_members(const data_store_conduit& rhs, const std:: m_super_node = rhs.m_super_node; m_compacted_sample_size = rhs.m_compacted_sample_size; m_is_local_cache = rhs.m_is_local_cache; + m_node_sizes_vary = rhs.m_node_sizes_vary; + + /// This block needed when carving a validation set from the training set + if (options::get()->get_bool("debug") && !m_output) { + std::stringstream ss; + ss << "debug_" << m_reader->get_role() << "." << m_comm->get_rank_in_world(); + } if(ds_sample_move_list.size() == 0) { m_data = rhs.m_data; } else { /// Move indices on the list from the data and owner maps in the RHS data store to the new data store for(auto&& i : ds_sample_move_list) { + + if (m_output) { + rhs.m_output << "next ds_sample_move_list index: " << i << " is it in rhs? " << (rhs.m_data.find(i) != rhs.m_data.end()) << " rhs.m_data.size: " << rhs.m_data.size() << "\n"; + } + if(rhs.m_data.find(i) != rhs.m_data.end()){ - conduit::Node node = rhs.m_data[i]["data"]; + if (!m_super_node) { + /// Repack the nodes because they don't seem to copy correctly + build_node_for_sending(rhs.m_data[i]["data"], m_data[i]); + } else { + m_data[i] = rhs.m_data[i]; + } rhs.m_data.erase(i); - /// Repack the nodes because they don't seem to copy correctly - build_node_for_sending(node, m_data[i]); } + /// Removed migrated nodes from the original data store's owner list if(rhs.m_owner.find(i) != rhs.m_owner.end()) { m_owner[i] = rhs.m_owner[i]; @@ -124,7 +159,6 @@ void data_store_conduit::copy_members(const data_store_conduit& rhs, const std:: } } - /// Clear the pointer to the data reader, this cannot be copied m_reader = nullptr; m_shuffled_indices = nullptr; @@ -191,11 +225,14 @@ void data_store_conduit::setup_data_store_buffers() { // handle things ourselves. TODO: possibly modify conduit to // handle non-blocking comms void data_store_conduit::exchange_data_by_super_node(size_t current_pos, size_t mb_size) { - if (! m_is_setup) { LBANN_ERROR("setup(mb_size) has not been called"); } + if (m_output) { + m_output << "starting data_store_conduit::exchange_data_by_super_node; mb_size: " << mb_size << std::endl; + } + if (m_n == 0) { setup_data_store_buffers(); } @@ -213,6 +250,9 @@ void data_store_conduit::exchange_data_by_super_node(size_t current_pos, size_t for (auto idx : m_indices_to_send[p]) { m_send_buffer[p].update_external(m_data[idx]); } + if (m_output) { + m_output << "2. calling build_node_for_sending\n"; + } build_node_for_sending(m_send_buffer[p], m_send_buffer_2[p]); } @@ -256,6 +296,9 @@ void data_store_conduit::exchange_data_by_super_node(size_t current_pos, size_t m_minibatch_data.clear(); for (int p=0; p &names = m_reconstituted[p].child_names(); for (auto &t : names) { + if (m_output) { + m_output << "next name: " << t << std::endl; + } m_minibatch_data[atoi(t.c_str())][t].update_external(m_reconstituted[p][t]); } } + + if (m_output) { + m_output << "m_minibatch_data.size(): " << m_minibatch_data.size() << "; indices: "; + for (auto t : m_minibatch_data) { + m_output << t.first << " "; + } + m_output << std::endl; + } } void data_store_conduit::set_preloaded_conduit_node(int data_id, conduit::Node &node) { // note: at this point m_data[data_id] = node // note: if running in super_node mode, nothing to do + // note2: this may depend on the particular data reader if (!m_super_node) { + if (m_output) { + m_output << "set_preloaded_conduit_node: " << data_id << " for non-super_node mode\n"; + } conduit::Node n2 = node; + if (m_output) { + m_output << "3. calling build_node_for_sending\n"; + } build_node_for_sending(n2, m_data[data_id]); error_check_compacted_node(m_data[data_id], data_id); - } + } else { + if (m_data.find(data_id) == m_data.end()) { + m_data[data_id] = node; + if (m_output) { + m_output << "set_preloaded_conduit_node: " << data_id << " for super_node mode\n"; + } + } else { + if (m_output) { + m_output << "set_preloaded_conduit_node: " << data_id << " is already in m_data\n"; + } + } + } } void data_store_conduit::error_check_compacted_node(const conduit::Node &nd, int data_id) { - if(m_compacted_sample_size == 0) { + if (m_compacted_sample_size == 0) { m_compacted_sample_size = nd.total_bytes_compact(); - } else if(m_compacted_sample_size != nd.total_bytes_compact()) { + } else if (m_compacted_sample_size != nd.total_bytes_compact() && !m_node_sizes_vary) { LBANN_ERROR("Conduit node being added data_id: " + std::to_string(data_id) + " is not the same size as existing nodes in the data_store " + std::to_string(m_compacted_sample_size) + " != " + std::to_string(nd.total_bytes_compact()) + " role: " + m_reader->get_role()); } - if(!nd.is_contiguous()) { + if (!nd.is_contiguous()) { LBANN_ERROR("m_data[" + std::to_string(data_id) + "] does not have a contiguous layout"); } - if(nd.data_ptr() == nullptr) { + if (nd.data_ptr() == nullptr) { LBANN_ERROR("m_data[" + std::to_string(data_id) + "] does not have a valid data pointer"); } - if(nd.contiguous_data_ptr() == nullptr) { + if (nd.contiguous_data_ptr() == nullptr) { LBANN_ERROR("m_data[" + std::to_string(data_id) + "] does not have a valid contiguous data pointer"); } } @@ -318,6 +390,10 @@ void data_store_conduit::set_conduit_node(int data_id, conduit::Node &node, bool LBANN_ERROR("duplicate data_id: " + std::to_string(data_id) + " in data_store_conduit::set_conduit_node"); } + if (m_output) { + m_output << "set_conduit_node: " << data_id << std::endl; + } + if (already_have && is_local_cache()) { if (m_data.find(data_id) == m_data.end()) { LBANN_ERROR("you claim the passed node was obtained from this data_store, but the data_id (" + std::to_string(data_id) + ") doesn't exist in m_data"); @@ -325,17 +401,20 @@ void data_store_conduit::set_conduit_node(int data_id, conduit::Node &node, bool return; } - if (m_owner[data_id] != m_rank_in_trainer) { + if (is_local_cache()) { + m_data[data_id] = node; + } + + else if (m_owner[data_id] != m_rank_in_trainer) { std::stringstream s; s << "set_conduit_node error for data id: "<get_role() << "\n"; LBANN_ERROR(s.str()); } - if (is_local_cache()) { - m_data[data_id] = node; - } - else if (! m_super_node) { + if (m_output) { + m_output << "4. calling build_node_for_sending\n"; + } build_node_for_sending(node, m_data[data_id]); error_check_compacted_node(m_data[data_id], data_id); } @@ -350,6 +429,9 @@ void data_store_conduit::set_conduit_node(int data_id, conduit::Node &node, bool } const conduit::Node & data_store_conduit::get_conduit_node(int data_id) const { + if (m_output) { + m_output << "get_conduit_node: " << data_id << std::endl; + } /** * dah: commenting this out since it gives a false positive for test * case with unshuffled indices. Since we currently send samples @@ -368,7 +450,7 @@ const conduit::Node & data_store_conduit::get_conduit_node(int data_id) const { if (is_local_cache()) { std::unordered_map::const_iterator t3 = m_data.find(data_id); if (t3 == m_data.end()) { - LBANN_ERROR("failed to find data_id: " + std::to_string(data_id) + " in m_data; m_data.size: " + std::to_string(m_data.size())); + LBANN_ERROR("(local cache) failed to find data_id: " + std::to_string(data_id) + " in m_data; m_data.size: " + std::to_string(m_data.size())); } return t3->second; } @@ -382,6 +464,13 @@ const conduit::Node & data_store_conduit::get_conduit_node(int data_id) const { return t3->second["data"]; } LBANN_ERROR("failed to find data_id: " + std::to_string(data_id) + " in m_minibatch_data; m_minibatch_data.size: " + std::to_string(m_minibatch_data.size())+ " and also failed to find it in m_data; m_data.size: " + std::to_string(m_data.size()) + "; role: " + m_reader->get_role()); + if (m_output) { + m_output << "failed to find data_id: " << data_id << " in m_minibatch_data; my m_minibatch_data indices: "; + for (auto t : m_minibatch_data) { + m_output << t.first << " "; + } + m_output << std::endl; + } } return t2->second; @@ -390,6 +479,9 @@ const conduit::Node & data_store_conduit::get_conduit_node(int data_id) const { // code in the following method is a modification of code from // conduit/src/libs/relay/conduit_relay_mpi.cpp void data_store_conduit::build_node_for_sending(const conduit::Node &node_in, conduit::Node &node_out) { + if (m_output) { + m_output << "starting build_node_for_sending\n"; + } node_out.reset(); conduit::Schema s_data_compact; @@ -429,6 +521,10 @@ void data_store_conduit::exchange_data_by_sample(size_t current_pos, size_t mb_s LBANN_ERROR("setup(mb_size) has not been called"); } + if (m_output) { + m_output << "starting data_store_conduit::exchange_data_by_sample; mb_size: " << mb_size << std::endl; + } + int num_send_req = build_indices_i_will_send(current_pos, mb_size); int num_recv_req = build_indices_i_will_recv(current_pos, mb_size); @@ -534,6 +630,9 @@ int data_store_conduit::build_indices_i_will_send(int current_pos, int mb_size) m_indices_to_send.clear(); m_indices_to_send.resize(m_np_in_trainer); int k = 0; + if (m_output) { + m_output << "build_indices_i_will_send; cur pos: " << current_pos << " mb_size: " << mb_size << " m_data.size: " << m_data.size() << "\n"; + } for (int i = current_pos; i < current_pos + mb_size; i++) { auto index = (*m_shuffled_indices)[i]; /// If this rank owns the index send it to the (i%m_np)'th rank @@ -563,11 +662,14 @@ void data_store_conduit::build_preloaded_owner_map(const std::vector& per_r per_rank_list_range_start += per_rank_list_size; } m_owner[i] = owning_rank; + if (m_output) { + m_output << "m_owner[" << i << "] = " << owning_rank << std::endl; + } } } void data_store_conduit::build_owner_map(int mini_batch_size) { - if (m_world_master) std::cout << "starting data_store_conduit::build_owner_map for role: " << m_reader->get_role() << " with mini_batch_size: " << mini_batch_size << "\n"; + if (m_world_master) std::cout << "starting data_store_conduit::build_owner_map for role: " << m_reader->get_role() << " with mini_batch_size: " << mini_batch_size << " num indices: " << m_shuffled_indices->size() << "\n"; if (mini_batch_size == 0) { LBANN_ERROR("mini_batch_size == 0; can't build owner_map"); } @@ -609,6 +711,9 @@ conduit::Node & data_store_conduit::get_empty_node(int data_id) { } void data_store_conduit::purge_unused_samples(const std::vector& indices) { + if (m_output) { + m_output << " purge_unused_samples; indices.size(): " << indices.size() << " data.size(): " << m_data.size() << std::endl; + } /// Remove unused indices from the data and owner maps for(auto&& i : indices) { if(m_data.find(i) != m_data.end()){ @@ -621,12 +726,25 @@ void data_store_conduit::purge_unused_samples(const std::vector& indices) { } void data_store_conduit::compact_nodes() { + if (m_super_node) { + if (m_output) { + m_output << "RETURNING from data_store_conduit::compact_nodes; m_data.size(): " << m_data.size() << "\n"; + } + return; + } else { + if (m_output) { + m_output << ">> NOT RETURNING from data_store_conduit::compact_nodes\n"; + } + } for(auto&& j : *m_shuffled_indices) { if(m_data.find(j) != m_data.end()){ if(!m_data[j].is_contiguous()) { /// Repack the nodes because they don't seem to copy correctly conduit::Node node = m_data[j]["data"]; m_data.erase(j); + if (m_output) { + m_output << "5. calling build_node_for_sending\n"; + } build_node_for_sending(node, m_data[j]); } } @@ -793,9 +911,15 @@ void data_store_conduit::check_mem_capacity(lbann_comm *comm, const std::string bool data_store_conduit::has_conduit_node(int data_id) const { std::unordered_map::const_iterator t = m_data.find(data_id); - return t == m_data.end(); + if (m_output) { + m_output << "has_conduit_node( " << data_id << " ) = " << (t == m_data.end()) << std::endl; + } + return t != m_data.end(); } +void data_store_conduit::set_shuffled_indices(const std::vector *indices) { + m_shuffled_indices = indices; +} } // namespace lbann diff --git a/src/proto/proto_common.cpp b/src/proto/proto_common.cpp index 904cf93bf48..b9166ec27fb 100644 --- a/src/proto/proto_common.cpp +++ b/src/proto/proto_common.cpp @@ -431,8 +431,7 @@ void init_data_readers( } else if (name == "numpy_npz_conduit_reader") { reader_validation = new numpy_npz_conduit_reader(*dynamic_cast(reader)); } else if (name == "imagenet") { - reader_validation = new imagenet_reader(*dynamic_cast(reader)); - reader_validation = new numpy_npz_conduit_reader(*dynamic_cast(reader)); + reader_validation = new imagenet_reader(*dynamic_cast(reader), reader->get_unused_indices()); } else if (name == "imagenet_patches") { reader_validation = new imagenet_reader_patches(*dynamic_cast(reader)); } else if (name == "multihead_siamese") { From 509d37dbdac21e8164211c6b205dd97441fff788 Mon Sep 17 00:00:00 2001 From: "David A. Hysom" Date: Mon, 13 May 2019 13:36:26 -0700 Subject: [PATCH 013/634] added print statement to show when operating in local cache mode --- src/data_store/data_store_conduit.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/data_store/data_store_conduit.cpp b/src/data_store/data_store_conduit.cpp index c9533a45ae8..33f3a6b6a72 100644 --- a/src/data_store/data_store_conduit.cpp +++ b/src/data_store/data_store_conduit.cpp @@ -72,6 +72,9 @@ data_store_conduit::data_store_conduit( if (m_is_local_cache && opts->get_bool("preload_data_store")) { LBANN_ERROR("you cannot use both of these options: --data_store_cache --preload_data_store"); } + if (m_world_master) { + std::cout << "data_store_conduit is running in local_cache mode\n"; + } } data_store_conduit::~data_store_conduit() { From 711eec80b87e2d3b133904d2796189d0c1ef9ad4 Mon Sep 17 00:00:00 2001 From: Ryan Forsyth Date: Mon, 13 May 2019 15:30:42 -0700 Subject: [PATCH 014/634] Update Weekly Develop --- bamboo/integration_tests/test_integration_performance.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bamboo/integration_tests/test_integration_performance.py b/bamboo/integration_tests/test_integration_performance.py index a171184ba5e..fef3a0d267c 100644 --- a/bamboo/integration_tests/test_integration_performance.py +++ b/bamboo/integration_tests/test_integration_performance.py @@ -154,8 +154,8 @@ def skeleton_performance_full_alexnet(cluster, dir_name, executables, should_log = True output_file_name = '%s/bamboo/integration_tests/output/%s_%s_output.txt' %(dir_name, model_name, compiler_name) error_file_name = '%s/bamboo/integration_tests/error/%s_%s_error.txt' %(dir_name, model_name, compiler_name) - if cluster in ['catalyst', 'surface']: - command = 'salloc %s/bamboo/integration_tests/%s.sh > %s' % (dir_name, model_name, output_file_name) + if cluster in ['catalyst', 'pascal', 'surface']: + command = 'salloc --nodes 128 %s/bamboo/integration_tests/%s.sh > %s 2> %s' % (dir_name, model_name, output_file_name, error_file_name) elif cluster == 'ray': e = 'skeleton_performance_full_alexnet: Ray is unsupported for skeleton_performance_full_alexnet' print('Skip - ' + e) From 6f68f6eacac90d732cfa1b32bb8b7dae1fe7b160 Mon Sep 17 00:00:00 2001 From: Yosuke Oyama <17844184+oyamay@users.noreply.github.com> Date: Tue, 14 May 2019 00:42:24 -0700 Subject: [PATCH 015/634] Set Adam's default parameters --- model_zoo/cosmoflow/cosmoflow.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/model_zoo/cosmoflow/cosmoflow.py b/model_zoo/cosmoflow/cosmoflow.py index 58cb183dafc..64ac9b0cd51 100755 --- a/model_zoo/cosmoflow/cosmoflow.py +++ b/model_zoo/cosmoflow/cosmoflow.py @@ -273,7 +273,10 @@ def create_data_reader(train_path, val_path, test_path): callbacks=callbacks) # Setup optimizer -opt = lbann.Adam(learn_rate=args.learn_rate) +opt = lbann.Adam(learn_rate=args.learn_rate, + beta1=0.9, + beta2=0.99, + eps=1e-8) # Setup data reader data_reader_proto = create_data_reader(args.train_path, From 0b20b0a5690ebd2388fc72f898799cb1fffc3f85 Mon Sep 17 00:00:00 2001 From: Yosuke Oyama <17844184+oyamay@users.noreply.github.com> Date: Tue, 14 May 2019 01:37:22 -0700 Subject: [PATCH 016/634] Disable a data reader if data_filename path is not specified --- model_zoo/cosmoflow/cosmoflow.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/model_zoo/cosmoflow/cosmoflow.py b/model_zoo/cosmoflow/cosmoflow.py index 64ac9b0cd51..085b3359b3d 100755 --- a/model_zoo/cosmoflow/cosmoflow.py +++ b/model_zoo/cosmoflow/cosmoflow.py @@ -163,11 +163,12 @@ def create_dropout(x, i): return x def create_data_reader(train_path, val_path, test_path): - readerArgs = [ - {"role": "train", "data_filename": train_path}, - {"role": "validate", "data_filename": val_path}, - {"role": "test", "data_filename": test_path}, - ] + readerArgs = [] + for role, data_filename in [("train", train_path), + ("validate", val_path), + ("test", test_path)]: + if data_filename != "": + readerArgs.append({"role": role, "data_filename": data_filename}) readers = [] for readerArg in readerArgs: From 1e2fb97c1d6828cc31db87e3a955169b21ddd4c2 Mon Sep 17 00:00:00 2001 From: Sam Ade Jacobs Date: Tue, 14 May 2019 09:59:23 -0700 Subject: [PATCH 017/634] Add callback for dropout perturbation. Add a flag to determine whether LTFB exchange hyperparameter --- include/lbann/callbacks/callback_ltfb.hpp | 5 + include/lbann/layers/regularizers/dropout.hpp | 9 ++ include/lbann/lbann.hpp | 1 + src/callbacks/CMakeLists.txt | 1 + src/callbacks/callback_ltfb.cpp | 50 +++++--- src/callbacks/callback_perturb_dropout.cpp | 120 ++++++++++++++++++ src/proto/factories/callback_factory.cpp | 7 + src/proto/lbann.proto | 6 + 8 files changed, 179 insertions(+), 20 deletions(-) create mode 100644 src/callbacks/callback_perturb_dropout.cpp diff --git a/include/lbann/callbacks/callback_ltfb.hpp b/include/lbann/callbacks/callback_ltfb.hpp index f5823cd3b14..75de120090a 100644 --- a/include/lbann/callbacks/callback_ltfb.hpp +++ b/include/lbann/callbacks/callback_ltfb.hpp @@ -120,6 +120,7 @@ class lbann_callback_ltfb : public lbann_callback { std::set weights_names = std::set(), bool low_score_wins = false, communication_algorithm comm_algo = communication_algorithm::sendrecv_weights, + bool exchange_hyperparameters = false, lbann_summary *summarizer = nullptr); lbann_callback_ltfb(const lbann_callback_ltfb& other); lbann_callback_ltfb& operator=(const lbann_callback_ltfb& other); @@ -154,6 +155,10 @@ class lbann_callback_ltfb : public lbann_callback { /** Inter-trainer communication scheme. */ communication_algorithm m_comm_algo; + + /** Whether to exchange training hyperparameters between trainers + */ + bool m_exchange_hyperparameters; /** Workspace weights. * diff --git a/include/lbann/layers/regularizers/dropout.hpp b/include/lbann/layers/regularizers/dropout.hpp index 3fa652b07da..f2c6ec3274e 100644 --- a/include/lbann/layers/regularizers/dropout.hpp +++ b/include/lbann/layers/regularizers/dropout.hpp @@ -29,6 +29,7 @@ #include "lbann/layers/regularizers/regularizer.hpp" #include "lbann/utils/cudnn.hpp" +#include "lbann/utils/random.hpp" namespace lbann { @@ -121,6 +122,14 @@ class dropout : public regularizer_layer { desc.add("Keep probability", m_keep_prob); return desc; } + /** @brief get prob for keep each unit. */ + EvalType get_keep_prob() const { + return m_keep_prob; + } + /** @brief set prob for keep each unit. */ + void set_keep_prob(EvalType keep_prob) { + m_keep_prob = keep_prob; + } protected: diff --git a/include/lbann/lbann.hpp b/include/lbann/lbann.hpp index 3a15b6b03ca..e648c750905 100644 --- a/include/lbann/lbann.hpp +++ b/include/lbann/lbann.hpp @@ -166,6 +166,7 @@ #include "lbann/callbacks/callback_check_gradients.hpp" #include "lbann/callbacks/callback_check_metric.hpp" #include "lbann/callbacks/callback_perturb_adam.hpp" +#include "lbann/callbacks/callback_perturb_dropout.hpp" /// Weights and weight initializers #include "lbann/weights/weights.hpp" diff --git a/src/callbacks/CMakeLists.txt b/src/callbacks/CMakeLists.txt index 2b29975b44b..b38b1c11107 100644 --- a/src/callbacks/CMakeLists.txt +++ b/src/callbacks/CMakeLists.txt @@ -33,6 +33,7 @@ set_full_path(THIS_DIR_SOURCES profiler.cpp callback_replace_weights.cpp callback_gpu_memory_usage.cpp + callback_perturb_dropout.cpp ) # Propagate the files up the tree diff --git a/src/callbacks/callback_ltfb.cpp b/src/callbacks/callback_ltfb.cpp index 79fbf058243..add3af46954 100644 --- a/src/callbacks/callback_ltfb.cpp +++ b/src/callbacks/callback_ltfb.cpp @@ -97,7 +97,8 @@ void exchange_models__sendrecv_weights(lbann_comm& comm, El::Int partner_trainer, const std::set& weights_names, const std::vector& send_weights, - std::vector& recv_weights) { + std::vector& recv_weights, + bool exchange_hyperparameters) { // Get partner process const El::Int rank_in_trainer = comm.get_rank_in_trainer(); @@ -127,19 +128,21 @@ void exchange_models__sendrecv_weights(lbann_comm& comm, const auto* send_sgd = dynamic_cast(send_opt); auto* recv_sgd = dynamic_cast(recv_opt); if (send_sgd != nullptr && recv_sgd != nullptr) { - using hyperparameters_type = std::tuple; - hyperparameters_type hyperparameters(send_sgd->get_learning_rate(), + if(exchange_hyperparameters) { + using hyperparameters_type = std::tuple; + hyperparameters_type hyperparameters(send_sgd->get_learning_rate(), send_sgd->get_momentum(), send_sgd->using_nesterov()); - El::mpi::SendRecv(reinterpret_cast(&hyperparameters), + El::mpi::SendRecv(reinterpret_cast(&hyperparameters), sizeof(hyperparameters_type), partner_rank_in_world, partner_rank_in_world, comm.get_world_comm(), El::SyncInfo{}); - recv_sgd->set_learning_rate(std::get<0>(hyperparameters)); - recv_sgd->set_momentum(std::get<1>(hyperparameters)); - recv_sgd->set_nesterov(std::get<2>(hyperparameters)); + recv_sgd->set_learning_rate(std::get<0>(hyperparameters)); + recv_sgd->set_momentum(std::get<1>(hyperparameters)); + recv_sgd->set_nesterov(std::get<2>(hyperparameters)); + } El::SendRecv(send_sgd->get_velocity().LockedMatrix(), recv_sgd->get_velocity().Matrix(), comm.get_world_comm(), @@ -149,30 +152,32 @@ void exchange_models__sendrecv_weights(lbann_comm& comm, const auto* send_adam = dynamic_cast(send_opt); auto* recv_adam = dynamic_cast(recv_opt); if (send_adam != nullptr && recv_adam != nullptr) { - using hyperparameters_type = std::tuple; - hyperparameters_type hyperparameters(send_adam->get_learning_rate(), + if(exchange_hyperparameters) { + using hyperparameters_type = std::tuple; + hyperparameters_type hyperparameters(send_adam->get_learning_rate(), send_adam->get_beta1(), send_adam->get_beta2(), send_adam->get_eps(), send_adam->get_current_beta1(), send_adam->get_current_beta2()); - El::mpi::SendRecv(reinterpret_cast(&hyperparameters), + El::mpi::SendRecv(reinterpret_cast(&hyperparameters), sizeof(hyperparameters_type), partner_rank_in_world, partner_rank_in_world, comm.get_world_comm(), El::SyncInfo{}); - recv_adam->set_learning_rate(std::get<0>(hyperparameters)); - recv_adam->set_beta1(std::get<1>(hyperparameters)); - recv_adam->set_beta2(std::get<2>(hyperparameters)); - recv_adam->set_eps(std::get<3>(hyperparameters)); - recv_adam->set_current_beta1(std::get<4>(hyperparameters)); - recv_adam->set_current_beta2(std::get<5>(hyperparameters)); - El::SendRecv(send_adam->get_moment1().LockedMatrix(), + recv_adam->set_learning_rate(std::get<0>(hyperparameters)); + recv_adam->set_beta1(std::get<1>(hyperparameters)); + recv_adam->set_beta2(std::get<2>(hyperparameters)); + recv_adam->set_eps(std::get<3>(hyperparameters)); + recv_adam->set_current_beta1(std::get<4>(hyperparameters)); + recv_adam->set_current_beta2(std::get<5>(hyperparameters)); + El::SendRecv(send_adam->get_moment1().LockedMatrix(), recv_adam->get_moment1().Matrix(), comm.get_world_comm(), partner_rank_in_world, partner_rank_in_world); + } El::SendRecv(send_adam->get_moment2().LockedMatrix(), recv_adam->get_moment2().Matrix(), comm.get_world_comm(), @@ -320,19 +325,22 @@ lbann_callback_ltfb::lbann_callback_ltfb(El::Int batch_interval, std::set weights_names, bool low_score_wins, communication_algorithm comm_algo, + bool exchange_hyperparameters, lbann_summary *summarizer) : lbann_callback(batch_interval, summarizer), m_metric_name(std::move(metric_name)), m_weights_names(std::move(weights_names)), m_low_score_wins(low_score_wins), - m_comm_algo(comm_algo) {} + m_comm_algo(comm_algo), + m_exchange_hyperparameters(exchange_hyperparameters) {} lbann_callback_ltfb::lbann_callback_ltfb(const lbann_callback_ltfb& other) : lbann_callback(other), m_metric_name(other.m_metric_name), m_weights_names(other.m_weights_names), m_low_score_wins(other.m_low_score_wins), - m_comm_algo(other.m_comm_algo) { + m_comm_algo(other.m_comm_algo), + m_exchange_hyperparameters(other.m_exchange_hyperparameters) { // Deep copy m_workspace_weights.clear(); @@ -351,6 +359,7 @@ lbann_callback_ltfb& lbann_callback_ltfb::operator=(const lbann_callback_ltfb& o m_weights_names = other.m_weights_names; m_low_score_wins = other.m_low_score_wins; m_comm_algo = other.m_comm_algo; + m_exchange_hyperparameters = other.m_exchange_hyperparameters; // Deep copy m_workspace_weights.clear(); @@ -442,7 +451,8 @@ void lbann_callback_ltfb::on_batch_begin(model *m) { partner_trainer, m_weights_names, local_weights, - model_weights); + model_weights, + m_exchange_hyperparameters); break; case communication_algorithm::checkpoint_file: exchange_models__checkpoint_file(comm, diff --git a/src/callbacks/callback_perturb_dropout.cpp b/src/callbacks/callback_perturb_dropout.cpp new file mode 100644 index 00000000000..aa064be3954 --- /dev/null +++ b/src/callbacks/callback_perturb_dropout.cpp @@ -0,0 +1,120 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#include "lbann/callbacks/callback_perturb_dropout.hpp" +#include "lbann/utils/random.hpp" + +namespace lbann { + +lbann_callback_perturb_dropout::lbann_callback_perturb_dropout(EvalType keep_prob_factor, + std::set layer_names) + : lbann_callback(1), + m_keep_prob_factor(keep_prob_factor), + m_layer_names(std::move(layer_names)) {} + +void lbann_callback_perturb_dropout::setup(model* m) { + perturb(*m); +} + +template +dropout* lbann_callback_perturb_dropout::get_dropout_layer(Layer* l) { + if(auto d_layer = dynamic_cast*>(l)) return d_layer; + else return nullptr; +} + +void lbann_callback_perturb_dropout::perturb(model& m) { + auto* comm = m.get_comm(); + for (auto* l : m.get_layers()) { + if (l == nullptr) { + std::stringstream err; + err << "callback \"" << name() << "\" " + << "got a layer pointer that is a null pointer"; + LBANN_ERROR(err.str()); + } + if (m_layer_names.empty() + || m_layer_names.count(l->get_name()) > 0) { + + auto d_dp_cpu = get_dropout_layer(l); + auto d_mp_cpu = get_dropout_layer(l); + #ifdef LBANN_HAS_GPU + auto d_dp_gpu = get_dropout_layer(l); + auto d_mp_gpu = get_dropout_layer(l); + #endif + // Perturb dropout layer + if(d_dp_cpu != nullptr || d_mp_cpu != nullptr + #ifdef LBANN_HAS_GPU + || d_dp_gpu != nullptr || d_mp_gpu != nullptr + #endif + ) { + EvalType new_keep_prob; + if (comm->am_trainer_master()) { + + // Useful constants + constexpr EvalType zero = 0; + constexpr EvalType one = 1; + constexpr EvalType min_val = std::numeric_limits::min(); + + // RNG + auto& gen = get_generator(); + std::normal_distribution dist(zero, one); + + // Perturb log(keep_prob) + EvalType old_keep_prob = 0; + if (d_dp_cpu) old_keep_prob = d_dp_cpu->get_keep_prob(); + if (d_mp_cpu) old_keep_prob = d_mp_cpu->get_keep_prob(); + #ifdef LBANN_HAS_GPU + if (d_dp_gpu) old_keep_prob = d_dp_gpu->get_keep_prob(); + if (d_mp_gpu) old_keep_prob = d_mp_gpu->get_keep_prob(); + #endif + if (m_keep_prob_factor != zero && old_keep_prob >= zero) { + auto log_val = std::log(std::max(old_keep_prob, min_val)); + log_val += m_keep_prob_factor * dist(gen); + new_keep_prob = std::max(EvalType(0.5), std::min(std::exp(log_val),EvalType(1.0))); + std::cout << "Trainer [ " << comm->get_trainer_rank() << " ] keep prob changed from " + << old_keep_prob << " to " << new_keep_prob << std::endl; + } + + } + + // Communicate new keep prob from trainer master processes + comm->trainer_broadcast(comm->get_trainer_master(), new_keep_prob); + + // Update keep prob + if (d_dp_cpu) d_dp_cpu->set_keep_prob(new_keep_prob); + if (d_mp_cpu) d_mp_cpu->set_keep_prob(new_keep_prob); + #ifdef LBANN_HAS_GPU + if (d_dp_gpu) d_dp_gpu->set_keep_prob(new_keep_prob); + if (d_mp_gpu) d_mp_gpu->set_keep_prob(new_keep_prob); + #endif + + } + + } + } +} + + +} // namespace lbann diff --git a/src/proto/factories/callback_factory.cpp b/src/proto/factories/callback_factory.cpp index afd1f7ec370..d4f9fc774f6 100644 --- a/src/proto/factories/callback_factory.cpp +++ b/src/proto/factories/callback_factory.cpp @@ -104,6 +104,7 @@ lbann_callback* construct_callback(lbann_comm* comm, parse_set(params.weights()), params.low_score_wins(), lbann_callback_ltfb::string_to_comm_algo(params.communication_algorithm()), + params.exchange_hyperparameters(), summarizer); } /// @todo @@ -420,6 +421,12 @@ lbann_callback* construct_callback(lbann_comm* comm, parse_set(params.weights())); } + if (proto_cb.has_perturb_dropout()) { + const auto& params = proto_cb.perturb_dropout(); + return new lbann_callback_perturb_dropout( + params.keep_dropout_factor(), + parse_set(params.layers())); + } return nullptr; } diff --git a/src/proto/lbann.proto b/src/proto/lbann.proto index 4e344af5330..d78f74ecef0 100644 --- a/src/proto/lbann.proto +++ b/src/proto/lbann.proto @@ -412,6 +412,7 @@ message Callback { CallbackConfusionMatrix confusion_matrix = 36; CallbackCheckMetric check_metric = 37; CallbackPerturbAdam perturb_adam = 38; + CallbackPerturbDropout perturb_dropout = 39; } message CallbackLTFB { @@ -420,6 +421,7 @@ message CallbackLTFB { string weights = 3; // default: all weights bool low_score_wins = 4; string communication_algorithm = 5; // default: "sendrecv_weights" + bool exchange_hyperparameters = 6; } message CallbackStepLearningRate { @@ -653,6 +655,10 @@ message CallbackPerturbAdam { string weights = 7; // Weights with Adam optimizer } +message CallbackPerturbDropout { + float keep_dropout_factor = 1; //Keep dropout prob perturbation (in log space) + string layers = 2; // dropout layers to perturb keep prob, all dropout layers by default +} //======================================================================== // Weights //======================================================================== From e9f0af537c611716724e3d4b65ab22c1791ad279 Mon Sep 17 00:00:00 2001 From: Sam Ade Jacobs Date: Tue, 14 May 2019 14:23:10 -0700 Subject: [PATCH 018/634] Missing header --- .../callbacks/callback_perturb_dropout.hpp | 81 +++++++++++++++++++ 1 file changed, 81 insertions(+) create mode 100644 include/lbann/callbacks/callback_perturb_dropout.hpp diff --git a/include/lbann/callbacks/callback_perturb_dropout.hpp b/include/lbann/callbacks/callback_perturb_dropout.hpp new file mode 100644 index 00000000000..05ef0402362 --- /dev/null +++ b/include/lbann/callbacks/callback_perturb_dropout.hpp @@ -0,0 +1,81 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_CALLBACKS_CALLBACK_PERTURB_DROPOUT_HPP_INCLUDED +#define LBANN_CALLBACKS_CALLBACK_PERTURB_DROPOUT_HPP_INCLUDED + +#include "lbann/callbacks/callback.hpp" +#include "lbann/layers/regularizers/dropout.hpp" +#include + +namespace lbann { + +/** @brief Hyperparameter exploration with dropouts. + * + * Goes through the dropout layers in a model and perturbs keep probability + */ +class lbann_callback_perturb_dropout : public lbann_callback { +public: + + /** @param keep_prob_factor Standard deviation of learning rate + * perturbation (in log space). + * @param layer_names Names of layers with dropout keep prob to perturb. If + * empty, all dropout layers in the model are + * perturbed. + */ + lbann_callback_perturb_dropout(EvalType keep_prob_factor, + std::set layer_names + = std::set()); + lbann_callback_perturb_dropout* copy() const override { return new lbann_callback_perturb_dropout(*this); } + std::string name() const override { return "perturb dropout"; } + + void setup(model* m); + +private: + + /** Standard deviation of keep probability perturbation. + * + * In log space. + */ + EvalType m_keep_prob_factor; + + /** Keep prob for these layers will be perturbed. + * + * If empty, all dropout layers in the model will be perturbed. + */ + std::set m_layer_names; + + template + dropout* get_dropout_layer(Layer* l); + + /** Perturb dropout keep prob in model. */ + void perturb(model& m); + +}; + +} // namespace lbann + +#endif // LBANN_CALLBACKS_CALLBACK_PERTURB_DROPOUT_HPP_INCLUDED From a465612e755d674dabf71f7ab86f34b039b321f2 Mon Sep 17 00:00:00 2001 From: Sam Ade Jacobs Date: Tue, 14 May 2019 14:35:18 -0700 Subject: [PATCH 019/634] lbann_inf gracefully exits if checkpoint dir is not given --- model_zoo/lbann_inf.cpp | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/model_zoo/lbann_inf.cpp b/model_zoo/lbann_inf.cpp index fdc7161b272..daa4ac54458 100644 --- a/model_zoo/lbann_inf.cpp +++ b/model_zoo/lbann_inf.cpp @@ -60,11 +60,16 @@ int main(int argc, char *argv[]) { build_model_from_prototext(argc, argv, *pb_model, comm.get(), io_thread_pool, models.size() == 0)); } + for(auto&& m : models) { + bool loaded = lbann_callback_save_model::load_model_weights(opts->get_string("ckpt_dir"), m.get()); + if(!loaded) LBANN_ERROR("Unable to reload model"); + } // Load layer weights from checkpoint if checkpoint directory given if(opts->has_string("ckpt_dir")){ for(auto&& m : models) { - lbann_callback_save_model::load_model_weights(opts->get_string("ckpt_dir"), m.get()); + bool loaded = lbann_callback_save_model::load_model_weights(opts->get_string("ckpt_dir"), m.get()); + if(!loaded) LBANN_ERROR("Unable to reload model"); } }else { LBANN_ERROR("Unable to reload model"); From 6a92a6cbe8d7d6bbd96cf18ae259ea558c7bdd72 Mon Sep 17 00:00:00 2001 From: Sam Ade Jacobs Date: Tue, 14 May 2019 19:20:08 -0700 Subject: [PATCH 020/634] Add flag that allow user specify full path to load trained model --- .../lbann/callbacks/callback_save_model.hpp | 6 ++++- model_zoo/lbann_inf.cpp | 8 +++--- src/callbacks/callback_save_model.cpp | 25 +++++++++++-------- 3 files changed, 23 insertions(+), 16 deletions(-) diff --git a/include/lbann/callbacks/callback_save_model.hpp b/include/lbann/callbacks/callback_save_model.hpp index 8e6d8b6cfea..4061966b3c7 100644 --- a/include/lbann/callbacks/callback_save_model.hpp +++ b/include/lbann/callbacks/callback_save_model.hpp @@ -63,7 +63,11 @@ class lbann_callback_save_model : public lbann_callback { void on_train_end(model *m) override; bool save_model(model *m); bool save_model_weights(model *m); - static bool load_model_weights(std::string ckpt_dir, model *m); + /* ckptdir_is_fullpath flag if true + * allow user to specify full path to model weights to load + * and allow system to ignore appending trainer id, num of epochs/steps + * to default ckpt_dir*/ + static bool load_model_weights(std::string ckpt_dir, model *m, bool ckptdir_is_fullpath=false); std::string name() const override { return "save model"; } private: diff --git a/model_zoo/lbann_inf.cpp b/model_zoo/lbann_inf.cpp index daa4ac54458..919fd1f47b2 100644 --- a/model_zoo/lbann_inf.cpp +++ b/model_zoo/lbann_inf.cpp @@ -60,15 +60,13 @@ int main(int argc, char *argv[]) { build_model_from_prototext(argc, argv, *pb_model, comm.get(), io_thread_pool, models.size() == 0)); } - for(auto&& m : models) { - bool loaded = lbann_callback_save_model::load_model_weights(opts->get_string("ckpt_dir"), m.get()); - if(!loaded) LBANN_ERROR("Unable to reload model"); - } // Load layer weights from checkpoint if checkpoint directory given if(opts->has_string("ckpt_dir")){ for(auto&& m : models) { - bool loaded = lbann_callback_save_model::load_model_weights(opts->get_string("ckpt_dir"), m.get()); + bool loaded = lbann_callback_save_model::load_model_weights(opts->get_string("ckpt_dir"), + m.get(), + opts->get_bool("ckptdir_is_fullpath")); if(!loaded) LBANN_ERROR("Unable to reload model"); } }else { diff --git a/src/callbacks/callback_save_model.cpp b/src/callbacks/callback_save_model.cpp index cf6e90e8717..50f03174087 100644 --- a/src/callbacks/callback_save_model.cpp +++ b/src/callbacks/callback_save_model.cpp @@ -133,18 +133,23 @@ bool lbann_callback_save_model::save_model_weights(model *m) { return true; } -bool lbann_callback_save_model::load_model_weights(std::string ckpt_dir, model * m) { +bool lbann_callback_save_model::load_model_weights(std::string ckpt_dir, model * m, bool ckptdir_is_fullpath) { std::vector weight_list = std::vector(); - int epochLast = -1; - int stepLast = -1; - std::string active_ckpt_dir = get_last_shared_checkpoint_filename(m, ckpt_dir); - - // get last epoch and step saved. - int success = read_latest(active_ckpt_dir, &epochLast, &stepLast); - if(!success) { - return false; + std::string active_ckpt_dir; + if(ckptdir_is_fullpath) { + active_ckpt_dir = ckpt_dir; + }else { + int epochLast = -1; + int stepLast = -1; + active_ckpt_dir = get_last_shared_checkpoint_filename(m, ckpt_dir); + + // get last epoch and step saved. + int success = read_latest(active_ckpt_dir, &epochLast, &stepLast); + if(!success) { + return false; + } + active_ckpt_dir = get_shared_checkpoint_dirname(m, ckpt_dir, epochLast, stepLast); } - active_ckpt_dir = get_shared_checkpoint_dirname(m, ckpt_dir, epochLast, stepLast); lbann_comm *comm = m->get_comm(); if(comm->am_trainer_master()) { std::cout << "Loading model weights from " << active_ckpt_dir << std::endl; From 121db1ee5aff1e4f567210fd199aa16200f14477 Mon Sep 17 00:00:00 2001 From: Daniel Matthew Merl Date: Wed, 15 May 2019 11:35:48 -0700 Subject: [PATCH 021/634] include protobuf in the spack environment --- spack_environments/developer_release_ppc64le_cuda_spack.yaml | 1 + spack_environments/developer_release_x86_64_cuda_spack.yaml | 1 + 2 files changed, 2 insertions(+) diff --git a/spack_environments/developer_release_ppc64le_cuda_spack.yaml b/spack_environments/developer_release_ppc64le_cuda_spack.yaml index 5326a77bbd3..3462e2eb90c 100644 --- a/spack_environments/developer_release_ppc64le_cuda_spack.yaml +++ b/spack_environments/developer_release_ppc64le_cuda_spack.yaml @@ -34,6 +34,7 @@ spack: - py-matplotlib - py-onnx - py-pandas + - protobuf - py-protobuf+cpp - py-setuptools - py-texttable diff --git a/spack_environments/developer_release_x86_64_cuda_spack.yaml b/spack_environments/developer_release_x86_64_cuda_spack.yaml index 6349b29ab09..801ff296d06 100644 --- a/spack_environments/developer_release_x86_64_cuda_spack.yaml +++ b/spack_environments/developer_release_x86_64_cuda_spack.yaml @@ -34,6 +34,7 @@ spack: - py-matplotlib - py-onnx - py-pandas + - protobuf - py-protobuf+cpp - py-setuptools - py-texttable From bfdf2d6be3e890ce9414fb37e9e49b81e0c25cf3 Mon Sep 17 00:00:00 2001 From: Sam Ade Jacobs Date: Wed, 15 May 2019 14:51:31 -0700 Subject: [PATCH 022/634] Not related to this PR but the change is so minor that I am putting it here. This should not hurt but rather help: dumping JAG 1M test data and losses is no fun for debugging purpose --- include/lbann/callbacks/callback_dump_outputs.hpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/include/lbann/callbacks/callback_dump_outputs.hpp b/include/lbann/callbacks/callback_dump_outputs.hpp index 4df49686d87..09e6ab1fd1f 100644 --- a/include/lbann/callbacks/callback_dump_outputs.hpp +++ b/include/lbann/callbacks/callback_dump_outputs.hpp @@ -79,7 +79,11 @@ class lbann_callback_dump_outputs : public lbann_callback { std::string name() const override { return "dump outputs"; } void on_forward_prop_end(model* m, Layer* l) override { dump_outputs(*m, *l); } - void on_evaluate_forward_prop_end(model* m, Layer* l) override { dump_outputs(*m, *l); } + void on_evaluate_forward_prop_end(model* m, Layer* l) override { + if(m->get_step() % m_batch_interval == 0) { + dump_outputs(*m, *l); + } + } private: From ca6d4f92c1a1b82763b9433a9e2c8e9a3f446798 Mon Sep 17 00:00:00 2001 From: Nikoli Dryden Date: Wed, 15 May 2019 15:56:04 -0700 Subject: [PATCH 023/634] Add support for specifying a random seed for ResNet models. --- model_zoo/vision/resnet.py | 6 +++++- python/lbann/model.py | 5 ++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/model_zoo/vision/resnet.py b/model_zoo/vision/resnet.py index adda3b98c50..e8f21c89924 100755 --- a/model_zoo/vision/resnet.py +++ b/model_zoo/vision/resnet.py @@ -52,6 +52,9 @@ parser.add_argument( '--num-labels', action='store', default=1000, type=int, help='number of data classes (default: 1000)', metavar='NUM') +parser.add_argument( + '--random-seed', action='store', default=0, type=int, + help='random seed for LBANN RNGs', metavar='NUM') lbann.contrib.args.add_optimizer_arguments(parser, default_learning_rate=0.1) parser.add_argument( '--data-reader', action='store', @@ -145,7 +148,8 @@ layers=layers, objective_function=obj, metrics=metrics, - callbacks=callbacks) + callbacks=callbacks, + random_seed=args.random_seed) # Setup optimizer opt = lbann.contrib.args.create_optimizer(args) diff --git a/python/lbann/model.py b/python/lbann/model.py index 19e3f79e248..ac22b840928 100644 --- a/python/lbann/model.py +++ b/python/lbann/model.py @@ -10,7 +10,7 @@ class Model: def __init__(self, mini_batch_size, epochs, layers=[], weights=[], objective_function=None, - metrics=[], callbacks=[]): + metrics=[], callbacks=[], random_seed=None): # Scalar fields self.mini_batch_size = mini_batch_size @@ -18,6 +18,7 @@ def __init__(self, mini_batch_size, epochs, self.block_size = 256 # TODO: Make configurable self.num_parallel_readers = 0 # TODO: Make configurable self.procs_per_trainer = 0 # TODO: Make configurable + self.random_seed = random_seed # Get connected layers self.layers = list(lbann.layer.traverse_layer_graph(layers)) @@ -49,6 +50,8 @@ def export_proto(self): model.block_size = self.block_size model.num_parallel_readers = self.num_parallel_readers model.procs_per_trainer = self.procs_per_trainer + if self.random_seed is not None: + model.random_seed = self.random_seed # Add model components model.layer.extend([l.export_proto() for l in self.layers]) From 7efdc45f1231108f715b3c53d7c110af047af3ab Mon Sep 17 00:00:00 2001 From: Yosuke Oyama <17844184+oyamay@users.noreply.github.com> Date: Wed, 15 May 2019 19:28:47 -0700 Subject: [PATCH 024/634] Set +scaling_factor_int16 to 1 --- model_zoo/cosmoflow/cosmoflow.py | 1 + 1 file changed, 1 insertion(+) diff --git a/model_zoo/cosmoflow/cosmoflow.py b/model_zoo/cosmoflow/cosmoflow.py index 085b3359b3d..b6cdd5720cf 100755 --- a/model_zoo/cosmoflow/cosmoflow.py +++ b/model_zoo/cosmoflow/cosmoflow.py @@ -178,6 +178,7 @@ def create_data_reader(train_path, val_path, test_path): validation_percent=0, absolute_sample_count=0, percent_of_data_to_use=1.0, + scaling_factor_int16=1.0, **readerArg) readers.append(reader) From 58fb6ae6959445e3db8616d937ab29119f4ac041 Mon Sep 17 00:00:00 2001 From: Yosuke Oyama <17844184+oyamay@users.noreply.github.com> Date: Wed, 15 May 2019 20:51:12 -0700 Subject: [PATCH 025/634] Update default parameters of the cosmoflow script --- model_zoo/cosmoflow/cosmoflow.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/model_zoo/cosmoflow/cosmoflow.py b/model_zoo/cosmoflow/cosmoflow.py index b6cdd5720cf..1df51412411 100755 --- a/model_zoo/cosmoflow/cosmoflow.py +++ b/model_zoo/cosmoflow/cosmoflow.py @@ -167,7 +167,7 @@ def create_data_reader(train_path, val_path, test_path): for role, data_filename in [("train", train_path), ("validate", val_path), ("test", test_path)]: - if data_filename != "": + if not data_filename is None: readerArgs.append({"role": role, "data_filename": data_filename}) readers = [] @@ -205,10 +205,10 @@ def create_data_reader(train_path, val_path, test_path): "--learn-rate", action="store", default=0.0005, type=float, help="The initial learning-rate") parser.add_argument( - "--nodes", action="store", default=8, type=int, + "--nodes", action="store", default=32, type=int, help="The number of nodes") parser.add_argument( - "--mini-batch-size", action="store", default=32, type=int, + "--mini-batch-size", action="store", default=128, type=int, help="The mini-batch size") parser.add_argument( "--epochs", action="store", default=130, type=int, @@ -219,10 +219,12 @@ def create_data_reader(train_path, val_path, test_path): parser.add_argument( "--input-width", action="store", default=256, type=int, help="Width of input tensor") -for role, label in [("train", "training"), ("val", "validation"), ("test", "test")]: +for role, label, required in [("train", "training", True), + ("val", "validation", False), + ("test", "test", False)]: parser.add_argument( - "--{}-path".format(role), action="store", default="", type=str, - help="Path to {} dataset".format(label)) + "--{}-path".format(role), type=str, required=required, + help="Path to {} dataset".format(label), default=None) args = parser.parse_args() # ---------------------------------- From 56c37601be668dce8f8e94f8a29c44446c3e1740 Mon Sep 17 00:00:00 2001 From: Sam Ade Jacobs Date: Thu, 16 May 2019 11:39:10 -0700 Subject: [PATCH 026/634] Fix how perturbation is computed as suggested in Tim's review --- src/callbacks/callback_perturb_dropout.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/callbacks/callback_perturb_dropout.cpp b/src/callbacks/callback_perturb_dropout.cpp index aa064be3954..8cb0ab17ed2 100644 --- a/src/callbacks/callback_perturb_dropout.cpp +++ b/src/callbacks/callback_perturb_dropout.cpp @@ -89,10 +89,10 @@ void lbann_callback_perturb_dropout::perturb(model& m) { if (d_dp_gpu) old_keep_prob = d_dp_gpu->get_keep_prob(); if (d_mp_gpu) old_keep_prob = d_mp_gpu->get_keep_prob(); #endif - if (m_keep_prob_factor != zero && old_keep_prob >= zero) { - auto log_val = std::log(std::max(old_keep_prob, min_val)); + if (m_keep_prob_factor > zero) { + auto log_val = std::log(one - std::max(old_keep_prob, min_val)); log_val += m_keep_prob_factor * dist(gen); - new_keep_prob = std::max(EvalType(0.5), std::min(std::exp(log_val),EvalType(1.0))); + new_keep_prob = std::max(EvalType(0.5), std::min(std::exp(one - log_val),one)); std::cout << "Trainer [ " << comm->get_trainer_rank() << " ] keep prob changed from " << old_keep_prob << " to " << new_keep_prob << std::endl; } From 32ed8125df7916d27d3d2ac5fc9cae43fd6bb1c8 Mon Sep 17 00:00:00 2001 From: "David A. Hysom" Date: Thu, 16 May 2019 12:05:51 -0700 Subject: [PATCH 027/634] working version with imagnet. Run with one of the following: --preload_data_store --data_store_cache --use_data_store Currently data exchanges are performed using supe_node mode; you do not need to specify this. --- .../lbann/data_store/data_store_conduit.hpp | 13 +- src/data_store/data_store_conduit.cpp | 128 ++++++++++++++++-- 2 files changed, 129 insertions(+), 12 deletions(-) diff --git a/include/lbann/data_store/data_store_conduit.hpp b/include/lbann/data_store/data_store_conduit.hpp index d58d0a0ec17..b8dbce9982b 100644 --- a/include/lbann/data_store/data_store_conduit.hpp +++ b/include/lbann/data_store/data_store_conduit.hpp @@ -220,6 +220,7 @@ protected : /// Contains the list of data IDs that will be received std::vector m_recv_data_ids; + std::unordered_map m_recv_sample_sizes; /// contains the Nodes that this processor owns; /// maps data_id to conduit::Node @@ -235,6 +236,8 @@ protected : std::vector> m_send_requests; std::vector> m_recv_requests; std::vector m_recv_buffer; + std::vector m_recv_buffer_sample_sizes; + std::vector m_send_buffer_sample_sizes; std::vector m_outgoing_msg_sizes; std::vector m_incoming_msg_sizes; @@ -253,8 +256,13 @@ protected : /// fills in m_owner, which maps index -> owning processor void build_owner_map(int mini_batch_size); + /// for use when conduit Nodes have non-uniform size, e.g, imagenet, + /// and when running in non-super_node mode + void exchange_sample_sizes(int num_to_send, int num_to_receive); + /// maps processor id -> set of indices (whose associated samples) - /// this proc needs to send. (formerly called "proc_to_indices) + /// this proc needs to send. (formerly called "proc_to_indices); + /// this is filled in by build_indices_i_will_send() std::vector> m_indices_to_send; /// fills in m_indices_to_send and returns the number of samples @@ -274,6 +282,9 @@ protected : bool m_is_local_cache; bool m_node_sizes_vary; + + /// for use when conduit Nodes have non-uniform size, e.g, imagenet + std::unordered_map m_sample_sizes; }; } // namespace lbann diff --git a/src/data_store/data_store_conduit.cpp b/src/data_store/data_store_conduit.cpp index 09eaa84075f..081eab30098 100644 --- a/src/data_store/data_store_conduit.cpp +++ b/src/data_store/data_store_conduit.cpp @@ -72,8 +72,15 @@ data_store_conduit::data_store_conduit( if (m_is_local_cache && opts->get_bool("preload_data_store")) { LBANN_ERROR("you cannot use both of these options: --data_store_cache --preload_data_store"); } + if (m_world_master) { - std::cout << "data_store_conduit is running in local_cache mode\n"; + if (m_is_local_cache) { + std::cout << "data_store_conduit is running in local_cache mode\n"; + } else if (m_super_node) { + std::cout << "data_store_conduit is running in super_node mode\n"; + } else { + std::cout << "data_store_conduit is running in multi-message mode\n"; + } } } @@ -140,11 +147,11 @@ void data_store_conduit::copy_members(const data_store_conduit& rhs, const std:: /// Move indices on the list from the data and owner maps in the RHS data store to the new data store for(auto&& i : ds_sample_move_list) { - if (m_output) { - rhs.m_output << "next ds_sample_move_list index: " << i << " is it in rhs? " << (rhs.m_data.find(i) != rhs.m_data.end()) << " rhs.m_data.size: " << rhs.m_data.size() << "\n"; - } - if(rhs.m_data.find(i) != rhs.m_data.end()){ + if (m_output) { + rhs.m_output << "moving index: " << i << " from other to myself\n"; + } + if (!m_super_node) { /// Repack the nodes because they don't seem to copy correctly build_node_for_sending(rhs.m_data[i]["data"], m_data[i]); @@ -351,7 +358,11 @@ void data_store_conduit::set_preloaded_conduit_node(int data_id, conduit::Node & m_output << "3. calling build_node_for_sending\n"; } build_node_for_sending(n2, m_data[data_id]); - error_check_compacted_node(m_data[data_id], data_id); + if (!m_node_sizes_vary) { + error_check_compacted_node(m_data[data_id], data_id); + } else { + m_sample_sizes[data_id] = m_data[data_id].total_bytes_compact(); + } } else { if (m_data.find(data_id) == m_data.end()) { m_data[data_id] = node; @@ -415,11 +426,9 @@ void data_store_conduit::set_conduit_node(int data_id, conduit::Node &node, bool } else if (! m_super_node) { - if (m_output) { - m_output << "4. calling build_node_for_sending\n"; - } build_node_for_sending(node, m_data[data_id]); error_check_compacted_node(m_data[data_id], data_id); + m_sample_sizes[data_id] = m_data[data_id].total_bytes_compact(); } else { @@ -536,6 +545,12 @@ void data_store_conduit::exchange_data_by_sample(size_t current_pos, size_t mb_s m_recv_buffer.resize(num_recv_req); m_recv_data_ids.resize(num_recv_req); +/* XX for ruture development + if (m_node_sizes_vary) { + exchange_sample_sizes(num_send_req, num_recv_req); + } +*/ + //======================================================================== //part 2: exchange the actual data @@ -558,7 +573,15 @@ void data_store_conduit::exchange_data_by_sample(size_t current_pos, size_t mb_s if(n.contiguous_data_ptr() == nullptr) { LBANN_ERROR("data_id: " + std::to_string(index) + " does not have a valid contiguous data pointer"); } - m_comm->nb_tagged_send(s, m_compacted_sample_size, p, index, m_send_requests[ss++], m_comm->get_trainer_comm()); + + int sz = m_compacted_sample_size; + if (m_node_sizes_vary) { + if (m_sample_sizes.find(index) == m_sample_sizes.end()) { + LBANN_ERROR("m_sample_sizes.find(index) == m_sample_sizes.end() for index: " + std::to_string(index)); + } + sz = m_sample_sizes[index]; + } + m_comm->nb_tagged_send(s, sz, p, index, m_send_requests[ss++], m_comm->get_trainer_comm()); } } @@ -715,7 +738,7 @@ conduit::Node & data_store_conduit::get_empty_node(int data_id) { void data_store_conduit::purge_unused_samples(const std::vector& indices) { if (m_output) { - m_output << " purge_unused_samples; indices.size(): " << indices.size() << " data.size(): " << m_data.size() << std::endl; + m_output << " starting purge_unused_samples; indices.size(): " << indices.size() << " data.size(): " << m_data.size() << std::endl; } /// Remove unused indices from the data and owner maps for(auto&& i : indices) { @@ -726,6 +749,9 @@ void data_store_conduit::purge_unused_samples(const std::vector& indices) { m_owner.erase(i); } } + if (m_output) { + m_output << " leaving purge_unused_samples; indices.size(): " << indices.size() << " data.size(): " << m_data.size() << std::endl; + } } void data_store_conduit::compact_nodes() { @@ -924,6 +950,86 @@ void data_store_conduit::set_shuffled_indices(const std::vector *indices) { m_shuffled_indices = indices; } +void data_store_conduit::exchange_sample_sizes(int num_send_req, int num_recv_req) { +//for future development +#if 0 + + m_send_requests.resize(m_np_in_trainer); + m_recv_requests.resize(num_recv_req); + m_recv_buffer_sample_sizes.resize(num_recv_req); + m_recv_data_ids.resize(num_recv_req); + + // start sends for outgoing sample sizes + std::vector> outgoing(m_np_in_trainer); + size_t request_idx = 0; + for (int p=0; p &indices = m_indices_to_send[p]; + outgoing[p].reserve(m_indices_to_send.size()+1); + outgoing[p].push_back(0); + for (auto data_id : indices) { + if (m_data.find(data_id) == m_data.end()) { + LBANN_ERROR("failed to find data_id= " + std::to_string(data_id) + " in m_data"); + } + if (m_sample_sizes.find(data_id) == m_sample_sizes.end()) { + LBANN_ERROR("failed to find data_id= " + std::to_string(data_id) + " in m_sample_sizes"); + } + outgoing[p].push_back(m_sample_sizes[data_id]); + } + + if (m_output) { + m_output << "XX sending num samples: " << outgoing[p].size() << " to " << p << std::endl; + } + + const El::byte *s = reinterpret_cast(outgoing[p].data()); + int tag = (p+1) * -1; + m_comm->nb_tagged_send(s, sizeof(int)*outgoing[p].size(), p, tag, m_send_requests[request_idx++], m_comm->get_trainer_comm()); + } + + // sanity checks + if (request_idx!= m_send_requests.size()) { + LBANN_ERROR("request_idx!= m_send_requests.size"); + } + +m_output.close(); +MPI_Barrier(MPI_COMM_WORLD); +exit(0); + + // start recvs for incoming sample sizes + request_idx = 0; + for (int p=0; p &indices = m_indices_to_recv[p]; + for (auto index : indices) { + El::byte *s = reinterpret_cast(&m_recv_buffer_sample_sizes[request_idx]); + m_comm->nb_tagged_recv(s, sizeof(int), p, index, m_recv_requests[request_idx], m_comm->get_trainer_comm()); + m_recv_data_ids[request_idx] = index; + m_recv_sample_sizes[index] = index; + ++request_idx; + } + } + + // sanity checks + if (request_idx != m_recv_buffer.size()) { + LBANN_ERROR("request_idx != m_recv_buffer.size; request_idx: " + std::to_string(request_idx) + " m_recv_buffer.size: " + std::to_string(m_recv_buffer.size())); + } + if (m_recv_requests.size() != m_recv_buffer.size()) { + LBANN_ERROR("m_recv_requests.size != m_recv_buffer.size; m_recv_requests: " + std::to_string(m_recv_requests.size()) + " m_recv_buffer.size: " + std::to_string(m_recv_buffer.size())); + } + + // wait for all msgs to complete + m_comm->wait_all(m_send_requests); + m_comm->wait_all(m_recv_requests); + + if (m_output) { + m_output << "my incoming Node sizes (data_id, size):\n"; + for (size_t j=0; j Date: Thu, 16 May 2019 12:46:43 -0700 Subject: [PATCH 028/634] Update callback_perturb_dropout.cpp --- src/callbacks/callback_perturb_dropout.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/callbacks/callback_perturb_dropout.cpp b/src/callbacks/callback_perturb_dropout.cpp index 8cb0ab17ed2..36390729e6d 100644 --- a/src/callbacks/callback_perturb_dropout.cpp +++ b/src/callbacks/callback_perturb_dropout.cpp @@ -92,7 +92,7 @@ void lbann_callback_perturb_dropout::perturb(model& m) { if (m_keep_prob_factor > zero) { auto log_val = std::log(one - std::max(old_keep_prob, min_val)); log_val += m_keep_prob_factor * dist(gen); - new_keep_prob = std::max(EvalType(0.5), std::min(std::exp(one - log_val),one)); + new_keep_prob = std::max(EvalType(0.5), std::min(one - std::exp(log_val),one)); std::cout << "Trainer [ " << comm->get_trainer_rank() << " ] keep prob changed from " << old_keep_prob << " to " << new_keep_prob << std::endl; } From 4f06d9d67ee2a662a8f019dc56325dbb88d33bbe Mon Sep 17 00:00:00 2001 From: "David A. Hysom" Date: Fri, 17 May 2019 08:06:28 -0700 Subject: [PATCH 029/634] added mutex to guard against multiple threads writing to unordered_map in set_conduit_node(). --- include/lbann/data_store/data_store_conduit.hpp | 3 +++ src/data_store/data_store_conduit.cpp | 5 +++++ 2 files changed, 8 insertions(+) diff --git a/include/lbann/data_store/data_store_conduit.hpp b/include/lbann/data_store/data_store_conduit.hpp index b8dbce9982b..611af9fe8f0 100644 --- a/include/lbann/data_store/data_store_conduit.hpp +++ b/include/lbann/data_store/data_store_conduit.hpp @@ -37,6 +37,7 @@ #include "conduit/conduit_node.hpp" #include #include +#include namespace lbann { @@ -285,6 +286,8 @@ protected : /// for use when conduit Nodes have non-uniform size, e.g, imagenet std::unordered_map m_sample_sizes; + + std::mutex m_mutex; }; } // namespace lbann diff --git a/src/data_store/data_store_conduit.cpp b/src/data_store/data_store_conduit.cpp index 081eab30098..8598434ea7e 100644 --- a/src/data_store/data_store_conduit.cpp +++ b/src/data_store/data_store_conduit.cpp @@ -400,6 +400,7 @@ void data_store_conduit::error_check_compacted_node(const conduit::Node &nd, int void data_store_conduit::set_conduit_node(int data_id, conduit::Node &node, bool already_have) { + m_mutex.lock(); if (already_have == false && m_data.find(data_id) != m_data.end()) { LBANN_ERROR("duplicate data_id: " + std::to_string(data_id) + " in data_store_conduit::set_conduit_node"); } @@ -412,11 +413,13 @@ void data_store_conduit::set_conduit_node(int data_id, conduit::Node &node, bool if (m_data.find(data_id) == m_data.end()) { LBANN_ERROR("you claim the passed node was obtained from this data_store, but the data_id (" + std::to_string(data_id) + ") doesn't exist in m_data"); } + m_mutex.unlock(); return; } if (is_local_cache()) { m_data[data_id] = node; + m_mutex.unlock(); } else if (m_owner[data_id] != m_rank_in_trainer) { @@ -429,10 +432,12 @@ void data_store_conduit::set_conduit_node(int data_id, conduit::Node &node, bool build_node_for_sending(node, m_data[data_id]); error_check_compacted_node(m_data[data_id], data_id); m_sample_sizes[data_id] = m_data[data_id].total_bytes_compact(); + m_mutex.unlock(); } else { m_data[data_id] = node; + m_mutex.unlock(); // @TODO would like to do: m_data[data_id].set_external(node); but since // (as of now) 'node' is a local variable in a data_reader+jag_conduit, // we need to do a deep copy. If the data_store furnishes a node to the From 52fe58f154732ba67bb3d8eab6f458e734a6fb80 Mon Sep 17 00:00:00 2001 From: "David A. Hysom" Date: Fri, 17 May 2019 16:11:30 -0700 Subject: [PATCH 030/634] modification to use optionally use either exchange_data_by_sample() or :exchange_data_by_super_node(). The default is now exchange_data_by_sample(). You can optionally add to the cmd line: --super_node --- include/lbann/data_readers/data_reader.hpp | 2 + .../lbann/data_store/data_store_conduit.hpp | 8 +- src/data_readers/data_reader_image.cpp | 1 - src/data_store/data_store_conduit.cpp | 153 ++++++++---------- 4 files changed, 71 insertions(+), 93 deletions(-) diff --git a/include/lbann/data_readers/data_reader.hpp b/include/lbann/data_readers/data_reader.hpp index 2b1e9f16f74..f5a8a4fe359 100644 --- a/include/lbann/data_readers/data_reader.hpp +++ b/include/lbann/data_readers/data_reader.hpp @@ -720,6 +720,8 @@ class generic_data_reader : public lbann_image_preprocessor { void set_model(model *m) { m_model = m; } + model * get_model() const { return m_model; } + /// experimental; used to ensure all readers for jag_conduit_hdf5 /// have identical shuffled indices virtual void post_update() {} diff --git a/include/lbann/data_store/data_store_conduit.hpp b/include/lbann/data_store/data_store_conduit.hpp index 611af9fe8f0..f2fbf810ea8 100644 --- a/include/lbann/data_store/data_store_conduit.hpp +++ b/include/lbann/data_store/data_store_conduit.hpp @@ -113,7 +113,7 @@ class data_store_conduit { /// As of this writing, will be called if cmd line includes: --preload_data_store /// This may change in the future; TODO revisit - void set_preload() { m_preload = true; } + void set_preload(); bool is_preloaded() { return m_preload; } @@ -259,7 +259,7 @@ protected : /// for use when conduit Nodes have non-uniform size, e.g, imagenet, /// and when running in non-super_node mode - void exchange_sample_sizes(int num_to_send, int num_to_receive); + void exchange_sample_sizes(); /// maps processor id -> set of indices (whose associated samples) /// this proc needs to send. (formerly called "proc_to_indices); @@ -287,7 +287,11 @@ protected : /// for use when conduit Nodes have non-uniform size, e.g, imagenet std::unordered_map m_sample_sizes; + /// used in set_conduit_node(...) std::mutex m_mutex; + + /// used in exchange_data_by_sample, when sample sizes are non-uniform + bool m_have_sample_sizes; }; } // namespace lbann diff --git a/src/data_readers/data_reader_image.cpp b/src/data_readers/data_reader_image.cpp index 0935e37f4df..ea2a904625b 100644 --- a/src/data_readers/data_reader_image.cpp +++ b/src/data_readers/data_reader_image.cpp @@ -189,7 +189,6 @@ void image_data_reader::load() { std::iota(m_shuffled_indices.begin(), m_shuffled_indices.end(), 0); opts->set_option("node_sizes_vary", 1); - opts->set_option("super_node", 1); instantiate_data_store(local_list_sizes); select_subset_of_data(); diff --git a/src/data_store/data_store_conduit.cpp b/src/data_store/data_store_conduit.cpp index 8598434ea7e..5eceb26a769 100644 --- a/src/data_store/data_store_conduit.cpp +++ b/src/data_store/data_store_conduit.cpp @@ -48,7 +48,8 @@ data_store_conduit::data_store_conduit( m_super_node(false), m_compacted_sample_size(0), m_is_local_cache(false), - m_node_sizes_vary(false) { + m_node_sizes_vary(false), + m_have_sample_sizes(false) { m_comm = m_reader->get_comm(); if (m_comm == nullptr) { LBANN_ERROR(" m_comm is nullptr"); @@ -108,7 +109,6 @@ data_store_conduit& data_store_conduit::operator=(const data_store_conduit& rhs) } void data_store_conduit::set_role(const std::string role) { - if (options::get()->get_bool("debug")) { std::stringstream ss; ss << "debug_" << m_reader->get_role() << "." << m_comm->get_rank_in_world(); @@ -134,6 +134,7 @@ void data_store_conduit::copy_members(const data_store_conduit& rhs, const std:: m_compacted_sample_size = rhs.m_compacted_sample_size; m_is_local_cache = rhs.m_is_local_cache; m_node_sizes_vary = rhs.m_node_sizes_vary; + m_sample_sizes = rhs.m_sample_sizes; /// This block needed when carving a validation set from the training set if (options::get()->get_bool("debug") && !m_output) { @@ -260,9 +261,6 @@ void data_store_conduit::exchange_data_by_super_node(size_t current_pos, size_t for (auto idx : m_indices_to_send[p]) { m_send_buffer[p].update_external(m_data[idx]); } - if (m_output) { - m_output << "2. calling build_node_for_sending\n"; - } build_node_for_sending(m_send_buffer[p], m_send_buffer_2[p]); } @@ -354,9 +352,6 @@ void data_store_conduit::set_preloaded_conduit_node(int data_id, conduit::Node & m_output << "set_preloaded_conduit_node: " << data_id << " for non-super_node mode\n"; } conduit::Node n2 = node; - if (m_output) { - m_output << "3. calling build_node_for_sending\n"; - } build_node_for_sending(n2, m_data[data_id]); if (!m_node_sizes_vary) { error_check_compacted_node(m_data[data_id], data_id); @@ -538,6 +533,15 @@ void data_store_conduit::exchange_data_by_sample(size_t current_pos, size_t mb_s LBANN_ERROR("setup(mb_size) has not been called"); } + /// exchange sample sizes if they are non-uniform (imagenet); + /// this will only be called once, during the first call to + /// exchange_data_by_sample at the beginning of the 2nd epoch, + /// or during the first call th exchange_data_by_sample() during + /// the first epoch if preloading + if (m_node_sizes_vary && !m_have_sample_sizes) { + exchange_sample_sizes(); + } + if (m_output) { m_output << "starting data_store_conduit::exchange_data_by_sample; mb_size: " << mb_size << std::endl; } @@ -550,12 +554,6 @@ void data_store_conduit::exchange_data_by_sample(size_t current_pos, size_t mb_s m_recv_buffer.resize(num_recv_req); m_recv_data_ids.resize(num_recv_req); -/* XX for ruture development - if (m_node_sizes_vary) { - exchange_sample_sizes(num_send_req, num_recv_req); - } -*/ - //======================================================================== //part 2: exchange the actual data @@ -580,12 +578,18 @@ void data_store_conduit::exchange_data_by_sample(size_t current_pos, size_t mb_s } int sz = m_compacted_sample_size; + if (m_node_sizes_vary) { if (m_sample_sizes.find(index) == m_sample_sizes.end()) { - LBANN_ERROR("m_sample_sizes.find(index) == m_sample_sizes.end() for index: " + std::to_string(index)); + LBANN_ERROR("m_sample_sizes.find(index) == m_sample_sizes.end() for index: " + std::to_string(index) + "; m_sample_sizes.size: " + std::to_string(m_sample_sizes.size())); } sz = m_sample_sizes[index]; } + + if (m_output) { + m_output << "sending " << index << " size: " << sz << " to " << p << std::endl; + } + m_comm->nb_tagged_send(s, sz, p, index, m_send_requests[ss++], m_comm->get_trainer_comm()); } } @@ -597,12 +601,22 @@ void data_store_conduit::exchange_data_by_sample(size_t current_pos, size_t mb_s // start recvs for incoming data ss = 0; + for (int p=0; p &indices = m_indices_to_recv[p]; for (auto index : indices) { - m_recv_buffer[ss].set(conduit::DataType::uint8(m_compacted_sample_size)); + + int sz = m_compacted_sample_size; + if (m_node_sizes_vary) { + if (m_sample_sizes.find(index) == m_sample_sizes.end()) { + LBANN_ERROR("m_sample_sizes.find(index) == m_sample_sizes.end() for index: " + std::to_string(index) + "; m_sample_sizes.size(): " + std::to_string(m_sample_sizes.size()) + " role: " + m_reader->get_role()); + } + sz = m_sample_sizes[index]; + } + + m_recv_buffer[ss].set(conduit::DataType::uint8(sz)); El::byte *r = reinterpret_cast(m_recv_buffer[ss].data_ptr()); - m_comm->nb_tagged_recv(r, m_compacted_sample_size, p, index, m_recv_requests[ss], m_comm->get_trainer_comm()); + m_comm->nb_tagged_recv(r, sz, p, index, m_recv_requests[ss], m_comm->get_trainer_comm()); m_recv_data_ids[ss] = index; ++ss; } @@ -693,9 +707,6 @@ void data_store_conduit::build_preloaded_owner_map(const std::vector& per_r per_rank_list_range_start += per_rank_list_size; } m_owner[i] = owning_rank; - if (m_output) { - m_output << "m_owner[" << i << "] = " << owning_rank << std::endl; - } } } @@ -776,9 +787,6 @@ void data_store_conduit::compact_nodes() { /// Repack the nodes because they don't seem to copy correctly conduit::Node node = m_data[j]["data"]; m_data.erase(j); - if (m_output) { - m_output << "5. calling build_node_for_sending\n"; - } build_node_for_sending(node, m_data[j]); } } @@ -955,84 +963,49 @@ void data_store_conduit::set_shuffled_indices(const std::vector *indices) { m_shuffled_indices = indices; } -void data_store_conduit::exchange_sample_sizes(int num_send_req, int num_recv_req) { -//for future development -#if 0 - - m_send_requests.resize(m_np_in_trainer); - m_recv_requests.resize(num_recv_req); - m_recv_buffer_sample_sizes.resize(num_recv_req); - m_recv_data_ids.resize(num_recv_req); +void data_store_conduit::exchange_sample_sizes() { + if (m_output) { + m_output << "starting data_store_conduit::exchange_sample_sizes" << std::endl; + } - // start sends for outgoing sample sizes - std::vector> outgoing(m_np_in_trainer); - size_t request_idx = 0; - for (int p=0; p &indices = m_indices_to_send[p]; - outgoing[p].reserve(m_indices_to_send.size()+1); - outgoing[p].push_back(0); - for (auto data_id : indices) { - if (m_data.find(data_id) == m_data.end()) { - LBANN_ERROR("failed to find data_id= " + std::to_string(data_id) + " in m_data"); - } - if (m_sample_sizes.find(data_id) == m_sample_sizes.end()) { - LBANN_ERROR("failed to find data_id= " + std::to_string(data_id) + " in m_sample_sizes"); - } - outgoing[p].push_back(m_sample_sizes[data_id]); - } + int my_count = m_sample_sizes.size(); + std::vector all_counts(m_np_in_trainer); + m_comm->all_gather(&my_count, 1, all_counts.data(), 1, m_comm->get_trainer_comm()); - if (m_output) { - m_output << "XX sending num samples: " << outgoing[p].size() << " to " << p << std::endl; + if (m_output) { + for (size_t h=0; h(outgoing[p].data()); - int tag = (p+1) * -1; - m_comm->nb_tagged_send(s, sizeof(int)*outgoing[p].size(), p, tag, m_send_requests[request_idx++], m_comm->get_trainer_comm()); } - // sanity checks - if (request_idx!= m_send_requests.size()) { - LBANN_ERROR("request_idx!= m_send_requests.size"); + std::vector my_sizes(m_sample_sizes.size()*2); + size_t j = 0; + for (auto t : m_sample_sizes) { + my_sizes[j++] = t.first; + my_sizes[j++] = t.second; } -m_output.close(); -MPI_Barrier(MPI_COMM_WORLD); -exit(0); - - // start recvs for incoming sample sizes - request_idx = 0; - for (int p=0; p &indices = m_indices_to_recv[p]; - for (auto index : indices) { - El::byte *s = reinterpret_cast(&m_recv_buffer_sample_sizes[request_idx]); - m_comm->nb_tagged_recv(s, sizeof(int), p, index, m_recv_requests[request_idx], m_comm->get_trainer_comm()); - m_recv_data_ids[request_idx] = index; - m_recv_sample_sizes[index] = index; - ++request_idx; + std::vector other_sizes; + for (int k=0; kbroadcast(k, my_sizes.data(), all_counts[k]*2, m_comm->get_trainer_comm()); + } else { + m_comm->broadcast(k, other_sizes.data(), all_counts[k]*2, m_comm->get_trainer_comm()); + for (size_t i=0; iwait_all(m_send_requests); - m_comm->wait_all(m_recv_requests); + m_have_sample_sizes = true; +} - if (m_output) { - m_output << "my incoming Node sizes (data_id, size):\n"; - for (size_t j=0; j Date: Tue, 21 May 2019 09:27:18 -0700 Subject: [PATCH 031/634] removing "#ifdef LBANN_HAS_CONDUIT" guards, since conduit is now required for building lbann. --- include/lbann/data_readers/data_reader_jag_conduit.hpp | 2 -- include/lbann/data_store/data_store_conduit.hpp | 3 --- model_zoo/tests/conduit_timing_test.cpp | 3 --- src/data_readers/data_reader_jag_conduit.cpp | 2 -- src/data_store/data_store_conduit.cpp | 3 --- src/proto/factories/layer_factory.cpp | 6 ------ src/proto/init_image_data_readers.cpp | 2 -- src/proto/proto_common.cpp | 10 ---------- 8 files changed, 31 deletions(-) diff --git a/include/lbann/data_readers/data_reader_jag_conduit.hpp b/include/lbann/data_readers/data_reader_jag_conduit.hpp index 0938fa79438..d947ae0a766 100644 --- a/include/lbann/data_readers/data_reader_jag_conduit.hpp +++ b/include/lbann/data_readers/data_reader_jag_conduit.hpp @@ -29,7 +29,6 @@ #include "lbann_config.hpp" // may define LBANN_HAS_CONDUIT -#ifdef LBANN_HAS_CONDUIT #include "lbann/data_readers/opencv.hpp" #include "data_reader.hpp" #include "conduit/conduit.hpp" @@ -602,5 +601,4 @@ inline size_t data_reader_jag_conduit::add_val(const std::string key, const cond } } // end of namespace lbann -#endif // LBANN_HAS_CONDUIT #endif // _DATA_READER_JAG_CONDUIT_HPP_ diff --git a/include/lbann/data_store/data_store_conduit.hpp b/include/lbann/data_store/data_store_conduit.hpp index f2fbf810ea8..97e7e5cd997 100644 --- a/include/lbann/data_store/data_store_conduit.hpp +++ b/include/lbann/data_store/data_store_conduit.hpp @@ -30,8 +30,6 @@ #include "lbann_config.hpp" -#ifdef LBANN_HAS_CONDUIT - #include "lbann/base.hpp" #include "lbann/comm.hpp" #include "conduit/conduit_node.hpp" @@ -296,6 +294,5 @@ protected : } // namespace lbann -#endif //#ifdef LBANN_HAS_CONDUIT #endif // __DATA_STORE_JAG_HPP__ diff --git a/model_zoo/tests/conduit_timing_test.cpp b/model_zoo/tests/conduit_timing_test.cpp index a37cee28f54..a8b4b397a6a 100644 --- a/model_zoo/tests/conduit_timing_test.cpp +++ b/model_zoo/tests/conduit_timing_test.cpp @@ -27,8 +27,6 @@ #include "lbann_config.hpp" -#ifdef LBANN_HAS_CONDUIT - #include "conduit/conduit.hpp" #include "conduit/conduit_relay.hpp" #include "conduit/conduit_relay_io_hdf5.hpp" @@ -274,4 +272,3 @@ void test_conduit_3(int from, int to, std::vector filenames) { std::cerr << " time to load entire (images) sample: " << n1 << "\n"; std::cerr << " time to access image values: " << n2 << "\n"; } -#endif //#ifdef LBANN_HAS_CONDUIT diff --git a/src/data_readers/data_reader_jag_conduit.cpp b/src/data_readers/data_reader_jag_conduit.cpp index b9214a680aa..a58256ee250 100644 --- a/src/data_readers/data_reader_jag_conduit.cpp +++ b/src/data_readers/data_reader_jag_conduit.cpp @@ -31,7 +31,6 @@ #include "lbann/models/model.hpp" #include "lbann/utils/lbann_library.hpp" -#ifdef LBANN_HAS_CONDUIT #include "lbann/utils/file_utils.hpp" // for add_delimiter() in load() #include "lbann/data_readers/opencv_extensions.hpp" #include // numeric_limits @@ -1648,4 +1647,3 @@ void data_reader_jag_conduit::add_input_normalization_param(const data_reader_ja } // end of namespace lbann #undef _CN_ -#endif // LBANN_HAS_CONDUIT diff --git a/src/data_store/data_store_conduit.cpp b/src/data_store/data_store_conduit.cpp index 5eceb26a769..8bb95e29772 100644 --- a/src/data_store/data_store_conduit.cpp +++ b/src/data_store/data_store_conduit.cpp @@ -27,8 +27,6 @@ #include "lbann/data_store/data_store_conduit.hpp" -#ifdef LBANN_HAS_CONDUIT - #include "lbann/data_readers/data_reader_jag_conduit.hpp" #include "lbann/utils/exception.hpp" #include "lbann/utils/options.hpp" @@ -1010,4 +1008,3 @@ void data_store_conduit::set_preload() { } // namespace lbann -#endif //#ifdef LBANN_HAS_CONDUIT diff --git a/src/proto/factories/layer_factory.cpp b/src/proto/factories/layer_factory.cpp index 2521314d2ae..8f73ac23c2e 100644 --- a/src/proto/factories/layer_factory.cpp +++ b/src/proto/factories/layer_factory.cpp @@ -80,7 +80,6 @@ std::unique_ptr construct_layer( if (params.get_num_neurons_of_slice_from_reader_size() > 0) { num_neurons_method_name = "get_num_neurons_of_slice_from_reader"; - #if defined(LBANN_HAS_CONDUIT) const auto dr_generic = lbann::peek_map(data_readers, execution_mode::training); const int num_slice_indices = params.get_num_neurons_of_slice_from_reader_size(); if (dynamic_cast(dr_generic) != nullptr) { @@ -97,7 +96,6 @@ std::unique_ptr construct_layer( num_neurons += diff; } } - #endif // defined(LBANN_HAS_CONDUIT) } else { num_neurons_method_name = "num_neurons"; num_neurons = params.num_neurons(); @@ -240,11 +238,9 @@ std::unique_ptr construct_layer( if (params.get_slice_points_from_reader() != "") { slice_point_method_name = "'get_slice_points_from_reader'"; - #if defined(LBANN_HAS_CONDUIT) const auto dr_generic = lbann::peek_map(data_readers, execution_mode::training); const std::string& var = params.get_slice_points_from_reader(); slice_points = get_slice_points_from_reader(dr_generic, var, is_supported); - #endif // defined(LBANN_HAS_CONDUIT) } else { slice_point_method_name = "'slice_points'"; slice_points = parse_list(params.slice_points()); @@ -647,7 +643,6 @@ std::vector get_slice_points_from_reader(const generic_data_reader* dr_ bool& is_supported) { std::vector slice_points; is_supported = false; -#if defined(LBANN_HAS_CONDUIT) // TODO: remove the dynamic cast when this feature gets merged into the base class const auto dr = dynamic_cast(dr_generic); @@ -662,7 +657,6 @@ std::vector get_slice_points_from_reader(const generic_data_reader* dr_ + "\". Must be either \"independent\" or \"dependent\"."); } } -#endif return slice_points; } diff --git a/src/proto/init_image_data_readers.cpp b/src/proto/init_image_data_readers.cpp index 65a13c52d80..511ddaab32e 100644 --- a/src/proto/init_image_data_readers.cpp +++ b/src/proto/init_image_data_readers.cpp @@ -353,7 +353,6 @@ void init_image_data_reader(const lbann_data::Reader& pb_readme, const lbann_dat reader = new data_reader_multi_images(pp, shuffle); } else if (name == "moving_mnist") { reader = new moving_mnist_reader(7, 40, 40, 2); -#ifdef LBANN_HAS_CONDUIT } else if (name =="jag_conduit") { data_reader_jag_conduit* reader_jag = new data_reader_jag_conduit(pp, shuffle); const lbann_data::DataSetMetaData::Schema& pb_schema = pb_metadata.schema(); @@ -513,7 +512,6 @@ void init_image_data_reader(const lbann_data::Reader& pb_readme, const lbann_dat reader = reader_jag; if (master) std::cout << reader->get_type() << " is set" << std::endl; return; -#endif // LBANN_HAS_CONDUIT } if (channels == 0) { diff --git a/src/proto/proto_common.cpp b/src/proto/proto_common.cpp index b9166ec27fb..e88f2f5b648 100644 --- a/src/proto/proto_common.cpp +++ b/src/proto/proto_common.cpp @@ -55,9 +55,7 @@ void init_data_readers( bool is_shareable_testing_data_reader, bool is_shareable_validation_data_reader) { -#ifdef LBANN_HAS_CONDUIT static std::unordered_map leading_reader_jag_conduit; -#endif const bool master = comm->am_world_master(); std::ostringstream err; @@ -141,7 +139,6 @@ void init_data_readers( reader_jag->set_normalization_mode(pb_preproc.early_normalization()); reader = reader_jag; set_up_generic_preprocessor = false; -#ifdef LBANN_HAS_CONDUIT } else if (name == "jag_conduit") { init_image_data_reader(readme, pb_metadata, master, reader); auto reader_jag_conduit = dynamic_cast(reader); @@ -181,7 +178,6 @@ void init_data_readers( } else if (name == "jag_conduit_hdf5") { init_image_data_reader(readme, pb_metadata, master, reader); set_up_generic_preprocessor = false; -#endif // LBANN_HAS_CONDUIT } else if (name == "nci") { reader = new data_reader_nci(shuffle); } else if (name == "csv") { @@ -242,12 +238,10 @@ void init_data_readers( reader_numpy_npz->set_has_responses(!readme.disable_responses()); reader_numpy_npz->set_scaling_factor_int16(readme.scaling_factor_int16()); npy_readers.push_back(reader_numpy_npz); -#ifdef LBANN_HAS_CONDUIT } else if (readme.format() == "jag_conduit") { init_image_data_reader(readme, pb_metadata, master, reader); set_up_generic_preprocessor = false; npy_readers.push_back(reader); -#endif } else if (readme.format() == "pilot2_molecular_reader") { pilot2_molecular_reader* reader_pilot2_molecular = new pilot2_molecular_reader(readme.num_neighbors(), readme.max_neighborhood(), shuffle); reader_pilot2_molecular->set_data_filename(path); @@ -443,7 +437,6 @@ void init_data_readers( } else if (name == "jag") { reader_validation = new data_reader_jag(shuffle); *dynamic_cast(reader_validation) = *dynamic_cast(reader); -#ifdef LBANN_HAS_CONDUIT } else if (name == "jag_conduit") { /// If the training data reader was shared and the validate reader is split from it, then the validation data reader /// is also shared @@ -473,7 +466,6 @@ void init_data_readers( reader_jag_conduit->set_role(role); leading_reader_jag_conduit[role] = reader_jag_conduit; } -#endif // LBANN_HAS_CONDUIT } else if (name == "nci") { reader_validation = new data_reader_nci(shuffle); (*(data_reader_nci *)reader_validation) = (*(data_reader_nci *)reader); @@ -531,12 +523,10 @@ void init_data_readers( double train_percent = ((double) num_train / (double) (num_train+num_validate))*100.0; std::cout << "Training using " << train_percent << "% of the training data set, which is " << reader->get_num_data() << " samples." << std::endl << "Validating training using " << validate_percent << "% of the training data set, which is " << reader_validation->get_num_data() << " samples."; -#ifdef LBANN_HAS_CONDUIT if (name == "jag_conduit") { std::cout << " jag conduit leading reader " << dynamic_cast(reader)->get_leading_reader() << " of " << (is_shareable_training_data_reader? "shared" : "unshared") << " reader " << reader << " for " << reader->get_role() << std::endl; } -#endif // LBANN_HAS_CONDUIT std::cout << std::endl; } From 4b230f430a0a5ad02224be79f5d72cb1f18404de Mon Sep 17 00:00:00 2001 From: "David A. Hysom" Date: Tue, 21 May 2019 10:00:16 -0700 Subject: [PATCH 032/634] removing additional HAS_CONDUIT guards. --- CMakeLists.txt | 2 -- cmake/configure_files/LBANNConfig.cmake.in | 1 - cmake/configure_files/lbann_config.hpp.in | 1 - cmake/configure_files/lbann_module.lua.in | 2 -- include/lbann/data_readers/data_reader_jag_conduit.hpp | 2 +- model_zoo/jag_utils/CMakeLists.txt | 3 --- model_zoo/jag_utils/build_index.cpp | 3 --- model_zoo/jag_utils/check_for_duplicate_samples.cpp | 3 --- model_zoo/jag_utils/check_images.cpp | 3 --- model_zoo/jag_utils/compute_min_max_images.cpp | 4 ---- .../jag_utils/compute_per_channel_image_avg_min_max.cpp | 5 ----- model_zoo/jag_utils/detect_corruption.cpp | 3 --- model_zoo/jag_utils/dump_bundle.cpp | 3 --- model_zoo/jag_utils/extract_random_samples.cpp | 3 --- model_zoo/jag_utils/generate_corrupt_samples.cpp | 3 --- model_zoo/jag_utils/load_balance.cpp | 3 --- model_zoo/jag_utils/load_bundle2raw.cpp | 4 ---- model_zoo/jag_utils/test_conduit_hdf5.cpp | 3 --- model_zoo/jag_utils/test_conduit_with_mpi.cpp | 3 --- model_zoo/jag_utils/test_mpi.cpp | 3 --- 20 files changed, 1 insertion(+), 56 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 4dfb77a0e19..72062f30c9a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -430,7 +430,6 @@ if (LBANN_WITH_CONDUIT) "${_conduit_interface_link_libs}") set(CONDUIT_LIBRARIES conduit::conduit) - set(LBANN_HAS_CONDUIT ${Conduit_FOUND}) endif (LBANN_WITH_CONDUIT) if (LBANN_WITH_UNIT_TESTING) @@ -747,7 +746,6 @@ append_str_tf(_str LBANN_HAS_DOXYGEN LBANN_HAS_LBANN_PROTO LBANN_HAS_ALUMINUM - LBANN_HAS_CONDUIT LBANN_HAS_PYTHON) string(APPEND _str "\n== End LBANN Configuration Summary ==\n") diff --git a/cmake/configure_files/LBANNConfig.cmake.in b/cmake/configure_files/LBANNConfig.cmake.in index 2ac6ed91a9f..e55515437d3 100644 --- a/cmake/configure_files/LBANNConfig.cmake.in +++ b/cmake/configure_files/LBANNConfig.cmake.in @@ -37,7 +37,6 @@ set(LBANN_GNU_LINUX @LBANN_GNU_LINUX@) set(LBANN_HAS_ALUMINUM @LBANN_HAS_ALUMINUM@) set(LBANN_HAS_CEREAL @LBANN_HAS_CEREAL@) set(LBANN_HAS_CNPY @LBANN_HAS_CNPY@) -set(LBANN_HAS_CONDUIT @LBANN_HAS_CONDUIT@) set(LBANN_HAS_CUDA @LBANN_HAS_CUDA@) set(LBANN_HAS_CUDNN @LBANN_HAS_CUDNN@) set(LBANN_HAS_DOXYGEN @LBANN_HAS_DOXYGEN@) diff --git a/cmake/configure_files/lbann_config.hpp.in b/cmake/configure_files/lbann_config.hpp.in index 76b50bc920c..08306a8d10f 100644 --- a/cmake/configure_files/lbann_config.hpp.in +++ b/cmake/configure_files/lbann_config.hpp.in @@ -30,7 +30,6 @@ #cmakedefine LBANN_HAS_VTUNE #cmakedefine LBANN_HAS_ALUMINUM #cmakedefine LBANN_ALUMINUM_MPI_PASSTHROUGH -#cmakedefine LBANN_HAS_CONDUIT #cmakedefine LBANN_HAS_PYTHON #cmakedefine LBANN_DETERMINISTIC diff --git a/cmake/configure_files/lbann_module.lua.in b/cmake/configure_files/lbann_module.lua.in index 754d2c6106d..e6ea77453ae 100644 --- a/cmake/configure_files/lbann_module.lua.in +++ b/cmake/configure_files/lbann_module.lua.in @@ -22,7 +22,6 @@ -- LBANN_HAS_DOXYGEN: @LBANN_HAS_DOXYGEN@ -- LBANN_HAS_LBANN_PROTO: @LBANN_HAS_LBANN_PROTO@ -- LBANN_HAS_ALUMINUM: @LBANN_HAS_ALUMINUM@ --- LBANN_HAS_CONDUIT: @LBANN_HAS_CONDUIT@ -- LBANN_HAS_PYTHON: @LBANN_HAS_PYTHON@ help( @@ -58,7 +57,6 @@ whatis("LBANN_NVPROF: @LBANN_NVPROF@") whatis("LBANN_HAS_DOXYGEN: @LBANN_HAS_DOXYGEN@") whatis("LBANN_HAS_LBANN_PROTO: @LBANN_HAS_LBANN_PROTO@") whatis("LBANN_HAS_ALUMINUM: @LBANN_HAS_ALUMINUM@") -whatis("LBANN_HAS_CONDUIT: @LBANN_HAS_CONDUIT@") whatis("LBANN_HAS_PYTHON: @LBANN_HAS_PYTHON@") prepend_path("PATH","@CMAKE_INSTALL_PREFIX@/@CMAKE_INSTALL_BINDIR@") diff --git a/include/lbann/data_readers/data_reader_jag_conduit.hpp b/include/lbann/data_readers/data_reader_jag_conduit.hpp index d947ae0a766..7d91c89c300 100644 --- a/include/lbann/data_readers/data_reader_jag_conduit.hpp +++ b/include/lbann/data_readers/data_reader_jag_conduit.hpp @@ -27,7 +27,7 @@ #ifndef _DATA_READER_JAG_CONDUIT_HPP_ #define _DATA_READER_JAG_CONDUIT_HPP_ -#include "lbann_config.hpp" // may define LBANN_HAS_CONDUIT +#include "lbann_config.hpp" #include "lbann/data_readers/opencv.hpp" #include "data_reader.hpp" diff --git a/model_zoo/jag_utils/CMakeLists.txt b/model_zoo/jag_utils/CMakeLists.txt index 115b5d06ff0..9030bde2243 100644 --- a/model_zoo/jag_utils/CMakeLists.txt +++ b/model_zoo/jag_utils/CMakeLists.txt @@ -1,5 +1,3 @@ -if (LBANN_HAS_CONDUIT) - add_executable( build_index-bin build_index.cpp ) target_link_libraries(build_index-bin lbann ) set_target_properties(build_index-bin PROPERTIES OUTPUT_NAME build_index) @@ -56,4 +54,3 @@ if (LBANN_HAS_CONDUIT) target_link_libraries(generate_corrupt_samples-bin lbann ) set_target_properties(generate_corrupt_samples-bin PROPERTIES OUTPUT_NAME generate_corrupt_samples) -endif () diff --git a/model_zoo/jag_utils/build_index.cpp b/model_zoo/jag_utils/build_index.cpp index 9a92153b940..9af7a2f6d6e 100644 --- a/model_zoo/jag_utils/build_index.cpp +++ b/model_zoo/jag_utils/build_index.cpp @@ -27,8 +27,6 @@ #include "lbann_config.hpp" -#ifdef LBANN_HAS_CONDUIT - #include "conduit/conduit.hpp" #include "conduit/conduit_relay.hpp" #include "conduit/conduit_relay_io_hdf5.hpp" @@ -210,4 +208,3 @@ if (j >= 400) break; return EXIT_SUCCESS; } -#endif //#ifdef LBANN_HAS_CONDUIT diff --git a/model_zoo/jag_utils/check_for_duplicate_samples.cpp b/model_zoo/jag_utils/check_for_duplicate_samples.cpp index 553f7aaa4be..fc33898a401 100644 --- a/model_zoo/jag_utils/check_for_duplicate_samples.cpp +++ b/model_zoo/jag_utils/check_for_duplicate_samples.cpp @@ -27,8 +27,6 @@ #include "lbann_config.hpp" -#ifdef LBANN_HAS_CONDUIT - #include "conduit/conduit.hpp" #include "conduit/conduit_relay.hpp" #include "conduit/conduit_relay_io_hdf5.hpp" @@ -159,4 +157,3 @@ void get_input_names(std::unordered_set &s) { s.insert("shape_model_initial_modes:(1,0)"); } -#endif //#ifdef LBANN_HAS_CONDUIT diff --git a/model_zoo/jag_utils/check_images.cpp b/model_zoo/jag_utils/check_images.cpp index 29dc779fdf3..436df059b7e 100644 --- a/model_zoo/jag_utils/check_images.cpp +++ b/model_zoo/jag_utils/check_images.cpp @@ -27,8 +27,6 @@ #include "lbann_config.hpp" -#ifdef LBANN_HAS_CONDUIT - #include "conduit/conduit.hpp" #include "conduit/conduit_relay.hpp" #include "conduit/conduit_relay_io_hdf5.hpp" @@ -136,4 +134,3 @@ int main(int argc, char *argv[]) { // Clean up return EXIT_SUCCESS; } -#endif //#ifdef LBANN_HAS_CONDUIT diff --git a/model_zoo/jag_utils/compute_min_max_images.cpp b/model_zoo/jag_utils/compute_min_max_images.cpp index 42167e082ac..080c01110f0 100644 --- a/model_zoo/jag_utils/compute_min_max_images.cpp +++ b/model_zoo/jag_utils/compute_min_max_images.cpp @@ -27,7 +27,6 @@ #include "lbann_config.hpp" -#ifdef LBANN_HAS_CONDUIT #include "conduit/conduit.hpp" #include "conduit/conduit_relay.hpp" @@ -237,6 +236,3 @@ std::cerr << rank << " :: opening for reading: " << files[j] << "\n"; return EXIT_SUCCESS; } - - -#endif //#ifdef LBANN_HAS_CONDUIT diff --git a/model_zoo/jag_utils/compute_per_channel_image_avg_min_max.cpp b/model_zoo/jag_utils/compute_per_channel_image_avg_min_max.cpp index 8a5745c1a29..0fcbea00804 100644 --- a/model_zoo/jag_utils/compute_per_channel_image_avg_min_max.cpp +++ b/model_zoo/jag_utils/compute_per_channel_image_avg_min_max.cpp @@ -27,8 +27,6 @@ #include "lbann_config.hpp" -#ifdef LBANN_HAS_CONDUIT - #include "conduit/conduit.hpp" #include "conduit/conduit_relay.hpp" #include "conduit/conduit_relay_io_hdf5.hpp" @@ -243,6 +241,3 @@ std::cerr << rank << " :: opening for reading: " << files[j] << "\n"; return EXIT_SUCCESS; } - - -#endif //#ifdef LBANN_HAS_CONDUIT diff --git a/model_zoo/jag_utils/detect_corruption.cpp b/model_zoo/jag_utils/detect_corruption.cpp index b42b67271b3..0f1d8b02590 100644 --- a/model_zoo/jag_utils/detect_corruption.cpp +++ b/model_zoo/jag_utils/detect_corruption.cpp @@ -27,8 +27,6 @@ #include "lbann_config.hpp" -#ifdef LBANN_HAS_CONDUIT - #include "conduit/conduit.hpp" #include "conduit/conduit_relay.hpp" #include "conduit/conduit_relay_io_hdf5.hpp" @@ -259,4 +257,3 @@ void print_errs(world_comm_ptr &comm, int np, int rank, std::ostringstream &s, c } comm->global_barrier(); } -#endif //#ifdef LBANN_HAS_CONDUIT diff --git a/model_zoo/jag_utils/dump_bundle.cpp b/model_zoo/jag_utils/dump_bundle.cpp index 7191a65fa96..aa62c2510e7 100644 --- a/model_zoo/jag_utils/dump_bundle.cpp +++ b/model_zoo/jag_utils/dump_bundle.cpp @@ -27,8 +27,6 @@ #include "lbann_config.hpp" -#ifdef LBANN_HAS_CONDUIT - #include "conduit/conduit.hpp" #include "conduit/conduit_relay.hpp" #include "conduit/conduit_relay_io_hdf5.hpp" @@ -61,4 +59,3 @@ int main(int argc, char *argv[]) { return EXIT_SUCCESS; } -#endif //#ifdef LBANN_HAS_CONDUIT diff --git a/model_zoo/jag_utils/extract_random_samples.cpp b/model_zoo/jag_utils/extract_random_samples.cpp index 7937183aa68..ef636db2741 100644 --- a/model_zoo/jag_utils/extract_random_samples.cpp +++ b/model_zoo/jag_utils/extract_random_samples.cpp @@ -27,8 +27,6 @@ #include "lbann_config.hpp" -#ifdef LBANN_HAS_CONDUIT - #include "conduit/conduit.hpp" #include "conduit/conduit_relay.hpp" #include "conduit/conduit_relay_io_hdf5.hpp" @@ -476,4 +474,3 @@ void print_sample_ids( } std::cerr << "\n==========================================\n"; } -#endif //#ifdef LBANN_HAS_CONDUIT diff --git a/model_zoo/jag_utils/generate_corrupt_samples.cpp b/model_zoo/jag_utils/generate_corrupt_samples.cpp index 3a2181ea6d8..e145e1cfb40 100644 --- a/model_zoo/jag_utils/generate_corrupt_samples.cpp +++ b/model_zoo/jag_utils/generate_corrupt_samples.cpp @@ -27,8 +27,6 @@ #include "lbann_config.hpp" -#ifdef LBANN_HAS_CONDUIT - #include "conduit/conduit.hpp" #include "conduit/conduit_relay_io_handle.hpp" #include @@ -161,4 +159,3 @@ int main(int argc, char *argv[]) { out.close(); std::cout << "\nMade directory 'corrupt_jag_samples/' and wrote files in that directory\n\n"; } -#endif //#ifdef LBANN_HAS_CONDUIT diff --git a/model_zoo/jag_utils/load_balance.cpp b/model_zoo/jag_utils/load_balance.cpp index 5a3403bae68..c025c918bce 100644 --- a/model_zoo/jag_utils/load_balance.cpp +++ b/model_zoo/jag_utils/load_balance.cpp @@ -27,8 +27,6 @@ #include "lbann_config.hpp" -#ifdef LBANN_HAS_CONDUIT - #include "conduit/conduit.hpp" #include "conduit/conduit_relay.hpp" #include "conduit/conduit_relay_io_hdf5.hpp" @@ -213,4 +211,3 @@ int main(int argc, char *argv[]) { return EXIT_SUCCESS; } -#endif //#ifdef LBANN_HAS_CONDUIT diff --git a/model_zoo/jag_utils/load_bundle2raw.cpp b/model_zoo/jag_utils/load_bundle2raw.cpp index 703a39edafd..772afefa385 100644 --- a/model_zoo/jag_utils/load_bundle2raw.cpp +++ b/model_zoo/jag_utils/load_bundle2raw.cpp @@ -27,8 +27,6 @@ #include "lbann_config.hpp" -#ifdef LBANN_HAS_CONDUIT - #include "conduit/conduit.hpp" #include "conduit/conduit_relay.hpp" #include "conduit/conduit_relay_io_hdf5.hpp" @@ -256,5 +254,3 @@ void get_scalar_names(std::vector &s) { s.push_back("MINradius"); } - -#endif //#ifdef LBANN_HAS_CONDUIT diff --git a/model_zoo/jag_utils/test_conduit_hdf5.cpp b/model_zoo/jag_utils/test_conduit_hdf5.cpp index 3c2d6955bfa..cd157f8ca7e 100644 --- a/model_zoo/jag_utils/test_conduit_hdf5.cpp +++ b/model_zoo/jag_utils/test_conduit_hdf5.cpp @@ -27,8 +27,6 @@ #include "lbann_config.hpp" -#ifdef LBANN_HAS_CONDUIT - #include "conduit/conduit.hpp" #include "conduit/conduit_relay.hpp" #include "conduit/conduit_relay_io_hdf5.hpp" @@ -161,4 +159,3 @@ void get_image_names(std::unordered_set &s) { s.insert("(90.0, 78.0)//0.0/emi"); } -#endif //#ifdef LBANN_HAS_CONDUIT diff --git a/model_zoo/jag_utils/test_conduit_with_mpi.cpp b/model_zoo/jag_utils/test_conduit_with_mpi.cpp index bb86c0708f9..d7c7caf34d8 100644 --- a/model_zoo/jag_utils/test_conduit_with_mpi.cpp +++ b/model_zoo/jag_utils/test_conduit_with_mpi.cpp @@ -27,8 +27,6 @@ #include "lbann_config.hpp" -#ifdef LBANN_HAS_CONDUIT - #include "conduit/conduit.hpp" #include "conduit/conduit_relay.hpp" #include "conduit/conduit_relay_hdf5.hpp" @@ -92,4 +90,3 @@ int main(int argc, char *argv[]) { #endif //if 0 } -#endif //#ifdef LBANN_HAS_CONDUIT diff --git a/model_zoo/jag_utils/test_mpi.cpp b/model_zoo/jag_utils/test_mpi.cpp index 92a78ef4f1d..a3425eafd95 100644 --- a/model_zoo/jag_utils/test_mpi.cpp +++ b/model_zoo/jag_utils/test_mpi.cpp @@ -27,8 +27,6 @@ #include "lbann_config.hpp" -#ifdef LBANN_HAS_CONDUIT - #include "conduit/conduit.hpp" #include "conduit/conduit_relay.hpp" #include "conduit/conduit_relay_io_hdf5.hpp" @@ -94,4 +92,3 @@ int main(int argc, char *argv[]) { #endif //if 0 } -#endif //#ifdef LBANN_HAS_CONDUIT From d449d06de563f226b9648902ea4b84d3b279e84d Mon Sep 17 00:00:00 2001 From: "David A. Hysom" Date: Tue, 21 May 2019 12:17:30 -0700 Subject: [PATCH 033/634] added method: get_image_sizes() and associated data structures. This method will be used when the data store is being used as a local cache, and all ranks on a node read/write images to a shared memory segment. --- .../lbann/data_store/data_store_conduit.hpp | 9 ++ src/data_store/data_store_conduit.cpp | 85 +++++++++++++++++++ 2 files changed, 94 insertions(+) diff --git a/include/lbann/data_store/data_store_conduit.hpp b/include/lbann/data_store/data_store_conduit.hpp index 97e7e5cd997..9d73e3f8f54 100644 --- a/include/lbann/data_store/data_store_conduit.hpp +++ b/include/lbann/data_store/data_store_conduit.hpp @@ -290,6 +290,15 @@ protected : /// used in exchange_data_by_sample, when sample sizes are non-uniform bool m_have_sample_sizes; + + /// fills in m_image_name_to_index, m_image_sizes, and m_image_offsets + void get_image_sizes(); + + /// number of bytes in each image + std::vector m_image_sizes; + + /// offset at which the raw image will be stored in a shared memory segment + std::vector m_image_offsets; }; } // namespace lbann diff --git a/src/data_store/data_store_conduit.cpp b/src/data_store/data_store_conduit.cpp index 8bb95e29772..27ee844a08f 100644 --- a/src/data_store/data_store_conduit.cpp +++ b/src/data_store/data_store_conduit.cpp @@ -72,6 +72,10 @@ data_store_conduit::data_store_conduit( LBANN_ERROR("you cannot use both of these options: --data_store_cache --preload_data_store"); } + if (m_is_local_cache) { + get_image_sizes(); + } + if (m_world_master) { if (m_is_local_cache) { std::cout << "data_store_conduit is running in local_cache mode\n"; @@ -1006,5 +1010,86 @@ void data_store_conduit::set_preload() { m_preload = true; } +void data_store_conduit::get_image_sizes() { + options *opts = options::get(); + /// this block fires if image sizes have been precomputed + if (opts->has_string("image_sizes_filename")) { + LBANN_ERROR("not yet implemented"); + } + + else { + // get list of image file names + const std::string image_list_file = m_reader->get_data_filename(); + const std::string image_dir = m_reader->get_file_dir(); + FILE *fplist = fopen(image_list_file.c_str(), "rt"); + std::vector image_file_names; + int imagelabel; + while (!feof(fplist)) { + char imagepath[512]; + if (fscanf(fplist, "%s%d", imagepath, &imagelabel) <= 1) { + break; + } + image_file_names.emplace_back(imagepath); + } + fclose(fplist); + + // get sizes of files for which I'm responsible + // TODO: should add threading to reduce computation time + std::vector my_sizes; + for (size_t h=m_rank_in_trainer; h counts(m_np_in_trainer); + m_comm->all_gather(&my_count, 1, counts.data(), 1, m_comm->get_trainer_comm()); + size_t g_count = std::accumulate(counts.begin(), counts.end(), 0); + if (g_count != image_file_names.size()) { + LBANN_ERROR("g_count != image_file_names.size()"); + } + std::vector work(image_file_names.size()); + std::vector disp(m_np_in_trainer); + disp[0] = 0; + for (size_t h=0; htrainer_all_gather(my_sizes, work, counts, disp); + + // fill in m_image_sizes and m_image_offsets + m_image_sizes.resize(image_file_names.size()); + for (int rank = 0; rank < m_np_in_trainer; rank++) { + size_t offset = disp[rank]; + size_t count = counts[rank]; + size_t i = rank; + for (size_t j=offset; j Date: Tue, 9 Apr 2019 16:53:29 -0700 Subject: [PATCH 034/634] DenseNet Python --- .../densenet/generated_densenet.prototext | 4522 +++++++++++++++++ model_zoo/vision/densenet.py | 544 ++ 2 files changed, 5066 insertions(+) create mode 100644 model_zoo/models/densenet/generated_densenet.prototext create mode 100755 model_zoo/vision/densenet.py diff --git a/model_zoo/models/densenet/generated_densenet.prototext b/model_zoo/models/densenet/generated_densenet.prototext new file mode 100644 index 00000000000..07e4423f5c9 --- /dev/null +++ b/model_zoo/models/densenet/generated_densenet.prototext @@ -0,0 +1,4522 @@ +data_reader { + reader { + name: "imagenet" + role: "train" + shuffle: true + data_filedir: "/p/lscratchh/brainusr/datasets/ILSVRC2012/original/train/" + data_filename: "/p/lscratchh/brainusr/datasets/ILSVRC2012/original/labels/train.txt" + validation_percent: 0.01 + percent_of_data_to_use: 1.0 + image_preprocessor { + cropper { + crop_randomly: true + crop_width: 224 + crop_height: 224 + resized_width: 256 + resized_height: 256 + } + augmenter { + horizontal_flip: true + } + colorizer { + } + subtractor { + disable: true + image_to_sub: "mean-256x256x3-6.bin" + } + normalizer { + z_score: true + } + } + num_labels: 1000 + } + reader { + name: "imagenet" + role: "test" + shuffle: true + data_filedir: "/p/lscratchh/brainusr/datasets/ILSVRC2012/original/val/" + data_filename: "/p/lscratchh/brainusr/datasets/ILSVRC2012/original/labels/val.txt" + percent_of_data_to_use: 1.0 + image_preprocessor { + cropper { + crop_width: 224 + crop_height: 224 + resized_width: 256 + resized_height: 256 + } + augmenter { + disable: true + } + colorizer { + } + subtractor { + disable: true + image_to_sub: "mean-256x256x3-6.bin" + } + normalizer { + z_score: true + } + } + num_labels: 1000 + } +} +model { + objective_function { + layer_term { + scale_factor: 1.0 + layer: "layer434" + } + l2_weight_regularization { + scale_factor: 0.0001 + } + } + metric { + layer_metric { + layer: "layer435" + name: "top-1 accuracy" + unit: "%" + } + } + metric { + layer_metric { + layer: "layer436" + name: "top-5 accuracy" + unit: "%" + } + } + mini_batch_size: 256 + num_epochs: 90 + block_size: 256 + layer { + name: "layer1" + children: "layer2 layer3" + data_layout: "data_parallel" + input { + } + } + layer { + name: "layer3" + parents: "layer1" + children: "layer434 layer435 layer436" + data_layout: "data_parallel" + identity { + } + } + layer { + name: "layer2" + parents: "layer1" + children: "layer4" + data_layout: "data_parallel" + identity { + } + } + layer { + name: "layer4" + parents: "layer2" + children: "layer5" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 64 + conv_dims_i: 7 + conv_pads_i: 3 + conv_strides_i: 2 + } + } + layer { + name: "layer5" + parents: "layer4" + children: "layer6" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer6" + parents: "layer5" + children: "layer7" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer7" + parents: "layer6" + children: "layer8 layer15 layer22 layer29 layer36 layer43 layer50" + data_layout: "data_parallel" + pooling { + num_dims: 2 + pool_dims_i: 3 + pool_pads_i: 1 + pool_strides_i: 2 + pool_mode: "max" + } + } + layer { + name: "layer8" + parents: "layer7" + children: "layer9" + data_layout: "data_parallel" + concatenation { + } + } + layer { + name: "layer9" + parents: "layer8" + children: "layer10" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer10" + parents: "layer9" + children: "layer11" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer11" + parents: "layer10" + children: "layer12" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 128 + conv_dims_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer12" + parents: "layer11" + children: "layer13" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer13" + parents: "layer12" + children: "layer14" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer14" + parents: "layer13" + children: "layer15 layer22 layer29 layer36 layer43 layer50" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 32 + conv_dims_i: 3 + conv_pads_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer15" + parents: "layer7 layer14" + children: "layer16" + data_layout: "data_parallel" + concatenation { + } + } + layer { + name: "layer16" + parents: "layer15" + children: "layer17" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer17" + parents: "layer16" + children: "layer18" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer18" + parents: "layer17" + children: "layer19" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 128 + conv_dims_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer19" + parents: "layer18" + children: "layer20" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer20" + parents: "layer19" + children: "layer21" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer21" + parents: "layer20" + children: "layer22 layer29 layer36 layer43 layer50" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 32 + conv_dims_i: 3 + conv_pads_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer22" + parents: "layer7 layer14 layer21" + children: "layer23" + data_layout: "data_parallel" + concatenation { + } + } + layer { + name: "layer23" + parents: "layer22" + children: "layer24" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer24" + parents: "layer23" + children: "layer25" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer25" + parents: "layer24" + children: "layer26" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 128 + conv_dims_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer26" + parents: "layer25" + children: "layer27" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer27" + parents: "layer26" + children: "layer28" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer28" + parents: "layer27" + children: "layer29 layer36 layer43 layer50" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 32 + conv_dims_i: 3 + conv_pads_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer29" + parents: "layer7 layer14 layer21 layer28" + children: "layer30" + data_layout: "data_parallel" + concatenation { + } + } + layer { + name: "layer30" + parents: "layer29" + children: "layer31" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer31" + parents: "layer30" + children: "layer32" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer32" + parents: "layer31" + children: "layer33" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 128 + conv_dims_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer33" + parents: "layer32" + children: "layer34" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer34" + parents: "layer33" + children: "layer35" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer35" + parents: "layer34" + children: "layer36 layer43 layer50" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 32 + conv_dims_i: 3 + conv_pads_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer36" + parents: "layer7 layer14 layer21 layer28 layer35" + children: "layer37" + data_layout: "data_parallel" + concatenation { + } + } + layer { + name: "layer37" + parents: "layer36" + children: "layer38" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer38" + parents: "layer37" + children: "layer39" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer39" + parents: "layer38" + children: "layer40" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 128 + conv_dims_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer40" + parents: "layer39" + children: "layer41" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer41" + parents: "layer40" + children: "layer42" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer42" + parents: "layer41" + children: "layer43 layer50" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 32 + conv_dims_i: 3 + conv_pads_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer43" + parents: "layer7 layer14 layer21 layer28 layer35 layer42" + children: "layer44" + data_layout: "data_parallel" + concatenation { + } + } + layer { + name: "layer44" + parents: "layer43" + children: "layer45" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer45" + parents: "layer44" + children: "layer46" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer46" + parents: "layer45" + children: "layer47" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 128 + conv_dims_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer47" + parents: "layer46" + children: "layer48" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer48" + parents: "layer47" + children: "layer49" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer49" + parents: "layer48" + children: "layer50" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 32 + conv_dims_i: 3 + conv_pads_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer50" + parents: "layer7 layer14 layer21 layer28 layer35 layer42 layer49" + children: "layer51" + data_layout: "data_parallel" + concatenation { + } + } + layer { + name: "layer51" + parents: "layer50" + children: "layer52" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer52" + parents: "layer51" + children: "layer53" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer53" + parents: "layer52" + children: "layer54" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 128 + conv_dims_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer54" + parents: "layer53" + children: "layer55 layer62 layer69 layer76 layer83 layer90 layer97 layer104 layer111 layer118 layer125 layer132 layer139" + data_layout: "data_parallel" + pooling { + num_dims: 2 + pool_dims_i: 2 + pool_strides_i: 2 + pool_mode: "average" + } + } + layer { + name: "layer55" + parents: "layer54" + children: "layer56" + data_layout: "data_parallel" + concatenation { + } + } + layer { + name: "layer56" + parents: "layer55" + children: "layer57" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer57" + parents: "layer56" + children: "layer58" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer58" + parents: "layer57" + children: "layer59" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 128 + conv_dims_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer59" + parents: "layer58" + children: "layer60" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer60" + parents: "layer59" + children: "layer61" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer61" + parents: "layer60" + children: "layer62 layer69 layer76 layer83 layer90 layer97 layer104 layer111 layer118 layer125 layer132 layer139" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 32 + conv_dims_i: 3 + conv_pads_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer62" + parents: "layer54 layer61" + children: "layer63" + data_layout: "data_parallel" + concatenation { + } + } + layer { + name: "layer63" + parents: "layer62" + children: "layer64" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer64" + parents: "layer63" + children: "layer65" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer65" + parents: "layer64" + children: "layer66" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 128 + conv_dims_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer66" + parents: "layer65" + children: "layer67" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer67" + parents: "layer66" + children: "layer68" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer68" + parents: "layer67" + children: "layer69 layer76 layer83 layer90 layer97 layer104 layer111 layer118 layer125 layer132 layer139" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 32 + conv_dims_i: 3 + conv_pads_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer69" + parents: "layer54 layer61 layer68" + children: "layer70" + data_layout: "data_parallel" + concatenation { + } + } + layer { + name: "layer70" + parents: "layer69" + children: "layer71" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer71" + parents: "layer70" + children: "layer72" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer72" + parents: "layer71" + children: "layer73" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 128 + conv_dims_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer73" + parents: "layer72" + children: "layer74" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer74" + parents: "layer73" + children: "layer75" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer75" + parents: "layer74" + children: "layer76 layer83 layer90 layer97 layer104 layer111 layer118 layer125 layer132 layer139" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 32 + conv_dims_i: 3 + conv_pads_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer76" + parents: "layer54 layer61 layer68 layer75" + children: "layer77" + data_layout: "data_parallel" + concatenation { + } + } + layer { + name: "layer77" + parents: "layer76" + children: "layer78" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer78" + parents: "layer77" + children: "layer79" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer79" + parents: "layer78" + children: "layer80" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 128 + conv_dims_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer80" + parents: "layer79" + children: "layer81" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer81" + parents: "layer80" + children: "layer82" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer82" + parents: "layer81" + children: "layer83 layer90 layer97 layer104 layer111 layer118 layer125 layer132 layer139" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 32 + conv_dims_i: 3 + conv_pads_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer83" + parents: "layer54 layer61 layer68 layer75 layer82" + children: "layer84" + data_layout: "data_parallel" + concatenation { + } + } + layer { + name: "layer84" + parents: "layer83" + children: "layer85" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer85" + parents: "layer84" + children: "layer86" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer86" + parents: "layer85" + children: "layer87" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 128 + conv_dims_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer87" + parents: "layer86" + children: "layer88" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer88" + parents: "layer87" + children: "layer89" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer89" + parents: "layer88" + children: "layer90 layer97 layer104 layer111 layer118 layer125 layer132 layer139" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 32 + conv_dims_i: 3 + conv_pads_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer90" + parents: "layer54 layer61 layer68 layer75 layer82 layer89" + children: "layer91" + data_layout: "data_parallel" + concatenation { + } + } + layer { + name: "layer91" + parents: "layer90" + children: "layer92" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer92" + parents: "layer91" + children: "layer93" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer93" + parents: "layer92" + children: "layer94" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 128 + conv_dims_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer94" + parents: "layer93" + children: "layer95" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer95" + parents: "layer94" + children: "layer96" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer96" + parents: "layer95" + children: "layer97 layer104 layer111 layer118 layer125 layer132 layer139" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 32 + conv_dims_i: 3 + conv_pads_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer97" + parents: "layer54 layer61 layer68 layer75 layer82 layer89 layer96" + children: "layer98" + data_layout: "data_parallel" + concatenation { + } + } + layer { + name: "layer98" + parents: "layer97" + children: "layer99" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer99" + parents: "layer98" + children: "layer100" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer100" + parents: "layer99" + children: "layer101" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 128 + conv_dims_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer101" + parents: "layer100" + children: "layer102" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer102" + parents: "layer101" + children: "layer103" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer103" + parents: "layer102" + children: "layer104 layer111 layer118 layer125 layer132 layer139" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 32 + conv_dims_i: 3 + conv_pads_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer104" + parents: "layer54 layer61 layer68 layer75 layer82 layer89 layer96 layer103" + children: "layer105" + data_layout: "data_parallel" + concatenation { + } + } + layer { + name: "layer105" + parents: "layer104" + children: "layer106" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer106" + parents: "layer105" + children: "layer107" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer107" + parents: "layer106" + children: "layer108" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 128 + conv_dims_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer108" + parents: "layer107" + children: "layer109" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer109" + parents: "layer108" + children: "layer110" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer110" + parents: "layer109" + children: "layer111 layer118 layer125 layer132 layer139" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 32 + conv_dims_i: 3 + conv_pads_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer111" + parents: "layer54 layer61 layer68 layer75 layer82 layer89 layer96 layer103 layer110" + children: "layer112" + data_layout: "data_parallel" + concatenation { + } + } + layer { + name: "layer112" + parents: "layer111" + children: "layer113" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer113" + parents: "layer112" + children: "layer114" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer114" + parents: "layer113" + children: "layer115" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 128 + conv_dims_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer115" + parents: "layer114" + children: "layer116" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer116" + parents: "layer115" + children: "layer117" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer117" + parents: "layer116" + children: "layer118 layer125 layer132 layer139" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 32 + conv_dims_i: 3 + conv_pads_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer118" + parents: "layer54 layer61 layer68 layer75 layer82 layer89 layer96 layer103 layer110 layer117" + children: "layer119" + data_layout: "data_parallel" + concatenation { + } + } + layer { + name: "layer119" + parents: "layer118" + children: "layer120" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer120" + parents: "layer119" + children: "layer121" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer121" + parents: "layer120" + children: "layer122" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 128 + conv_dims_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer122" + parents: "layer121" + children: "layer123" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer123" + parents: "layer122" + children: "layer124" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer124" + parents: "layer123" + children: "layer125 layer132 layer139" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 32 + conv_dims_i: 3 + conv_pads_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer125" + parents: "layer54 layer61 layer68 layer75 layer82 layer89 layer96 layer103 layer110 layer117 layer124" + children: "layer126" + data_layout: "data_parallel" + concatenation { + } + } + layer { + name: "layer126" + parents: "layer125" + children: "layer127" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer127" + parents: "layer126" + children: "layer128" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer128" + parents: "layer127" + children: "layer129" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 128 + conv_dims_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer129" + parents: "layer128" + children: "layer130" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer130" + parents: "layer129" + children: "layer131" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer131" + parents: "layer130" + children: "layer132 layer139" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 32 + conv_dims_i: 3 + conv_pads_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer132" + parents: "layer54 layer61 layer68 layer75 layer82 layer89 layer96 layer103 layer110 layer117 layer124 layer131" + children: "layer133" + data_layout: "data_parallel" + concatenation { + } + } + layer { + name: "layer133" + parents: "layer132" + children: "layer134" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer134" + parents: "layer133" + children: "layer135" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer135" + parents: "layer134" + children: "layer136" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 128 + conv_dims_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer136" + parents: "layer135" + children: "layer137" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer137" + parents: "layer136" + children: "layer138" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer138" + parents: "layer137" + children: "layer139" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 32 + conv_dims_i: 3 + conv_pads_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer139" + parents: "layer54 layer61 layer68 layer75 layer82 layer89 layer96 layer103 layer110 layer117 layer124 layer131 layer138" + children: "layer140" + data_layout: "data_parallel" + concatenation { + } + } + layer { + name: "layer140" + parents: "layer139" + children: "layer141" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer141" + parents: "layer140" + children: "layer142" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer142" + parents: "layer141" + children: "layer143" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 256 + conv_dims_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer143" + parents: "layer142" + children: "layer144 layer151 layer158 layer165 layer172 layer179 layer186 layer193 layer200 layer207 layer214 layer221 layer228 layer235 layer242 layer249 layer256 layer263 layer270 layer277 layer284 layer291 layer298 layer305 layer312" + data_layout: "data_parallel" + pooling { + num_dims: 2 + pool_dims_i: 2 + pool_strides_i: 2 + pool_mode: "average" + } + } + layer { + name: "layer144" + parents: "layer143" + children: "layer145" + data_layout: "data_parallel" + concatenation { + } + } + layer { + name: "layer145" + parents: "layer144" + children: "layer146" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer146" + parents: "layer145" + children: "layer147" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer147" + parents: "layer146" + children: "layer148" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 128 + conv_dims_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer148" + parents: "layer147" + children: "layer149" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer149" + parents: "layer148" + children: "layer150" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer150" + parents: "layer149" + children: "layer151 layer158 layer165 layer172 layer179 layer186 layer193 layer200 layer207 layer214 layer221 layer228 layer235 layer242 layer249 layer256 layer263 layer270 layer277 layer284 layer291 layer298 layer305 layer312" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 32 + conv_dims_i: 3 + conv_pads_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer151" + parents: "layer143 layer150" + children: "layer152" + data_layout: "data_parallel" + concatenation { + } + } + layer { + name: "layer152" + parents: "layer151" + children: "layer153" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer153" + parents: "layer152" + children: "layer154" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer154" + parents: "layer153" + children: "layer155" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 128 + conv_dims_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer155" + parents: "layer154" + children: "layer156" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer156" + parents: "layer155" + children: "layer157" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer157" + parents: "layer156" + children: "layer158 layer165 layer172 layer179 layer186 layer193 layer200 layer207 layer214 layer221 layer228 layer235 layer242 layer249 layer256 layer263 layer270 layer277 layer284 layer291 layer298 layer305 layer312" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 32 + conv_dims_i: 3 + conv_pads_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer158" + parents: "layer143 layer150 layer157" + children: "layer159" + data_layout: "data_parallel" + concatenation { + } + } + layer { + name: "layer159" + parents: "layer158" + children: "layer160" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer160" + parents: "layer159" + children: "layer161" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer161" + parents: "layer160" + children: "layer162" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 128 + conv_dims_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer162" + parents: "layer161" + children: "layer163" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer163" + parents: "layer162" + children: "layer164" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer164" + parents: "layer163" + children: "layer165 layer172 layer179 layer186 layer193 layer200 layer207 layer214 layer221 layer228 layer235 layer242 layer249 layer256 layer263 layer270 layer277 layer284 layer291 layer298 layer305 layer312" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 32 + conv_dims_i: 3 + conv_pads_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer165" + parents: "layer143 layer150 layer157 layer164" + children: "layer166" + data_layout: "data_parallel" + concatenation { + } + } + layer { + name: "layer166" + parents: "layer165" + children: "layer167" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer167" + parents: "layer166" + children: "layer168" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer168" + parents: "layer167" + children: "layer169" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 128 + conv_dims_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer169" + parents: "layer168" + children: "layer170" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer170" + parents: "layer169" + children: "layer171" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer171" + parents: "layer170" + children: "layer172 layer179 layer186 layer193 layer200 layer207 layer214 layer221 layer228 layer235 layer242 layer249 layer256 layer263 layer270 layer277 layer284 layer291 layer298 layer305 layer312" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 32 + conv_dims_i: 3 + conv_pads_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer172" + parents: "layer143 layer150 layer157 layer164 layer171" + children: "layer173" + data_layout: "data_parallel" + concatenation { + } + } + layer { + name: "layer173" + parents: "layer172" + children: "layer174" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer174" + parents: "layer173" + children: "layer175" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer175" + parents: "layer174" + children: "layer176" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 128 + conv_dims_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer176" + parents: "layer175" + children: "layer177" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer177" + parents: "layer176" + children: "layer178" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer178" + parents: "layer177" + children: "layer179 layer186 layer193 layer200 layer207 layer214 layer221 layer228 layer235 layer242 layer249 layer256 layer263 layer270 layer277 layer284 layer291 layer298 layer305 layer312" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 32 + conv_dims_i: 3 + conv_pads_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer179" + parents: "layer143 layer150 layer157 layer164 layer171 layer178" + children: "layer180" + data_layout: "data_parallel" + concatenation { + } + } + layer { + name: "layer180" + parents: "layer179" + children: "layer181" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer181" + parents: "layer180" + children: "layer182" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer182" + parents: "layer181" + children: "layer183" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 128 + conv_dims_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer183" + parents: "layer182" + children: "layer184" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer184" + parents: "layer183" + children: "layer185" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer185" + parents: "layer184" + children: "layer186 layer193 layer200 layer207 layer214 layer221 layer228 layer235 layer242 layer249 layer256 layer263 layer270 layer277 layer284 layer291 layer298 layer305 layer312" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 32 + conv_dims_i: 3 + conv_pads_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer186" + parents: "layer143 layer150 layer157 layer164 layer171 layer178 layer185" + children: "layer187" + data_layout: "data_parallel" + concatenation { + } + } + layer { + name: "layer187" + parents: "layer186" + children: "layer188" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer188" + parents: "layer187" + children: "layer189" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer189" + parents: "layer188" + children: "layer190" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 128 + conv_dims_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer190" + parents: "layer189" + children: "layer191" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer191" + parents: "layer190" + children: "layer192" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer192" + parents: "layer191" + children: "layer193 layer200 layer207 layer214 layer221 layer228 layer235 layer242 layer249 layer256 layer263 layer270 layer277 layer284 layer291 layer298 layer305 layer312" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 32 + conv_dims_i: 3 + conv_pads_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer193" + parents: "layer143 layer150 layer157 layer164 layer171 layer178 layer185 layer192" + children: "layer194" + data_layout: "data_parallel" + concatenation { + } + } + layer { + name: "layer194" + parents: "layer193" + children: "layer195" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer195" + parents: "layer194" + children: "layer196" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer196" + parents: "layer195" + children: "layer197" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 128 + conv_dims_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer197" + parents: "layer196" + children: "layer198" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer198" + parents: "layer197" + children: "layer199" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer199" + parents: "layer198" + children: "layer200 layer207 layer214 layer221 layer228 layer235 layer242 layer249 layer256 layer263 layer270 layer277 layer284 layer291 layer298 layer305 layer312" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 32 + conv_dims_i: 3 + conv_pads_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer200" + parents: "layer143 layer150 layer157 layer164 layer171 layer178 layer185 layer192 layer199" + children: "layer201" + data_layout: "data_parallel" + concatenation { + } + } + layer { + name: "layer201" + parents: "layer200" + children: "layer202" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer202" + parents: "layer201" + children: "layer203" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer203" + parents: "layer202" + children: "layer204" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 128 + conv_dims_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer204" + parents: "layer203" + children: "layer205" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer205" + parents: "layer204" + children: "layer206" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer206" + parents: "layer205" + children: "layer207 layer214 layer221 layer228 layer235 layer242 layer249 layer256 layer263 layer270 layer277 layer284 layer291 layer298 layer305 layer312" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 32 + conv_dims_i: 3 + conv_pads_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer207" + parents: "layer143 layer150 layer157 layer164 layer171 layer178 layer185 layer192 layer199 layer206" + children: "layer208" + data_layout: "data_parallel" + concatenation { + } + } + layer { + name: "layer208" + parents: "layer207" + children: "layer209" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer209" + parents: "layer208" + children: "layer210" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer210" + parents: "layer209" + children: "layer211" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 128 + conv_dims_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer211" + parents: "layer210" + children: "layer212" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer212" + parents: "layer211" + children: "layer213" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer213" + parents: "layer212" + children: "layer214 layer221 layer228 layer235 layer242 layer249 layer256 layer263 layer270 layer277 layer284 layer291 layer298 layer305 layer312" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 32 + conv_dims_i: 3 + conv_pads_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer214" + parents: "layer143 layer150 layer157 layer164 layer171 layer178 layer185 layer192 layer199 layer206 layer213" + children: "layer215" + data_layout: "data_parallel" + concatenation { + } + } + layer { + name: "layer215" + parents: "layer214" + children: "layer216" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer216" + parents: "layer215" + children: "layer217" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer217" + parents: "layer216" + children: "layer218" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 128 + conv_dims_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer218" + parents: "layer217" + children: "layer219" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer219" + parents: "layer218" + children: "layer220" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer220" + parents: "layer219" + children: "layer221 layer228 layer235 layer242 layer249 layer256 layer263 layer270 layer277 layer284 layer291 layer298 layer305 layer312" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 32 + conv_dims_i: 3 + conv_pads_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer221" + parents: "layer143 layer150 layer157 layer164 layer171 layer178 layer185 layer192 layer199 layer206 layer213 layer220" + children: "layer222" + data_layout: "data_parallel" + concatenation { + } + } + layer { + name: "layer222" + parents: "layer221" + children: "layer223" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer223" + parents: "layer222" + children: "layer224" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer224" + parents: "layer223" + children: "layer225" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 128 + conv_dims_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer225" + parents: "layer224" + children: "layer226" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer226" + parents: "layer225" + children: "layer227" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer227" + parents: "layer226" + children: "layer228 layer235 layer242 layer249 layer256 layer263 layer270 layer277 layer284 layer291 layer298 layer305 layer312" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 32 + conv_dims_i: 3 + conv_pads_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer228" + parents: "layer143 layer150 layer157 layer164 layer171 layer178 layer185 layer192 layer199 layer206 layer213 layer220 layer227" + children: "layer229" + data_layout: "data_parallel" + concatenation { + } + } + layer { + name: "layer229" + parents: "layer228" + children: "layer230" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer230" + parents: "layer229" + children: "layer231" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer231" + parents: "layer230" + children: "layer232" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 128 + conv_dims_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer232" + parents: "layer231" + children: "layer233" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer233" + parents: "layer232" + children: "layer234" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer234" + parents: "layer233" + children: "layer235 layer242 layer249 layer256 layer263 layer270 layer277 layer284 layer291 layer298 layer305 layer312" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 32 + conv_dims_i: 3 + conv_pads_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer235" + parents: "layer143 layer150 layer157 layer164 layer171 layer178 layer185 layer192 layer199 layer206 layer213 layer220 layer227 layer234" + children: "layer236" + data_layout: "data_parallel" + concatenation { + } + } + layer { + name: "layer236" + parents: "layer235" + children: "layer237" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer237" + parents: "layer236" + children: "layer238" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer238" + parents: "layer237" + children: "layer239" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 128 + conv_dims_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer239" + parents: "layer238" + children: "layer240" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer240" + parents: "layer239" + children: "layer241" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer241" + parents: "layer240" + children: "layer242 layer249 layer256 layer263 layer270 layer277 layer284 layer291 layer298 layer305 layer312" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 32 + conv_dims_i: 3 + conv_pads_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer242" + parents: "layer143 layer150 layer157 layer164 layer171 layer178 layer185 layer192 layer199 layer206 layer213 layer220 layer227 layer234 layer241" + children: "layer243" + data_layout: "data_parallel" + concatenation { + } + } + layer { + name: "layer243" + parents: "layer242" + children: "layer244" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer244" + parents: "layer243" + children: "layer245" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer245" + parents: "layer244" + children: "layer246" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 128 + conv_dims_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer246" + parents: "layer245" + children: "layer247" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer247" + parents: "layer246" + children: "layer248" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer248" + parents: "layer247" + children: "layer249 layer256 layer263 layer270 layer277 layer284 layer291 layer298 layer305 layer312" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 32 + conv_dims_i: 3 + conv_pads_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer249" + parents: "layer143 layer150 layer157 layer164 layer171 layer178 layer185 layer192 layer199 layer206 layer213 layer220 layer227 layer234 layer241 layer248" + children: "layer250" + data_layout: "data_parallel" + concatenation { + } + } + layer { + name: "layer250" + parents: "layer249" + children: "layer251" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer251" + parents: "layer250" + children: "layer252" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer252" + parents: "layer251" + children: "layer253" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 128 + conv_dims_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer253" + parents: "layer252" + children: "layer254" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer254" + parents: "layer253" + children: "layer255" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer255" + parents: "layer254" + children: "layer256 layer263 layer270 layer277 layer284 layer291 layer298 layer305 layer312" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 32 + conv_dims_i: 3 + conv_pads_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer256" + parents: "layer143 layer150 layer157 layer164 layer171 layer178 layer185 layer192 layer199 layer206 layer213 layer220 layer227 layer234 layer241 layer248 layer255" + children: "layer257" + data_layout: "data_parallel" + concatenation { + } + } + layer { + name: "layer257" + parents: "layer256" + children: "layer258" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer258" + parents: "layer257" + children: "layer259" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer259" + parents: "layer258" + children: "layer260" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 128 + conv_dims_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer260" + parents: "layer259" + children: "layer261" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer261" + parents: "layer260" + children: "layer262" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer262" + parents: "layer261" + children: "layer263 layer270 layer277 layer284 layer291 layer298 layer305 layer312" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 32 + conv_dims_i: 3 + conv_pads_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer263" + parents: "layer143 layer150 layer157 layer164 layer171 layer178 layer185 layer192 layer199 layer206 layer213 layer220 layer227 layer234 layer241 layer248 layer255 layer262" + children: "layer264" + data_layout: "data_parallel" + concatenation { + } + } + layer { + name: "layer264" + parents: "layer263" + children: "layer265" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer265" + parents: "layer264" + children: "layer266" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer266" + parents: "layer265" + children: "layer267" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 128 + conv_dims_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer267" + parents: "layer266" + children: "layer268" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer268" + parents: "layer267" + children: "layer269" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer269" + parents: "layer268" + children: "layer270 layer277 layer284 layer291 layer298 layer305 layer312" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 32 + conv_dims_i: 3 + conv_pads_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer270" + parents: "layer143 layer150 layer157 layer164 layer171 layer178 layer185 layer192 layer199 layer206 layer213 layer220 layer227 layer234 layer241 layer248 layer255 layer262 layer269" + children: "layer271" + data_layout: "data_parallel" + concatenation { + } + } + layer { + name: "layer271" + parents: "layer270" + children: "layer272" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer272" + parents: "layer271" + children: "layer273" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer273" + parents: "layer272" + children: "layer274" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 128 + conv_dims_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer274" + parents: "layer273" + children: "layer275" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer275" + parents: "layer274" + children: "layer276" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer276" + parents: "layer275" + children: "layer277 layer284 layer291 layer298 layer305 layer312" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 32 + conv_dims_i: 3 + conv_pads_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer277" + parents: "layer143 layer150 layer157 layer164 layer171 layer178 layer185 layer192 layer199 layer206 layer213 layer220 layer227 layer234 layer241 layer248 layer255 layer262 layer269 layer276" + children: "layer278" + data_layout: "data_parallel" + concatenation { + } + } + layer { + name: "layer278" + parents: "layer277" + children: "layer279" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer279" + parents: "layer278" + children: "layer280" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer280" + parents: "layer279" + children: "layer281" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 128 + conv_dims_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer281" + parents: "layer280" + children: "layer282" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer282" + parents: "layer281" + children: "layer283" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer283" + parents: "layer282" + children: "layer284 layer291 layer298 layer305 layer312" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 32 + conv_dims_i: 3 + conv_pads_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer284" + parents: "layer143 layer150 layer157 layer164 layer171 layer178 layer185 layer192 layer199 layer206 layer213 layer220 layer227 layer234 layer241 layer248 layer255 layer262 layer269 layer276 layer283" + children: "layer285" + data_layout: "data_parallel" + concatenation { + } + } + layer { + name: "layer285" + parents: "layer284" + children: "layer286" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer286" + parents: "layer285" + children: "layer287" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer287" + parents: "layer286" + children: "layer288" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 128 + conv_dims_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer288" + parents: "layer287" + children: "layer289" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer289" + parents: "layer288" + children: "layer290" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer290" + parents: "layer289" + children: "layer291 layer298 layer305 layer312" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 32 + conv_dims_i: 3 + conv_pads_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer291" + parents: "layer143 layer150 layer157 layer164 layer171 layer178 layer185 layer192 layer199 layer206 layer213 layer220 layer227 layer234 layer241 layer248 layer255 layer262 layer269 layer276 layer283 layer290" + children: "layer292" + data_layout: "data_parallel" + concatenation { + } + } + layer { + name: "layer292" + parents: "layer291" + children: "layer293" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer293" + parents: "layer292" + children: "layer294" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer294" + parents: "layer293" + children: "layer295" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 128 + conv_dims_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer295" + parents: "layer294" + children: "layer296" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer296" + parents: "layer295" + children: "layer297" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer297" + parents: "layer296" + children: "layer298 layer305 layer312" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 32 + conv_dims_i: 3 + conv_pads_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer298" + parents: "layer143 layer150 layer157 layer164 layer171 layer178 layer185 layer192 layer199 layer206 layer213 layer220 layer227 layer234 layer241 layer248 layer255 layer262 layer269 layer276 layer283 layer290 layer297" + children: "layer299" + data_layout: "data_parallel" + concatenation { + } + } + layer { + name: "layer299" + parents: "layer298" + children: "layer300" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer300" + parents: "layer299" + children: "layer301" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer301" + parents: "layer300" + children: "layer302" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 128 + conv_dims_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer302" + parents: "layer301" + children: "layer303" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer303" + parents: "layer302" + children: "layer304" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer304" + parents: "layer303" + children: "layer305 layer312" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 32 + conv_dims_i: 3 + conv_pads_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer305" + parents: "layer143 layer150 layer157 layer164 layer171 layer178 layer185 layer192 layer199 layer206 layer213 layer220 layer227 layer234 layer241 layer248 layer255 layer262 layer269 layer276 layer283 layer290 layer297 layer304" + children: "layer306" + data_layout: "data_parallel" + concatenation { + } + } + layer { + name: "layer306" + parents: "layer305" + children: "layer307" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer307" + parents: "layer306" + children: "layer308" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer308" + parents: "layer307" + children: "layer309" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 128 + conv_dims_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer309" + parents: "layer308" + children: "layer310" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer310" + parents: "layer309" + children: "layer311" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer311" + parents: "layer310" + children: "layer312" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 32 + conv_dims_i: 3 + conv_pads_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer312" + parents: "layer143 layer150 layer157 layer164 layer171 layer178 layer185 layer192 layer199 layer206 layer213 layer220 layer227 layer234 layer241 layer248 layer255 layer262 layer269 layer276 layer283 layer290 layer297 layer304 layer311" + children: "layer313" + data_layout: "data_parallel" + concatenation { + } + } + layer { + name: "layer313" + parents: "layer312" + children: "layer314" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer314" + parents: "layer313" + children: "layer315" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer315" + parents: "layer314" + children: "layer316" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 512 + conv_dims_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer316" + parents: "layer315" + children: "layer317 layer324 layer331 layer338 layer345 layer352 layer359 layer366 layer373 layer380 layer387 layer394 layer401 layer408 layer415 layer422 layer429" + data_layout: "data_parallel" + pooling { + num_dims: 2 + pool_dims_i: 2 + pool_strides_i: 2 + pool_mode: "average" + } + } + layer { + name: "layer317" + parents: "layer316" + children: "layer318" + data_layout: "data_parallel" + concatenation { + } + } + layer { + name: "layer318" + parents: "layer317" + children: "layer319" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer319" + parents: "layer318" + children: "layer320" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer320" + parents: "layer319" + children: "layer321" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 128 + conv_dims_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer321" + parents: "layer320" + children: "layer322" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer322" + parents: "layer321" + children: "layer323" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer323" + parents: "layer322" + children: "layer324 layer331 layer338 layer345 layer352 layer359 layer366 layer373 layer380 layer387 layer394 layer401 layer408 layer415 layer422 layer429" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 32 + conv_dims_i: 3 + conv_pads_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer324" + parents: "layer316 layer323" + children: "layer325" + data_layout: "data_parallel" + concatenation { + } + } + layer { + name: "layer325" + parents: "layer324" + children: "layer326" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer326" + parents: "layer325" + children: "layer327" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer327" + parents: "layer326" + children: "layer328" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 128 + conv_dims_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer328" + parents: "layer327" + children: "layer329" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer329" + parents: "layer328" + children: "layer330" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer330" + parents: "layer329" + children: "layer331 layer338 layer345 layer352 layer359 layer366 layer373 layer380 layer387 layer394 layer401 layer408 layer415 layer422 layer429" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 32 + conv_dims_i: 3 + conv_pads_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer331" + parents: "layer316 layer323 layer330" + children: "layer332" + data_layout: "data_parallel" + concatenation { + } + } + layer { + name: "layer332" + parents: "layer331" + children: "layer333" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer333" + parents: "layer332" + children: "layer334" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer334" + parents: "layer333" + children: "layer335" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 128 + conv_dims_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer335" + parents: "layer334" + children: "layer336" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer336" + parents: "layer335" + children: "layer337" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer337" + parents: "layer336" + children: "layer338 layer345 layer352 layer359 layer366 layer373 layer380 layer387 layer394 layer401 layer408 layer415 layer422 layer429" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 32 + conv_dims_i: 3 + conv_pads_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer338" + parents: "layer316 layer323 layer330 layer337" + children: "layer339" + data_layout: "data_parallel" + concatenation { + } + } + layer { + name: "layer339" + parents: "layer338" + children: "layer340" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer340" + parents: "layer339" + children: "layer341" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer341" + parents: "layer340" + children: "layer342" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 128 + conv_dims_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer342" + parents: "layer341" + children: "layer343" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer343" + parents: "layer342" + children: "layer344" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer344" + parents: "layer343" + children: "layer345 layer352 layer359 layer366 layer373 layer380 layer387 layer394 layer401 layer408 layer415 layer422 layer429" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 32 + conv_dims_i: 3 + conv_pads_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer345" + parents: "layer316 layer323 layer330 layer337 layer344" + children: "layer346" + data_layout: "data_parallel" + concatenation { + } + } + layer { + name: "layer346" + parents: "layer345" + children: "layer347" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer347" + parents: "layer346" + children: "layer348" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer348" + parents: "layer347" + children: "layer349" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 128 + conv_dims_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer349" + parents: "layer348" + children: "layer350" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer350" + parents: "layer349" + children: "layer351" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer351" + parents: "layer350" + children: "layer352 layer359 layer366 layer373 layer380 layer387 layer394 layer401 layer408 layer415 layer422 layer429" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 32 + conv_dims_i: 3 + conv_pads_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer352" + parents: "layer316 layer323 layer330 layer337 layer344 layer351" + children: "layer353" + data_layout: "data_parallel" + concatenation { + } + } + layer { + name: "layer353" + parents: "layer352" + children: "layer354" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer354" + parents: "layer353" + children: "layer355" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer355" + parents: "layer354" + children: "layer356" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 128 + conv_dims_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer356" + parents: "layer355" + children: "layer357" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer357" + parents: "layer356" + children: "layer358" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer358" + parents: "layer357" + children: "layer359 layer366 layer373 layer380 layer387 layer394 layer401 layer408 layer415 layer422 layer429" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 32 + conv_dims_i: 3 + conv_pads_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer359" + parents: "layer316 layer323 layer330 layer337 layer344 layer351 layer358" + children: "layer360" + data_layout: "data_parallel" + concatenation { + } + } + layer { + name: "layer360" + parents: "layer359" + children: "layer361" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer361" + parents: "layer360" + children: "layer362" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer362" + parents: "layer361" + children: "layer363" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 128 + conv_dims_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer363" + parents: "layer362" + children: "layer364" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer364" + parents: "layer363" + children: "layer365" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer365" + parents: "layer364" + children: "layer366 layer373 layer380 layer387 layer394 layer401 layer408 layer415 layer422 layer429" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 32 + conv_dims_i: 3 + conv_pads_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer366" + parents: "layer316 layer323 layer330 layer337 layer344 layer351 layer358 layer365" + children: "layer367" + data_layout: "data_parallel" + concatenation { + } + } + layer { + name: "layer367" + parents: "layer366" + children: "layer368" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer368" + parents: "layer367" + children: "layer369" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer369" + parents: "layer368" + children: "layer370" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 128 + conv_dims_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer370" + parents: "layer369" + children: "layer371" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer371" + parents: "layer370" + children: "layer372" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer372" + parents: "layer371" + children: "layer373 layer380 layer387 layer394 layer401 layer408 layer415 layer422 layer429" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 32 + conv_dims_i: 3 + conv_pads_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer373" + parents: "layer316 layer323 layer330 layer337 layer344 layer351 layer358 layer365 layer372" + children: "layer374" + data_layout: "data_parallel" + concatenation { + } + } + layer { + name: "layer374" + parents: "layer373" + children: "layer375" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer375" + parents: "layer374" + children: "layer376" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer376" + parents: "layer375" + children: "layer377" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 128 + conv_dims_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer377" + parents: "layer376" + children: "layer378" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer378" + parents: "layer377" + children: "layer379" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer379" + parents: "layer378" + children: "layer380 layer387 layer394 layer401 layer408 layer415 layer422 layer429" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 32 + conv_dims_i: 3 + conv_pads_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer380" + parents: "layer316 layer323 layer330 layer337 layer344 layer351 layer358 layer365 layer372 layer379" + children: "layer381" + data_layout: "data_parallel" + concatenation { + } + } + layer { + name: "layer381" + parents: "layer380" + children: "layer382" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer382" + parents: "layer381" + children: "layer383" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer383" + parents: "layer382" + children: "layer384" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 128 + conv_dims_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer384" + parents: "layer383" + children: "layer385" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer385" + parents: "layer384" + children: "layer386" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer386" + parents: "layer385" + children: "layer387 layer394 layer401 layer408 layer415 layer422 layer429" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 32 + conv_dims_i: 3 + conv_pads_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer387" + parents: "layer316 layer323 layer330 layer337 layer344 layer351 layer358 layer365 layer372 layer379 layer386" + children: "layer388" + data_layout: "data_parallel" + concatenation { + } + } + layer { + name: "layer388" + parents: "layer387" + children: "layer389" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer389" + parents: "layer388" + children: "layer390" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer390" + parents: "layer389" + children: "layer391" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 128 + conv_dims_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer391" + parents: "layer390" + children: "layer392" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer392" + parents: "layer391" + children: "layer393" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer393" + parents: "layer392" + children: "layer394 layer401 layer408 layer415 layer422 layer429" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 32 + conv_dims_i: 3 + conv_pads_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer394" + parents: "layer316 layer323 layer330 layer337 layer344 layer351 layer358 layer365 layer372 layer379 layer386 layer393" + children: "layer395" + data_layout: "data_parallel" + concatenation { + } + } + layer { + name: "layer395" + parents: "layer394" + children: "layer396" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer396" + parents: "layer395" + children: "layer397" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer397" + parents: "layer396" + children: "layer398" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 128 + conv_dims_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer398" + parents: "layer397" + children: "layer399" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer399" + parents: "layer398" + children: "layer400" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer400" + parents: "layer399" + children: "layer401 layer408 layer415 layer422 layer429" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 32 + conv_dims_i: 3 + conv_pads_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer401" + parents: "layer316 layer323 layer330 layer337 layer344 layer351 layer358 layer365 layer372 layer379 layer386 layer393 layer400" + children: "layer402" + data_layout: "data_parallel" + concatenation { + } + } + layer { + name: "layer402" + parents: "layer401" + children: "layer403" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer403" + parents: "layer402" + children: "layer404" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer404" + parents: "layer403" + children: "layer405" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 128 + conv_dims_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer405" + parents: "layer404" + children: "layer406" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer406" + parents: "layer405" + children: "layer407" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer407" + parents: "layer406" + children: "layer408 layer415 layer422 layer429" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 32 + conv_dims_i: 3 + conv_pads_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer408" + parents: "layer316 layer323 layer330 layer337 layer344 layer351 layer358 layer365 layer372 layer379 layer386 layer393 layer400 layer407" + children: "layer409" + data_layout: "data_parallel" + concatenation { + } + } + layer { + name: "layer409" + parents: "layer408" + children: "layer410" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer410" + parents: "layer409" + children: "layer411" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer411" + parents: "layer410" + children: "layer412" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 128 + conv_dims_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer412" + parents: "layer411" + children: "layer413" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer413" + parents: "layer412" + children: "layer414" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer414" + parents: "layer413" + children: "layer415 layer422 layer429" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 32 + conv_dims_i: 3 + conv_pads_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer415" + parents: "layer316 layer323 layer330 layer337 layer344 layer351 layer358 layer365 layer372 layer379 layer386 layer393 layer400 layer407 layer414" + children: "layer416" + data_layout: "data_parallel" + concatenation { + } + } + layer { + name: "layer416" + parents: "layer415" + children: "layer417" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer417" + parents: "layer416" + children: "layer418" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer418" + parents: "layer417" + children: "layer419" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 128 + conv_dims_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer419" + parents: "layer418" + children: "layer420" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer420" + parents: "layer419" + children: "layer421" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer421" + parents: "layer420" + children: "layer422 layer429" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 32 + conv_dims_i: 3 + conv_pads_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer422" + parents: "layer316 layer323 layer330 layer337 layer344 layer351 layer358 layer365 layer372 layer379 layer386 layer393 layer400 layer407 layer414 layer421" + children: "layer423" + data_layout: "data_parallel" + concatenation { + } + } + layer { + name: "layer423" + parents: "layer422" + children: "layer424" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer424" + parents: "layer423" + children: "layer425" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer425" + parents: "layer424" + children: "layer426" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 128 + conv_dims_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer426" + parents: "layer425" + children: "layer427" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer427" + parents: "layer426" + children: "layer428" + data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer428" + parents: "layer427" + children: "layer429" + data_layout: "data_parallel" + convolution { + num_dims: 2 + num_output_channels: 32 + conv_dims_i: 3 + conv_pads_i: 1 + conv_strides_i: 1 + } + } + layer { + name: "layer429" + parents: "layer316 layer323 layer330 layer337 layer344 layer351 layer358 layer365 layer372 layer379 layer386 layer393 layer400 layer407 layer414 layer421 layer428" + children: "layer430" + data_layout: "data_parallel" + concatenation { + } + } + layer { + name: "layer430" + parents: "layer429" + children: "layer431" + data_layout: "data_parallel" + batch_normalization { + decay: 0.9 + scale_init: 1.0 + epsilon: 1e-05 + } + } + layer { + name: "layer431" + parents: "layer430" + children: "layer432" + data_layout: "data_parallel" + pooling { + num_dims: 2 + pool_dims_i: 7 + pool_pads_i: 1 + pool_strides_i: 1 + pool_mode: "average" + } + } + layer { + name: "layer432" + parents: "layer431" + children: "layer433" + data_layout: "data_parallel" + fully_connected { + num_neurons: 1000 + } + } + layer { + name: "layer433" + parents: "layer432" + children: "layer434 layer435 layer436" + data_layout: "data_parallel" + softmax { + } + } + layer { + name: "layer436" + parents: "layer433 layer3" + data_layout: "data_parallel" + top_k_categorical_accuracy { + k: 5 + } + } + layer { + name: "layer435" + parents: "layer433 layer3" + data_layout: "data_parallel" + categorical_accuracy { + } + } + layer { + name: "layer434" + parents: "layer433 layer3" + data_layout: "data_parallel" + cross_entropy { + } + } + callback { + print { + } + } + callback { + timer { + } + } + callback { + drop_fixed_learning_rate { + drop_epoch: 30 + drop_epoch: 60 + amt: 0.1 + } + } +} +optimizer { + sgd { + learn_rate: 0.1 + momentum: 0.9 + nesterov: true + } +} diff --git a/model_zoo/vision/densenet.py b/model_zoo/vision/densenet.py new file mode 100755 index 00000000000..08487045d63 --- /dev/null +++ b/model_zoo/vision/densenet.py @@ -0,0 +1,544 @@ +#!/usr/bin/env python3 +import argparse +import os.path +import subprocess +import google.protobuf.text_format as txtf +import lbann +import lbann.contrib.args +import lbann.contrib.lc.launcher + +# TODO: Add trainer argument after PR #916 merges + +LOG = True + + +def log(string): + if LOG: + print(string) + +# Commands to run ############################################################## + +# Allocate notes on Pascal from ssh: +# salloc --nodes=16 --partition=pbatch --time=180 + +# From lbann/model_zoo/vision: +# ./densenet.py +# --disable-run (if experiment shouldn't be run) +# --mini-batch-size 128 (if mini-batch-size should be something other than 256) +# --nodes 16 (if more than one node is to be used; 16 is optimal) +# --procs-per-node 2 + +# To run the full 90 epochs from ssh: +# ./densenet.py --nodes 16 --procs-per-node 2 > /usr/workspace/wsb//lbann/model_zoo/vision/output.txt +# mini-batch-size default => 256, num-epochs => 90 + +# To run 10 epoch test from ssh: +# ./densenet.py --nodes 16 --procs-per-node 2 --mini-batch-size 256 --num-epochs 10 > /usr/workspace/wsb//lbann/model_zoo/vision/output.txt + +# To avoid needing to stay logged into ssh, create a script +# densenet_batch_job.cmd such as: +# #!/bin/bash +# #SBATCH --nodes 16 +# #SBATCH --partition pbatch +# #SBATCH --time 240 +# ./densenet.py --nodes 16 --procs-per-node 2 --mini-batch-size 256 --num-epochs 10 > /usr/workspace/wsb//lbann/model_zoo/vision/output.txt + +# and from lbann/model_zoo/vision run: +# sbatch densenet_batch_job.cmd + +# To generate visualization, from lbann run: +# scripts/viz.py model_zoo/models/densenet/generated_densenet.prototext + +# Copy the output file, experiment directory, and visualization +# from LC to your computer by running the following commands from your computer: +# scp @pascal.llnl.gov:/usr/workspace/wsb//lbann/model_zoo/vision/output.txt . +# scp -r @pascal.llnl.gov:/usr/workspace/wsb//lbann/experiments/_lbann_densenet/ . +# scp @pascal.llnl.gov:/usr/workspace/wsb//lbann/graph.pdf . + + +# DenseNet ##################################################################### +# See src/proto/lbann.proto for possible functions to call. +# See PyTorch DenseNet: +# https://github.com/pytorch/vision/blob/master/torchvision/models/densenet.py +# See "Densely Connected Convolutional Networks" by Huang et. al p.4 +def densenet(version, + cumulative_layer_num, + images_node + ): + if version == 121: + growth_rate = 32 # k in the paper + layers_per_block = (6, 12, 24, 16) + num_initial_features = 64 + elif version == 161: + growth_rate = 48 # k in the paper + layers_per_block = (96, 48, 36, 24) + num_initial_features = 96 + else: + raise Exception('Invalid version={v}.'.format(v=version)) + batch_norm_size = 4 + + parent_node, cumulative_layer_num = initial_layer( + cumulative_layer_num, images_node, + num_initial_features) + num_features = num_initial_features + # Start counting dense blocks at 1. + for current_block_num, num_layers in enumerate(layers_per_block, 1): + parent_nodes, cumulative_layer_num = dense_block( + cumulative_layer_num, + parent_node, + batch_norm_size=batch_norm_size, + current_block_num=current_block_num, + growth_rate=growth_rate, + num_layers=num_layers, + num_initial_channels=num_initial_features + ) + # num_features += num_layers * growth_rate + for node in parent_nodes[1:]: + num_features += node.num_output_channels + parent_node = lbann.Concatenation(parent_nodes) + cumulative_layer_num += 1 + log('densenet Concatenation. cumulative_layer_num={n}'.format( + b=current_block_num, n=cumulative_layer_num)) + if current_block_num != len(layers_per_block): + parent_node, cumulative_layer_num = transition_layer( + current_block_num, + cumulative_layer_num, + parent_node, + # In Python 3, this is integer division. + num_output_channels=num_features//2, + ) + num_features //= 2 + + batch_normalization_node = standard_batchnorm(parent_node) + cumulative_layer_num += 1 + log('densenet BatchNormalization. cumulative_layer_num={n}'.format( + b=current_block_num, n=cumulative_layer_num)) + + probs = classification_layer( + cumulative_layer_num, + batch_normalization_node + ) + return probs + + +def initial_layer(cumulative_layer_num, + images_node, + num_initial_channels + ): + # 7x7 conv, stride 2 + convolution_node = lbann.Convolution( + images_node, + conv_dims_i=7, + conv_pads_i=3, + conv_strides_i=2, + has_bias=False, + num_dims=2, + num_output_channels=num_initial_channels + ) + cumulative_layer_num += 1 + log('initial_layer Convolution. cumulative_layer_num={n}'.format( + n=cumulative_layer_num)) + + batch_normalization_node = standard_batchnorm(convolution_node) + cumulative_layer_num += 1 + log('initial_layer BatchNormalization. cumulative_layer_num={n}'.format( + n=cumulative_layer_num)) + + relu_node = lbann.Relu(batch_normalization_node) + cumulative_layer_num += 1 + log('initial_layer Relu. cumulative_layer_num={n}'.format( + n=cumulative_layer_num)) + + # 3x3 max pool, stride 2 + pooling_node = lbann.Pooling( + relu_node, + num_dims=2, + pool_dims_i=3, + pool_mode='max', + pool_pads_i=1, + pool_strides_i=2 + ) + cumulative_layer_num += 1 + log('initial_layer Pooling. cumulative_layer_num={n}'.format( + n=cumulative_layer_num)) + + return pooling_node, cumulative_layer_num + + +def standard_batchnorm(parent_node): + return lbann.BatchNormalization( + parent_node, + bias_init=0.0, + decay=0.9, + epsilon=1e-5, + scale_init=1.0 + ) + + +def dense_block(cumulative_layer_num, + parent_node, + batch_norm_size, + current_block_num, + growth_rate, + num_layers, + num_initial_channels + ): + parent_nodes = [parent_node] + # Start counting dense layers at 1. + for current_layer_num in range(1, num_layers + 1): + # channels from before block + (each dense layer has k=growth_rate channels) + num_input_channels = num_initial_channels + (current_layer_num - 1) * growth_rate + print('num_input_channels={c}'.format(c=num_input_channels)) + parent_node, cumulative_layer_num = dense_layer( + current_block_num, + current_layer_num, + cumulative_layer_num, + parent_nodes, + batch_norm_size=batch_norm_size, + growth_rate=growth_rate + ) + parent_nodes.append(parent_node) + return parent_nodes, cumulative_layer_num + + +def dense_layer(current_block_num, + current_layer_num, + cumulative_layer_num, + parent_nodes, + batch_norm_size, + growth_rate + ): + concatenation_node = lbann.Concatenation(parent_nodes) + cumulative_layer_num += 1 + log('dense_block={b} dense_layer={l} Concatenation. cumulative_layer_num={n}'.format( + b=current_block_num, l=current_layer_num, n=cumulative_layer_num)) + conv_block_1_node, cumulative_layer_num = conv_block( + current_block_num, + current_layer_num, + cumulative_layer_num, + concatenation_node, + conv_dims_i=1, + conv_pads_i=0, + num_output_channels=batch_norm_size * growth_rate + ) + conv_block_2_node, cumulative_layer_num = conv_block( + current_block_num, + current_layer_num, + cumulative_layer_num, + conv_block_1_node, + conv_dims_i=3, + conv_pads_i=1, + num_output_channels=growth_rate + ) + return conv_block_2_node, cumulative_layer_num + + +def conv_block(current_block_num, + current_layer_num, + cumulative_layer_num, + parent_node, + conv_dims_i, + conv_pads_i, + num_output_channels + ): + batch_normalization_node = standard_batchnorm(parent_node) + cumulative_layer_num += 1 + log('dense_block={b} dense_layer={l} BatchNormalization. cumulative_layer_num={n}'.format( + b=current_block_num, l=current_layer_num, n=cumulative_layer_num)) + + relu_node = lbann.Relu(batch_normalization_node) + cumulative_layer_num += 1 + log( + 'dense_block={b} dense_layer={l} Relu. cumulative_layer_num={n}'.format( + b=current_block_num, l=current_layer_num, n=cumulative_layer_num)) + + convolution_node = lbann.Convolution( + relu_node, + conv_dims_i=conv_dims_i, + conv_pads_i=conv_pads_i, + conv_strides_i=1, + has_bias=False, + num_dims=2, + num_output_channels=num_output_channels + ) + cumulative_layer_num += 1 + log('dense_block={b} dense_layer={l} Convolution. cumulative_layer_num={n}'.format( + b=current_block_num, l=current_layer_num, n=cumulative_layer_num)) + + return convolution_node, cumulative_layer_num + + +def transition_layer(current_block_num, + cumulative_layer_num, + parent_node, + num_output_channels + ): + batch_normalization_node = standard_batchnorm(parent_node) + cumulative_layer_num += 1 + log('dense_block={b} > transition_layer BatchNormalization. cumulative_layer_num={n}'.format( + b=current_block_num, n=cumulative_layer_num)) + + relu_node = lbann.Relu(batch_normalization_node) + cumulative_layer_num += 1 + log('dense_block={b} > transition_layer Relu. cumulative_layer_num={n}'.format( + b=current_block_num, n=cumulative_layer_num)) + + convolution_node = lbann.Convolution( + relu_node, + conv_dims_i=1, + conv_pads_i=0, + conv_strides_i=1, + has_bias=False, + num_dims=2, + num_output_channels=num_output_channels + ) + cumulative_layer_num += 1 + log('dense_block={b} > transition_layer Convolution. cumulative_layer_num={n}'.format( + b=current_block_num, n=cumulative_layer_num)) + + # 2x2 average pool, stride 2 + pooling_node = lbann.Pooling( + convolution_node, + num_dims=2, + pool_dims_i=2, + pool_mode='average', + pool_pads_i=0, + pool_strides_i=2 + ) + cumulative_layer_num += 1 + log('dense_block={b} > transition_layer Pooling. cumulative_layer_num={n}'.format( + b=current_block_num, n=cumulative_layer_num)) + + return pooling_node, cumulative_layer_num + + +def classification_layer(cumulative_layer_num, + parent_node): + # 7x7 global average pool + pooling_node = lbann.Pooling( + parent_node, + num_dims=2, + pool_dims_i=7, + pool_mode='average', + pool_pads_i=1, + pool_strides_i=1 + ) + cumulative_layer_num += 1 + log('classification_layer Pooling. cumulative_layer_num={n}'.format( + n=cumulative_layer_num)) + + fully_connected_node = lbann.FullyConnected( + pooling_node, + num_neurons=1000, + has_bias=False + ) + cumulative_layer_num += 1 + log('classification_layer FullyConnected. cumulative_layer_num={n}'.format( + n=cumulative_layer_num)) + + probabilities = lbann.Softmax(fully_connected_node) + return probabilities + + +# Helpful Functions ############################################################ +def get_args(): + desc = ('Construct and run DenseNet on ImageNet data. ' + 'Running the experiment is only supported on LC systems.') + parser = argparse.ArgumentParser(description=desc) + lbann.contrib.args.add_scheduler_arguments(parser) + parser.add_argument( + '--mini-batch-size', action='store', default=256, type=int, + help='mini-batch size (default: 256)', metavar='NUM') + parser.add_argument( + '--num-epochs', action='store', default=90, type=int, + help='number of epochs (default: 90)', metavar='NUM') + parser.add_argument( + '--num-labels', action='store', default=1000, type=int, + help='number of data classes (default: 1000)', metavar='NUM') + lbann.contrib.args.add_optimizer_arguments( + parser, + default_optimizer='sgd', + default_learning_rate=0.1 + ) + lbann_dir = subprocess.check_output( + 'git rev-parse --show-toplevel'.split()).strip() + # https://stackoverflow.com/questions/606191/convert-bytes-to-a-string + lbann_dir = lbann_dir.decode("utf-8") + data_reader_prototext = os.path.join(lbann_dir, + 'model_zoo', + 'data_readers', + 'data_reader_imagenet.prototext') + parser.add_argument( + '--data-reader', action='store', + default=data_reader_prototext, type=str, + help='data reader prototext file (default: ' + data_reader_prototext + ')', + metavar='FILE') + parser.add_argument( + '--imagenet-classes', action='store', type=int, + help='number of ImageNet-1K classes (availability of subsampled datasets may vary by system)', + metavar='NUM') + generated_prototext = os.path.join(lbann_dir, + 'model_zoo', + 'models', + 'densenet', + 'generated_densenet.prototext') + parser.add_argument( + '--prototext', action='store', + default=generated_prototext, type=str, + help='exported prototext file', metavar='FILE') + parser.add_argument( + '--disable-run', action='store_true', + help='do not run experiment (e.g. if only the prototext is desired)') + args = parser.parse_args() + return args + + +def construct_layer_graph( + version, + cumulative_layer_num, + input_node): + # Input data + images_node = lbann.Identity(input_node) + cumulative_layer_num += 1 + log('Identity. cumulative_layer_num={n}'.format(n=cumulative_layer_num)) + + # Use input_node, not images_node. + image_labels_node = lbann.Identity(input_node) + cumulative_layer_num += 1 + log('Identity. cumulative_layer_num={n}'.format(n=cumulative_layer_num)) + + # Use images_node, not image_labels_node. + probabilities = densenet(version, cumulative_layer_num, images_node) + + return probabilities, image_labels_node + + +def set_up_experiment(args, + input_, + probs, + labels): + # Set up objective function + cross_entropy = lbann.CrossEntropy([probs, labels]) + layers = list(lbann.traverse_layer_graph(input_)) + weights = set() + for l in layers: + weights.update(l.weights) + # scale = weight decay + l2_reg = lbann.L2WeightRegularization(weights=weights, scale=1e-4) + objective_function = lbann.ObjectiveFunction([cross_entropy, l2_reg]) + + # Set up model + top1 = lbann.CategoricalAccuracy([probs, labels]) + top5 = lbann.TopKCategoricalAccuracy([probs, labels], k=5) + metrics = [lbann.Metric(top1, name='top-1 accuracy', unit='%'), + lbann.Metric(top5, name='top-5 accuracy', unit='%')] + callbacks = [lbann.CallbackPrint(), + lbann.CallbackTimer(), + lbann.CallbackDropFixedLearningRate( + drop_epoch=[30, 60], amt=0.1)] + model = lbann.Model(args.mini_batch_size, + args.num_epochs, + layers=layers, + weights=weights, + objective_function=objective_function, + metrics=metrics, + callbacks=callbacks) + + # Load data reader from prototext + data_reader_proto = lbann.lbann_pb2.LbannPB() + with open(args.data_reader, 'r') as f: + txtf.Merge(f.read(), data_reader_proto) + data_reader_proto = data_reader_proto.data_reader + + # Set up optimizer + if args.optimizer == 'sgd': + print('Creating sgd optimizer') + optimizer = lbann.optimizer.SGD( + learn_rate=args.optimizer_learning_rate, + momentum=0.9, + nesterov=True + ) + else: + optimizer = lbann.contrib.args.create_optimizer(args) + + # Save prototext to args.prototext + if args.prototext: + lbann.proto.save_prototext(args.prototext, + model=model, + optimizer=optimizer, + data_reader=data_reader_proto) + + return model, data_reader_proto, optimizer + + +def run_experiment(args, + model, + data_reader_proto, + optimizer): + # Run experiment + if not args.disable_run: + from lbann.contrib.lc.paths import imagenet_dir, imagenet_labels + import lbann.contrib.lc.launcher + kwargs = {} + if args.nodes: + kwargs['nodes'] = args.nodes + if args.procs_per_node: + kwargs['procs_per_node'] = args.procs_per_node + if args.partition: + kwargs['partition'] = args.partition + if args.account: + kwargs['account'] = args.account + if args.time_limit: + kwargs['time_limit'] = args.time_limit + if args.imagenet_classes: + classes = args.imagenet_classes + kwargs['lbann_args'] = ( + '--data_filedir_train={} --data_filename_train={} ' + '--data_filedir_test={} --data_filename_test={}' + .format(imagenet_dir(data_set='train', num_classes=classes), + imagenet_labels(data_set='train', + num_classes=classes), + imagenet_dir(data_set='val', num_classes=classes), + imagenet_labels(data_set='val', + num_classes=classes))) + lbann.contrib.lc.launcher.run(model, + data_reader_proto, + optimizer, + job_name='lbann_densenet', + **kwargs) + + +# Main function ################################################################ +def main(): + # ---------------------------------- + # Command-line arguments + # ---------------------------------- + args = get_args() + + # ---------------------------------- + # Construct layer graph + # ---------------------------------- + input_node = lbann.Input() + # Start counting cumulative layers at 1. + cumulative_layer_num = 1 + log('Input. cumulative_layer_num={n}'.format(n=cumulative_layer_num)) + (probs, labels) = construct_layer_graph( + 121, cumulative_layer_num, input_node) + + # ---------------------------------- + # Setup experiment + # ---------------------------------- + + (model, data_reader_proto, optimizer) = set_up_experiment( + args, input_node, probs, labels) + + # ---------------------------------- + # Run experiment + # ---------------------------------- + # Note: Use `lbann.run` instead for non-LC systems. + + run_experiment(args, model, data_reader_proto, optimizer) + + +if __name__ == '__main__': + main() From 350adbe8596e19eae6a8a401fd9e59ad91f1d1ae Mon Sep 17 00:00:00 2001 From: Ryan Forsyth Date: Tue, 21 May 2019 17:12:06 -0700 Subject: [PATCH 035/634] Reduce node allocation queueing --- bamboo/allocate_and_run.sh | 9 ++++++++ bamboo/common_python/tools.py | 8 +++---- .../catalyst/clang4/expected_performance.csv | 4 ++-- .../catalyst/gcc7/expected_performance.csv | 8 +++---- .../pascal/gcc7/expected_performance.csv | 2 +- bamboo/run.sh | 22 +++++++++++++++++++ 6 files changed, 41 insertions(+), 12 deletions(-) create mode 100755 bamboo/allocate_and_run.sh create mode 100755 bamboo/run.sh diff --git a/bamboo/allocate_and_run.sh b/bamboo/allocate_and_run.sh new file mode 100755 index 00000000000..a2af7dd5a18 --- /dev/null +++ b/bamboo/allocate_and_run.sh @@ -0,0 +1,9 @@ +CLUSTER=$(hostname | sed 's/\([a-zA-Z][a-zA-Z]*\)[0-9]*/\1/g') + +if [ "${CLUSTER}" = 'catalyst' ]; then + salloc -N16 -t 600 ./run.sh +fi + +if [ "${CLUSTER}" = 'pascal' ]; then + salloc -N16 -t 600 ./run.sh +fi diff --git a/bamboo/common_python/tools.py b/bamboo/common_python/tools.py index 7110ddc9a67..044cf6add02 100644 --- a/bamboo/common_python/tools.py +++ b/bamboo/common_python/tools.py @@ -77,9 +77,8 @@ def get_command(cluster, if scheduler == 'slurm': # Create allocate command command_allocate = '' - # Allocate a node if we don't have one already - # Running the tests manually allows for already having a node allocated - if os.getenv('SLURM_JOB_NUM_NODES') == None: + # Allocate nodes only if we don't already have an allocation. + if os.getenv('SLURM_JOB_NUM_NODES') is None: command_allocate = 'salloc' option_num_nodes = '' option_partition = '' @@ -122,8 +121,7 @@ def get_command(cluster, elif scheduler == 'lsf': # Create allocate command command_allocate = '' - # Allocate a node if we don't have one already - # Running the tests manually allows for already having a node allocated + # Allocate nodes only if we don't already have an allocation. if os.getenv('LSB_HOSTS') is None: command_allocate = 'bsub' # x => Puts the host running your job into exclusive execution diff --git a/bamboo/integration_tests/expected_values/catalyst/clang4/expected_performance.csv b/bamboo/integration_tests/expected_values/catalyst/clang4/expected_performance.csv index 32551e8e70b..6d2d59bafb7 100644 --- a/bamboo/integration_tests/expected_values/catalyst/clang4/expected_performance.csv +++ b/bamboo/integration_tests/expected_values/catalyst/clang4/expected_performance.csv @@ -1,5 +1,5 @@ Model_name, training_run_time, training_mean, training_max, training_min, training_stdev, test_accuracy -alexnet_nightly, 56.00, 1.20, 5.00, 0.80, 0.40, 0.00 +alexnet_nightly, 117.00, 2.80, 9.00, 1.20, 2.00, 0.00 alexnet_weekly, 0.00, 0.00, 0.00, 0.00, 0.00, 100.00 cache_alexnet, 0.00, 0.00, 0.00, 0.00, 0.00, 100.00 -lenet_mnist, 88.00, 0.12, 0.40, 0.10, 0.09, 98.40 +lenet_mnist, 100.00, 0.12, 0.40, 0.10, 0.09, 98.40 diff --git a/bamboo/integration_tests/expected_values/catalyst/gcc7/expected_performance.csv b/bamboo/integration_tests/expected_values/catalyst/gcc7/expected_performance.csv index d3ac7caa6b4..654db1a99af 100644 --- a/bamboo/integration_tests/expected_values/catalyst/gcc7/expected_performance.csv +++ b/bamboo/integration_tests/expected_values/catalyst/gcc7/expected_performance.csv @@ -1,5 +1,5 @@ Model_name, training_run_time, training_mean, training_max, training_min, training_stdev, test_accuracy -alexnet_nightly, 57.00, 1.11, 4.80, 0.37, 1.20, 0.00 -alexnet_weekly, 0.00, 0.00, 0.00, 0.00, 0.00, 100.00 -cache_alexnet, 0.00, 0.00, 0.00, 0.00, 0.00, 100.00 -lenet_mnist, 64.00, 0.10, 0.40, 0.08, 0.04, 98.92 +alexnet_nightly, 65.00, 1.50, 8.30, 0.37, 1.70, 0.00 +alexnet_weekly, 0.00, 0.00, 0.00, 0.00, 0.00, 100.00 +cache_alexnet, 0.00, 0.00, 0.00, 0.00, 0.00, 100.00 +lenet_mnist, 137.00, 0.18, 0.40, 0.15, 0.04, 98.92 diff --git a/bamboo/integration_tests/expected_values/pascal/gcc7/expected_performance.csv b/bamboo/integration_tests/expected_values/pascal/gcc7/expected_performance.csv index cca3451efd2..25e04fb92b7 100644 --- a/bamboo/integration_tests/expected_values/pascal/gcc7/expected_performance.csv +++ b/bamboo/integration_tests/expected_values/pascal/gcc7/expected_performance.csv @@ -2,4 +2,4 @@ Model_name, training_run_time, training_mean, training_max, training_min, t alexnet_nightly, 51.00, 1.20, 4.00, 0.50, 0.40, 0.17 alexnet_weekly, 0.00, 0.00, 0.00, 0.00, 0.00, 100.00 cache_alexnet, 0.00, 0.00, 0.00, 0.00, 0.00, 100.00 -lenet_mnist, 9.00, 0.01, 6.00, 0.01, 0.40, 98.40 +lenet_mnist, 12.00, 0.01, 6.00, 0.01, 0.40, 98.40 diff --git a/bamboo/run.sh b/bamboo/run.sh new file mode 100755 index 00000000000..30d993c9a84 --- /dev/null +++ b/bamboo/run.sh @@ -0,0 +1,22 @@ +#!/bin/bash -l + +echo "Task: Cleaning" +./clean.sh + +echo "Task: Compiler Tests" +cd compiler_tests +module load cmake/3.9.2 +python -m pytest -s --junitxml=results.xml +cd .. + +echo "Task: Integration Tests" +cd integration_tests +python -m pytest -s --junitxml=results.xml +cd .. + +echo "Task: Unit Tests" +cd unit_tests +python -m pytest -s --junitxml=results.xml +cd .. + +echo "Task: Finished" From 4c8dbe171e1d87512f292e09905b02504711b0d9 Mon Sep 17 00:00:00 2001 From: Ryan Forsyth Date: Wed, 22 May 2019 15:56:32 -0700 Subject: [PATCH 036/634] Script updates for Pascal --- bamboo/allocate_and_run.sh | 1 + .../pascal/gcc7/expected_performance.csv | 2 +- bamboo/run.sh | 14 +++++++++++--- 3 files changed, 13 insertions(+), 4 deletions(-) diff --git a/bamboo/allocate_and_run.sh b/bamboo/allocate_and_run.sh index a2af7dd5a18..d0018260e7c 100755 --- a/bamboo/allocate_and_run.sh +++ b/bamboo/allocate_and_run.sh @@ -5,5 +5,6 @@ if [ "${CLUSTER}" = 'catalyst' ]; then fi if [ "${CLUSTER}" = 'pascal' ]; then + export MV2_USE_CUDA=1 salloc -N16 -t 600 ./run.sh fi diff --git a/bamboo/integration_tests/expected_values/pascal/gcc7/expected_performance.csv b/bamboo/integration_tests/expected_values/pascal/gcc7/expected_performance.csv index 25e04fb92b7..12770f3b9fc 100644 --- a/bamboo/integration_tests/expected_values/pascal/gcc7/expected_performance.csv +++ b/bamboo/integration_tests/expected_values/pascal/gcc7/expected_performance.csv @@ -2,4 +2,4 @@ Model_name, training_run_time, training_mean, training_max, training_min, t alexnet_nightly, 51.00, 1.20, 4.00, 0.50, 0.40, 0.17 alexnet_weekly, 0.00, 0.00, 0.00, 0.00, 0.00, 100.00 cache_alexnet, 0.00, 0.00, 0.00, 0.00, 0.00, 100.00 -lenet_mnist, 12.00, 0.01, 6.00, 0.01, 0.40, 98.40 +lenet_mnist, 12.00, 0.04, 6.00, 0.01, 0.40, 98.40 diff --git a/bamboo/run.sh b/bamboo/run.sh index 30d993c9a84..614b15235bd 100755 --- a/bamboo/run.sh +++ b/bamboo/run.sh @@ -1,22 +1,30 @@ #!/bin/bash -l +if [ "${CLUSTER}" = 'catalyst' ]; then + PYTHON=python +fi + +if [ "${CLUSTER}" = 'pascal' ]; then + PYTHON=$bamboo_PYTHON_x86_gpu/python +fi + echo "Task: Cleaning" ./clean.sh echo "Task: Compiler Tests" cd compiler_tests module load cmake/3.9.2 -python -m pytest -s --junitxml=results.xml +$PYTHON -m pytest -s --junitxml=results.xml cd .. echo "Task: Integration Tests" cd integration_tests -python -m pytest -s --junitxml=results.xml +$PYTHON -m pytest -s --junitxml=results.xml cd .. echo "Task: Unit Tests" cd unit_tests -python -m pytest -s --junitxml=results.xml +$PYTHON -m pytest -s --junitxml=results.xml cd .. echo "Task: Finished" From 56dda76c0e8d42c1457e2db7df03fff0aedce3ee Mon Sep 17 00:00:00 2001 From: "David A. Hysom" Date: Fri, 24 May 2019 06:04:09 -0700 Subject: [PATCH 037/634] ongoing development. --- .../lbann/data_store/data_store_conduit.hpp | 18 +++++--- src/data_store/data_store_conduit.cpp | 42 ++++++++++++++----- 2 files changed, 44 insertions(+), 16 deletions(-) diff --git a/include/lbann/data_store/data_store_conduit.hpp b/include/lbann/data_store/data_store_conduit.hpp index 9d73e3f8f54..e3a8284416a 100644 --- a/include/lbann/data_store/data_store_conduit.hpp +++ b/include/lbann/data_store/data_store_conduit.hpp @@ -91,6 +91,8 @@ class data_store_conduit { } */ + void preload_local_cache(); + void check_mem_capacity(lbann_comm *comm, const std::string sample_list_file, size_t stride, size_t offset); /// returns the conduit node @@ -291,14 +293,20 @@ protected : /// used in exchange_data_by_sample, when sample sizes are non-uniform bool m_have_sample_sizes; - /// fills in m_image_name_to_index, m_image_sizes, and m_image_offsets - void get_image_sizes(); - - /// number of bytes in each image - std::vector m_image_sizes; + /// fills in m_image_offsets; returns the segment size (which is the + /// sum of the file sizes) + int get_image_offsets(); /// offset at which the raw image will be stored in a shared memory segment std::vector m_image_offsets; + + void allocate_shared_segment(int size); + + std::string m_image_base_dir; + std::vector m_my_files; + std::vector m_my_sizes; + + void load_files(); }; } // namespace lbann diff --git a/src/data_store/data_store_conduit.cpp b/src/data_store/data_store_conduit.cpp index 27ee844a08f..ee2e23a3aae 100644 --- a/src/data_store/data_store_conduit.cpp +++ b/src/data_store/data_store_conduit.cpp @@ -1010,7 +1010,8 @@ void data_store_conduit::set_preload() { m_preload = true; } -void data_store_conduit::get_image_sizes() { +int data_store_conduit::get_image_offsets() { + int segment_length = 0; options *opts = options::get(); /// this block fires if image sizes have been precomputed if (opts->has_string("image_sizes_filename")) { @@ -1020,7 +1021,7 @@ void data_store_conduit::get_image_sizes() { else { // get list of image file names const std::string image_list_file = m_reader->get_data_filename(); - const std::string image_dir = m_reader->get_file_dir(); + m_image_base_dir = m_reader->get_file_dir(); FILE *fplist = fopen(image_list_file.c_str(), "rt"); std::vector image_file_names; int imagelabel; @@ -1034,8 +1035,6 @@ void data_store_conduit::get_image_sizes() { fclose(fplist); // get sizes of files for which I'm responsible - // TODO: should add threading to reduce computation time - std::vector my_sizes; for (size_t h=m_rank_in_trainer; htrainer_all_gather(my_sizes, work, counts, disp); - // fill in m_image_sizes and m_image_offsets - m_image_sizes.resize(image_file_names.size()); + // fill in m_image_offsets + m_image_offsets.resize(image_file_names.size()+1); + m_image_offsets[0] = 0; for (int rank = 0; rank < m_np_in_trainer; rank++) { size_t offset = disp[rank]; size_t count = counts[rank]; size_t i = rank; for (size_t j=offset; jget_rank_in_node(); + if (node_id == 0) { + } + m_comm->barrier(m_comm->get_node_comm()); +} + +void data_store_conduit::preload_local_cache() { + int segment_size = get_image_offsets(); + allocate_shared_segment(segment_length); + load_files(); +} + +void data_store_conduit::load_files() { } } // namespace lbann From 0de6d6d37520ea8f7fdf9ec9c21590180102d1c9 Mon Sep 17 00:00:00 2001 From: Katie Graham Date: Fri, 24 May 2019 14:02:57 -0700 Subject: [PATCH 038/634] Fixed typo --- docs/index.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/index.rst b/docs/index.rst index b07bfae9bb9..988b14b217e 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -11,7 +11,7 @@ LBANN provides model-parallel acceleration through domain decomposition to optimize for strong scaling of network training. It also allows for composition of model-parallelism with both data parallelism and ensemble training methods for training large neural -networks with massive amounts of data. LBANN is able to advantage of +networks with massive amounts of data. LBANN is able to take advantage of tightly-coupled accelerators, low-latency high-bandwidth networking, and high-bandwidth parallel file systems. From ccfead73c0a78fd430daef5997bd0187d25dde16 Mon Sep 17 00:00:00 2001 From: Katie Graham Date: Fri, 24 May 2019 14:04:25 -0700 Subject: [PATCH 039/634] Clarified spack user build instructions --- docs/building_lbann.rst | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/building_lbann.rst b/docs/building_lbann.rst index b374170bb07..9ddd24bc478 100644 --- a/docs/building_lbann.rst +++ b/docs/building_lbann.rst @@ -125,7 +125,7 @@ Setup Spack and local base tools package names prepended with a dash, e.g.: :bash:`ml -intel`. To unload all currently loaded modules, use :bash:`ml purge`. -3. Optionally, setup your spack environment to take advantages of +3. Optionally, setup your spack environment to take advantage of locally installed tools. Note that unless your spack environment is explicitly told about tools such as cmake, python, mpi, etc. it will install everything that LBANN and all of its dependencies @@ -165,7 +165,8 @@ Here are three easy ways to install LBANN: cd /spack_environments/users/llnl_lc/_gpu/ # where = x86_64 | ppc64le spack install - ml load lbann + spack env loads + source ./loads - Building with the latest released versions and GPU support (use the user's defaults for specifying the compiler, MPI library, etc.): From b6e7f8a0f8507fd46f4826a0017738d3c0f6dc22 Mon Sep 17 00:00:00 2001 From: Katie Graham Date: Fri, 24 May 2019 14:19:29 -0700 Subject: [PATCH 040/634] clarify command; fix directory name --- docs/building_lbann.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/building_lbann.rst b/docs/building_lbann.rst index 9ddd24bc478..88cf1cffd0c 100644 --- a/docs/building_lbann.rst +++ b/docs/building_lbann.rst @@ -104,7 +104,7 @@ Setup Spack and local base tools .. code-block:: bash - . ${SPACK_ROOT}/share/spack/setup-env.sh + source ${SPACK_ROOT}/share/spack/setup-env.sh 2. Setup your compiler and external software environment. For example, @@ -163,7 +163,7 @@ Here are three easy ways to install LBANN: .. code-block:: bash - cd /spack_environments/users/llnl_lc/_gpu/ # where = x86_64 | ppc64le + cd /spack_environments/users/llnl_lc/_cuda/ # where = x86_64 | ppc64le spack install spack env loads source ./loads From db8c6d0b7aaac6b872e4f06da54cf5021bce3e92 Mon Sep 17 00:00:00 2001 From: Katie Graham Date: Fri, 24 May 2019 14:20:12 -0700 Subject: [PATCH 041/634] move dependency information away from spack build --- docs/build_with_cmake.rst | 79 ++++++++++++++++++++++++++++++++++++++- docs/building_lbann.rst | 76 ------------------------------------- 2 files changed, 78 insertions(+), 77 deletions(-) diff --git a/docs/build_with_cmake.rst b/docs/build_with_cmake.rst index f6f49ebd305..9e092014f3d 100644 --- a/docs/build_with_cmake.rst +++ b/docs/build_with_cmake.rst @@ -8,7 +8,7 @@ Building LBANN with `CMake `_ ================================================== LBANN uses `CMake `_ for its build system and a -version newer than or equal to 3.9.0 is required. LBANN development is +version newer than or equal to 3.12.0 is required. LBANN development is done primarily on UNIX-based platforms. As such, the build is tested regularly on Linux-based machines, occasionally on OSX, and never on Windows machines. @@ -22,6 +22,83 @@ is missing, please `open an issue `_. It is required that LBANN be built out-of-source. That is, CMake must not be invoked in a directory containing a CMakeLists. +-------------------- +Dependencies +-------------------- + +The following packages and tools are required to build LBANN. All +packages listed below may be installed using `Spack +`_. See :ref:`the Spack installation +instructions ` for more details on using Spack to +build a complete LBANN environment. + +The following basic tools are **required**. + ++ A C++11-compliant compiler. + ++ OpenMP, version 3.0 or newer. + ++ An MPI-3.0 implementation. + ++ `CEREAL `_ is used to handle + complex serialization tasks. + ++ `CMake `_, version 3.9 or newer. + +The following LLNL-maintained packages are **required**. + ++ `Hydrogen `_ is a fork of the + `Elemental `_ distributed + dense linear-algebra library and it may be installed via + `Spack `_ using the package name + "hydrogen". If CUDA support is enabled in Hydrogen, LBANN will + inherit this support. + +The following third-party packages are **required**. + ++ `CNPY `_ is used to ingest data + in NumPy format. In principle this should be optional, but at time + of writing, LBANN will not build without it. + ++ `OpenCV `_ is used to preprocess + image data. For performance reasons, it is recommend to build OpenCV + with `JPEG-turbo `_ + for JPEG format support. + ++ `ProtoBuf `_ is used to + express models in a portable format. + +The following LLNL-maintained packages are **optional**. + ++ `Aluminum `_ is a + communication library optimized for machine learning and interaction + with GPUs. We cannot recommend its use strongly enough. It can be + built using `Spack `_. + ++ `CONDUIT `_ is used to ingest + structured data produced by scientific simulations. + +The following third-party packages are **optional**. + ++ `CUDA `_. The development + team currently uses CUDA version 9.2. Building with CUDA support + requires that Hydrogen has been built with CUDA support (see below). + ++ `cuDNN `_ is required if + building LBANN with CUDA support. It is freely available as a binary + distribution from NVIDIA. + ++ `HWLOC `_. HWLOC enables + LBANN to make certain optimizations based on the hardware + topology. Its use is strongly recommended. + ++ NVTX. LBANN supports some improved annotations for NVPROF using + NVTX. NVTX is provided as part of the CUDA toolkit. + ++ VTune. LBANN supports some improved annotations for VTune. + + + -------------------- LBANN CMake options -------------------- diff --git a/docs/building_lbann.rst b/docs/building_lbann.rst index 88cf1cffd0c..277f6d781b6 100644 --- a/docs/building_lbann.rst +++ b/docs/building_lbann.rst @@ -12,82 +12,6 @@ Download LBANN source code can be obtained from the `Github repo `_. --------------------- -Dependencies --------------------- - -The following packages and tools are required to build LBANN. All -packages listed below may be installed using `Spack -`_. See :ref:`below -` for more details on using Spack to build a -complete LBANN environment. - -The following basic tools are **required**. - -+ A C++11-compliant compiler. - -+ OpenMP, version 3.0 or newer. - -+ An MPI-3.0 implementation. - -+ `CEREAL `_ is used to handle - complex serialization tasks. - -+ `CMake `_, version 3.9 or newer. - -The following LLNL-maintained packages are **required**. - -+ `Hydrogen `_ is a fork of the - `Elemental `_ distributed - dense linear-algebra library and it may be installed via - `Spack `_ using the package name - "hydrogen". If CUDA support is enabled in Hydrogen, LBANN will - inherit this support. - -The following third-party packages are **required**. - -+ `CNPY `_ is used to ingest data - in NumPy format. In principle this should be optional, but at time - of writing, LBANN will not build without it. - -+ `OpenCV `_ is used to preprocess - image data. For performance reasons, it is recommend to build OpenCV - with `JPEG-turbo `_ - for JPEG format support. - -+ `ProtoBuf `_ is used to - express models in a portable format. - -The following LLNL-maintained packages are **optional**. - -+ `Aluminum `_ is a - communication library optimized for machine learning and interaction - with GPUs. We cannot recommend its use strongly enough. It can be - built using `Spack `_. - -+ `CONDUIT `_ is used to ingest - structured data produced by scientific simulations. - -The following third-party packages are **optional**. - -+ `CUDA `_. The development - team currently uses CUDA version 9.2. Building with CUDA support - requires that Hydrogen has been built with CUDA support (see below). - -+ `cuDNN `_ is required if - building LBANN with CUDA support. It is freely available as a binary - distribution from NVIDIA. - -+ `HWLOC `_. HWLOC enables - LBANN to make certain optimizations based on the hardware - topology. Its use is strongly recommended. - -+ NVTX. LBANN supports some improved annotations for NVPROF using - NVTX. NVTX is provided as part of the CUDA toolkit. - -+ VTune. LBANN supports some improved annotations for VTune. - - .. _building-with-spack: ------------------------------------------------------------ From 1a1731c99ee7c7ea199be1e65bad4b48dd800be9 Mon Sep 17 00:00:00 2001 From: Ryan Forsyth Date: Mon, 20 May 2019 16:44:19 -0700 Subject: [PATCH 042/634] Update Weekly Develop --- bamboo/allocate_and_run.sh | 26 ++++++++++++++++++- bamboo/integration_tests/common_code.py | 1 + .../catalyst/clang4/expected_performance.csv | 4 +-- .../catalyst/gcc7/expected_performance.csv | 8 +++--- bamboo/integration_tests/full_alexnet.sh | 0 .../test_integration_performance.py | 6 ++--- bamboo/run.sh | 26 ++++++++++++++++++- 7 files changed, 60 insertions(+), 11 deletions(-) mode change 100644 => 100755 bamboo/integration_tests/full_alexnet.sh diff --git a/bamboo/allocate_and_run.sh b/bamboo/allocate_and_run.sh index d0018260e7c..300f75a42d8 100755 --- a/bamboo/allocate_and_run.sh +++ b/bamboo/allocate_and_run.sh @@ -4,7 +4,31 @@ if [ "${CLUSTER}" = 'catalyst' ]; then salloc -N16 -t 600 ./run.sh fi +WEEKLY=0 +while :; do + case ${1} in + --weekly) + # Run all tests. This is a weekly build. + WEEKLY=1 + ;; + -?*) + # Unknown option + echo "Unknown option (${1})" >&2 + exit 1 + ;; + *) + # Break loop if there are no more options + break + esac + shift +done + if [ "${CLUSTER}" = 'pascal' ]; then export MV2_USE_CUDA=1 - salloc -N16 -t 600 ./run.sh + if [ ${WEEKLY} -ne 0 ]; then + salloc -N16 -t 600 ./run.sh --weekly + else + salloc -N16 -t 600 ./run.sh + fi + fi diff --git a/bamboo/integration_tests/common_code.py b/bamboo/integration_tests/common_code.py index 0d0a4dda68e..915289adedd 100644 --- a/bamboo/integration_tests/common_code.py +++ b/bamboo/integration_tests/common_code.py @@ -9,6 +9,7 @@ def get_command(cluster, dir_name, model_folder, model_name, executable, output_file_name, error_file_name, compiler_name, weekly=False): if model_name in ['alexnet', 'conv_autoencoder_imagenet']: data_reader_percent = 0.01 + # If doing weekly testing, increase data_reader_percent if weekly: data_reader_percent = 0.10 command = tools.get_command( diff --git a/bamboo/integration_tests/expected_values/catalyst/clang4/expected_performance.csv b/bamboo/integration_tests/expected_values/catalyst/clang4/expected_performance.csv index 6d2d59bafb7..d7db441ade5 100644 --- a/bamboo/integration_tests/expected_values/catalyst/clang4/expected_performance.csv +++ b/bamboo/integration_tests/expected_values/catalyst/clang4/expected_performance.csv @@ -1,5 +1,5 @@ Model_name, training_run_time, training_mean, training_max, training_min, training_stdev, test_accuracy -alexnet_nightly, 117.00, 2.80, 9.00, 1.20, 2.00, 0.00 -alexnet_weekly, 0.00, 0.00, 0.00, 0.00, 0.00, 100.00 +alexnet_nightly, 117.00, 2.80, 9.00, 1.20, 2.00, 100.00 +alexnet_weekly, 490.00, 1.00, 3.00, 0.60, 0.50, 2.00 cache_alexnet, 0.00, 0.00, 0.00, 0.00, 0.00, 100.00 lenet_mnist, 100.00, 0.12, 0.40, 0.10, 0.09, 98.40 diff --git a/bamboo/integration_tests/expected_values/catalyst/gcc7/expected_performance.csv b/bamboo/integration_tests/expected_values/catalyst/gcc7/expected_performance.csv index 654db1a99af..6b4eee1703b 100644 --- a/bamboo/integration_tests/expected_values/catalyst/gcc7/expected_performance.csv +++ b/bamboo/integration_tests/expected_values/catalyst/gcc7/expected_performance.csv @@ -1,5 +1,5 @@ Model_name, training_run_time, training_mean, training_max, training_min, training_stdev, test_accuracy -alexnet_nightly, 65.00, 1.50, 8.30, 0.37, 1.70, 0.00 -alexnet_weekly, 0.00, 0.00, 0.00, 0.00, 0.00, 100.00 -cache_alexnet, 0.00, 0.00, 0.00, 0.00, 0.00, 100.00 -lenet_mnist, 137.00, 0.18, 0.40, 0.15, 0.04, 98.92 +alexnet_nightly, 65.00, 1.50, 8.30, 0.37, 1.70, 100.00 +alexnet_weekly, 360.00, 0.90, 4.00, 0.40, 0.70, 3.00 +cache_alexnet, 0.00, 0.00, 0.00, 0.00, 0.00, 100.00 +lenet_mnist, 137.00, 0.18, 0.40, 0.15, 0.04, 98.92 diff --git a/bamboo/integration_tests/full_alexnet.sh b/bamboo/integration_tests/full_alexnet.sh old mode 100644 new mode 100755 diff --git a/bamboo/integration_tests/test_integration_performance.py b/bamboo/integration_tests/test_integration_performance.py index fef3a0d267c..6d488a2e316 100644 --- a/bamboo/integration_tests/test_integration_performance.py +++ b/bamboo/integration_tests/test_integration_performance.py @@ -154,10 +154,10 @@ def skeleton_performance_full_alexnet(cluster, dir_name, executables, should_log = True output_file_name = '%s/bamboo/integration_tests/output/%s_%s_output.txt' %(dir_name, model_name, compiler_name) error_file_name = '%s/bamboo/integration_tests/error/%s_%s_error.txt' %(dir_name, model_name, compiler_name) - if cluster in ['catalyst', 'pascal', 'surface']: + if cluster in ['catalyst', 'surface']: command = 'salloc --nodes 128 %s/bamboo/integration_tests/%s.sh > %s 2> %s' % (dir_name, model_name, output_file_name, error_file_name) - elif cluster == 'ray': - e = 'skeleton_performance_full_alexnet: Ray is unsupported for skeleton_performance_full_alexnet' + elif cluster in ['pascal', 'ray']: + e = 'skeleton_performance_full_alexnet: Pascal, Ray are unsupported for skeleton_performance_full_alexnet' print('Skip - ' + e) pytest.skip(e) else: diff --git a/bamboo/run.sh b/bamboo/run.sh index 614b15235bd..6859ff129c5 100755 --- a/bamboo/run.sh +++ b/bamboo/run.sh @@ -8,6 +8,25 @@ if [ "${CLUSTER}" = 'pascal' ]; then PYTHON=$bamboo_PYTHON_x86_gpu/python fi +WEEKLY=0 +while :; do + case ${1} in + --weekly) + # Run all tests. This is a weekly build. + WEEKLY=1 + ;; + -?*) + # Unknown option + echo "Unknown option (${1})" >&2 + exit 1 + ;; + *) + # Break loop if there are no more options + break + esac + shift +done + echo "Task: Cleaning" ./clean.sh @@ -19,7 +38,12 @@ cd .. echo "Task: Integration Tests" cd integration_tests -$PYTHON -m pytest -s --junitxml=results.xml +if [ ${WEEKLY} -ne 0 ]; then + $PYTHON -m pytest -s --weekly --junitxml=results.xml +else + $PYTHON -m pytest -s --junitxml=results.xml +fi + cd .. echo "Task: Unit Tests" From 779c7d679edd0307eebd006efa42250a0aa5f481 Mon Sep 17 00:00:00 2001 From: "David A. Hysom" Date: Sun, 26 May 2019 07:24:30 -0700 Subject: [PATCH 043/634] modified to work with hydra data --- model_zoo/jag_utils/build_index.cpp | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/model_zoo/jag_utils/build_index.cpp b/model_zoo/jag_utils/build_index.cpp index 9af7a2f6d6e..1416c6fbe1e 100644 --- a/model_zoo/jag_utils/build_index.cpp +++ b/model_zoo/jag_utils/build_index.cpp @@ -128,6 +128,16 @@ if (j >= 400) break; conduit::Node n_ok; for (size_t h=0; h Date: Mon, 27 May 2019 06:36:15 -0700 Subject: [PATCH 044/634] modified build_sample_id_mapping to work with hydra --- .../jag_utils/build_sample_id_mapping.cpp | 52 +++++++++++++++++-- 1 file changed, 47 insertions(+), 5 deletions(-) diff --git a/model_zoo/jag_utils/build_sample_id_mapping.cpp b/model_zoo/jag_utils/build_sample_id_mapping.cpp index 3814ef676d1..24bbdb72964 100644 --- a/model_zoo/jag_utils/build_sample_id_mapping.cpp +++ b/model_zoo/jag_utils/build_sample_id_mapping.cpp @@ -30,15 +30,21 @@ int main(int argc, char **argv) { opts->init(argc, argv); // sanity check the cmd line - if (argc != 2) { + if (argc < 2) { if (master) { - cerr << "\nusage: " << argv[0] << " --base_dir=\n" + cerr << "\nusage: " << argv[0] << " --base_dir= [--hydra]\n" << "assumes: the file '/index.txt' exists\n" - << "output: writes the file /id_mapping.txt\n\n"; + << "output: writes the file /id_mapping.txt\n" + << "hydra: you must include --hydra when building a mapping for\n" + << " hydra conduit nodes, else the output file will be\n" + << " meaningless, and will result in undefined behavior."; + } return(0); } + bool hydra = opts->get_bool("hydra"); + // get list of conduit filenames if (master) cerr << "reading filelist\n"; vector filenames; @@ -50,7 +56,10 @@ int main(int argc, char **argv) { sprintf(b, "%s/index.txt", base_dir.c_str()); std::string fn; std::ifstream in(b); - if (!in) LBANN_ERROR("can't open file for writing"); + if (!in) { + std::string fn2(b); + LBANN_ERROR("can't open file for reading: " + fn2); + } std::string line; getline(in, line); getline(in, line); @@ -72,6 +81,7 @@ int main(int argc, char **argv) { // each proc builds a map: sample_id -> local index, for the // conduit files for which it's responsible size_t q = 0; + conduit::Node n_ok; if (master) cerr << "building map\n"; for (size_t j=rank; j cnames; conduit::relay::io::hdf5_group_list_child_names(hdf5_file_hnd, "/", cnames); for (size_t h=0; h Date: Tue, 28 May 2019 09:49:14 -0700 Subject: [PATCH 045/634] Instead of using, e.g, 2dd3c515-7c3a-11e9-9101-0894ef80059f/runno/run0001, as the sample_id, use 2dd3c515-7c3a-11e9-9101-0894ef80059f, since these are unique (wrt the hydra samples we have so far), and this makes processing the sample lists similar to what we do for JAG --- .../jag_utils/build_sample_id_mapping.cpp | 29 +++++++++++++++++-- 1 file changed, 26 insertions(+), 3 deletions(-) diff --git a/model_zoo/jag_utils/build_sample_id_mapping.cpp b/model_zoo/jag_utils/build_sample_id_mapping.cpp index 24bbdb72964..e1bd52cdf54 100644 --- a/model_zoo/jag_utils/build_sample_id_mapping.cpp +++ b/model_zoo/jag_utils/build_sample_id_mapping.cpp @@ -43,7 +43,10 @@ int main(int argc, char **argv) { return(0); } - bool hydra = opts->get_bool("hydra"); +std::unordered_set names; +int total = 0; + + //bool hydra = opts->get_bool("hydra"); // get list of conduit filenames if (master) cerr << "reading filelist\n"; @@ -92,6 +95,13 @@ int main(int argc, char **argv) { std::vector cnames; conduit::relay::io::hdf5_group_list_child_names(hdf5_file_hnd, "/", cnames); for (size_t h=0; h Date: Tue, 28 May 2019 10:45:31 -0700 Subject: [PATCH 046/634] Update docs/build_with_cmake.rst Co-Authored-By: Tim Moon --- docs/build_with_cmake.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/build_with_cmake.rst b/docs/build_with_cmake.rst index 9e092014f3d..c6722fa956a 100644 --- a/docs/build_with_cmake.rst +++ b/docs/build_with_cmake.rst @@ -43,7 +43,7 @@ The following basic tools are **required**. + `CEREAL `_ is used to handle complex serialization tasks. -+ `CMake `_, version 3.9 or newer. ++ `CMake `_, version 3.12 or newer. The following LLNL-maintained packages are **required**. From d5b7479d630d7e352dbddedee63d1a1ef12d1132 Mon Sep 17 00:00:00 2001 From: Ryan Forsyth Date: Tue, 28 May 2019 13:49:19 -0700 Subject: [PATCH 047/634] Update Weekly Develop --- bamboo/allocate_and_run.sh | 19 +++++++++++-------- bamboo/run.sh | 9 +++++++++ 2 files changed, 20 insertions(+), 8 deletions(-) diff --git a/bamboo/allocate_and_run.sh b/bamboo/allocate_and_run.sh index 300f75a42d8..58ba329bff1 100755 --- a/bamboo/allocate_and_run.sh +++ b/bamboo/allocate_and_run.sh @@ -1,14 +1,14 @@ CLUSTER=$(hostname | sed 's/\([a-zA-Z][a-zA-Z]*\)[0-9]*/\1/g') -if [ "${CLUSTER}" = 'catalyst' ]; then - salloc -N16 -t 600 ./run.sh -fi +echo "allocate_and_run.sh CLUSTER=" +echo $CLUSTER WEEKLY=0 while :; do case ${1} in --weekly) # Run all tests. This is a weekly build. + echo "Setting WEEKLY in allocate_and_run.sh" WEEKLY=1 ;; -?*) @@ -23,12 +23,15 @@ while :; do shift done +echo "allocate_and_run.sh WEEKLY=" +echo $WEEKLY + if [ "${CLUSTER}" = 'pascal' ]; then export MV2_USE_CUDA=1 - if [ ${WEEKLY} -ne 0 ]; then - salloc -N16 -t 600 ./run.sh --weekly - else - salloc -N16 -t 600 ./run.sh - fi +fi +if [ ${WEEKLY} -ne 0 ]; then + salloc -N16 -t 600 ./run.sh --weekly +else + salloc -N16 -t 600 ./run.sh fi diff --git a/bamboo/run.sh b/bamboo/run.sh index 6859ff129c5..234e950137d 100755 --- a/bamboo/run.sh +++ b/bamboo/run.sh @@ -1,5 +1,10 @@ #!/bin/bash -l +CLUSTER=$(hostname | sed 's/\([a-zA-Z][a-zA-Z]*\)[0-9]*/\1/g') + +echo "run.sh CLUSTER=" +echo $CLUSTER + if [ "${CLUSTER}" = 'catalyst' ]; then PYTHON=python fi @@ -13,6 +18,7 @@ while :; do case ${1} in --weekly) # Run all tests. This is a weekly build. + echo "Setting WEEKLY in run.sh" WEEKLY=1 ;; -?*) @@ -27,6 +33,9 @@ while :; do shift done +echo "run.sh WEEKLY=" +echo $WEEKLY + echo "Task: Cleaning" ./clean.sh From b397e2c77026cbb83b003289fa4ada57275f7bc8 Mon Sep 17 00:00:00 2001 From: Ryan Forsyth Date: Fri, 24 May 2019 08:46:10 -0700 Subject: [PATCH 048/634] Move testing docs to Read-the-Docs --- bamboo/README.md | 95 +------------- docs/continuous_integration.rst | 224 ++++++++++++++++++++++++++++++++ docs/index.rst | 1 + 3 files changed, 227 insertions(+), 93 deletions(-) create mode 100644 docs/continuous_integration.rst diff --git a/bamboo/README.md b/bamboo/README.md index c317c496379..763df1443a0 100644 --- a/bamboo/README.md +++ b/bamboo/README.md @@ -1,93 +1,2 @@ -# LBANN CI - -Bamboo is the continuous integration (CI) framework we use. A Bamboo plan consists of stages (which run sequentially), which consist of jobs (which run in parallel), which consist of tasks (which run sequentially). - -The LBANN build project has many plans. Two plans run off of [`LLNL/lbann/develop`](https://github.com/LLNL/lbann/tree/develop "https://github.com/LLNL/lbann/tree/develop") - Nightly Develop and Weekly Develop. Nightly Develop runs every night (except Saturday) at midnight. Weekly Develop runs every Saturday at midnight. The other plans in the build project are for each individual LBANN developer's fork of LBANN. - -All plans run off the latest *pushed* commits to the repository. That means if you have local commits that you have not pushed to your fork, these commits will *not* be tested by Bamboo. If you have pushed commits to your fork but have not merged your branch into the main repository's `develop`, your commits will be tested on your individual plan, but not on Nightly Develop or Weekly Develop. - -## Plan Configuration -Each plan is identical (except Weekly Develop, which will be explained below). The plans consist of a single stage `Tests`. The stage consists of three jobs - `ppc64le_gpu`, `x86_cpu`, and `x86_gpu`. Each of these three jobs can run in parallel. They consist of an identical list of tasks: -1. Checkout Default Repository (checkout the repository) -2. Remove Generated Files (each build creates a large number of files. We may look at these files between builds, so we cannot delete them at the end of a build. So, instead we delete them before doing any real work in the next build. This also ensures the generated files came from the latest build and not a previous build). -3. Compiler Tests (run tests in `bamboo/compiler_tests`) -4. Integration Tests (run tests in `bamboo/integration_tests`) -5. Unit Tests (run tests in `bamboo/unit_tests`) -6. JUnit Parser (this allows Bamboo to render test results in a nice UI) - -The three testing tasks differ somewhat between jobs. However, they all execute some variant of `python -m pytest -s --junitxml=results.xml`, which will run all the pytests in the job's associated directory. - -Weekly Develop adds the `--weekly` option (`python -m pytest -s --weekly --junitxml=results.xml`). Many (mostly longer-running) tests are set to not run unless this option is on. Weekly Develop runs a superset of the tests that Nightly Develop runs. - -## Directory Structure - -`bamboo/compiler_tests`, `bamboo/integration_tests`, `bamboo/unit_tests` each have a `conftest.py` that pytest requires. They also contain one or more python files. Each of these files have a number of tests to run. - -## Writing Your Own Tests - -A side effect of our Bamboo setup is that tests must be written using pytest. Test files must begin with `test_` to be recognized by pytest. Individual test methods must also begin with `test_`. Test methods should use the `assert` keyword. A test will only fail if the assertion turns out to be false. Not putting an assertion will automatically cause the test to pass. - -How then to test non-Python code? You can just wrap your test with Python. A test can be as simple as asserting the output of a shell command is 0. The output of a command can be found using Python's `os.system()`. - -## Running Tests On Your Individual Plan - -Unlike Nightly Develop, the individual plans are triggered to run by polling your fork for commits. They do not run nightly. If you push new commits to your fork, a new build should start automatically. You can also manually start a build by navigating to your individual plan and clicking Run > Run Plan. Once again, keep in mind that the tests will run off what has been pushed to your GitHub fork of LBANN and not your local copy of the LBANN repository. - -## Navigating Bamboo - -From the [LBANN Project Summary](https://lc.llnl.gov/bamboo/browse/LBANN "https://lc.llnl.gov/bamboo/browse/LBANN"), click on a build project. From there, click on a build (builds are listed under "Recent History" and can also be accessed from the pass/fail marks in the top right, to the left of the "Run" button). This will bring you to a certain build's page. The most relevant tabs are "Tests" and "Logs". It is recommended to look at failures first in the "Tests" tab, as the build logs can be difficult to parse through. The build's "Tests" tab shows "New test failures", "Existing test failures", "Fixed tests", and "Skipped Tests". - -From the build's page, you can also click on individual jobs, which have the same tabs. The "Tests" tabs of the individual jobs have two sub-tabs, "Failed tests" and "Successful tests". They do not display skipped tests. The Bamboo agent that ran the job can be found by looking at the "Agent" field under the "Job Summary" tab. Alternatively, you can determine the agent from one of the first lines in the build logs: `Build working directory is /usr/workspace/wsb/lbannusr/bamboo//xml-data/build-dir/`. - -Some build logs can be very large (e.g. over 100,000 lines). Beyond about 5,000 lines it is a good idea to download a log instead of viewing it in the browser. Beyond about 10,000 lines, some text editors may experience slowness. At this point it is good to split up the files with `split -l 10000 `, which creates files of the form `x*` and of length 10,000. You can then run a command such as `grep -in "Errors for:" x*` to find which files have reported errors. After you are done, you can remove the files with `rm x*`. Note that the original log file is not modified by any of these steps. - -As an alternative to splitting the file, errors can be searched for with `grep -in -A "Errors for:" `. - -## Bamboo Agent Properties - -Bamboo agent properties are used to specify requirements for each job. - -| Agents (jobs) | `agent_owner` | `architecture` | `cluster` | `gpu_architecture` | `sys_type` | -| --- | --- | --- | --- | --- | --- | -| Catalyst Agents (x86_cpu) | `lbannusr` | `x86_64` | `catalyst` | `none` | `toss_3_x86_64_ib` | -| Pascal Agents (x86_gpu_pascal) | `lbannusr` | `x86_64` | `pascal` | `pascal` | `chaos_6_x86_64_ib` | -| Quartz Agents (x86_cpu) | `lbannusr` | `x86_64` | `quartz` | `none` | `toss_3_x86_64_ib` | -| Ray Agents (ppc64le_gpu) | `lbannusr` | `ppc64_le` | `ray` | `pascal` | `blueos_3_ppc64le_ib` | -| Surface Agents (x86_gpu) | `lbannusr` | `x86_64` | `surface` | `kepler` | `chaos_5_x86_64_ib` | - -Currently, `agent_owner`, `architecture`, and `gpu_architecture` are used to determine agents to run a job. - -# Running Tests From The Command Line - -Navigate to `bamboo/compiler_tests`, `bamboo/integration_tests`, or `bamboo/unit_tests`. - -To run all the tests in a subdirectory: `python -m pytest -s --weekly`. Note that running all tests can take a substantial amount of time. - -To run the tests that Nightly Develop or the individual plans run in a subdirectory: `python -m pytest -s`. - -To run a specific test file: `python -m pytest -s .py`. - -To run a specific test: `python -m pytest -s .py -k ''`. - -Most integration and unit tests allow for running a test with a different executable. The convention is to have a similarly structured test replacing `_` with `_exe`. These tests are set to be skipped in Bamboo, but can be run locally. There should be a line above the test that gives the command to run the test locally, likely in the following form: `python -m pytest -s .py -k '' --exe=`. - -At this time, there is no way to run all the `_exe` tests in a subdirectory and only those. - -# Helpful Files - -First, run `sudo lbannusr`. - -To look at output and error from previous builds: `cd /usr/workspace/wsb/lbannusr/bamboo//xml-data/build-dir//bamboo//` - -To look at archived results from previous builds: `cd /usr/workspace/wsb/lbannusr/archives/` - -To look at Bamboo agent properties: `cat /usr/global/tools/bamboo/agents/lbannusr//bin/bamboo-capabilities.properties` - -You can copy these files over to your own machine as follows: -- `sudo lbannusr` -- `give ` -- `exit` - to go back to your own LC account, not `lbannusr`'s. -- `take lbannusr` - now the file exists on your LC account, but not yet on your own machine. - -From your own machine, not a ssh terminal: -- `scp @.llnl.gov: .` +Refer to `lbann/docs/continuous_integration.rst` +or "LBANN CI" on the [LBANN docs](http://software.llnl.gov/lbann/). diff --git a/docs/continuous_integration.rst b/docs/continuous_integration.rst new file mode 100644 index 00000000000..363b9aa8f9c --- /dev/null +++ b/docs/continuous_integration.rst @@ -0,0 +1,224 @@ +.. role:: bash(code) + :language: bash + +.. role:: python(code) + :language: python + +LBANN CI +==================== + +Bamboo is the continuous integration (CI) framework we use. +A Bamboo plan consists of stages (which run sequentially), +which consist of jobs (which run in parallel), +which consist of tasks (which run sequentially). + +The LBANN build project has many plans. +Two plans run off of `LLNL/lbann/develop `_ +- Nightly Develop and Weekly Develop. +Nightly Develop runs every night (except Saturday) at midnight. +Weekly Develop runs every Saturday at midnight. +The other plans in the build project are for each individual LBANN developer's +fork of LBANN. + +All plans run off the latest *pushed* commits to the repository. +That means if you have local commits that you have not pushed to your fork, +these commits will *not* be tested by Bamboo. +If you have pushed commits to your fork but have not merged your branch into +the main repository's "develop", +your commits will be tested on your individual plan, +but not on Nightly Develop or Weekly Develop. + +Plan Configuration +---------------------------------------- +Each plan is identical (except Weekly Develop, which will be explained below). +The plans consist of a single stage "Tests". +The stage consists of two jobs - "x86_cpu" (Catalyst), and "x86_gpu" (Pascal). +Each of these jobs can run in parallel. +They consist of an identical list of tasks: + +1. Checkout Default Repository (checkout the repository) + +2. Run :bash:`./allocate_and_run.sh`; + Weekly Develop adds the :bash:`--weekly` option. + This script allocates nodes and then runs "run.sh" which does the following: + + a. Remove Generated Files (each build creates a large number of files. + We may look at these files between builds, + so we cannot delete them at the end of a build. + So, instead we delete them before doing any real work in the next build. + This also ensures the generated files came from the latest build and not + a previous build). + + b. Compiler Tests (run tests in "bamboo/compiler_tests") + + c. Integration Tests (run tests in "bamboo/integration_tests") + + d. Unit Tests (run tests in "bamboo/unit_tests") + +3. JUnit Parser (this allows Bamboo to render test results in a nice UI) + + +The tests in Task 2 run +:bash:`$PYTHON -m pytest -s [--weekly] --junitxml=results.xml`, +which will run all the pytests in the job's associated directory. +Note that :bash:`$PYTHON` refers to the Python build to use. +Also note that only Weekly Develop adds the :bash:`--weekly` option. +Many (mostly longer-running) tests are set to not run unless this option is on. +Weekly Develop runs a superset of the tests that Nightly Develop runs. + +Directory Structure +---------------------------------------- + +"bamboo/compiler_tests", "bamboo/integration_tests", "bamboo/unit_tests" each +have a "conftest.py" that pytest requires. +They also contain one or more python files. +Each of these files have a number of tests to run. + +Writing Your Own Tests +---------------------------------------- + +A side effect of our Bamboo setup is that tests must be written using pytest. +Test files must begin with :bash:`test_` to be recognized by pytest. +Individual test methods must also begin with :python:`test_`. +Test methods should use the :python:`assert` keyword. +A test will only fail if the assertion turns out to be false. +Not putting an assertion will automatically cause the test to pass. + +How then to test non-Python code? +You can just wrap your test with Python. +A test can be as simple as asserting the output code of a shell command is 0. +The output code of a command can be found using Python's :python:`os.system()`. + +Running Tests On Your Individual Plan +---------------------------------------- + +Unlike Nightly Develop, the individual plans are triggered to run by polling +your fork for commits. +They do not run nightly. +If you push new commits to your fork, a new build should start automatically. +You can also manually start a build by navigating to your individual plan and +clicking Run > Run Plan. +Once again, keep in mind that the tests will run off what has been pushed to +your GitHub fork of LBANN and not your local copy of the LBANN repository. + +Navigating Bamboo +---------------------------------------- + +From the `LBANN Project Summary `_, +click on a plan. +From there, click on a build (builds are listed under "Recent History" and can +also be accessed from the pass/fail marks in the top right, +to the left of the "Run" button). +This will bring you to a certain build's page. +The most relevant tabs are "Tests" and "Logs". +It is recommended to look at failures first in the "Tests" tab, +as the build logs can be difficult to parse through. +The build's "Tests" tab shows "New test failures", "Existing test failures", +"Fixed tests", and "Skipped Tests". + +From the build's page, you can also click on individual jobs, +which have the same tabs. +The "Tests" tabs of the individual jobs have two sub-tabs, +"Failed tests" and "Successful tests". +They do not display skipped tests. +The Bamboo agent that ran the job can be found by looking at the "Agent" field +under the "Job Summary" tab. +Alternatively, you can determine the agent from one of the first lines in the +build logs: +"Build working directory is /usr/workspace/wsb/lbannusr/bamboo//xml-data/build-dir/". + +Some build logs can be very large (e.g. over 100,000 lines). +Beyond about 5,000 lines it is a good idea to download a log instead of +viewing it in the browser. +Beyond about 10,000 lines, some text editors may experience slowness. +At this point it is good to split up the files with +:bash:`split -l 10000 `, which creates files of the form `x*` and of +length 10,000. +You can then run a command such as :bash:`grep -in "Errors for:" x*` to find +which files have reported errors. +After you are done, you can remove the files with :bash:`rm x*`. +Note that the original log file is not modified by any of these steps. + +As an alternative to splitting the file, +errors can be searched for with +:bash:`grep -in -A "Errors for:" `. + +Bamboo Agent Properties +---------------------------------------- + +Bamboo agent properties are used to specify requirements for each job. + ++--------------------------------+-------------+--------------+----------+------------------+---------------------+ +| Agents (jobs) | agent_owner | architecture | cluster | gpu_architecture | sys_type | ++================================+=============+==============+==========+==================+=====================+ +| Catalyst Agents (x86_cpu) | lbannusr | x86_64 | catalyst | none | toss_3_x86_64_ib | ++--------------------------------+-------------+--------------+----------+------------------+---------------------+ +| Pascal Agents (x86_gpu_pascal) | lbannusr | x86_64 | pascal | pascal | chaos_6_x86_64_ib | ++--------------------------------+-------------+--------------+----------+------------------+---------------------+ +| Quartz Agents (x86_cpu) | lbannusr | x86_64 | quartz | none | toss_3_x86_64_ib | ++--------------------------------+-------------+--------------+----------+------------------+---------------------+ +| Ray Agents (ppc64le_gpu) | lbannusr | ppc64_le | ray | pascal | blueos_3_ppc64le_ib | ++--------------------------------+-------------+--------------+----------+------------------+---------------------+ +| Surface Agents (x86_gpu) | lbannusr | x86_64 | surface | kepler | chaos_5_x86_64_ib | ++--------------------------------+-------------+--------------+----------+------------------+---------------------+ + +Currently, "agent_owner", "architecture", and "gpu_architecture" are used to +determine agents to run a job. + +Running Tests From The Command Line +---------------------------------------- + +Navigate to "bamboo/compiler_tests", "bamboo/integration_tests", +or "bamboo/unit_tests". + +To run all the tests in a subdirectory: :bash:`python -m pytest -s --weekly`. +Note that running all tests can take a substantial amount of time. + +To run the tests that Nightly Develop or the individual plans run in a +subdirectory: :bash:`python -m pytest -s`. + +To run a specific test file: :bash:`python -m pytest -s .py`. + +To run a specific test: +:bash:`python -m pytest -s .py -k ''`. + +Most integration and unit tests allow for running a test with a different +executable. +The convention is to have a similarly structured test replacing +:python:`_` with :python:`_exe`. +These tests are set to be skipped in Bamboo, but can be run locally. +There should be a line above the test that gives the command to run the test +locally, likely in the following form: +:bash:`python -m pytest -s .py -k '' --exe=`. + +At this time, there is no way to run all the :python:`_exe` tests in a subdirectory +and only those. + +Helpful Files +---------------------------------------- + +First, run :bash:`sudo lbannusr`. + +To look at output and error from previous builds: +:bash:`cd /usr/workspace/wsb/lbannusr/bamboo//xml-data/build-dir//bamboo//` + +To look at archived results from previous builds: +:bash:`cd /usr/workspace/wsb/lbannusr/archives/` + +To look at Bamboo agent properties: +:bash:`cat /usr/global/tools/bamboo/agents/lbannusr//bin/bamboo-capabilities.properties` + +You can copy these files over to your own machine as follows: + +- :bash:`sudo lbannusr` + +- :bash:`give ` + +- :bash:`exit` - to go back to your own LC account, not lbannusr's. + +- :bash:`take lbannusr` - now the file exists on your LC account, + but not yet on your own machine. + +From your own machine, not a ssh terminal: + +- :bash:`scp @.llnl.gov: .` diff --git a/docs/index.rst b/docs/index.rst index b07bfae9bb9..fb68ce421c6 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -42,6 +42,7 @@ methods. lbann/lbann style_guide + continuous_integration ================== From 07b66a13849f9133ef8dda59fe7b97fa412177ec Mon Sep 17 00:00:00 2001 From: Ryan Forsyth Date: Wed, 29 May 2019 16:14:19 -0700 Subject: [PATCH 049/634] Meta documentation: docs about building docs --- docs/documentation_building.rst | 56 +++++++++++++++++++++++++++++++++ docs/index.rst | 1 + 2 files changed, 57 insertions(+) create mode 100644 docs/documentation_building.rst diff --git a/docs/documentation_building.rst b/docs/documentation_building.rst new file mode 100644 index 00000000000..98fe792e89c --- /dev/null +++ b/docs/documentation_building.rst @@ -0,0 +1,56 @@ +.. role:: bash(code) + :language: bash + +LBANN Documentation Building +============================ + +.. warning:: Some of the directions in this section are Mac-specific. + +Adding Documentation Outside Code +---------------------------------- + +1. Create a file such as "new_docs.rst" in "lbann/docs". + +2. Add "new_docs" (no ".rst") to the appropriate documentation block in + "lbann/docs/index.rst". + +3. Look at the other ".rst" files in "lbann/docs" to see how to get + certain formatting. + +4. When you want to see how your code looks, you have a couple options: + + a. Push your docs to your fork/branch on GitHub and look at how + the text renders. This is a very simplified look compared to + Read-the-Docs. + + b. From "lbann/docs" run :bash:`make html` and then + :bash:`open -a _build/html/index.html`. + This is exactly how the docs will look. + +5. Merge your code into "lbann/develop" and then have someone with + correct permissions on Read-the-Docs update the + `official docs `_. + +Making The Build Work +---------------------------------- + +In order to make :bash:`make html` work, you may need to do a few steps: + +1. Run :bash:`pip3 install sphinx breathe sphinx-rtd-theme`. + +2. Download Doxygen by going to the + `Doxygen downloads page `_, + downloading "Doxygen-1.8.15.dmg", and + dragging the app to the "Applications" folder. + +3. Determine the directory Doxygen is in by running `which Doxygen`. + If nothing is returned, see if `doxygen` is in + "/Applications/Doxygen.app/Contents/Resources" or + "/Applications/Doxygen.app/Contents/MacOS". + +4. Add Doxygen to your path with + :bash:`PATH=":${PATH}"`. + You may want to add this to your "~/.bash_profile" so your :bash:`PATH` is + always correct. Run :bash:`source ~.bash_profile` to run that code. + +5. Try running :bash:`make html` again. diff --git a/docs/index.rst b/docs/index.rst index fb68ce421c6..fd27462c6c0 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -43,6 +43,7 @@ methods. lbann/lbann style_guide continuous_integration + documentation_building ================== From f135e8fdab2c115e3283e1cfb5fd6fc733e182b2 Mon Sep 17 00:00:00 2001 From: Sam Ade Jacobs Date: Thu, 30 May 2019 12:38:55 -0700 Subject: [PATCH 050/634] save topk models --- .../callbacks/callback_save_topk_models.hpp | 58 +++++++++++++ include/lbann/lbann.hpp | 1 + src/callbacks/CMakeLists.txt | 1 + src/callbacks/callback_save_topk_models.cpp | 87 +++++++++++++++++++ src/proto/factories/callback_factory.cpp | 8 ++ src/proto/lbann.proto | 8 ++ 6 files changed, 163 insertions(+) create mode 100644 include/lbann/callbacks/callback_save_topk_models.hpp create mode 100644 src/callbacks/callback_save_topk_models.cpp diff --git a/include/lbann/callbacks/callback_save_topk_models.hpp b/include/lbann/callbacks/callback_save_topk_models.hpp new file mode 100644 index 00000000000..40e71e442a8 --- /dev/null +++ b/include/lbann/callbacks/callback_save_topk_models.hpp @@ -0,0 +1,58 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +// +// lbann_callback_save_topk_models .hpp .cpp - Callback to save top k models +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_CALLBACKS_CALLBACK_SAVE_TOPK_MODELS_HPP_INCLUDED +#define LBANN_CALLBACKS_CALLBACK_SAVE_TOPK_MODELS_HPP_INCLUDED + +#include "lbann/callbacks/callback_save_model.hpp" + +namespace lbann { + +/** Periodically save_topk_models computational results. + */ +class lbann_callback_save_topk_models : public lbann_callback_save_model { + public: + lbann_callback_save_topk_models(std::string dir, int k, std::string metric_name, bool ascending_ordering=false) : + lbann_callback_save_model(dir,true), m_k(k),m_metric_name(metric_name),m_ascending_ordering(ascending_ordering) {} + lbann_callback_save_topk_models(const lbann_callback_save_topk_models&) = default; + lbann_callback_save_topk_models& operator=(const lbann_callback_save_topk_models&) = default; + lbann_callback_save_topk_models* copy() const override { return new lbann_callback_save_topk_models(*this); } + void on_test_end(model *m) override; + std::string name() const override { return "save_topk_models"; } + + private: + void compute_stats_save_models(model *m); + int m_k ; //number of models to save + std::string m_metric_name; //evaluation metric + bool m_ascending_ordering; //ordering for the top k + +}; + +} // namespace lbann + +#endif // LBANN_CALLBACKS_CALLBACK_SAVE_TOPK_MODELS_HPP_INCLUDED diff --git a/include/lbann/lbann.hpp b/include/lbann/lbann.hpp index d537f634c62..3a5bd27ef03 100644 --- a/include/lbann/lbann.hpp +++ b/include/lbann/lbann.hpp @@ -152,6 +152,7 @@ #include "lbann/callbacks/callback_ltfb.hpp" #include "lbann/callbacks/callback_save_images.hpp" #include "lbann/callbacks/callback_save_model.hpp" +#include "lbann/callbacks/callback_save_topk_models.hpp" #include "lbann/callbacks/profiler.hpp" #include "lbann/callbacks/callback_hang.hpp" #include "lbann/callbacks/callback_variable_minibatch.hpp" diff --git a/src/callbacks/CMakeLists.txt b/src/callbacks/CMakeLists.txt index b38b1c11107..b9baa45839d 100644 --- a/src/callbacks/CMakeLists.txt +++ b/src/callbacks/CMakeLists.txt @@ -34,6 +34,7 @@ set_full_path(THIS_DIR_SOURCES callback_replace_weights.cpp callback_gpu_memory_usage.cpp callback_perturb_dropout.cpp + callback_save_topk_models.cpp ) # Propagate the files up the tree diff --git a/src/callbacks/callback_save_topk_models.cpp b/src/callbacks/callback_save_topk_models.cpp new file mode 100644 index 00000000000..fe258c6e4a3 --- /dev/null +++ b/src/callbacks/callback_save_topk_models.cpp @@ -0,0 +1,87 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +// +// lbann_callback_save_topk_models .hpp .cpp - Callback hooks to save_topk_models information +//////////////////////////////////////////////////////////////////////////////// + +#include +#include "lbann/callbacks/callback_save_topk_models.hpp" + +namespace lbann { + +void lbann_callback_save_topk_models::on_test_end(model *m) { + if(m->get_comm()->am_trainer_master()) { + compute_stats_save_models(m); + } +} + +void lbann_callback_save_topk_models::compute_stats_save_models(model *m) { + lbann_comm *comm = m->get_comm(); + const int num_trainers = comm->get_num_trainers(); + std::string mode_string = "test"; + bool found_metric = false; + EvalType score = 0; + for (const auto& met : m->get_metrics()) { + if (met->name() == m_metric_name) { + found_metric = true; + score = met->get_mean_value(m->get_execution_mode()); + break; + } + } + if (!found_metric) { + std::stringstream err; + err << "could not find metric \"" << m_metric_name << "\"" + << "in model \"" << m->get_name() << "\""; + LBANN_ERROR(err.str()); + } + + std::vector score_list(comm->get_num_trainers()); + //void all_gather(T &src, std::vector &data, const El::mpi::Comm& c) { + comm->all_gather(score, score_list,comm->get_world_comm()); + std::vector score_v = score_list; + //top-k in an ascending order + if(m_ascending_ordering) std::sort(score_v.begin(), score_v.end(),std::less()); + //top-k in an descending order + else std::sort(score_v.begin(), score_v.end(),std::greater()); + score_v.resize(m_k); + + std::cout << "Trainer score_v size m_k " << comm->get_trainer_rank() << score_v.size() << m_k << std::endl; + + if (comm->am_world_master()) { + std::cout << "Top " << m_k << " " << m_metric_name << " average " + << std::accumulate(score_v.begin(), score_v.end(), EvalType(0))/m_k; + } + for(int i =0; i < num_trainers; ++i) { + if(std::find(score_v.begin(), score_v.end(), + score_list[i]) != score_v.end()) { + if( i == comm->get_trainer_rank()) { + save_model_weights(m); + } + } + } + +} + +} // namespace lbann diff --git a/src/proto/factories/callback_factory.cpp b/src/proto/factories/callback_factory.cpp index 6fa579f82cb..cd7e647ef85 100644 --- a/src/proto/factories/callback_factory.cpp +++ b/src/proto/factories/callback_factory.cpp @@ -239,6 +239,14 @@ lbann_callback* construct_callback(lbann_comm* comm, } } + if (proto_cb.has_save_topk_models()) { + const auto& params = proto_cb.save_topk_models(); + return new lbann_callback_save_topk_models(params.dir(), + params.k(), + params.metric(), + params.ascending_ordering()); + } + ////////////////////////////////////////////////////////////// // Weight exchange/replace ////////////////////////////////////////////////////////////// diff --git a/src/proto/lbann.proto b/src/proto/lbann.proto index d78f74ecef0..859261344e3 100644 --- a/src/proto/lbann.proto +++ b/src/proto/lbann.proto @@ -413,6 +413,7 @@ message Callback { CallbackCheckMetric check_metric = 37; CallbackPerturbAdam perturb_adam = 38; CallbackPerturbDropout perturb_dropout = 39; + CallbackSaveTopKModels save_topk_models = 40; } message CallbackLTFB { @@ -659,6 +660,13 @@ message CallbackPerturbDropout { float keep_dropout_factor = 1; //Keep dropout prob perturbation (in log space) string layers = 2; // dropout layers to perturb keep prob, all dropout layers by default } + +message CallbackSaveTopKModels { + string dir = 1; //directory to save model + int32 k = 2; //number of (top) models to save + string metric = 3; //metrics to use in evaluating models + bool ascending_ordering = 4; //whether to sort metrics per model in ascending order, descending order is default +} //======================================================================== // Weights //======================================================================== From f60030f8aeff30fc6f581b9ab1e26bc1bc91cd0e Mon Sep 17 00:00:00 2001 From: "David A. Hysom" Date: Fri, 31 May 2019 07:30:21 -0700 Subject: [PATCH 051/634] treat top level child METAXXX as an excluded sample --- model_zoo/jag_utils/build_index.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/model_zoo/jag_utils/build_index.cpp b/model_zoo/jag_utils/build_index.cpp index 1416c6fbe1e..e1dfd268c7f 100644 --- a/model_zoo/jag_utils/build_index.cpp +++ b/model_zoo/jag_utils/build_index.cpp @@ -135,6 +135,9 @@ if (j >= 400) break; bool good = conduit::relay::io::hdf5_has_path(hdf5_file_hnd, key_1); if (!good) { std::cerr << "missing path: " << key_1 << " (this is probably OK for hydra)\n"; + s5 << cnames[h] << " "; + ++num_samples_bad; + ++local_num_samples_bad; continue; } From 58ef8b8c1dd177c7b4a1fe008a93492c3a5b066b Mon Sep 17 00:00:00 2001 From: Ryan Forsyth Date: Mon, 3 Jun 2019 13:56:52 -0700 Subject: [PATCH 052/634] Fix errors in Weekly Develop Build 31 --- bamboo/allocate_and_run.sh | 8 ++++++ bamboo/integration_tests/conftest.py | 9 ++++++- .../test_integration_performance.py | 25 ++++++++++++------- 3 files changed, 32 insertions(+), 10 deletions(-) diff --git a/bamboo/allocate_and_run.sh b/bamboo/allocate_and_run.sh index 58ba329bff1..0f338913e4b 100755 --- a/bamboo/allocate_and_run.sh +++ b/bamboo/allocate_and_run.sh @@ -32,6 +32,14 @@ fi if [ ${WEEKLY} -ne 0 ]; then salloc -N16 -t 600 ./run.sh --weekly + if [ "${CLUSTER}" = 'catalyst' ]; then + cd integration_tests + python -m pytest -s test_integration_performance_full_alexnet_clang4 --weekly --run + python -m pytest -s test_integration_performance_full_alexnet_gcc4 --weekly --run + python -m pytest -s test_integration_performance_full_alexnet_gcc7 --weekly --run + python -m pytest -s test_integration_performance_full_alexnet_intel18 --weekly --run + cd .. + fi else salloc -N16 -t 600 ./run.sh fi diff --git a/bamboo/integration_tests/conftest.py b/bamboo/integration_tests/conftest.py index da2ffc127be..97d34bf9055 100644 --- a/bamboo/integration_tests/conftest.py +++ b/bamboo/integration_tests/conftest.py @@ -19,8 +19,10 @@ def pytest_addoption(parser): help='--exes={compiler_name: path}') parser.addoption('--log', action='store', default=0, help='--log=1 to keep trimmed accuracy files. Default (--log=0) removes files') + parser.addoption('--run', action='store_true', default=False, + help='--run specifies that a test normally ignored should be run. Default False') parser.addoption('--weekly', action='store_true', default=False, - help='--weekly specifies that the test should ONLY be run weekly, not nightly') + help='--weekly specifies that the test should ONLY be run weekly, not nightly. Default False') # For local testing only parser.addoption('--exe', action='store', help='--exe=') @@ -45,6 +47,11 @@ def exes(request): return request.config.getoption('--exes') +@pytest.fixture +def run(request): + return request.config.getoption('--run') + + @pytest.fixture def weekly(request): return request.config.getoption('--weekly') diff --git a/bamboo/integration_tests/test_integration_performance.py b/bamboo/integration_tests/test_integration_performance.py index 6d488a2e316..82413d93046 100644 --- a/bamboo/integration_tests/test_integration_performance.py +++ b/bamboo/integration_tests/test_integration_performance.py @@ -138,7 +138,11 @@ def skeleton_performance_alexnet(cluster, dir_name, executables, compiler_name, def skeleton_performance_full_alexnet(cluster, dir_name, executables, - compiler_name, weekly): + compiler_name, weekly, run): + if not run: + e = 'skeleton_performance_full_alexnet: Ignored' + print('Skip - ' + e) + pytest.skip(e) if not weekly: e = 'skeleton_performance_full_alexnet: Non-local testing' print('Skip - ' + e) @@ -179,8 +183,9 @@ def test_integration_performance_alexnet_clang4(cluster, dirname, exes, weekly): def test_integration_performance_full_alexnet_clang4(cluster, dirname, exes, - weekly): - skeleton_performance_full_alexnet(cluster, dirname, exes, 'clang4', weekly) + weekly, run): + skeleton_performance_full_alexnet(cluster, dirname, exes, 'clang4', weekly, + run) def test_integration_performance_lenet_mnist_gcc4(cluster, dirname, exes): @@ -191,8 +196,9 @@ def test_integration_performance_alexnet_gcc4(cluster, dirname, exes, weekly): skeleton_performance_alexnet(cluster, dirname, exes, 'gcc4', weekly) -def test_integration_performance_full_alexnet_gcc4(cluster, dirname, exes, weekly): - skeleton_performance_full_alexnet(cluster, dirname, exes, 'gcc4', weekly) +def test_integration_performance_full_alexnet_gcc4(cluster, dirname, exes, + weekly, run): + skeleton_performance_full_alexnet(cluster, dirname, exes, 'gcc4', weekly, run) def test_integration_performance_lenet_mnist_gcc7(cluster, dirname, exes): @@ -204,8 +210,8 @@ def test_integration_performance_alexnet_gcc7(cluster, dirname, exes, weekly): def test_integration_performance_full_alexnet_gcc7(cluster, dirname, exes, - weekly): - skeleton_performance_full_alexnet(cluster, dirname, exes, 'gcc7', weekly) + weekly, run): + skeleton_performance_full_alexnet(cluster, dirname, exes, 'gcc7', weekly, run) def test_integration_performance_lenet_mnist_intel18(cluster, dirname, exes): @@ -218,8 +224,9 @@ def test_integration_performance_alexnet_intel18(cluster, dirname, exes, def test_integration_performance_full_alexnet_intel18(cluster, dirname, exes, - weekly): - skeleton_performance_full_alexnet(cluster, dirname, exes, 'intel18', weekly) + weekly, run): + skeleton_performance_full_alexnet(cluster, dirname, exes, 'intel18', weekly, + run) # Run with python -m pytest -s test_integration_performance.py -k 'test_integration_performance_lenet_mnist_exe' --exe= From a3a1f3b8868beced56a175a34902bd1f4c9fb956 Mon Sep 17 00:00:00 2001 From: Sam Ade Jacobs Date: Mon, 3 Jun 2019 15:47:57 -0700 Subject: [PATCH 053/634] Clean up and compatibility with base class --- .../callbacks/callback_save_topk_models.hpp | 14 +++++--- src/callbacks/callback_save_topk_models.cpp | 33 ++++++++++++------- 2 files changed, 31 insertions(+), 16 deletions(-) diff --git a/include/lbann/callbacks/callback_save_topk_models.hpp b/include/lbann/callbacks/callback_save_topk_models.hpp index 40e71e442a8..d5f57e1ebdb 100644 --- a/include/lbann/callbacks/callback_save_topk_models.hpp +++ b/include/lbann/callbacks/callback_save_topk_models.hpp @@ -33,7 +33,11 @@ namespace lbann { -/** Periodically save_topk_models computational results. +/** Save_topk_models for (e.g., inference and other analysis). + * @param dir directory to save model + * @param k number of models to save, should be less than number of trainers + * @param metric_namei, evaluation metric + * @ordering for the topk, descending order is default */ class lbann_callback_save_topk_models : public lbann_callback_save_model { public: @@ -46,10 +50,10 @@ class lbann_callback_save_topk_models : public lbann_callback_save_model { std::string name() const override { return "save_topk_models"; } private: - void compute_stats_save_models(model *m); - int m_k ; //number of models to save - std::string m_metric_name; //evaluation metric - bool m_ascending_ordering; //ordering for the top k + bool compute_stats(model *m); + int m_k ; + std::string m_metric_name; + bool m_ascending_ordering; }; diff --git a/src/callbacks/callback_save_topk_models.cpp b/src/callbacks/callback_save_topk_models.cpp index fe258c6e4a3..38dfb0b1270 100644 --- a/src/callbacks/callback_save_topk_models.cpp +++ b/src/callbacks/callback_save_topk_models.cpp @@ -30,14 +30,16 @@ #include "lbann/callbacks/callback_save_topk_models.hpp" namespace lbann { - void lbann_callback_save_topk_models::on_test_end(model *m) { + bool am_in_topk = false; if(m->get_comm()->am_trainer_master()) { - compute_stats_save_models(m); + am_in_topk = compute_stats(m); } + m->get_comm()->trainer_broadcast(0, am_in_topk); + if(am_in_topk) save_model(m); } -void lbann_callback_save_topk_models::compute_stats_save_models(model *m) { +bool lbann_callback_save_topk_models::compute_stats(model *m) { lbann_comm *comm = m->get_comm(); const int num_trainers = comm->get_num_trainers(); std::string mode_string = "test"; @@ -50,16 +52,26 @@ void lbann_callback_save_topk_models::compute_stats_save_models(model *m) { break; } } + //sanity check if (!found_metric) { std::stringstream err; - err << "could not find metric \"" << m_metric_name << "\"" + err << __FILE__ << " " << __LINE__ << " :: " + << "could not find metric \"" << m_metric_name << "\"" << "in model \"" << m->get_name() << "\""; LBANN_ERROR(err.str()); } + if (m_k > num_trainers) { + std::stringstream err; + err << __FILE__ << " " << __LINE__ << " :: " + << "k ( " << m_k << ") " + << " can not be greater than number of trainers (" + << num_trainers << ") " ; + LBANN_ERROR(err.str()); + } + std::vector score_list(comm->get_num_trainers()); - //void all_gather(T &src, std::vector &data, const El::mpi::Comm& c) { - comm->all_gather(score, score_list,comm->get_world_comm()); + comm->all_gather(score, score_list,comm->get_intertrainer_comm()); std::vector score_v = score_list; //top-k in an ascending order if(m_ascending_ordering) std::sort(score_v.begin(), score_v.end(),std::less()); @@ -67,21 +79,20 @@ void lbann_callback_save_topk_models::compute_stats_save_models(model *m) { else std::sort(score_v.begin(), score_v.end(),std::greater()); score_v.resize(m_k); - std::cout << "Trainer score_v size m_k " << comm->get_trainer_rank() << score_v.size() << m_k << std::endl; - if (comm->am_world_master()) { std::cout << "Top " << m_k << " " << m_metric_name << " average " - << std::accumulate(score_v.begin(), score_v.end(), EvalType(0))/m_k; + << std::accumulate(score_v.begin(), score_v.end(), EvalType(0))/m_k << std::endl; } for(int i =0; i < num_trainers; ++i) { if(std::find(score_v.begin(), score_v.end(), score_list[i]) != score_v.end()) { if( i == comm->get_trainer_rank()) { - save_model_weights(m); + std::cout << "Trainer [ " << comm->get_trainer_rank() << "] in top list with score " << score_list[i] << std::endl;; + return true; } } } - + return false; } } // namespace lbann From c10f4fb27f4ecf634f8d6bc5f7a93fcfa0b38428 Mon Sep 17 00:00:00 2001 From: "David A. Hysom" Date: Tue, 4 Jun 2019 09:16:56 -0700 Subject: [PATCH 054/634] bug fix, for when there are more processes than data files --- src/data_readers/data_reader_numpy_npz_conduit.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/data_readers/data_reader_numpy_npz_conduit.cpp b/src/data_readers/data_reader_numpy_npz_conduit.cpp index 79dfd8196ec..004653825f4 100644 --- a/src/data_readers/data_reader_numpy_npz_conduit.cpp +++ b/src/data_readers/data_reader_numpy_npz_conduit.cpp @@ -316,7 +316,11 @@ bool numpy_npz_conduit_reader::fetch_response(Mat& Y, int data_id, int mb_idx) { void numpy_npz_conduit_reader::fill_in_metadata() { int rank = m_comm->get_rank_in_trainer(); // to avoid contention, each rank opens a separate file - std::ifstream in(m_filenames[rank]); + size_t my_file = rank; + if (my_file >= m_filenames.size()) { + my_file = 0; + } + std::ifstream in(m_filenames[my_file]); if (!in) { LBANN_ERROR("failed to open " + m_filenames[rank] + " for reading"); } From 6237ba86d535696b5c51a4c98a197f1b42da4c29 Mon Sep 17 00:00:00 2001 From: "David A. Hysom" Date: Wed, 5 Jun 2019 08:32:23 -0700 Subject: [PATCH 055/634] change instances of 'rank' to 'my_file' - follow on to previous bug fix. --- src/data_readers/data_reader_numpy_npz_conduit.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/data_readers/data_reader_numpy_npz_conduit.cpp b/src/data_readers/data_reader_numpy_npz_conduit.cpp index 004653825f4..5c13ea20d89 100644 --- a/src/data_readers/data_reader_numpy_npz_conduit.cpp +++ b/src/data_readers/data_reader_numpy_npz_conduit.cpp @@ -322,7 +322,7 @@ void numpy_npz_conduit_reader::fill_in_metadata() { } std::ifstream in(m_filenames[my_file]); if (!in) { - LBANN_ERROR("failed to open " + m_filenames[rank] + " for reading"); + LBANN_ERROR("failed to open " + m_filenames[my_file] + " for reading"); } in.close(); @@ -333,7 +333,7 @@ void numpy_npz_conduit_reader::fill_in_metadata() { int data_id = 0; //meaningless conduit::Node node; - numpy_conduit_converter::load_conduit_node(m_filenames[rank], data_id, node); + numpy_conduit_converter::load_conduit_node(m_filenames[my_file], data_id, node); //fill in m_data_dims auto shape = node[LBANN_DATA_ID_STR(data_id) + "/data/shape"].as_uint64_array(); From f011edfc21c14180863acf5db89283cbc885d8e3 Mon Sep 17 00:00:00 2001 From: "David A. Hysom" Date: Fri, 7 Jun 2019 08:54:23 -0700 Subject: [PATCH 056/634] initial commit --- model_zoo/jag_utils/CMakeLists.txt | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/model_zoo/jag_utils/CMakeLists.txt b/model_zoo/jag_utils/CMakeLists.txt index 9030bde2243..ed8694bf390 100644 --- a/model_zoo/jag_utils/CMakeLists.txt +++ b/model_zoo/jag_utils/CMakeLists.txt @@ -54,3 +54,8 @@ target_link_libraries(generate_corrupt_samples-bin lbann ) set_target_properties(generate_corrupt_samples-bin PROPERTIES OUTPUT_NAME generate_corrupt_samples) + add_executable( compute_hydra_normalization-bin compute_hydra_normalization.cpp ) + target_link_libraries(compute_hydra_normalization-bin lbann ) + set_target_properties(compute_hydra_normalization-bin PROPERTIES OUTPUT_NAME compute_hydra_normalization) + + From dbfa3865621cf00972112467dc9e275c3da3880b Mon Sep 17 00:00:00 2001 From: "David A. Hysom" Date: Fri, 7 Jun 2019 08:54:39 -0700 Subject: [PATCH 057/634] initial commit --- .../jag_utils/compute_hydra_normalization.cpp | 178 ++++++++++++++++++ 1 file changed, 178 insertions(+) create mode 100644 model_zoo/jag_utils/compute_hydra_normalization.cpp diff --git a/model_zoo/jag_utils/compute_hydra_normalization.cpp b/model_zoo/jag_utils/compute_hydra_normalization.cpp new file mode 100644 index 00000000000..1a39ccfdfc4 --- /dev/null +++ b/model_zoo/jag_utils/compute_hydra_normalization.cpp @@ -0,0 +1,178 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +// +//////////////////////////////////////////////////////////////////////////////// + +#include "lbann_config.hpp" + +#include "conduit/conduit.hpp" +#include "conduit/conduit_relay.hpp" +#include "conduit/conduit_relay_io_hdf5.hpp" +#include +#include +#include +#include +#include +#include "lbann/lbann.hpp" +#include "lbann/utils/jag_utils.hpp" +#include +#include + +using namespace lbann; +using namespace std; + +vector get_input_names(); +vector get_scalar_names(); + +//========================================================================== +int main(int argc, char *argv[]) { + int random_seed = lbann_default_random_seed; + world_comm_ptr comm = initialize(argc, argv, random_seed); + bool master = comm->am_world_master(); + const int rank = comm->get_rank_in_world(); + + //try { + options *opts = options::get(); + opts->init(argc, argv); + + if (!(opts->has_string("filelist"))) { + if (master) { + throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__) + " :: usage: " + argv[0] + " --filelist="); + } + } + + //======================================================================= + + hid_t hdf5_file_hnd; + std::string key; + conduit::Node n_ok; + conduit::Node tmp; + + int num_samples = 0; + vector input_names = get_input_names(); + size_t sz = input_names.size(); + std::vector inputs_v_max(sz, DBL_MIN); + std::vector inputs_v_min(sz, DBL_MAX); + std::vector inputs_sum(sz, 0.0); + + ifstream in(opts->get_string("filelist").c_str()); + if (!in) { + LBANN_ERROR("failed to open " + opts->get_string("filelist") + " for reading"); + } + + size_t hh = 0; + string filename; + while (!in.eof()) { + getline(in, filename); + if (filename.size() < 2) { + continue; + } + hh += 1; + if (hh % 10 == 0) std::cout << rank << " :: processed " << hh << " filenames\n"; + + try { + hdf5_file_hnd = conduit::relay::io::hdf5_open_file_for_read( filename.c_str() ); + } catch (...) { + LBANN_ERROR("failed to open " + filename + " for reading"); + } + + std::vector cnames; + try { + conduit::relay::io::hdf5_group_list_child_names(hdf5_file_hnd, "/", cnames); + } catch (...) { + LBANN_ERROR("exception hdf5_group_list_child_names; " + filename); + } + + for (size_t i=0; i inputs_v_max[h]) inputs_v_max[h] = v; + inputs_sum[h] += v; + } + } catch (...) { + LBANN_ERROR("error reading " + key + " from file " + filename); + } + } + ++num_samples; + } + } + + for (size_t j=0; j get_input_names() { + vector f; + f.push_back("p_preheat"); + f.push_back("sc_peak"); + f.push_back("t_3rd"); + f.push_back("t_end"); + return f; +} + +vector get_scalar_names() { + vector f; + f.push_back("avg_rhor"); + f.push_back("peak_eprod"); + f.push_back("peak_tion_bw_DT"); + f.push_back("bt_tion_bw_DT"); + f.push_back("avg_tion_bw_DT"); + f.push_back("adiabat"); + f.push_back("bangt"); + f.push_back("burnwidth"); + f.push_back("bt_rhor"); + f.push_back("bt_eprodr"); + f.push_back("peak_eprodr"); + return f; +} From a4318f60134039d65bbdb13c79c92e7e869ae0e2 Mon Sep 17 00:00:00 2001 From: "David A. Hysom" Date: Fri, 7 Jun 2019 09:26:09 -0700 Subject: [PATCH 058/634] input and scalar normalizations are computed. still to do: images --- .../jag_utils/compute_hydra_normalization.cpp | 64 +++++++++++++------ 1 file changed, 45 insertions(+), 19 deletions(-) diff --git a/model_zoo/jag_utils/compute_hydra_normalization.cpp b/model_zoo/jag_utils/compute_hydra_normalization.cpp index 1a39ccfdfc4..6caadf30c97 100644 --- a/model_zoo/jag_utils/compute_hydra_normalization.cpp +++ b/model_zoo/jag_utils/compute_hydra_normalization.cpp @@ -53,18 +53,20 @@ int main(int argc, char *argv[]) { bool master = comm->am_world_master(); const int rank = comm->get_rank_in_world(); - //try { options *opts = options::get(); opts->init(argc, argv); + ofstream out("normalize.txt"); + if (!out) { + LBANN_ERROR("failed to open: normalize.txt for writing"); + } + if (!(opts->has_string("filelist"))) { if (master) { throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__) + " :: usage: " + argv[0] + " --filelist="); } } - //======================================================================= - hid_t hdf5_file_hnd; std::string key; conduit::Node n_ok; @@ -77,6 +79,12 @@ int main(int argc, char *argv[]) { std::vector inputs_v_min(sz, DBL_MAX); std::vector inputs_sum(sz, 0.0); + vector scalar_names = get_scalar_names(); + sz = scalar_names.size(); + std::vector scalars_v_max(sz, DBL_MIN); + std::vector scalars_v_min(sz, DBL_MAX); + std::vector scalars_sum(sz, 0.0); + ifstream in(opts->get_string("filelist").c_str()); if (!in) { LBANN_ERROR("failed to open " + opts->get_string("filelist") + " for reading"); @@ -125,6 +133,15 @@ int main(int argc, char *argv[]) { if (v > inputs_v_max[h]) inputs_v_max[h] = v; inputs_sum[h] += v; } + + for (size_t h=0; h scalars_v_max[h]) scalars_v_max[h] = v; + scalars_sum[h] += v; + } } catch (...) { LBANN_ERROR("error reading " + key + " from file " + filename); } @@ -133,22 +150,31 @@ int main(int argc, char *argv[]) { } } - for (size_t j=0; j Date: Fri, 7 Jun 2019 16:49:15 -0700 Subject: [PATCH 059/634] some clean up and logic correctness mostly based on feedback from Tim PR review --- .../callbacks/callback_save_topk_models.hpp | 5 ++- src/callbacks/callback_save_topk_models.cpp | 37 ++++++++----------- 2 files changed, 18 insertions(+), 24 deletions(-) diff --git a/include/lbann/callbacks/callback_save_topk_models.hpp b/include/lbann/callbacks/callback_save_topk_models.hpp index d5f57e1ebdb..cb6f734894a 100644 --- a/include/lbann/callbacks/callback_save_topk_models.hpp +++ b/include/lbann/callbacks/callback_save_topk_models.hpp @@ -36,7 +36,7 @@ namespace lbann { /** Save_topk_models for (e.g., inference and other analysis). * @param dir directory to save model * @param k number of models to save, should be less than number of trainers - * @param metric_namei, evaluation metric + * @param metric_name, evaluation metric * @ordering for the topk, descending order is default */ class lbann_callback_save_topk_models : public lbann_callback_save_model { @@ -50,7 +50,8 @@ class lbann_callback_save_topk_models : public lbann_callback_save_model { std::string name() const override { return "save_topk_models"; } private: - bool compute_stats(model *m); + /*determine if a trainer's model is in top k, computation done by master process*/ + bool am_in_topk(model *m); int m_k ; std::string m_metric_name; bool m_ascending_ordering; diff --git a/src/callbacks/callback_save_topk_models.cpp b/src/callbacks/callback_save_topk_models.cpp index 38dfb0b1270..6cbff7df8c4 100644 --- a/src/callbacks/callback_save_topk_models.cpp +++ b/src/callbacks/callback_save_topk_models.cpp @@ -31,15 +31,15 @@ namespace lbann { void lbann_callback_save_topk_models::on_test_end(model *m) { - bool am_in_topk = false; + bool in_topk = false; if(m->get_comm()->am_trainer_master()) { - am_in_topk = compute_stats(m); + in_topk = am_in_topk(m); } - m->get_comm()->trainer_broadcast(0, am_in_topk); - if(am_in_topk) save_model(m); + m->get_comm()->trainer_broadcast(0, in_topk); + if(in_topk) save_model(m); } -bool lbann_callback_save_topk_models::compute_stats(model *m) { +bool lbann_callback_save_topk_models::am_in_topk(model *m) { lbann_comm *comm = m->get_comm(); const int num_trainers = comm->get_num_trainers(); std::string mode_string = "test"; @@ -55,16 +55,14 @@ bool lbann_callback_save_topk_models::compute_stats(model *m) { //sanity check if (!found_metric) { std::stringstream err; - err << __FILE__ << " " << __LINE__ << " :: " - << "could not find metric \"" << m_metric_name << "\"" + err << "could not find metric \"" << m_metric_name << "\"" << "in model \"" << m->get_name() << "\""; LBANN_ERROR(err.str()); } if (m_k > num_trainers) { std::stringstream err; - err << __FILE__ << " " << __LINE__ << " :: " - << "k ( " << m_k << ") " + err << "k ( " << m_k << ") " << " can not be greater than number of trainers (" << num_trainers << ") " ; LBANN_ERROR(err.str()); @@ -72,25 +70,20 @@ bool lbann_callback_save_topk_models::compute_stats(model *m) { std::vector score_list(comm->get_num_trainers()); comm->all_gather(score, score_list,comm->get_intertrainer_comm()); - std::vector score_v = score_list; + std::vector top_scores = score_list; //top-k in an ascending order - if(m_ascending_ordering) std::sort(score_v.begin(), score_v.end(),std::less()); + if(m_ascending_ordering) std::sort(top_scores.begin(), top_scores.end(),std::less()); //top-k in an descending order - else std::sort(score_v.begin(), score_v.end(),std::greater()); - score_v.resize(m_k); + else std::sort(top_scores.begin(), top_scores.end(),std::greater()); + top_scores.resize(m_k); if (comm->am_world_master()) { std::cout << "Top " << m_k << " " << m_metric_name << " average " - << std::accumulate(score_v.begin(), score_v.end(), EvalType(0))/m_k << std::endl; + << std::accumulate(top_scores.begin(), top_scores.end(), EvalType(0))/m_k << std::endl; } - for(int i =0; i < num_trainers; ++i) { - if(std::find(score_v.begin(), score_v.end(), - score_list[i]) != score_v.end()) { - if( i == comm->get_trainer_rank()) { - std::cout << "Trainer [ " << comm->get_trainer_rank() << "] in top list with score " << score_list[i] << std::endl;; - return true; - } - } + if(std::find(top_scores.begin(), top_scores.end(), + score_list[comm->get_trainer_rank()]) != top_scores.end()) { + return true; } return false; } From 6baa6134e3fa32f4ec871bd26b2da4be09b94c4e Mon Sep 17 00:00:00 2001 From: Sam Ade Jacobs Date: Fri, 7 Jun 2019 17:00:40 -0700 Subject: [PATCH 060/634] some clean up and logic correctness mostly based on feedback from Tim PR review --- include/lbann/callbacks/callback_save_topk_models.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/lbann/callbacks/callback_save_topk_models.hpp b/include/lbann/callbacks/callback_save_topk_models.hpp index cb6f734894a..d33020a7acd 100644 --- a/include/lbann/callbacks/callback_save_topk_models.hpp +++ b/include/lbann/callbacks/callback_save_topk_models.hpp @@ -50,7 +50,7 @@ class lbann_callback_save_topk_models : public lbann_callback_save_model { std::string name() const override { return "save_topk_models"; } private: - /*determine if a trainer's model is in top k, computation done by master process*/ + /*determine if a trainer's model is in top k, computation done by trainer master processes*/ bool am_in_topk(model *m); int m_k ; std::string m_metric_name; From ef18c6ab76f9a458840296466f819d0ce33675fe Mon Sep 17 00:00:00 2001 From: Sam Ade Jacobs Date: Fri, 7 Jun 2019 17:03:43 -0700 Subject: [PATCH 061/634] clarity on how many models are saved --- include/lbann/callbacks/callback_save_topk_models.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/include/lbann/callbacks/callback_save_topk_models.hpp b/include/lbann/callbacks/callback_save_topk_models.hpp index d33020a7acd..4d6bad2f240 100644 --- a/include/lbann/callbacks/callback_save_topk_models.hpp +++ b/include/lbann/callbacks/callback_save_topk_models.hpp @@ -38,6 +38,7 @@ namespace lbann { * @param k number of models to save, should be less than number of trainers * @param metric_name, evaluation metric * @ordering for the topk, descending order is default + * Note: may end up saving more than k models if multiple models (trainers) have the same metric score */ class lbann_callback_save_topk_models : public lbann_callback_save_model { public: From 73d42386d12f1669919727400cd2367ad8be953f Mon Sep 17 00:00:00 2001 From: "David A. Hysom" Date: Fri, 7 Jun 2019 17:32:17 -0700 Subject: [PATCH 062/634] added code to compute min/max normalization for HYDRA images. --- .../jag_utils/compute_hydra_normalization.cpp | 68 +++++++++++++++++-- 1 file changed, 63 insertions(+), 5 deletions(-) diff --git a/model_zoo/jag_utils/compute_hydra_normalization.cpp b/model_zoo/jag_utils/compute_hydra_normalization.cpp index 6caadf30c97..316ca8d6204 100644 --- a/model_zoo/jag_utils/compute_hydra_normalization.cpp +++ b/model_zoo/jag_utils/compute_hydra_normalization.cpp @@ -45,8 +45,11 @@ using namespace std; vector get_input_names(); vector get_scalar_names(); +vector get_image_names(); //========================================================================== +#define MAGIC_NUMBER 9 + int main(int argc, char *argv[]) { int random_seed = lbann_default_random_seed; world_comm_ptr comm = initialize(argc, argv, random_seed); @@ -85,20 +88,29 @@ int main(int argc, char *argv[]) { std::vector scalars_v_min(sz, DBL_MAX); std::vector scalars_sum(sz, 0.0); + vector image_names = get_image_names(); + sz = image_names.size(); + vector> images_v_max(sz); + vector> images_v_min(sz); + for (size_t h=0; hget_string("filelist").c_str()); if (!in) { LBANN_ERROR("failed to open " + opts->get_string("filelist") + " for reading"); } - size_t hh = 0; + size_t hhh = 0; string filename; while (!in.eof()) { getline(in, filename); if (filename.size() < 2) { continue; } - hh += 1; - if (hh % 10 == 0) std::cout << rank << " :: processed " << hh << " filenames\n"; + hhh += 1; + if (hhh % 10 == 0) std::cout << rank << " :: processed " << hhh << " filenames\n"; try { hdf5_file_hnd = conduit::relay::io::hdf5_open_file_for_read( filename.c_str() ); @@ -125,6 +137,7 @@ int main(int argc, char *argv[]) { int success = n_ok.to_int64(); if (success == 1) { try { + for (size_t h=0; h scalars_v_max[h]) scalars_v_max[h] = v; scalars_sum[h] += v; } + + for (size_t h=0; h images_v_max[h][g]) { + images_v_max[h][g] = emi[idx]; + } + ++idx; + } + } + } + } catch (...) { LBANN_ERROR("error reading " + key + " from file " + filename); } @@ -167,13 +203,28 @@ int main(int argc, char *argv[]) { double scale = 1.0 / (scalars_v_max[h] - scalars_v_min[h]); double bias = -1*scalars_v_min[h] / (scalars_v_max[h] - scalars_v_min[h]); if (h < scalar_names.size()-1) { - out << " { scale: " << scale << " bias: " << bias << " }, #" << scalar_names[h] << " avg= " << scalars_sum[h] / num_samples << "}\n"; + out << " { scale: " << scale << " bias: " << bias << " }, #" << scalar_names[h] << " avg= " << scalars_sum[h] / num_samples << "\n"; } else { - out << " { scale: " << scale << " bias: " << bias << " } #" << scalar_names[h] << " avg= " << scalars_sum[h] / num_samples << "}\n"; + out << " { scale: " << scale << " bias: " << bias << " } #" << scalar_names[h] << " avg= " << scalars_sum[h] / num_samples << "\n"; } } out << " ]\n"; + out << " jag_image_normalization_params: [\n"; + for (size_t h=0; h get_scalar_names() { f.push_back("peak_eprodr"); return f; } + +vector get_image_names() { + vector f; + f.push_back("(90,0)/bang/image/data"); + f.push_back("(0,0)/bang/image/data"); + return f; +} From 9879d6edec03237fbaab95804a21fd2a793f99bd Mon Sep 17 00:00:00 2001 From: Tim Moon Date: Mon, 10 Jun 2019 10:49:43 -0700 Subject: [PATCH 063/634] Basic documentation for running LBANN (#1062) * Progress toward basic documentation for running LBANN. Section describing basic structure of LBANN experiments. * Documenting Python frontend. * Documenting Protobuf frontend. * Fixing formatting errors in documentation. * Removing old Python frontend documentation. * Fixing Sphinx build warnings. * Updating documentation with suggestions from @ndryden. * Updating documentation with suggestions from @forsyth2. * Updating documentation with suggestions from @samadejacobs. --- docs/running_lbann.rst | 506 ++++++++++++++++++++++++++++++++++------- python/README.md | 201 ---------------- 2 files changed, 427 insertions(+), 280 deletions(-) delete mode 100644 python/README.md diff --git a/docs/running_lbann.rst b/docs/running_lbann.rst index d98e5fa62fe..cb6c7575327 100644 --- a/docs/running_lbann.rst +++ b/docs/running_lbann.rst @@ -1,96 +1,444 @@ .. role:: bash(code) :language: bash +.. role:: python(code) + :language: python -==================== +============================================================ Running LBANN -==================== +============================================================ -The basic template for running LBANN is +------------------------------------------------ +Anatomy of an LBANN experiment +------------------------------------------------ + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Parallelism +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +LBANN is run under `MPI +`_, i.e. with +multiple processes that communicate with message passing. This set of +processes is subdivided into one or more "trainers." Conceptually, a +trainer owns parallel objects, like models and data readers, and +generally operates independently of other trainers. + +Comments: + ++ LBANN targets HPC systems with homogeneous compute nodes and GPU + accelerators, which motivates some simplifying assumptions: + + - All trainers have the same number of processes. + + - If GPU acceleration is enabled, each MPI process corresponds to + one GPU. + ++ Processors are block assigned to trainers based on MPI rank. + + - In order to minimize the cost of intra-trainer communication, make + sure to map processes to the hardware and network + topologies. Typically, this just means choosing a sensible number + of processes per trainer, e.g. a multiple of the number of GPUs + per compute node. + ++ Generally, increasing the number of processes per trainer will + accelerate computation but require more intra-trainer + communication. There is typically a sweet spot where run time is + minimized, but it is complicated and sensitive to the nature of the + computation, the mini-batch size, the data partitioning scheme, + hardware and network properties, the communication algorithms, and + myriad other factors. + + - Rule-of-thumb: Configure experiments so that the bulk of run time + is taken by compute-bound operations (e.g. convolution or matrix + multiplication) and so that each process has enough work to + achieve a large fraction of peak performance (e.g. by making the + mini-batch size sufficiently large). + ++ Most HPC systems are managed with job schedulers like `Slurm + `_. Typically, users can + not immediately access compute nodes but must request them from + login nodes. The login nodes can be accessed directly (e.g. via + :bash:`ssh`), but users are discouraged from doing heavy computation + on them. + + - For debugging and quick testing, it's convenient to request an + interactive session (:bash:`salloc` or :bash:`sxterm` with Slurm). + + - If you need to run multiple experiments or if experiments are not + time-sensitive, it's best to submit a batch job (:bash:`sbatch` + with Slurm). + + - When running an experiment, make sure you know what scheduler + account to charge (used by the scheduler for billing and + determining priority) and what scheduler partition to run on + (compute nodes on a system are typically subdivided into multiple + groups, e.g. for batch jobs and for debugging). + + + With :bash:`salloc`, specify the partition using the + :bash:`--partition` command-line argument and specify the + account using :bash:`--account`. + + - Familiarize yourself with the rules for the systems you use + (e.g. the expected work for each partition, time limits, job + submission limits) and be a good neighbor. + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Model components +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. warning:: `A major refactor of core model infrastructure + `_ is + pending. This documentation will be updated once it is + merged and the interface stabilized. + ++ Layer: A tensor operation, arranged within a directed acyclic graph. + + - During evaluation ("forward prop"), a layer receives input tensors + from its parents and sends an output tensor to each child. + + - During automatic differentiation ("backprop"), a layer receives + "input error signals" (objective function gradients w.r.t. output + tensors) from its children and sends "output error signals" + (objective function gradients w.r.t. input tensors) to its + parents. If the layer has any associated weight tensors, it will + also compute objective function gradients w.r.t. the weight + tensors. + + - Most layers require a specific number of parents and children, but + LBANN will insert layers into the graph if there is a mismatch and + the intention is obvious. For example, if a layer expects one + child but has multiple, then a split layer (with multiple output + tensors all identical to the input tensor) is inserted. Similarly, + if a layer has fewer children than expected, dummy layers will be + inserted. However, this does not work if there is any + ambiguity. In such cases (common with input and slice layers), it + is recommended to manually insert identity layers so that the + parent/child relationships are absolutely unambiguous. + ++ Weights: A tensor consisting of trainable parameters, typically + associated with one or more layers. A weight tensor owns an + initializer to initially populate its values and an optimizer to + find values that minimize the objective function. + + - A weight tensor without a specified initializer will use a zero + initializer. + + - A weight tensor without a specified optimizer will use the model's + default optimizer. + + - If a layer requires weight tensors and none are specified, it will + create the needed weight tensors. The layer will pick sensible + initializers and optimizers for the weight tensors. For example, a + convolution layer will initialize its kernel tensor with He normal + initialization and with the model's default optimizer. + + - The dimensions of a weight tensor is determined by their + associated layers. The user can not set it directly. + ++ Objective function: Mathematical expression that the optimizers will + attempt to minimize. It is made up of multiple terms that are added + together (possibly with scaling factors). + + - An objective function term can get its value from a scalar-valued + layer, i.e. a layer with an output tensor with one entry. + ++ Metric: Mathematical expression that will be reported to the + user. This typically does not affect training, but is helpful for + evaluating the progress of training. A canonical example for + classification problems is classification accuracy. + ++ Callback: Function that is performed at various points during an + experiment. Callbacks are helpful for reporting, debugging, and + performing advanced training techniques. + + - This is the natural home for experimental training + techniques. + + - A common use-case is to export values with the "dump outputs" + callback so that the user can perform data post-processing or + visualization. + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Data readers +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. warning:: The core infrastructure for data readers is slated for + significant refactoring, so expect major changes in the + future. + +Data readers are responsible for managing a data set and providing +data samples to models. A data set is comprised of independent data +samples, each of which is made up of multiple tensors. For example, a +data sample for a labeled image classification problem consists of an +image tensor and a one-hot label vector. + +.. note:: The data readers are currently hard-coded to assume this + simple classification paradigm. Hacks are needed if your + data does not match it exactly, e.g. if a data sample is + comprised of more than two tensors. The most basic approach + is to flatten all tensors and concatenate them into one + large vector. The model is then responsible for slicing this + vector into the appropriate chunks and resizing the chunks + into the appropriate dimensions. Done correctly, this should + not impose any additional overhead. + +Specifically, data readers and models interact via input layers. Each +model must have exactly one input layer and its output tensors are +populated by a data reader every mini-batch step. This is typically +performed by a background thread pool, so data ingestion will +efficiently overlap with other computation, especially if the data +reader's work is IO-bound or if the computation is largely on GPUs. + +.. note:: An input layer has an output tensor for each data sample + tensor. Since each data sample has two tensors (one for the + data and one for the label), it follows that every input + layer should have two child layers. To make parent/child + relationships unambiguous, we recommend manually creating + identity layers as children of the input layer. + +Note that layers within a model treat the data for a mini-batch as a +single tensor where the leading dimension is the mini-batch +size. Thus, corresponding tensors in all data samples must have the +same dimensions. The data dimensions must be known from the beginning +of the experiment and can not change. However, real data is rarely so +consistent and some preprocessing is typically required. + +.. warning:: `A major refactor of the preprocessing pipeline + `_ is + pending. This documentation will be updated once it is + merged and the interface stabilized. + +------------------------------------------------ +Python frontend +------------------------------------------------ + +LBANN provides a Python frontend with syntax reminiscent of `PyTorch +`_. See the `model zoo implementation of LeNet +`_ +for a simple example. + +Comments: + ++ Under-the-hood, the Python frontend is actually a convenience + wrapper around the Protobuf frontend. The core infrastructure allows + users to configure an experiment and "compiles" it to a Prototext + text file. + ++ The Python interface can only configure and launch experiments. It + is not active during an experiment and it does not allow for any + dynamic control flow. + ++ Only Python 3 is supported. + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Setup +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The :python:`lbann` Python package is installed as part of the LBANN +build process. However, it is necessary to update the +:bash:`PYTHONPATH` environment variable to make sure Python detect +it. There are several ways to do this: + ++ If LBANN has been built with the Spack user build process, loading + LBANN will automatically update :bash:`PYTHONPATH`: .. code-block:: bash - \ - lbann \ - --model=model.prototext \ - --optimizer=opt.prototext \ - --reader=data_reader.prototext - -When using GPGPU accelerators, users should be aware that LBANN is -optimized for the case in which one assigns one GPU per MPI -*rank*. This should be borne in mind when choosing the parameters for -the MPI launcher. - -A list of options for LBANN may be found by running :bash:`lbann ---help`. - -.. note:: At time of writing, it is known that some of these are - out-of-date. An - `issue `_ has been - opened to track this. - -.. _using-the-model-zoo: - --------------------- -Using the model zoo --------------------- - -LBANN ships with prototext descriptions of a variety of models, -optimizers and data readers. These may be found in the :code:`model_zoo/` -directory of the source repository or the :code:`share/model_zoo/` directory -of the install directory. - -.. warning:: Some of these prototexts point to specific data locations - on LLNL LC clusters. Users may have to modify such paths - to point to locations on their own systems. This can be - done by modifying the prototext directly or overriding - the options on the command line with, e.g., the - :code:`--data_filedir_train` and - :code:`--data_filedir_test` options. - -The following is an example invocation of LBANN on a machine using -Slurm's :bash:`srun` as an MPI launcher. In the example command, -a machine with 2 GPGPUs per node are available, 4 nodes will be used, -:bash:`${LBANN_EXE}` is the path to the :code:`lbann` executable, and -:bash:`${LBANN_MODEL_ZOO_DIR}` is the path to the :code:`model_zoo/` directory in -either the source tree or the install tree. Note that the options -passed to :bash:`srun` are not likely to be portable to other MPI -launchers. The example will train Alexnet with SGD optimization on the -Imagenet dataset for 5 epochs. + module load lbann + +.. warning:: The above will *not* work if LBANN has been built with + :bash:`scripts/build_lbann_lc.sh` or with the Spack + developer build process. + ++ LBANN includes a modulefile that updates :bash:`PYTHONPATH`: .. code-block:: bash - srun -N4 --ntasks-per-node=2 \ - ${LBANN_EXE} \ - --model=${LBANN_MODEL_ZOO_DIR}/models/alexnet/alexnet.prototext \ - --optimizer=${LBANN_MODEL_ZOO_DIR}/optimizers/opt_sgd.prototext \ - --reader=${LBANN_MODEL_ZOO_DIR}/data_readers/data_reader_imagenet.prototext \ - --num_epochs=5 - ---------------------------------------------- -Using the Python interface for prototext ---------------------------------------------- - -There is a python interface for generating model prototext -files. Example Python scripts may be found in the -:code:`scripts/proto/lbann/models` directory of the source -repository. Running the Python script will generate a prototext that -can be passed to the :code:`--model` option for LBANN. + module use /etc/modulefiles + module load lbann- + ++ Directly manipulate :bash:`PYTHONPATH`: .. code-block:: bash - - python3 alexnet.py alexnet.prototext - \ - lbann --model=alexnet.prototext -where :code:`` are as documented -:ref:`above `, with optimizer and data reader -prototexts coming from the appropriate :code:`model_zoo/` directories. + export PYTHONPATH=/lib/python/site-packages:${PYTHONPATH} + +Note that LBANN depends on the Protobuf Python package, which can be +installed with: + +.. code-block:: bash + + pip install protobuf + +If the user does not own the site-packages directory, then it may be +necessary to pass the :bash:`--user` flag to pip. + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Basic usage +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +A typical workflow involves the following steps: + +1. Configuring LBANN model components (like the graph of + :python:`Layer` s) and creating a :python:`Model`. + + + Classes for model components are automatically generated from the + LBANN Protobuf specification at `src/proto/lbann.proto + `_. + This file is currently the best source of documentation. Message + fields in the Protobuf specification are optional keyword + arguments for the corresponding Python class constructor. + +2. Configuring the default :python:`Optimizer` to be used by the + :python:`Weights` es. + +3. Loading in a Protobuf text file describing the data reader. + + + The Python frontend currently does not have good support for + specifying data readers. If any data reader properties need to be + set programmatically, the user must do it directly via the + Protobuf Python API. + +4. Launching LBANN by calling :python:`run`. + + + :python:`lbann.run` will detect whether the user is currently on + a login node or a compute node. If on a login node, a batch job + will be submitted to the job scheduler. If on a compute node, + LBANN will be run directly on the allocated nodes. + + + A timestamped work directory will be created each time LBANN is + run. The default location of these work directories can be set + with the environment variable :bash:`LBANN_EXPERIMENT_DIR`. + + + Supported job managers are Slurm and LSF. + + + LLNL users may prefer to use :python:`lbann.contrib.lc.launcher.run`. + This is a wrapper around :python:`lbann.run`, with defaults and + optimizations specifically for LC systems. + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +A simple example +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + import lbann ------------------------------- -Running the inference engine ------------------------------- + # ---------------------------------- + # Construct layer graph + # ---------------------------------- -This section is under construction, requiring input from other team -members. Until it is complete, please ask questions on the -`issue tracker `_. + # Input data + input = lbann.Input() + image = lbann.Identity(input) + label = lbann.Identity(input) + + # Softmax classifier + y = lbann.FullyConnected(image, num_neurons = 10, has_bias = True) + pred = lbann.Softmax(y) + + # Loss function and accuracy + loss = lbann.CrossEntropy([pred, label]) + acc = lbann.CrossEntropy([pred, label]) + + # ---------------------------------- + # Setup experiment + # ---------------------------------- + + # Setup model + mini_batch_size = 64 + num_epochs = 5 + model = lbann.Model(mini_batch_size, + num_epochs, + layers=lbann.traverse_layer_graph(input), + objective_function=loss, + metrics=[lbann.Metric(acc, name='accuracy', unit='%')], + callbacks=[lbann.CallbackPrint(), lbann.CallbackTimer()]) + + # Setup optimizer + opt = lbann.SGD(learn_rate=0.01, momentum=0.9) + + # Load data reader from prototext + import google.protobuf.text_format as txtf + data_reader_proto = lbann.lbann_pb2.LbannPB() + with open('path/to/lbann/model_zoo/data_readers/data_reader.prototext', 'r') as f: + txtf.Merge(f.read(), data_reader_proto) + data_reader_proto = data_reader_proto.data_reader + + # ---------------------------------- + # Run experiment + # ---------------------------------- + + lbann.run(model, data_reader_proto, opt) + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Useful submodules +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +^^^^^^^^^^^^^^^^^^^^^^^^ +:python:`lbann.modules` +^^^^^^^^^^^^^^^^^^^^^^^^ + +A :python:`Module` is a pattern of layers that can be applied multiple +times in a neural network. Once created, a :python:`Module` is +*callable*, taking a layer as input and returning a layer as +output. They will create and manage :python:`Weights` es internally, +so they are convenient for weight sharing between different +layers. They are also useful for complicated patterns like RNN cells. + +*A possible note of confusion*: "Modules" in LBANN are similar to +"layers" in PyTorch, TensorFlow, and Keras. LBANN uses "layer" to +refer to tensor operations, in a similar manner as Caffe. + +^^^^^^^^^^^^^^^^^^^^^^^^ +:python:`lbann.models` +^^^^^^^^^^^^^^^^^^^^^^^^ + +Several common and influential neural network models are implemented +as :python:`Module` s. They can be used as building blocks within more +complicated models. + +^^^^^^^^^^^^^^^^^^^^^^^^ +:python:`lbann.proto` +^^^^^^^^^^^^^^^^^^^^^^^^ + +The :python:`save_prototext` function will export a Protobuf text +file, which can be fed into the Protobuf frontend. + +^^^^^^^^^^^^^^^^^^^^^^^^ +:python:`lbann.onnx` +^^^^^^^^^^^^^^^^^^^^^^^^ + +This contains functionality to convert between LBANN and ONNX +models. See `python/docs/onnx/README.md +`_ +for full documentation. + +------------------------------------------------ +Protobuf frontend (advanced) +------------------------------------------------ + +The main LBANN driver uses Protobuf text files (sometimes called +prototext files) to specify experiments. The Python frontend operates +by "compiling" an experiment configuration into a Protobuf text file +and passing it into the LBANN driver. Aside from quick debugging, +there are very few situations where directly manipulating Protobuf +text files is superior to using the Python frontend. In fact, it is +possible to use Protobuf's Python API to programmatically manipulate +Protobuf messages, if such fine control is necessary. + +In order to fully specify an experiment, the user must provide +Protobuf text files for the model, default optimizer, and data +reader. These can be provided as three separate files or one unified +file. The basic template for running LBANN is + +.. code-block:: bash + + \ + lbann --prototext=experiment.prototext +The LBANN Protobuf format is defined in `src/proto/lbann.proto +`_. It +is important to remember that the default value of a Protobuf field is +logically zero (e.g. false for Boolean fields and empty for string +fields). diff --git a/python/README.md b/python/README.md deleted file mode 100644 index fec92161dc9..00000000000 --- a/python/README.md +++ /dev/null @@ -1,201 +0,0 @@ -# LBANN Python Interface - -This provides a convenient Python wrapper for configuring and running -LBANN experiments. The syntax is meant to be deliberately reminiscent -of [PyTorch](https://pytorch.org/). - -This is still a work in progress, so please [open an -issue](https://github.com/LLNL/lbann/issues/new) if you find any -problems or have feature suggestions. - -* For more details about the LBANN/ONNX converter, -see [here](docs/onnx/README.md). -* For more details about the *accuracy/loss* visualization script -(also known as `lbplot`), see [here](docs/plot/README.md). - -## Setup - -The `lbann` Python package is installed as part of the LBANN build -process. Usage instructions depend on which build method was used. - -_Spack_: `module load lbann`. - -_CMake_: The Python package is typically installed inside the install -directory at `/share/python`. To make sure Python -can detect it, update the `PYTHONPATH` environment variable: -```sh -export PYTHONPATH=/share/python:${PYTHONPATH} -``` -Alternatively, the package can be installed into a Python -site-packages directory so that Python can detect it immediately. This -usually requires an active virtual environment or root access. To -build with this approach, pass `-DLBANN_PYTHON_IN_INSTALL_DIR=OFF` as -an argument into CMake during the build process. - -_Warnings_: -* The build system is still under active development. -* Python 2 is not supported. -* The CMake build process does not handle package dependencies. See - `$LBANN_HOME/cmake/configure_files/setup.py.in` for the full list of - dependencies. -* Installing the ONNX Python package may require some work. See [the - documentation](https://github.com/onnx/onnx#source). - * If you do not already have the ONNX Python package installed, you - will need to ensure the `protoc` compiler is in your path when you - run this. Either load the appropriate Spack module or add - `/bin` to `$PATH` before running. - -## Modules - -### `lbann` - -The `Model` class describes a neural network model and contains the -following components: - -* A `Layer` is a tensor operation, arranged within a directed acyclic - graph. A layer will recieve input tensors from its parents and will - send output tensor to its children. Once the layer graph has been - constructed, it may be helpful to call `traverse_layer_graph`, which - is a generator function that traverses the layer graph in a - topological order. -* A `Weights` is a set of trainable parameters, typically associated - with one or more layers. The initial values are populated with an - `Initializer` and it is optimized with an `Optimizer`. -* The `ObjectiveFunction` is a mathematical expression that the - optimization algorithm will attempt to minimize. It is made up of - multiple `ObjectiveFunctionTerm`s, which are added up (possibly with - scaling factors) to obtain the full objective function. There are - currently two objective function terms: - - `LayerTerm` gets its value from a `Layer`. The layer must output - a scalar (tensor with one entry). - - `L2WeightRegularization` gets its value by computing the L2 norm - of the model weights. -* A `Metric` reports values to the user, which is helpful for - evaluating the progress of training. They get the their values from - layers, which must output scalars (tensors with one entry). -* A `Callback` performs some function at various points during - training. They are helpful for performing advanced training - techniques. - -Many of these components, e.g. layers, are automatically generated by -parsing messages defined in `src/proto/lbann.proto`. This file is -currently the best source for documentation. Note that LBANN currently -only supports static models, i.e. models with static execution graphs. - -### `lbann.proto` - -The `save_prototext` function can be used to export an LBANN -experiment to a prototext file. A typical experiment is comprised of a -model, data reader, and optimizer. - -### `lbann.modules` - -This is a collection of neural network modules, which are patterns of -layers that take an input layer to produce an output layer. Once -created, a `Module` is _callable_. Calling it with an input layer will -add the module's pattern to the layer graph and will return the output -layer. - -_A possible note of confusion_: "modules" in LBANN are similar to -"layers" in PyTorch, TensorFlow, and Keras. LBANN uses "layer" in a -similar manner as Caffe. - -### `lbann.models` - -This consists of common and influential neural network models. They -are implemented as `Module`s and can be used as components within more -complicated models. - -### `lbann.launcher` - -The `run` function interfaces with job schedulers on HPC clusters. It -will either submit a batch job (if on a login node) or run with an -existing node allocation (if on a compute node). - -_LLNL users_: The `run` function in the `lbann.contrib.lc.launcher` -module provides similar functionality, with defaults and optimizations -for LC systems. - -### `lbann.onnx` - -This contains functionality to convert between LBANN and ONNX models. - -## Examples - -A simple (and not very good) convolutional neural network for MNIST -data: - -```py -import lbann -import lbann.proto - -# ---------------------------------------------------------- -# Construct layer graph -# ---------------------------------------------------------- -# Note: The first argument to every layer specifies its parents, -# i.e. the sources for its input tensors. - -# Input data -# Note: Order matters for the children of the input layer! -input = lbann.Input() # Interacts with data reader -images = lbann.Identity(input) # NCHW image tensor -labels = lbann.Identity(input) # One-hot vector - -# Simple convolutional network -conv = lbann.Convolution( - images, - num_dims=2, # 2D convolution for NCHW tensors - num_output_channels=64, # I.e. number of filters - conv_dims_i=5, # Convolution window size (64x3x5x5 kernel) - conv_pads_i=2, # Padding of 2 in every dimension - conv_strides_i=2, # Stride of 2 in every dimension - has_bias=True) # Channel-wise bias -bn = lbann.BatchNormalization(conv) -relu = lbann.Relu(bn) -pool = lbann.Pooling( - relu, - num_dims=2, # 2D pooling (for NCHW tensors) - pool_dims_i=3, # 3x3 pooling window - pool_pads_i=1, # Padding of 1 in every dimension - pool_strides_i=2, # Stride of 2 in every dimension - pool_mode='max') # Max pooling -fc = lbann.FullyConnected( - pool, - num_neurons=10, # Output size - has_bias=False) # Entry-wise bias -softmax = lbann.Softmax(fc) - -# Compute values for objective function and metrics -loss = lbann.CrossEntropy([softmax, labels]) -acc = lbann.CategoricalAccuracy([softmax, labels]) - -# ---------------------------------------------------------- -# Construct model -# ---------------------------------------------------------- - -mini_batch_size = 256 -num_epochs = 10 -obj = lbann.ObjectiveFunction([loss]) -metrics = [lbann.Metric(acc, name='accuracy', unit='%')] -callbacks = [ - lbann.CallbackPrint(), # Print basic information - lbann.CallbackTimer() # Print timing information -] -model = lbann.Model( - mini_batch_size, num_epochs, - layers=lbann.traverse_layer_graph(input), # Layers connected to input - objective_function=obj, - metrics=metrics, - callbacks=callbacks) - -# ---------------------------------------------------------- -# Save the model to a prototext file. -# ---------------------------------------------------------- - -lbann.proto.save_prototext('test.prototext', model=model) - -``` - -See the implementation of LeNet in -`$LBANN_HOME/model_zoo/vision/lenet.py` for a more comprehensive -example. From dee6df3f09569f03a18adf8c097dbcef96eb4a16 Mon Sep 17 00:00:00 2001 From: Sam Ade Jacobs Date: Mon, 10 Jun 2019 12:09:59 -0700 Subject: [PATCH 064/634] Start clean up (deletion) of deprecated jag models --- .../jag/ae_cycle_gan/3models/ae.prototext | 365 ------- .../jag/ae_cycle_gan/3models/ae_cyc.prototext | 454 --------- .../ae_cycle_gan/3models/ae_cyc2.prototext | 724 -------------- .../ae_cycle_gan/3models/cycle_gan.prototext | 899 ------------------ .../jag/ae_cycle_gan/cycgan_m1.prototext | 766 --------------- .../jag/ae_cycle_gan/cycgan_m2.prototext | 760 --------------- .../jag/ae_cycle_gan/cycgan_m3.prototext | 801 ---------------- .../data_reader_jag_conduit_lassen.prototext | 112 --- .../data_reader_jag_conduit_lustre.prototext | 112 --- .../jag/ae_cycle_gan/jag10k_data.prototext | 25 - .../ae_cycle_gan/jag_100M_metadata.prototext | 115 --- .../models/jag/ae_cycle_gan/vae1.prototext | 459 --------- .../models/jag/ae_cycle_gan/vae_cyc.prototext | 555 ----------- .../models/jag/cycle_gan/cycgan_m1.prototext | 547 ----------- .../cycle_gan/cycgan_m1_template.prototext | 66 -- .../models/jag/cycle_gan/cycgan_m2.prototext | 535 ----------- .../cycle_gan/cycgan_m2_template.prototext | 65 -- .../models/jag/cycle_gan/cycgan_m3.prototext | 597 ------------ .../cycle_gan/cycgan_m3_template.prototext | 65 -- .../jag/cycle_gan/generate_cycgan_m1.py | 253 ----- .../jag/cycle_gan/generate_cycgan_m2.py | 255 ----- .../jag/cycle_gan/generate_cycgan_m3.py | 257 ----- .../models/jag/cycle_gan/jag_data.prototext | 25 - 23 files changed, 8812 deletions(-) delete mode 100644 model_zoo/models/jag/ae_cycle_gan/3models/ae.prototext delete mode 100644 model_zoo/models/jag/ae_cycle_gan/3models/ae_cyc.prototext delete mode 100644 model_zoo/models/jag/ae_cycle_gan/3models/ae_cyc2.prototext delete mode 100644 model_zoo/models/jag/ae_cycle_gan/3models/cycle_gan.prototext delete mode 100644 model_zoo/models/jag/ae_cycle_gan/cycgan_m1.prototext delete mode 100644 model_zoo/models/jag/ae_cycle_gan/cycgan_m2.prototext delete mode 100644 model_zoo/models/jag/ae_cycle_gan/cycgan_m3.prototext delete mode 100644 model_zoo/models/jag/ae_cycle_gan/data_reader_jag_conduit_lassen.prototext delete mode 100644 model_zoo/models/jag/ae_cycle_gan/data_reader_jag_conduit_lustre.prototext delete mode 100644 model_zoo/models/jag/ae_cycle_gan/jag10k_data.prototext delete mode 100644 model_zoo/models/jag/ae_cycle_gan/jag_100M_metadata.prototext delete mode 100644 model_zoo/models/jag/ae_cycle_gan/vae1.prototext delete mode 100644 model_zoo/models/jag/ae_cycle_gan/vae_cyc.prototext delete mode 100644 model_zoo/models/jag/cycle_gan/cycgan_m1.prototext delete mode 100644 model_zoo/models/jag/cycle_gan/cycgan_m1_template.prototext delete mode 100644 model_zoo/models/jag/cycle_gan/cycgan_m2.prototext delete mode 100644 model_zoo/models/jag/cycle_gan/cycgan_m2_template.prototext delete mode 100644 model_zoo/models/jag/cycle_gan/cycgan_m3.prototext delete mode 100644 model_zoo/models/jag/cycle_gan/cycgan_m3_template.prototext delete mode 100644 model_zoo/models/jag/cycle_gan/generate_cycgan_m1.py delete mode 100644 model_zoo/models/jag/cycle_gan/generate_cycgan_m2.py delete mode 100644 model_zoo/models/jag/cycle_gan/generate_cycgan_m3.py delete mode 100644 model_zoo/models/jag/cycle_gan/jag_data.prototext diff --git a/model_zoo/models/jag/ae_cycle_gan/3models/ae.prototext b/model_zoo/models/jag/ae_cycle_gan/3models/ae.prototext deleted file mode 100644 index 5f0b88111a6..00000000000 --- a/model_zoo/models/jag/ae_cycle_gan/3models/ae.prototext +++ /dev/null @@ -1,365 +0,0 @@ -#Example taken from: https://lc.llnl.gov/bitbucket/users/jjayaram/repos/deep-latent-spaces/browse/codes/dev/VAE-FCN/vae_fcn.py and -#https://lc.llnl.gov/bitbucket/users/jjayaram/repos/deep-latent-spaces/browse/codes/dev/VAE-FCN/run_vae.py -#Timestamp 02/26/2018 8:45AM -model { - name: "ae_model" - shareable_training_data_reader:false - serialize_io: true - data_layout: "data_parallel" - mini_batch_size: 256 - block_size: 256 - num_epochs:4 - num_parallel_readers: 0 - procs_per_trainer: 0 - - ################################################### - # Objective function - ################################################### - - objective_function { - layer_term { layer: "binary_cross_entropy" } - #layer_term { layer: "kl_divergence" } - l2_weight_regularization { - scale_factor: 1e-4 - } - } - - ################################################### - # Metrics - ################################################### - - metric { - layer_metric { - name: "mean squared error" - layer: "mean_squared_error" - } - } - - ################################################### - # Callbacks - ################################################### - callback { - print { - interval: 1 - } - } - callback { timer {} } - #callback { - # dump_activations{ - #basename: "/usr/workspace/wsa/jacobs32/github.saj.lbann/dump_acts_ae/" - # basename: "/usr/workspace/wsa/jacobs32/centralized-lbann/EuroViz/" - # interval: 1 - # layer_names: "image_data_dummy sigmoid" - # layer_names: "reconstruction" - # } - #} - #callback { - # save_images { - # image_prefix: "vae_fcn_images_" - # image_format: "jpg" - # } - #} - - ################################################### - # start of layers - ################################################### - - ###################### - # Data - ###################### - #layer { - # name: "data" - # children: "encode1 reconstruction" - # data_layout: "data_parallel" - # input { - # target_mode: "reconstruction" - # } - #} - - layer { - input { - target_mode: "N/A" - } - name: "data" - data_layout: "data_parallel" - parents: " " - } - layer { - name: "slice_data" - data_layout: "data_parallel" - parents: "data" - children: "image_data_dummy param_data_id" - slice { - #slice_points: "0 16384 16389" - get_slice_points_from_reader: "independent" - } - } - layer { - identity { - } - name: "image_data_dummy" - data_layout: "data_parallel" - parents: "slice_data" - } - layer { - identity { - } - name: "param_data_id" - data_layout: "data_parallel" - parents: "slice_data" - } - ###################### - # Encoder - ###################### - - # encode1 - layer { - #parents: "data" - parents: "image_data_dummy" - name: "encode1" - data_layout: "data_parallel" - fully_connected { - num_neurons: 256 - weight_initialization: "he_normal" - has_bias: true - } - } - layer { - parents: "encode1" - name: "encode1_elu" - data_layout: "data_parallel" - device_allocation: "cpu" - elu {} - } - layer { - parents: "encode1_elu" - name: "encode1_dropout" - data_layout: "data_parallel" - dropout { - keep_prob: 0.95 - } - } - - # encode2 - layer { - parents: "encode1_dropout" - name: "encode2" - data_layout: "data_parallel" - fully_connected { - num_neurons: 256 - weight_initialization: "he_normal" - has_bias: true - } - } - layer { - parents: "encode2" - name: "encode2_tanh" - data_layout: "data_parallel" - tanh {} - } - layer { - parents: "encode2_tanh" - name: "encode2_dropout" - data_layout: "data_parallel" - dropout { - keep_prob: 0.95 - } - } - - # encode3 - layer { - parents: "encode2_dropout" - name: "encode3" - data_layout: "data_parallel" - fully_connected { - num_neurons: 256 - weight_initialization: "he_normal" - has_bias: true - } - } - layer { - parents: "encode3" - name: "encode3_tanh" - data_layout: "data_parallel" - tanh {} - } - layer { - parents: "encode3_tanh" - name: "encode3_dropout" - data_layout: "data_parallel" - dropout { - keep_prob: 0.95 - } - } - - ###################### - # Latent space - ###################### - - layer { - parents: "encode3_dropout" - name: "z_mean" - data_layout: "data_parallel" - fully_connected { - num_neurons:20 - weight_initialization: "glorot_normal" - has_bias: true - } - } - layer { - #parents: "z_mean sample_exp_noise" - parents: "z_mean" - name: "sample" - data_layout: "data_parallel" - #sum {} - identity {} - } - - ###################### - # Decoder - ###################### - - # decode3 - layer { - parents: "sample" - name: "decode3" - data_layout: "data_parallel" - fully_connected { - num_neurons: 256 - weight_initialization: "he_normal" - has_bias: true - } - } - layer { - parents: "decode3" - name: "decode3_tanh" - data_layout: "data_parallel" - tanh {} - } - layer { - parents: "decode3_tanh" - name: "decode3_dropout" - data_layout: "data_parallel" - dropout { - keep_prob: 0.95 - } - } - - # decode2 - layer { - parents: "decode3_dropout" - name: "decode2" - data_layout: "data_parallel" - fully_connected { - num_neurons: 256 - weight_initialization: "he_normal" - has_bias: true - } - } - layer { - parents: "decode2" - name: "decode2_tanh" - data_layout: "data_parallel" - tanh {} - } - layer { - parents: "decode2_tanh" - name: "decode2_dropout" - data_layout: "data_parallel" - dropout { - keep_prob: 0.95 - } - } - - # decode1 - layer { - parents: "decode2_dropout" - name: "decode1" - data_layout: "data_parallel" - fully_connected { - num_neurons: 256 - weight_initialization: "he_normal" - has_bias: true - } - } - layer { - parents: "decode1" - name: "decode1_elu" - data_layout: "data_parallel" - device_allocation: "cpu" - elu { - } - } - layer { - parents: "decode1_elu" - name: "decode1_dropout" - data_layout: "data_parallel" - dropout { - keep_prob: 0.95 - } - } - - # decode0 - layer { - parents: "decode1_dropout" - name: "decode0" - data_layout: "data_parallel" - #num_neurons_from_data_reader: true - fully_connected { - weight_initialization: "glorot_normal" - #num_neurons: 16384 - get_slice_points_from_reader: "independent" - get_num_neurons_of_slice_from_reader: [ 1 ] - has_bias: true - } - } - layer { - parents: "decode0" - name: "sigmoid" - data_layout: "data_parallel" - sigmoid {} - } - ###################### - # Reconstruction - ###################### - - layer { - parents: "sigmoid" - name: "reconstruction" - data_layout: "model_parallel" - split {} - } - layer { - parents: "reconstruction image_data_dummy" - name: "binary_cross_entropy" - data_layout: "model_parallel" - #binary_cross_entropy {} - mean_squared_error {} - } - layer { - parents: "reconstruction image_data_dummy" - name: "mean_squared_error" - data_layout: "model_parallel" - mean_squared_error {} - } - ####For metric, loss per individual sample - layer { - name: "ae_err" - data_layout: "model_parallel" - parents: "param_data_id mean_squared_error" - concatenation { - } - } - callback { - dump_outputs { - # directory:"/p/lscratchh/brainusr/jacobs32/EuroViz/ae_loss/" - # #directory:"/p/gpfs1/jacobs32/EuroViz3/ae_loss/" - directory:"ae_loss/" - layers: "ae_err" - execution_modes: "test" - } - } - ################################################### - # end of layers - ################################################### -} diff --git a/model_zoo/models/jag/ae_cycle_gan/3models/ae_cyc.prototext b/model_zoo/models/jag/ae_cycle_gan/3models/ae_cyc.prototext deleted file mode 100644 index c7931f84084..00000000000 --- a/model_zoo/models/jag/ae_cycle_gan/3models/ae_cyc.prototext +++ /dev/null @@ -1,454 +0,0 @@ -#Combines encoder portion ae with cycgan models for inference (forward prediction) in output space. -model { - name: "ae_cycgan_model" - shareable_training_data_reader:false - serialize_io: true - data_layout: "data_parallel" - mini_batch_size: 256 - block_size: 256 - num_epochs: 1 - num_parallel_readers: 0 - procs_per_trainer: 0 - - ################################################### - # Objective function - ################################################### - - objective_function { - layer_term { layer: "binary_cross_entropy" } - #layer_term { layer: "kl_divergence" } - l2_weight_regularization { - scale_factor: 1e-4 - } - } - - ################################################### - # Metrics - ################################################### - - metric { - layer_metric { - name: "mean squared error" - layer: "mean_squared_error" - } - } - ################################################### - # Callbacks - ################################################### - callback { - print { - interval: 1 - } - } - callback { timer {} } - # callback { - # save_images { - # image_prefix: "vae_fcn_images_" - # image_format: "jpg" - # } - # } - - ################################################### - # start of layers - ################################################### - - ###################### - # Data - ###################### - #Layer from cycle GAN - layer { - input { - target_mode: "N/A" - } - name: "data" - data_layout: "data_parallel" - parents: " " - } - layer { - name: "slice_data" - data_layout: "data_parallel" - parents: "data" - children: "image_data_id param_data_id" - slice { - #slice_points: "0 16384 16389" - get_slice_points_from_reader: "independent" - } - } - layer { - identity { - } - name: "image_data_id" - data_layout: "data_parallel" - parents: "slice_data" - } - layer { - identity { - } - name: "param_data_id" - data_layout: "data_parallel" - parents: "slice_data" - } - layer { - fully_connected { - num_neurons: 64 - #num_neurons: 256 - has_bias: true - } - name: "gen1fc1" - data_layout: "data_parallel" - weights: "gen1fc1linearity gen1fc1bias" - parents: "param_data_id" - } - layer { - relu { - } - name: "gen1relu1_1" - data_layout: "data_parallel" - parents: "gen1fc1" - } - layer { - fully_connected { - #num_neurons: 2048 - num_neurons: 512 - has_bias: true - } - name: "gen1fc2" - data_layout: "data_parallel" - weights: "gen1fc2linearity gen1fc2bias" - parents: "gen1relu1_1" - } - layer { - relu { - } - name: "gen1relu2_1" - data_layout: "data_parallel" - parents: "gen1fc2" - } - layer { - dropout { - keep_prob: 0.8 - } - name: "gen1dropout1_1" - data_layout: "data_parallel" - parents: "gen1relu2_1" - } - layer { - fully_connected { - #num_neurons: 8192 - num_neurons: 2048 - has_bias: true - } - name: "gen1fc3" - data_layout: "data_parallel" - weights: "gen1fc3linearity" - parents: "gen1dropout1_1" - } - layer { - relu { - } - name: "gen1relu3_1" - data_layout: "data_parallel" - parents: "gen1fc3" - } - layer { - fully_connected { - #num_neurons: 16384 - #latent_dim - num_neurons: 20 - has_bias: true - } - name: "gen1fc4" - data_layout: "data_parallel" - weights: "gen1fc4linearity" - parents: "gen1relu3_1" - } - - weights { - name: "gen1fc1linearity" - he_normal_initializer { - } - } - weights { - name: "gen1fc2linearity" - he_normal_initializer { - } - } - weights { - name: "gen1fc3linearity" - he_normal_initializer { - } - } - weights { - name: "gen1fc4linearity" - he_normal_initializer { - } - } - - ###################### - # Encoder - ###################### - #Encoder not really used here - # encode1 - layer { - parents: "image_data_id" - name: "encode1" - data_layout: "data_parallel" - fully_connected { - num_neurons: 256 - weight_initialization: "he_normal" - has_bias: true - } - } - layer { - parents: "encode1" - name: "encode1_elu" - data_layout: "data_parallel" - device_allocation: "cpu" - elu {} - } - layer { - parents: "encode1_elu" - name: "encode1_dropout" - data_layout: "data_parallel" - dropout { - keep_prob: 0.95 - } - } - - # encode2 - layer { - parents: "encode1_dropout" - name: "encode2" - data_layout: "data_parallel" - fully_connected { - num_neurons: 256 - weight_initialization: "he_normal" - has_bias: true - } - } - layer { - parents: "encode2" - name: "encode2_tanh" - data_layout: "data_parallel" - tanh {} - } - layer { - parents: "encode2_tanh" - name: "encode2_dropout" - data_layout: "data_parallel" - dropout { - keep_prob: 0.95 - } - } - - # encode3 - layer { - parents: "encode2_dropout" - name: "encode3" - data_layout: "data_parallel" - fully_connected { - num_neurons: 256 - weight_initialization: "he_normal" - has_bias: true - } - } - layer { - parents: "encode3" - name: "encode3_tanh" - data_layout: "data_parallel" - tanh {} - } - layer { - parents: "encode3_tanh" - name: "encode3_dropout" - data_layout: "data_parallel" - dropout { - keep_prob: 0.95 - } - } - - ###################### - # Latent space - ###################### - - layer { - parents: "encode3_dropout" - name: "z_mean" - data_layout: "data_parallel" - fully_connected { - num_neurons:20 - weight_initialization: "glorot_normal" - has_bias: true - } - } - #layer { - # parents: "sample_exp sample_noise" - # name: "sample_exp_noise" - # data_layout: "data_parallel" - # hadamard {} - #} - layer { - # parents: "z_mean sample_exp_noise" - parents: "z_mean" - name: "image_data_dummy" - data_layout: "data_parallel" - # sum {} - identity {} - } - ####output of encoder not used, dangling - ###################### - # Decoder - ###################### - - # decode3 - layer { - #parents: "sample" - parents: "gen1fc4" - name: "decode3" - data_layout: "data_parallel" - fully_connected { - num_neurons: 256 - weight_initialization: "he_normal" - has_bias: true - } - } - layer { - parents: "decode3" - name: "decode3_tanh" - data_layout: "data_parallel" - tanh {} - } - layer { - parents: "decode3_tanh" - name: "decode3_dropout" - data_layout: "data_parallel" - dropout { - keep_prob: 0.95 - } - } - - # decode2 - layer { - parents: "decode3_dropout" - name: "decode2" - data_layout: "data_parallel" - fully_connected { - num_neurons: 256 - weight_initialization: "he_normal" - has_bias: true - } - } - layer { - parents: "decode2" - name: "decode2_tanh" - data_layout: "data_parallel" - tanh {} - } - layer { - parents: "decode2_tanh" - name: "decode2_dropout" - data_layout: "data_parallel" - dropout { - keep_prob: 0.95 - } - } - - # decode1 - layer { - parents: "decode2_dropout" - name: "decode1" - data_layout: "data_parallel" - fully_connected { - num_neurons: 256 - weight_initialization: "he_normal" - has_bias: true - } - } - layer { - parents: "decode1" - name: "decode1_elu" - data_layout: "data_parallel" - device_allocation: "cpu" - elu { - } - } - layer { - parents: "decode1_elu" - name: "decode1_dropout" - data_layout: "data_parallel" - dropout { - keep_prob: 0.95 - } - } - - # decode0 - layer { - parents: "decode1_dropout" - name: "decode0" - data_layout: "data_parallel" - #num_neurons_from_data_reader: true - fully_connected { - weight_initialization: "glorot_normal" - #num_neurons: 16384 - get_slice_points_from_reader: "independent" - get_num_neurons_of_slice_from_reader: [ 1 ] - has_bias: true - } - } - layer { - parents: "decode0" - name: "sigmoid" - data_layout: "data_parallel" - sigmoid {} - } - - ###################### - # Reconstruction - ###################### - - layer { - parents: "sigmoid" - name: "reconstruction" - data_layout: "model_parallel" - split {} - } - layer { - parents: "reconstruction image_data_id" - name: "binary_cross_entropy" - data_layout: "model_parallel" - #binary_cross_entropy {} - mean_squared_error {} - } - layer { - parents: "reconstruction image_data_id" - name: "mean_squared_error" - data_layout: "model_parallel" - mean_squared_error {} - } - - ####For metric, loss per individual sample - layer { - name: "fw_out_loss" - data_layout: "model_parallel" - parents: "param_data_id mean_squared_error" - concatenation { - } - } - callback { - dump_outputs { - #directory:"/p/lscratchh/jacobs32/EuroViz/fw_out_loss/" - directory:"fw_out_loss/" - layers: "fw_out_loss" - execution_modes: "test" - } - } - callback { - save_model { - dir: "model" - disable_save_after_training: true - } - } - ################################################### - # end of layers - ################################################### -} diff --git a/model_zoo/models/jag/ae_cycle_gan/3models/ae_cyc2.prototext b/model_zoo/models/jag/ae_cycle_gan/3models/ae_cyc2.prototext deleted file mode 100644 index e2a6eb6085d..00000000000 --- a/model_zoo/models/jag/ae_cycle_gan/3models/ae_cyc2.prototext +++ /dev/null @@ -1,724 +0,0 @@ -#Augumented version of ae_cyc.prototext so we can have ae_loss, fw_latent_loss and fw_out_loss all in the same file instead of 3 files, a request from MLSI ML team. This augmentation involves replicating blocks for fw_model from cycle gan and encode from autoencoder. -#Streamlines inference to use of only 1 model checkpoint (saved by using this prototext in training). Weights are copied from autoencoder and cyclegan and saved after training. -model { - name: "ae_cycgan_model" - shareable_training_data_reader:false - serialize_io: true - data_layout: "data_parallel" - mini_batch_size: 256 - block_size: 256 - num_epochs: 1 - num_parallel_readers: 0 - procs_per_trainer: 0 - - ################################################### - # Objective function - ################################################### - - objective_function { - layer_term { layer: "binary_cross_entropy" } - #layer_term { layer: "kl_divergence" } - l2_weight_regularization { - scale_factor: 1e-4 - } - } - - ################################################### - # Metrics - ################################################### - - metric { - layer_metric { - name: "mean squared error" - #layer: "mean_squared_error" - layer: "fw_out_loss" - } - } - ################################################### - # Callbacks - ################################################### - callback { - print { - interval: 1 - } - } - callback { timer {} } - - ################################################### - # start of layers - ################################################### - - ###################### - # Data - ###################### - #Layer from cycle GAN - layer { - input { - target_mode: "N/A" - } - name: "data" - data_layout: "data_parallel" - parents: " " - } - layer { - name: "slice_data" - data_layout: "data_parallel" - parents: "data" - children: "image_data_id param_data_id" - slice { - #slice_points: "0 16384 16389" - get_slice_points_from_reader: "independent" - } - } - layer { - identity { - } - name: "image_data_id" - data_layout: "data_parallel" - parents: "slice_data" - } - layer { - identity { - } - name: "param_data_id" - data_layout: "data_parallel" - parents: "slice_data" - } - layer { - fully_connected { - num_neurons: 64 - #num_neurons: 256 - has_bias: true - } - name: "gen1fc1" - data_layout: "data_parallel" - weights: "gen1fc1linearity gen1fc1bias" - parents: "param_data_id" - } - layer { - relu { - } - name: "gen1relu1_1" - data_layout: "data_parallel" - parents: "gen1fc1" - } - layer { - fully_connected { - #num_neurons: 2048 - num_neurons: 512 - has_bias: true - } - name: "gen1fc2" - data_layout: "data_parallel" - weights: "gen1fc2linearity gen1fc2bias" - parents: "gen1relu1_1" - } - layer { - relu { - } - name: "gen1relu2_1" - data_layout: "data_parallel" - parents: "gen1fc2" - } - layer { - dropout { - keep_prob: 0.8 - } - name: "gen1dropout1_1" - data_layout: "data_parallel" - parents: "gen1relu2_1" - } - layer { - fully_connected { - #num_neurons: 8192 - num_neurons: 2048 - has_bias: true - } - name: "gen1fc3" - data_layout: "data_parallel" - weights: "gen1fc3linearity gen1fc3bias" - parents: "gen1dropout1_1" - } - layer { - relu { - } - name: "gen1relu3_1" - data_layout: "data_parallel" - parents: "gen1fc3" - } - layer { - fully_connected { - #num_neurons: 16384 - #latent_dim - num_neurons: 20 - has_bias: true - } - name: "gen1fc4" - data_layout: "data_parallel" - weights: "gen1fc4linearity gen1fc4bias" - parents: "gen1relu3_1" - } - - weights { - name: "gen1fc1linearity" - he_normal_initializer { - } - } - weights { - name: "gen1fc1bias" - } - weights { - name: "gen1fc2linearity" - he_normal_initializer { - } - } - weights { - name: "gen1fc2bias" - } - weights { - name: "gen1fc3linearity" - he_normal_initializer { - } - } - weights { - name: "gen1fc3bias" - } - weights { - name: "gen1fc4linearity" - he_normal_initializer { - } - } - weights { - name: "gen1fc4bias" - } - - ###################### - # Encoder - ###################### - #Encoder not really used here - # encode1 - layer { - parents: "image_data_id" - name: "encode1" - data_layout: "data_parallel" - fully_connected { - num_neurons: 256 - weight_initialization: "he_normal" - has_bias: true - } - } - layer { - parents: "encode1" - name: "encode1_elu" - data_layout: "data_parallel" - device_allocation: "cpu" - elu {} - } - layer { - parents: "encode1_elu" - name: "encode1_dropout" - data_layout: "data_parallel" - dropout { - keep_prob: 0.95 - } - } - - # encode2 - layer { - parents: "encode1_dropout" - name: "encode2" - data_layout: "data_parallel" - fully_connected { - num_neurons: 256 - weight_initialization: "he_normal" - has_bias: true - } - } - layer { - parents: "encode2" - name: "encode2_tanh" - data_layout: "data_parallel" - tanh {} - } - layer { - parents: "encode2_tanh" - name: "encode2_dropout" - data_layout: "data_parallel" - dropout { - keep_prob: 0.95 - } - } - - # encode3 - layer { - parents: "encode2_dropout" - name: "encode3" - data_layout: "data_parallel" - fully_connected { - num_neurons: 256 - weight_initialization: "he_normal" - has_bias: true - } - } - layer { - parents: "encode3" - name: "encode3_tanh" - data_layout: "data_parallel" - tanh {} - } - layer { - parents: "encode3_tanh" - name: "encode3_dropout" - data_layout: "data_parallel" - dropout { - keep_prob: 0.95 - } - } - - ###################### - # Latent space - ###################### - - layer { - parents: "encode3_dropout" - name: "z_mean" - data_layout: "data_parallel" - fully_connected { - num_neurons:20 - weight_initialization: "glorot_normal" - has_bias: true - } - } - #layer { - # parents: "sample_exp sample_noise" - # name: "sample_exp_noise" - # data_layout: "data_parallel" - # hadamard {} - #} - layer { - # parents: "z_mean sample_exp_noise" - parents: "z_mean" - name: "image_data_dummy" - data_layout: "data_parallel" - # sum {} - identity {} - } - ####output of encoder goes to decoder and cycGAN duplicates - ###################### - # Decoder for foward output loss - ###################### - - # decode3 - layer { - #parents: "sample" - parents: "gen1fc4" - name: "decode3" - data_layout: "data_parallel" - weights: "decode3linearity decode3bias" - fully_connected { - num_neurons: 256 - #weight_initialization: "he_normal" - has_bias: true - } - } - layer { - parents: "decode3" - name: "decode3_tanh" - data_layout: "data_parallel" - tanh {} - } - layer { - parents: "decode3_tanh" - name: "decode3_dropout" - data_layout: "data_parallel" - dropout { - keep_prob: 0.95 - } - } - - # decode2 - layer { - parents: "decode3_dropout" - name: "decode2" - data_layout: "data_parallel" - weights: "decode2linearity decode2bias" - fully_connected { - num_neurons: 256 - #weight_initialization: "he_normal" - has_bias: true - } - } - layer { - parents: "decode2" - name: "decode2_tanh" - data_layout: "data_parallel" - tanh {} - } - layer { - parents: "decode2_tanh" - name: "decode2_dropout" - data_layout: "data_parallel" - dropout { - keep_prob: 0.95 - } - } - - # decode1 - layer { - parents: "decode2_dropout" - name: "decode1" - data_layout: "data_parallel" - weights: "decode1linearity decode1bias" - fully_connected { - num_neurons: 256 - #weight_initialization: "he_normal" - has_bias: true - } - } - layer { - parents: "decode1" - name: "decode1_elu" - data_layout: "data_parallel" - device_allocation: "cpu" - elu { - } - } - layer { - parents: "decode1_elu" - name: "decode1_dropout" - data_layout: "data_parallel" - dropout { - keep_prob: 0.95 - } - } - - # decode0 - layer { - parents: "decode1_dropout" - name: "decode0" - data_layout: "data_parallel" - #num_neurons_from_data_reader: true - weights: "decode0linearity decode0bias" - fully_connected { - #weight_initialization: "glorot_normal" - #num_neurons: 16384 - get_slice_points_from_reader: "independent" - get_num_neurons_of_slice_from_reader: [ 1 ] - has_bias: true - } - } - layer { - parents: "decode0" - name: "sigmoid" - data_layout: "data_parallel" - sigmoid {} - } - - ###################### - # Reconstruction - ###################### - - layer { - parents: "sigmoid" - name: "reconstruction" - data_layout: "model_parallel" - split {} - } - layer { - parents: "reconstruction image_data_id" - name: "binary_cross_entropy" - data_layout: "model_parallel" - #binary_cross_entropy {} - mean_squared_error {} - } - layer { - parents: "reconstruction image_data_id" - name: "fw_out_loss" - data_layout: "model_parallel" - mean_squared_error {} - } - - ####Decoder weights - weights { - name: "decode0linearity" - he_normal_initializer { - } - } - weights { - name: "decode0bias" - } - - weights { - name: "decode1linearity" - he_normal_initializer { - } - } - weights { - name: "decode1bias" - } - weights { - name: "decode2linearity" - he_normal_initializer { - } - } - weights { - name: "decode2bias" - } - weights { - name: "decode3linearity" - he_normal_initializer { - } - } - weights { - name: "decode3bias" - } - -#Decoder duplicated for ae_loss - # decode3 - layer { - #parents: "sample" - parents: "image_data_dummy" - name: "ae_decode3" - data_layout: "data_parallel" - weights: "decode3linearity decode3bias" - fully_connected { - num_neurons: 256 - #weight_initialization: "he_normal" - has_bias: true - } - } - layer { - parents: "ae_decode3" - name: "ae_decode3_tanh" - data_layout: "data_parallel" - tanh {} - } - layer { - parents: "ae_decode3_tanh" - name: "ae_decode3_dropout" - data_layout: "data_parallel" - dropout { - keep_prob: 0.95 - } - } - - # decode2 - layer { - parents: "ae_decode3_dropout" - name: "ae_decode2" - data_layout: "data_parallel" - weights: "decode2linearity decode2bias" - fully_connected { - num_neurons: 256 - #weight_initialization: "he_normal" - has_bias: true - } - } - layer { - parents: "ae_decode2" - name: "ae_decode2_tanh" - data_layout: "data_parallel" - tanh {} - } - layer { - parents: "ae_decode2_tanh" - name: "ae_decode2_dropout" - data_layout: "data_parallel" - dropout { - keep_prob: 0.95 - } - } - - # decode1 - layer { - parents: "ae_decode2_dropout" - name: "ae_decode1" - data_layout: "data_parallel" - weights: "decode1linearity decode1bias" - fully_connected { - num_neurons: 256 - #weight_initialization: "he_normal" - has_bias: true - } - } - layer { - parents: "ae_decode1" - name: "ae_decode1_elu" - data_layout: "data_parallel" - device_allocation: "cpu" - elu { - } - } - layer { - parents: "ae_decode1_elu" - name: "ae_decode1_dropout" - data_layout: "data_parallel" - dropout { - keep_prob: 0.95 - } - } - - # decode0 - layer { - parents: "ae_decode1_dropout" - name: "ae_decode0" - data_layout: "data_parallel" - #num_neurons_from_data_reader: true - weights: "decode0linearity decode0bias" - fully_connected { - #weight_initialization: "glorot_normal" - #num_neurons: 16384 - get_slice_points_from_reader: "independent" - get_num_neurons_of_slice_from_reader: [ 1 ] - has_bias: true - } - } - layer { - parents: "ae_decode0" - name: "ae_sigmoid" - data_layout: "data_parallel" - sigmoid {} - } - - ###################### - # Reconstruction - ###################### - - layer { - parents: "ae_sigmoid" - name: "ae_reconstruction" - data_layout: "data_parallel" - split {} - } - layer { - parents: "ae_reconstruction image_data_id" - name: "ae_loss" - data_layout: "data_parallel" - mean_squared_error {} - } - - ###Cycle GAN duplicated for latent loss dump - #Takes output of encoder as input - layer { - fully_connected { - num_neurons: 64 - #num_neurons: 256 - has_bias: true - } - name: "latent_gen1fc1" - data_layout: "data_parallel" - weights: "gen1fc1linearity gen1fc1bias" - parents: "param_data_id" - } - layer { - relu { - } - name: "latent_gen1relu1_1" - data_layout: "data_parallel" - parents: "latent_gen1fc1" - } - layer { - fully_connected { - #num_neurons: 2048 - num_neurons: 512 - has_bias: true - } - name: "latent_gen1fc2" - data_layout: "data_parallel" - weights: "gen1fc2linearity gen1fc2bias" - parents: "latent_gen1relu1_1" - } - layer { - relu { - } - name: "latent_gen1relu2_1" - data_layout: "data_parallel" - parents: "latent_gen1fc2" - } - layer { - dropout { - keep_prob: 0.8 - } - name: "latent_gen1dropout1_1" - data_layout: "data_parallel" - parents: "latent_gen1relu2_1" - } - layer { - fully_connected { - #num_neurons: 8192 - num_neurons: 2048 - has_bias: true - } - name: "latent_gen1fc3" - data_layout: "data_parallel" - weights: "gen1fc3linearity gen1fc3bias" - parents: "latent_gen1dropout1_1" - } - layer { - relu { - } - name: "latent_gen1relu3_1" - data_layout: "data_parallel" - parents: "latent_gen1fc3" - } - layer { - fully_connected { - #num_neurons: 16384 - #latent_dim - num_neurons: 20 - has_bias: true - } - name: "latent_gen1fc4" - data_layout: "data_parallel" - weights: "gen1fc4linearity gen1fc4bias" - parents: "latent_gen1relu3_1" - } - - layer { - name: "gsample_minus_latentsample" - data_layout: "data_parallel" - parents: "latent_gen1fc4 image_data_dummy" - weighted_sum { - scaling_factors: "1 -1" - } - } - layer { - name: "fw_latent_loss" - data_layout: "data_parallel" - l2_norm2 { - } - parents: "gsample_minus_latentsample" - } - - ####For metric, loss per individual sample - layer { - name: "ae_latent_out_losses" - data_layout: "model_parallel" - parents: "param_data_id ae_loss fw_latent_loss fw_out_loss" - #parents: "data z_mean ae_loss fw_latent_loss fw_out_loss" - concatenation { - } - } - callback { - dump_outputs { - #directory:"/p/lscratchh/jacobs32/EuroViz/fw_out_loss/" - directory:"ae_latent_out_losses/" - #directory:"save_img_acts/" - #ae_reconstruction === autoencoder reconstrcution - #reconstruction ==== cycgan+autoencoder reconstruction - #layers: "ae_reconstruction image_data_id reconstruction" - layers: "ae_latent_out_losses" - execution_modes: "test" - format: "npz" - } - } - callback { - save_model { - dir: "model" - disable_save_after_training: true - } - } - ################################################### - # end of layers - ################################################### -} diff --git a/model_zoo/models/jag/ae_cycle_gan/3models/cycle_gan.prototext b/model_zoo/models/jag/ae_cycle_gan/3models/cycle_gan.prototext deleted file mode 100644 index 7d38e4ca6bb..00000000000 --- a/model_zoo/models/jag/ae_cycle_gan/3models/cycle_gan.prototext +++ /dev/null @@ -1,899 +0,0 @@ -model { - name: "cycgan_model" - shareable_training_data_reader:false - serialize_io: true - procs_per_trainer:0 - objective_function { - l2_weight_regularization { - scale_factor: 0.0001 - } - layer_term { - scale_factor: 1.0 - layer: "disc1_real_bce" - } - layer_term { - scale_factor: 1.0 - layer: "disc1_fake_bce" - } - layer_term { - scale_factor: 0.05 - layer: "g_adv1_bce" - } - layer_term { - scale_factor: 0.025 - layer: "l_l2_y" - } - layer_term { - scale_factor: 1.0 - layer: "disc1_inv_real_bce" - } - layer_term { - scale_factor: 1.0 - layer: "disc1_inv_fake_bce" - } - layer_term { - scale_factor: 0.05 - layer: "g_inv_adv1_bce" - } - layer_term { - scale_factor: 0.025 - layer: "l_l2_x" - } - } - num_epochs: 4 - super_steps: 10 - metric { - layer_metric { - layer: "l_l2_y" - } - } - data_layout: "data_parallel" - layer { - input { - data_set_per_model: true - target_mode: "N/A" - } - name: "data" - data_layout: "data_parallel" - parents: " " - } - layer { - name: "zero" - data_layout: "data_parallel" - constant { - value: 0.0 - num_neurons: "1" - } - } - layer { - name: "one" - data_layout: "data_parallel" - constant { - value: 1.0 - num_neurons: "1" - } - } - layer { - name: "slice_data" - data_layout: "data_parallel" - parents: "data" - #children: "image_data_dummy param_data_id" - children: "image_data_id param_data_id" - slice { - #slice_points: "0 2500 2511" - get_slice_points_from_reader: "independent" - } - } - layer { - identity { - } - #name: "image_data_dummy" - name: "image_data_id" - data_layout: "data_parallel" - parents: "slice_data" - } - layer { - identity { - } - name: "param_data_id" - data_layout: "data_parallel" - parents: "slice_data" - } -########Data space end here - ###################### - # Encoder from VAE - ###################### - - # encode1 - layer { - #parents: "data" - parents: "image_data_id" - name: "encode1" - freeze: true - data_layout: "data_parallel" - fully_connected { - num_neurons: 256 - has_bias: true - } - } - layer { - parents: "encode1" - name: "encode1_elu" - data_layout: "data_parallel" - device_allocation: "cpu" - elu {} - } - layer { - parents: "encode1_elu" - name: "encode1_dropout" - data_layout: "data_parallel" - dropout { - keep_prob: 0.95 - } - } - - # encode2 - layer { - parents: "encode1_dropout" - name: "encode2" - freeze: true - data_layout: "data_parallel" - fully_connected { - num_neurons: 256 - has_bias: true - } - } - layer { - parents: "encode2" - name: "encode2_tanh" - data_layout: "data_parallel" - tanh {} - } - layer { - parents: "encode2_tanh" - name: "encode2_dropout" - data_layout: "data_parallel" - dropout { - keep_prob: 0.95 - } - } - - # encode3 - layer { - parents: "encode2_dropout" - name: "encode3" - freeze: true - data_layout: "data_parallel" - fully_connected { - num_neurons: 256 - has_bias: true - } - } - layer { - parents: "encode3" - name: "encode3_tanh" - data_layout: "data_parallel" - tanh {} - } - layer { - parents: "encode3_tanh" - name: "encode3_dropout" - data_layout: "data_parallel" - dropout { - keep_prob: 0.95 - } - } - - ###################### - # Latent space - ###################### - - layer { - parents: "encode3_dropout" - name: "z_mean" - freeze: true - data_layout: "data_parallel" - fully_connected { - num_neurons:20 - has_bias: true - } - } - layer { - #parents: "z_mean sample_exp_noise" - parents: "z_mean" - #name: "sample" - ###This is actually sample in latent space, call image_data_dummy for legacy - name: "image_data_dummy" - data_layout: "data_parallel" - #sum {} - identity {} - } - #####VAE Encoder ends here, sample feeds/replaces image data dummy - layer { - fully_connected { - num_neurons: 64 - has_bias: true - } - name: "gen1fc1" - data_layout: "data_parallel" - weights: "gen1fc1linearity gen1fc1bias" - parents: "param_data_id" - } - layer { - relu { - } - name: "gen1relu1" - data_layout: "data_parallel" - parents: "gen1fc1" - } - layer { - fully_connected { - num_neurons: 512 - has_bias: true - } - name: "gen1fc2" - data_layout: "data_parallel" - weights: "gen1fc2linearity gen1fc2bias" - parents: "gen1relu1" - } - layer { - relu { - } - name: "gen1relu2" - data_layout: "data_parallel" - parents: "gen1fc2" - } - #layer { - # dropout { - # keep_prob: 0.8 - # } - # name: "gen1dropout1" - # data_layout: "data_parallel" - # parents: "gen1relu2" - #} - layer { - fully_connected { - num_neurons: 2048 - has_bias: true - } - name: "gen1fc3" - data_layout: "data_parallel" - weights: "gen1fc3linearity gen1fc3bias" - #parents: "gen1dropout1" - parents: "gen1relu2" - } - layer { - relu { - } - name: "gen1relu3" - data_layout: "data_parallel" - parents: "gen1fc3" - } - layer { - fully_connected { - #num_neurons: 2500 - #get_slice_points_from_reader: "independent" - #get_num_neurons_of_slice_from_reader: [ 1 ] - #replace image_dim with latent_dim - num_neurons: 20 - has_bias: true - } - name: "gen1fc4" - data_layout: "data_parallel" - weights: "gen1fc4linearity gen1fc4bias" - parents: "gen1relu3" - } - #concat latenty sample (image_data_dummy) and param - layer { - name: "concat_latent_sample_n_param" - data_layout: "data_parallel" - parents: "image_data_dummy param_data_id" - concatenation { - } - } - layer { - fully_connected { - num_neurons: 512 - has_bias: true - } - name: "d1fc1_real" - data_layout: "data_parallel" - weights: "d1fc1linearity d1fc1bias" - #parents: "data" - parents: "concat_latent_sample_n_param" - } - layer { - relu { - } - name: "d1relu1_real" - data_layout: "data_parallel" - parents: "d1fc1_real" - } - layer { - fully_connected { - num_neurons: 64 - has_bias: true - } - name: "d1fc2_real" - data_layout: "data_parallel" - weights: "d1fc2linearity d1fc2bias" - parents: "d1relu1_real" - } - layer { - relu { - } - name: "d1relu2_real" - data_layout: "data_parallel" - parents: "d1fc2_real" - } - layer { - fully_connected { - num_neurons: 1 - has_bias: true - } - name: "d1fc3_real" - data_layout: "data_parallel" - weights: "d1fc3linearity d1fc3bias" - parents: "d1relu2_real" - } - layer { - name: "concat_gsample_n_param" - data_layout: "data_parallel" - parents: "gen1fc4 param_data_id" - children: "d1_stop_gradient d2_dummy" - concatenation { - } - } - layer { - name: "d1_stop_gradient" - data_layout: "data_parallel" - parents: "concat_gsample_n_param" - stop_gradient { - } - } - layer { - fully_connected { - num_neurons: 512 - has_bias: true - } - name: "d1fc1_fake" - data_layout: "data_parallel" - weights: "d1fc1linearity d1fc1bias" - parents: "d1_stop_gradient" - } - layer { - relu { - } - name: "d1relu1_fake" - data_layout: "data_parallel" - parents: "d1fc1_fake" - } - layer { - fully_connected { - num_neurons: 64 - has_bias: true - } - name: "d1fc2_fake" - data_layout: "data_parallel" - weights: "d1fc2linearity d1fc2bias" - parents: "d1relu1_fake" - } - layer { - relu { - } - name: "d1relu2_fake" - data_layout: "data_parallel" - parents: "d1fc2_fake" - } - layer { - fully_connected { - num_neurons: 1 - has_bias: true - } - name: "d1fc3_fake" - data_layout: "data_parallel" - weights: "d1fc3linearity d1fc3bias" - parents: "d1relu2_fake" - } - layer { - sigmoid_binary_cross_entropy { - } - name: "disc1_real_bce" - data_layout: "data_parallel" - parents: "d1fc3_real one" - } - layer { - sigmoid_binary_cross_entropy { - } - name: "disc1_fake_bce" - data_layout: "data_parallel" - parents: "d1fc3_fake zero" - } - layer { - identity { - } - name: "d2_dummy" - data_layout: "data_parallel" - parents: "concat_gsample_n_param" - } - layer { - freeze: true - fully_connected { - num_neurons: 512 - has_bias: true - } - name: "d2fc1" - data_layout: "data_parallel" - parents: "d2_dummy" - } - layer { - relu { - } - name: "d2relu1" - data_layout: "data_parallel" - parents: "d2fc1" - } - layer { - freeze: true - fully_connected { - num_neurons: 64 - has_bias: true - } - name: "d2fc2" - data_layout: "data_parallel" - parents: "d2relu1" - } - layer { - relu { - } - name: "d2relu2" - data_layout: "data_parallel" - parents: "d2fc2" - } - layer { - freeze: true - fully_connected { - num_neurons: 1 - has_bias: true - } - name: "d2fc3" - data_layout: "data_parallel" - parents: "d2relu2" - } - layer { - sigmoid_binary_cross_entropy { - } - name: "g_adv1_bce" - data_layout: "data_parallel" - parents: "d2fc3 one" - } - layer { - name: "gsample_minus_y" - data_layout: "data_parallel" - parents: "gen1fc4 image_data_dummy" - weighted_sum { - scaling_factors: "1 -1" - } - } - layer { - name: "l_l2_y" - data_layout: "data_parallel" - l2_norm2 { - } - parents: "gsample_minus_y" - } - layer { - fully_connected { - num_neurons: 64 - has_bias: true - } - name: "gen2fc1" - data_layout: "data_parallel" - #weights: "gen2fc1linearity" - parents: "image_data_dummy" - } - layer { - relu { - } - name: "gen2relu1" - data_layout: "data_parallel" - parents: "gen2fc1" - } - layer { - fully_connected { - num_neurons: 512 - has_bias: true - } - name: "gen2fc2" - data_layout: "data_parallel" - #weights: "gen2fc2linearity" - parents: "gen2relu1" - } - layer { - relu { - } - name: "gen2relu2" - data_layout: "data_parallel" - parents: "gen2fc2" - } - layer { - fully_connected { - num_neurons: 2048 - has_bias: true - } - name: "gen2fc3" - data_layout: "data_parallel" - #weights: "gen2fc3linearity" - parents: "gen2relu2" - } - layer { - relu { - } - name: "gen2relu3" - data_layout: "data_parallel" - parents: "gen2fc3" - } - layer { - fully_connected { - #num_neurons: 11 - get_slice_points_from_reader: "independent" - get_num_neurons_of_slice_from_reader: [ 2 ] - has_bias: true - } - name: "gen2fc4" - data_layout: "data_parallel" - #weights: "gen2fc4linearity" - parents: "gen2relu3" - } - layer { - name: "concat_param_n_img" - data_layout: "data_parallel" - parents: "param_data_id image_data_dummy" - concatenation { - } - } - layer { - fully_connected { - num_neurons: 512 - has_bias: true - } - name: "d1_invfc1_real" - data_layout: "data_parallel" - weights: "d1_invfc1linearity d1_invfc1bias" - parents: "concat_param_n_img" - } - layer { - relu { - } - name: "d1_invrelu1_real" - data_layout: "data_parallel" - parents: "d1_invfc1_real" - } - layer { - fully_connected { - num_neurons: 64 - has_bias: true - } - name: "d1_invfc2_real" - data_layout: "data_parallel" - weights: "d1_invfc2linearity d1_invfc2bias" - parents: "d1_invrelu1_real" - } - layer { - relu { - } - name: "d1_invrelu2_real" - data_layout: "data_parallel" - parents: "d1_invfc2_real" - } - layer { - fully_connected { - num_neurons: 1 - has_bias: true - } - name: "d1_invfc3_real" - data_layout: "data_parallel" - weights: "d1_invfc3linearity d1_invfc3bias" - parents: "d1_invrelu2_real" - } - layer { - name: "concat_gsample2_n_img" - data_layout: "data_parallel" - parents: "gen2fc4 image_data_dummy" - children: "d1_inv_stop_gradient d2_inv_dummy" - concatenation { - } - } - layer { - name: "d1_inv_stop_gradient" - data_layout: "data_parallel" - parents: "concat_gsample2_n_img" - stop_gradient { - } - } - layer { - fully_connected { - num_neurons: 512 - has_bias: true - } - name: "d1_invfc1_fake" - data_layout: "data_parallel" - weights: "d1_invfc1linearity d1_invfc1bias" - parents: "d1_inv_stop_gradient" - } - layer { - relu { - } - name: "d1_invrelu1_fake" - data_layout: "data_parallel" - parents: "d1_invfc1_fake" - } - layer { - fully_connected { - num_neurons: 64 - has_bias: true - } - name: "d1_invfc2_fake" - data_layout: "data_parallel" - weights: "d1_invfc2linearity d1_invfc2bias" - parents: "d1_invrelu1_fake" - } - layer { - relu { - } - name: "d1_invrelu2_fake" - data_layout: "data_parallel" - parents: "d1_invfc2_fake" - } - layer { - fully_connected { - num_neurons: 1 - has_bias: true - } - name: "d1_invfc3_fake" - data_layout: "data_parallel" - weights: "d1_invfc3linearity d1_invfc3bias" - parents: "d1_invrelu2_fake" - } - layer { - sigmoid_binary_cross_entropy { - } - name: "disc1_inv_real_bce" - data_layout: "data_parallel" - parents: "d1_invfc3_real one" - } - layer { - sigmoid_binary_cross_entropy { - } - name: "disc1_inv_fake_bce" - data_layout: "data_parallel" - parents: "d1_invfc3_fake zero" - } - layer { - identity { - } - name: "d2_inv_dummy" - data_layout: "data_parallel" - parents: "concat_gsample2_n_img" - } - layer { - freeze: true - fully_connected { - num_neurons: 512 - has_bias: true - } - name: "d2_invfc1" - data_layout: "data_parallel" - parents: "d2_inv_dummy" - } - layer { - relu { - } - name: "d2_invrelu1" - data_layout: "data_parallel" - parents: "d2_invfc1" - } - layer { - freeze: true - fully_connected { - num_neurons: 64 - has_bias: true - } - name: "d2_invfc2" - data_layout: "data_parallel" - parents: "d2_invrelu1" - } - layer { - relu { - } - name: "d2_invrelu2" - data_layout: "data_parallel" - parents: "d2_invfc2" - } - layer { - freeze: true - fully_connected { - num_neurons: 1 - has_bias: true - } - name: "d2_invfc3" - data_layout: "data_parallel" - parents: "d2_invrelu2" - } - layer { - sigmoid_binary_cross_entropy { - } - name: "g_inv_adv1_bce" - data_layout: "data_parallel" - parents: "d2_invfc3 one" - } - layer { - name: "gsample2_minus_x" - data_layout: "data_parallel" - parents: "gen2fc4 param_data_id" - weighted_sum { - scaling_factors: "1 -1" - } - } - layer { - name: "l_l2_x" - data_layout: "data_parallel" - l2_norm2 { - } - parents: "gsample2_minus_x" - } - weights { - name: "gen1fc1linearity" - he_normal_initializer { - } - } - weights { - name: "gen1fc1bias" - } - weights { - name: "gen1fc2linearity" - he_normal_initializer { - } - } - weights { - name: "gen1fc2bias" - } - weights { - name: "gen1fc3linearity" - he_normal_initializer { - } - } - weights { - name: "gen1fc3bias" - } - weights { - name: "gen1fc4linearity" - he_normal_initializer { - } - } - weights { - name: "gen1fc4bias" - } - weights { - name: "d1fc1linearity" - he_normal_initializer { - } - } - weights { - name: "d1fc1bias" - } - weights { - name: "d1fc2linearity" - he_normal_initializer { - } - } - weights { - name: "d1fc2bias" - } - weights { - name: "d1fc3linearity" - he_normal_initializer { - } - } - weights { - name: "d1fc3bias" - } - weights { - name: "gen2fc1linearity" - he_normal_initializer { - } - } - weights { - name: "gen2fc2linearity" - he_normal_initializer { - } - } - weights { - name: "gen2fc3linearity" - he_normal_initializer { - } - } - weights { - name: "gen2fc4linearity" - he_normal_initializer { - } - } - weights { - name: "d1_invfc1linearity" - he_normal_initializer { - } - } - weights { - name: "d1_invfc1bias" - } - weights { - name: "d1_invfc2linearity" - he_normal_initializer { - } - } - weights { - name: "d1_invfc2bias" - } - weights { - name: "d1_invfc3linearity" - he_normal_initializer { - } - } - weights { - name: "d1_invfc3bias" - } - mini_batch_size: 256 - callback { - print { - interval: 10 - } - } - callback { - timer { - } - } - callback { gpu_memory_usage {} } - #callback { debug {} } - #callback { - # summary { - # dir: "." - # mat_interval: 25 - # } - #} - callback { - replace_weights { - source_layers: "d1fc1_real d1fc2_real d1fc3_real d1_invfc1_real d1_invfc2_real d1_invfc3_real" - destination_layers: "d2fc1 d2fc2 d2fc3 d2_invfc1 d2_invfc2 d2_invfc3" - batch_interval: 1 - } - } - #callback { - # ltfb { - # batch_interval: 100 - # metric: "l_l2_y_eval" - # low_score_wins: true - # weights: "gen1fc1linearity gen1fc1bias gen1fc2linearity gen1fc2bias gen1fc3linearity gen1fc3bias gen1fc4linearity gen1fc4bias gen2fc1_linearity_weights gen2fc1_bias_weights gen2fc2_linearity_weights gen2fc2_bias_weights gen2fc3_linearity_weights gen2fc3_bias_weights gen2fc4_linearity_weights gen2fc4_bias_weights" - - # } - # } - block_size: 256 - ####For metric, loss per individual sample - layer { - name: "fw_latent_loss" - data_layout: "model_parallel" - parents: "param_data_id l_l2_y" - concatenation { - } - } - - callback { - dump_outputs { - directory:"fw_latent_loss/" - #directory:"/p/gpfs1/jacobs32/EuroViz3/fw_latent_loss/" - #layer_names: "image_data_dummy gen1fc4 gsample_minus_y l_l2_y" - layers: "fw_latent_loss" - execution_modes: "test" - } - } - callback { save_model { dir: "model" } } -} diff --git a/model_zoo/models/jag/ae_cycle_gan/cycgan_m1.prototext b/model_zoo/models/jag/ae_cycle_gan/cycgan_m1.prototext deleted file mode 100644 index 41a071fab87..00000000000 --- a/model_zoo/models/jag/ae_cycle_gan/cycgan_m1.prototext +++ /dev/null @@ -1,766 +0,0 @@ -model { - name: "dis_model" - shareable_training_data_reader: true - serialize_io: true - objective_function { - l2_weight_regularization { - scale_factor: 0.0001 - } - layer_term { - scale_factor: 1.0 - layer: "disc1_real_eval" - } - layer_term { - scale_factor: 1.0 - layer: "disc1_fake_eval" - } - layer_term { - scale_factor: 1.0 - layer: "disc2_real_eval" - } - layer_term { - scale_factor: 1.0 - layer: "disc2_fake_eval" - } - } - num_epochs: 1 - data_layout: "model_parallel" -#### Data space - layer { - input { - target_mode: "N/A" - } - name: "data" - data_layout: "data_parallel" - parents: " " - } - layer { - name: "zero" - data_layout: "model_parallel" - constant { - value: 0.0 - num_neurons: "1" - } - } - layer { - name: "one" - data_layout: "model_parallel" - constant { - value: 1.0 - num_neurons: "1" - } - } - layer { - name: "slice_data" - data_layout: "model_parallel" - parents: "data" - children: "image_data_id param_data_id" - slice { - get_slice_points_from_reader: "independent" - } - } - layer { - identity { - } - name: "image_data_id" - data_layout: "model_parallel" - parents: "slice_data" - } - layer { - identity { - } - name: "param_data_id" - data_layout: "model_parallel" - parents: "slice_data" - } -########Data space end here - ###################### - # Encoder from VAE - ###################### - - # encode1 - layer { - #parents: "data" - parents: "image_data_id" - name: "encode1" - freeze: true - data_layout: "model_parallel" - fully_connected { - num_neurons: 256 - has_bias: true - } - } - layer { - parents: "encode1" - name: "encode1_elu" - data_layout: "model_parallel" - device_allocation: "cpu" - elu {} - } - layer { - parents: "encode1_elu" - name: "encode1_dropout" - data_layout: "model_parallel" - dropout { - keep_prob: 0.95 - } - } - - # encode2 - layer { - parents: "encode1_dropout" - name: "encode2" - freeze: true - data_layout: "model_parallel" - fully_connected { - num_neurons: 256 - has_bias: true - } - } - layer { - parents: "encode2" - name: "encode2_tanh" - data_layout: "model_parallel" - tanh {} - } - layer { - parents: "encode2_tanh" - name: "encode2_dropout" - data_layout: "model_parallel" - dropout { - keep_prob: 0.95 - } - } - - # encode3 - layer { - parents: "encode2_dropout" - name: "encode3" - freeze: true - data_layout: "model_parallel" - fully_connected { - num_neurons: 256 - has_bias: true - } - } - layer { - parents: "encode3" - name: "encode3_tanh" - data_layout: "model_parallel" - tanh {} - } - layer { - parents: "encode3_tanh" - name: "encode3_dropout" - data_layout: "model_parallel" - dropout { - keep_prob: 0.95 - } - } - - ###################### - # Latent space - ###################### - - layer { - parents: "encode3_dropout" - name: "z_mean" - freeze: true - data_layout: "model_parallel" - fully_connected { - num_neurons:20 - has_bias: true - } - } - layer { - parents: "encode3_dropout" - name: "z_log_sigma" - freeze: true - data_layout: "model_parallel" - fully_connected { - num_neurons:20 - has_bias: true - } - } - - ###################### - # KL divergence - ###################### - - layer { - name: "kl_one" - data_layout: "model_parallel" - constant { - value: 1.0 - num_neurons: "20" - } - } - layer { - parents: "z_mean" - name: "kl_z_mean2" - data_layout: "model_parallel" - square {} - } - layer { - parents: "z_log_sigma" - name: "kl_exp" - data_layout: "model_parallel" - exp {} - } - layer { - parents: "kl_one z_log_sigma kl_z_mean2 kl_exp" - name: "kl_full" - data_layout: "model_parallel" - weighted_sum { - scaling_factors: "-0.5 -0.5 0.5 0.5" - } - } - layer { - parents: "kl_full" - name: "kl_sum" - data_layout: "data_parallel" - reduction { - mode: "sum" - } - } - layer { - parents: "kl_sum" - name: "kl_divergence" - data_layout: "data_parallel" - evaluation {} - } - - ###################### - # Sample from latent space - ###################### - - layer { - parents: "z_log_sigma" - name: "sample_half" - data_layout: "model_parallel" - weighted_sum { - scaling_factors: "0.5" - } - } - layer { - parents: "sample_half" - name: "sample_exp" - data_layout: "model_parallel" - exp {} - } - layer { - name: "sample_noise" - data_layout: "model_parallel" - gaussian { - mean: 0.0 - stdev: 1.0 - neuron_dims: "20" - } - } - layer { - parents: "sample_exp sample_noise" - name: "sample_exp_noise" - data_layout: "model_parallel" - hadamard {} - } - layer { - parents: "z_mean sample_exp_noise" - #name: "sample" - ###This is actually sample, call image_data_dummy for legacy - name: "image_data_dummy" - data_layout: "model_parallel" - sum {} - } - #####VAE Encoder ends here, sample feeds/replaces image data dummy - - - layer { - freeze: true - fully_connected { - num_neurons: 256 - has_bias: true - } - name: "gen1fc1" - data_layout: "model_parallel" - weights: "gen1fc1linearity" - parents: "param_data_id" - } - layer { - relu { - } - name: "gen1relu1" - data_layout: "model_parallel" - parents: "gen1fc1" - } - layer { - freeze: true - fully_connected { - num_neurons: 2048 - has_bias: true - } - name: "gen1fc2" - data_layout: "model_parallel" - weights: "gen1fc2linearity" - parents: "gen1relu1" - } - layer { - relu { - } - name: "gen1relu2" - data_layout: "model_parallel" - parents: "gen1fc2" - } - layer { - dropout { - keep_prob: 0.8 - } - name: "gen1dropout1" - data_layout: "model_parallel" - parents: "gen1relu2" - } - layer { - freeze: true - fully_connected { - num_neurons: 8192 - has_bias: true - } - name: "gen1fc3" - data_layout: "model_parallel" - weights: "gen1fc3linearity" - parents: "gen1dropout1" - } - layer { - relu { - } - name: "gen1relu3" - data_layout: "model_parallel" - parents: "gen1fc3" - } - layer { - freeze: true - fully_connected { - #num_neurons: 16384 - #replace image_dim with latent_dim - num_neurons: 20 - has_bias: true - } - name: "gen1fc4" - data_layout: "model_parallel" - weights: "gen1fc4linearity" - parents: "gen1relu3" - } - layer { - freeze: true - fully_connected { - num_neurons: 256 - has_bias: true - } - name: "gen2fc1" - data_layout: "model_parallel" - weights: "gen2fc1linearity" - parents: "image_data_dummy" - } - layer { - relu { - } - name: "gen2relu1" - data_layout: "model_parallel" - parents: "gen2fc1" - } - layer { - freeze: true - fully_connected { - num_neurons: 2048 - has_bias: true - } - name: "gen2fc2" - data_layout: "model_parallel" - weights: "gen2fc2linearity" - parents: "gen2relu1" - } - layer { - relu { - } - name: "gen2relu2" - data_layout: "model_parallel" - parents: "gen2fc2" - } - layer { - freeze: true - fully_connected { - num_neurons: 8192 - has_bias: true - } - name: "gen2fc3" - data_layout: "model_parallel" - weights: "gen2fc3linearity" - parents: "gen2relu2" - } - layer { - relu { - } - name: "gen2relu3" - data_layout: "model_parallel" - parents: "gen2fc3" - } - layer { - freeze: true - fully_connected { - get_slice_points_from_reader: "independent" - get_num_neurons_of_slice_from_reader: [ 2 ] - has_bias: true - } - name: "gen2fc4" - data_layout: "model_parallel" - weights: "gen2fc4linearity" - parents: "gen2relu3" - } - #concat latenty sample (image_data_dummy) and param - layer { - name: "concat_latent_sample_n_param" - data_layout: "model_parallel" - parents: "image_data_dummy param_data_id" - concatenation { - } - } - layer { - fully_connected { - num_neurons: 2048 - has_bias: true - } - name: "disc1fc1_real" - data_layout: "model_parallel" - weights: "disc1fc1linearity" - #parents: "data" - parents: "concat_latent_sample_n_param" - } - layer { - relu { - } - name: "disc1relu1_real" - data_layout: "model_parallel" - parents: "disc1fc1_real" - } - layer { - fully_connected { - num_neurons: 256 - has_bias: true - } - name: "disc1fc2_real" - data_layout: "model_parallel" - weights: "disc1fc2linearity" - parents: "disc1relu1_real" - } - layer { - relu { - } - name: "disc1relu2_real" - data_layout: "model_parallel" - parents: "disc1fc2_real" - } - layer { - fully_connected { - num_neurons: 1 - has_bias: true - } - name: "disc1fc3_real" - data_layout: "model_parallel" - weights: "disc1fc3linearity" - parents: "disc1relu2_real" - } - layer { - name: "concat_gsample_n_param" - data_layout: "model_parallel" - parents: "gen1fc4 param_data_id" - concatenation { - } - } - layer { - fully_connected { - num_neurons: 2048 - has_bias: true - } - name: "disc1fc1_fake" - data_layout: "model_parallel" - weights: "disc1fc1linearity" - parents: "concat_gsample_n_param" - } - layer { - relu { - } - name: "disc1relu1_fake" - data_layout: "model_parallel" - parents: "disc1fc1_fake" - } - layer { - fully_connected { - num_neurons: 256 - has_bias: true - } - name: "disc1fc2_fake" - data_layout: "model_parallel" - weights: "disc1fc2linearity" - parents: "disc1relu1_fake" - } - layer { - relu { - } - name: "disc1relu2_fake" - data_layout: "model_parallel" - parents: "disc1fc2_fake" - } - layer { - fully_connected { - num_neurons: 1 - has_bias: true - } - name: "disc1fc3_fake" - data_layout: "model_parallel" - weights: "disc1fc3linearity" - parents: "disc1relu2_fake" - } - layer { - name: "concat_param_n_img" - data_layout: "model_parallel" - parents: "param_data_id image_data_dummy" - concatenation { - } - } - layer { - fully_connected { - num_neurons: 2048 - has_bias: true - } - name: "disc2fc1_real" - data_layout: "model_parallel" - weights: "disc2fc1linearity" - parents: "concat_param_n_img" - } - layer { - relu { - } - name: "disc2relu1_real" - data_layout: "model_parallel" - parents: "disc2fc1_real" - } - layer { - fully_connected { - num_neurons: 256 - has_bias: true - } - name: "disc2fc2_real" - data_layout: "model_parallel" - weights: "disc2fc2linearity" - parents: "disc2relu1_real" - } - layer { - relu { - } - name: "disc2relu2_real" - data_layout: "model_parallel" - parents: "disc2fc2_real" - } - layer { - fully_connected { - num_neurons: 1 - has_bias: true - } - name: "disc2fc3_real" - data_layout: "model_parallel" - weights: "disc2fc3linearity" - parents: "disc2relu2_real" - } - layer { - name: "concat_gsample2_n_img" - data_layout: "model_parallel" - parents: "gen2fc4 image_data_dummy" - concatenation { - } - } - layer { - fully_connected { - num_neurons: 2048 - has_bias: true - } - name: "disc2fc1_fake" - data_layout: "model_parallel" - weights: "disc2fc1linearity" - parents: "concat_gsample2_n_img" - } - layer { - relu { - } - name: "disc2relu1_fake" - data_layout: "model_parallel" - parents: "disc2fc1_fake" - } - layer { - fully_connected { - num_neurons: 256 - has_bias: true - } - name: "disc2fc2_fake" - data_layout: "model_parallel" - weights: "disc2fc2linearity" - parents: "disc2relu1_fake" - } - layer { - relu { - } - name: "disc2relu2_fake" - data_layout: "model_parallel" - parents: "disc2fc2_fake" - } - layer { - fully_connected { - num_neurons: 1 - has_bias: true - } - name: "disc2fc3_fake" - data_layout: "model_parallel" - weights: "disc2fc3linearity" - parents: "disc2relu2_fake" - } - layer { - name: "disc1_real_bce" - data_layout: "model_parallel" - parents: "disc1fc3_real one" - sigmoid_binary_cross_entropy { - } - } - layer { - name: "disc1_real_eval" - data_layout: "model_parallel" - parents: "disc1_real_bce" - evaluation { - } - } - layer { - name: "disc1_fake_bce" - data_layout: "model_parallel" - parents: "disc1fc3_fake zero" - sigmoid_binary_cross_entropy { - } - } - layer { - name: "disc1_fake_eval" - data_layout: "model_parallel" - parents: "disc1_fake_bce" - evaluation { - } - } - layer { - name: "disc2_real_bce" - data_layout: "model_parallel" - parents: "disc2fc3_real one" - sigmoid_binary_cross_entropy { - } - } - layer { - name: "disc2_real_eval" - data_layout: "model_parallel" - parents: "disc2_real_bce" - evaluation { - } - } - layer { - name: "disc2_fake_bce" - data_layout: "model_parallel" - parents: "disc2fc3_fake zero" - sigmoid_binary_cross_entropy { - } - } - layer { - name: "disc2_fake_eval" - data_layout: "model_parallel" - parents: "disc2_fake_bce" - evaluation { - } - } - weights { - name: "gen1fc1linearity" - he_normal_initializer { - } - } - weights { - name: "gen1fc2linearity" - he_normal_initializer { - } - } - weights { - name: "gen1fc3linearity" - he_normal_initializer { - } - } - weights { - name: "gen1fc4linearity" - he_normal_initializer { - } - } - weights { - name: "gen2fc1linearity" - he_normal_initializer { - } - } - weights { - name: "gen2fc2linearity" - he_normal_initializer { - } - } - weights { - name: "gen2fc3linearity" - he_normal_initializer { - } - } - weights { - name: "gen2fc4linearity" - he_normal_initializer { - } - } - weights { - name: "disc1fc1linearity" - he_normal_initializer { - } - } - weights { - name: "disc1fc2linearity" - he_normal_initializer { - } - } - weights { - name: "disc1fc3linearity" - he_normal_initializer { - } - } - weights { - name: "disc2fc1linearity" - he_normal_initializer { - } - } - weights { - name: "disc2fc2linearity" - he_normal_initializer { - } - } - weights { - name: "disc2fc3linearity" - he_normal_initializer { - } - } - mini_batch_size: 256 - callback { - print { - interval: 1 - } - } - callback { timer {} } - callback { - save_model { - dir: "model" - disable_save_after_training: true - } - } - block_size: 256 - super_steps: 20000 - num_batches: 1 -} diff --git a/model_zoo/models/jag/ae_cycle_gan/cycgan_m2.prototext b/model_zoo/models/jag/ae_cycle_gan/cycgan_m2.prototext deleted file mode 100644 index 9a6715a0fc9..00000000000 --- a/model_zoo/models/jag/ae_cycle_gan/cycgan_m2.prototext +++ /dev/null @@ -1,760 +0,0 @@ -model { - name: "fw_model" - shareable_training_data_reader: true - serialize_io: true - objective_function { - l2_weight_regularization { - scale_factor: 0.0001 - } - layer_term { - scale_factor: 0.05 - layer: "g_adv1_eval" - } - layer_term { - scale_factor: 0.025 - layer: "l_l2_y" - } - } - num_epochs: 1 - data_layout: "model_parallel" - layer { - input { - target_mode: "N/A" - } - name: "data" - data_layout: "data_parallel" - parents: " " - } - layer { - name: "slice_data" - data_layout: "model_parallel" - parents: "data" - children: "image_data_id param_data_id" - slice { - get_slice_points_from_reader: "independent" - } - } - layer { - identity { - } - name: "image_data_id" - data_layout: "model_parallel" - parents: "slice_data" - } - layer { - identity { - } - name: "param_data_id" - data_layout: "model_parallel" - parents: "slice_data" - } -########Data space end here - ###################### - # Encoder from VAE - ###################### - - # encode1 - layer { - #parents: "data" - parents: "image_data_id" - name: "encode1" - freeze: true - data_layout: "model_parallel" - fully_connected { - num_neurons: 256 - has_bias: true - } - } - layer { - parents: "encode1" - name: "encode1_elu" - data_layout: "model_parallel" - elu {} - } - layer { - parents: "encode1_elu" - name: "encode1_dropout" - data_layout: "model_parallel" - dropout { - keep_prob: 0.95 - } - } - - # encode2 - layer { - parents: "encode1_dropout" - name: "encode2" - freeze: true - data_layout: "model_parallel" - fully_connected { - num_neurons: 256 - has_bias: true - } - } - layer { - parents: "encode2" - name: "encode2_tanh" - data_layout: "model_parallel" - tanh {} - } - layer { - parents: "encode2_tanh" - name: "encode2_dropout" - data_layout: "model_parallel" - dropout { - keep_prob: 0.95 - } - } - - # encode3 - layer { - parents: "encode2_dropout" - name: "encode3" - freeze: true - data_layout: "model_parallel" - fully_connected { - num_neurons: 256 - has_bias: true - } - } - layer { - parents: "encode3" - name: "encode3_tanh" - data_layout: "model_parallel" - tanh {} - } - layer { - parents: "encode3_tanh" - name: "encode3_dropout" - data_layout: "model_parallel" - dropout { - keep_prob: 0.95 - } - } - - ###################### - # Latent space - ###################### - - layer { - parents: "encode3_dropout" - name: "z_mean" - freeze: true - data_layout: "model_parallel" - fully_connected { - num_neurons:20 - has_bias: true - } - } - layer { - parents: "encode3_dropout" - name: "z_log_sigma" - freeze: true - data_layout: "model_parallel" - fully_connected { - num_neurons:20 - has_bias: true - } - } - - ###################### - # KL divergence - ###################### - - layer { - name: "kl_one" - data_layout: "model_parallel" - constant { - value: 1.0 - num_neurons: "20" - } - } - layer { - parents: "z_mean" - name: "kl_z_mean2" - data_layout: "model_parallel" - square {} - } - layer { - parents: "z_log_sigma" - name: "kl_exp" - data_layout: "model_parallel" - exp {} - } - layer { - parents: "kl_one z_log_sigma kl_z_mean2 kl_exp" - name: "kl_full" - data_layout: "model_parallel" - weighted_sum { - scaling_factors: "-0.5 -0.5 0.5 0.5" - } - } - layer { - parents: "kl_full" - name: "kl_sum" - data_layout: "data_parallel" - reduction { - mode: "sum" - } - } - layer { - parents: "kl_sum" - name: "kl_divergence" - data_layout: "data_parallel" - evaluation {} - } - - ###################### - # Sample from latent space - ###################### - - layer { - parents: "z_log_sigma" - name: "sample_half" - data_layout: "model_parallel" - weighted_sum { - scaling_factors: "0.5" - } - } - layer { - parents: "sample_half" - name: "sample_exp" - data_layout: "model_parallel" - exp {} - } - layer { - name: "sample_noise" - data_layout: "model_parallel" - gaussian { - mean: 0.0 - stdev: 1.0 - neuron_dims: "20" - } - } - layer { - parents: "sample_exp sample_noise" - name: "sample_exp_noise" - data_layout: "model_parallel" - hadamard {} - } - layer { - parents: "z_mean sample_exp_noise" - #name: "sample" - ###This is actually sample, call image_data_dummy for legacy - name: "image_data_dummy" - data_layout: "model_parallel" - sum {} - } - #####VAE Encoder ends here, sample feeds/replaces image data dummy - - layer { - fully_connected { - num_neurons: 256 - has_bias: true - } - name: "gen1fc1_1" - data_layout: "model_parallel" - weights: "gen1fc1linearity" - parents: "param_data_id" - } - layer { - relu { - } - name: "gen1relu1_1" - data_layout: "model_parallel" - parents: "gen1fc1_1" - } - layer { - fully_connected { - num_neurons: 2048 - has_bias: true - } - name: "gen1fc2_1" - data_layout: "model_parallel" - weights: "gen1fc2linearity" - parents: "gen1relu1_1" - } - layer { - relu { - } - name: "gen1relu2_1" - data_layout: "model_parallel" - parents: "gen1fc2_1" - } - layer { - dropout { - keep_prob: 0.8 - } - name: "gen1dropout1_1" - data_layout: "model_parallel" - parents: "gen1relu2_1" - } - layer { - fully_connected { - num_neurons: 8192 - has_bias: true - } - name: "gen1fc3_1" - data_layout: "model_parallel" - weights: "gen1fc3linearity" - parents: "gen1dropout1_1" - } - layer { - relu { - } - name: "gen1relu3_1" - data_layout: "model_parallel" - parents: "gen1fc3_1" - } - layer { - fully_connected { - #num_neurons: 16384 - #replace image_dim with latent_dim - num_neurons: 20 - has_bias: true - } - name: "gen1fc4_1" - data_layout: "model_parallel" - weights: "gen1fc4linearity" - parents: "gen1relu3_1" - } - layer { - name: "concat_gsample_n_param" - data_layout: "model_parallel" - parents: "gen1fc4_1 param_data_id" - concatenation { - } - } - layer { - freeze: true - fully_connected { - num_neurons: 2048 - has_bias: true - } - name: "disc1fc1_real" - data_layout: "model_parallel" - weights: "disc1fc1linearity" - parents: "concat_gsample_n_param" - } - layer { - relu { - } - name: "disc1relu1_real" - data_layout: "model_parallel" - parents: "disc1fc1_real" - } - layer { - freeze: true - fully_connected { - num_neurons: 256 - has_bias: true - } - name: "disc1fc2_real" - data_layout: "model_parallel" - weights: "disc1fc2linearity" - parents: "disc1relu1_real" - } - layer { - relu { - } - name: "disc1relu2_real" - data_layout: "model_parallel" - parents: "disc1fc2_real" - } - layer { - freeze: true - fully_connected { - num_neurons: 1 - has_bias: true - } - name: "disc1fc3_real" - data_layout: "model_parallel" - weights: "disc1fc3linearity" - parents: "disc1relu2_real" - } - layer { - name: "one" - data_layout: "model_parallel" - constant { - value: 1.0 - num_neurons: "1" - } - } - layer { - name: "g_adv1_bce" - data_layout: "model_parallel" - parents: "disc1fc3_real one" - sigmoid_binary_cross_entropy { - } - } - layer { - name: "g_adv1_eval" - data_layout: "model_parallel" - parents: "g_adv1_bce" - evaluation { - } - } - layer { - freeze: true - fully_connected { - num_neurons: 256 - has_bias: true - } - name: "gen2fc1_y" - data_layout: "model_parallel" - weights: "gen2fc1linearity" - parents: "image_data_dummy" - } - layer { - relu { - } - name: "gen2relu1_y" - data_layout: "model_parallel" - parents: "gen2fc1_y" - } - layer { - freeze: true - fully_connected { - num_neurons: 2048 - has_bias: true - } - name: "gen2fc2_y" - data_layout: "model_parallel" - weights: "gen2fc2linearity" - parents: "gen2relu1_y" - } - layer { - relu { - } - name: "gen2relu2_y" - data_layout: "model_parallel" - parents: "gen2fc2_y" - } - layer { - freeze: true - fully_connected { - num_neurons: 8192 - has_bias: true - } - name: "gen2fc3_y" - data_layout: "model_parallel" - weights: "gen2fc3linearity" - parents: "gen2relu2_y" - } - layer { - relu { - } - name: "gen2relu3_y" - data_layout: "model_parallel" - parents: "gen2fc3_y" - } - layer { - freeze: true - fully_connected { - get_slice_points_from_reader: "independent" - get_num_neurons_of_slice_from_reader: [ 2 ] - has_bias: true - } - name: "gen2fc4_y" - data_layout: "model_parallel" - weights: "gen2fc4linearity" - parents: "gen2relu3_y" - } - layer { - fully_connected { - num_neurons: 256 - has_bias: true - } - name: "gen1fc1_2" - data_layout: "model_parallel" - weights: "gen1fc1linearity" - parents: "gen2fc4_y" - } - layer { - relu { - } - name: "gen1relu1_2" - data_layout: "model_parallel" - parents: "gen1fc1_2" - } - layer { - fully_connected { - num_neurons: 2048 - has_bias: true - } - name: "gen1fc2_2" - data_layout: "model_parallel" - weights: "gen1fc2linearity" - parents: "gen1relu1_2" - } - layer { - relu { - } - name: "gen1relu2_2" - data_layout: "model_parallel" - parents: "gen1fc2_2" - } - layer { - dropout { - keep_prob: 0.8 - } - name: "gen1dropout1_2" - data_layout: "model_parallel" - parents: "gen1relu2_2" - } - layer { - fully_connected { - num_neurons: 8192 - has_bias: true - } - name: "gen1fc3_2" - data_layout: "model_parallel" - weights: "gen1fc3linearity" - parents: "gen1dropout1_2" - } - layer { - relu { - } - name: "gen1relu3_2" - data_layout: "model_parallel" - parents: "gen1fc3_2" - } - layer { - fully_connected { - #num_neurons: 16384 - #replace image_dim with latent_dim - num_neurons: 20 - has_bias: true - } - name: "gen1fc4_2" - data_layout: "model_parallel" - weights: "gen1fc4linearity" - parents: "gen1relu3_2" - } - layer { - name: "cycy_minus_y" - data_layout: "model_parallel" - parents: "gen1fc4_2 image_data_dummy" - weighted_sum { - scaling_factors: "1 -1" - } - } - layer { - abs { - } - name: "L_cyc_y" - data_layout: "model_parallel" - device_allocation: "cpu" - parents: "cycy_minus_y" - } - layer { - name: "L_cyc_y_eval" - data_layout: "model_parallel" - parents: "L_cyc_y" - evaluation { - } - } - layer { - freeze: true - fully_connected { - num_neurons: 256 - has_bias: true - } - name: "gen2fc1_gsample" - data_layout: "model_parallel" - weights: "gen2fc1linearity" - parents: "gen1fc4_1" - } - layer { - relu { - } - name: "gen2relu1_gsample" - data_layout: "model_parallel" - parents: "gen2fc1_gsample" - } - layer { - freeze: true - fully_connected { - num_neurons: 2048 - has_bias: true - } - name: "gen2fc2_gsample" - data_layout: "model_parallel" - weights: "gen2fc2linearity" - parents: "gen2relu1_gsample" - } - layer { - relu { - } - name: "gen2relu2_gsample" - data_layout: "model_parallel" - parents: "gen2fc2_gsample" - } - layer { - freeze: true - fully_connected { - num_neurons: 8192 - has_bias: true - } - name: "gen2fc3_gsample" - data_layout: "model_parallel" - weights: "gen2fc3linearity" - parents: "gen2relu2_gsample" - } - layer { - relu { - } - name: "gen2relu3_gsample" - data_layout: "model_parallel" - parents: "gen2fc3_gsample" - } - layer { - freeze: true - fully_connected { - get_slice_points_from_reader: "independent" - get_num_neurons_of_slice_from_reader: [ 2 ] - has_bias: true - } - name: "gen2fc4_gsample" - data_layout: "model_parallel" - weights: "gen2fc4linearity" - parents: "gen2relu3_gsample" - } - layer { - name: "cycx_minus_x" - data_layout: "model_parallel" - parents: "gen2fc4_gsample param_data_id" - weighted_sum { - scaling_factors: "1 -1" - } - } - layer { - abs { - } - name: "L_cyc_x" - device_allocation: "cpu" - data_layout: "model_parallel" - parents: "cycx_minus_x" - } - layer { - name: "L_cyc_x_eval" - data_layout: "model_parallel" - parents: "L_cyc_x" - evaluation { - } - } - layer { - name: "gsample_minus_y" - data_layout: "model_parallel" - parents: "gen1fc4_1 image_data_dummy" - weighted_sum { - scaling_factors: "1 -1" - } - } - layer { - l2_norm2 { - } - name: "l_l2_y" - device_allocation: "cpu" - data_layout: "model_parallel" - parents: "gsample_minus_y" - } - ####For metric, loss per individual sample - layer { - name: "fw_latent_loss" - data_layout: "model_parallel" - parents: "param_data_id l_l2_y" - concatenation { - } - } - weights { - name: "gen1fc1linearity" - he_normal_initializer { - } - } - weights { - name: "gen1fc2linearity" - he_normal_initializer { - } - } - weights { - name: "gen1fc3linearity" - he_normal_initializer { - } - } - weights { - name: "gen1fc4linearity" - he_normal_initializer { - } - } - weights { - name: "disc1fc1linearity" - he_normal_initializer { - } - } - weights { - name: "disc1fc2linearity" - he_normal_initializer { - } - } - weights { - name: "disc1fc3linearity" - he_normal_initializer { - } - } - weights { - name: "gen2fc1linearity" - he_normal_initializer { - } - } - weights { - name: "gen2fc2linearity" - he_normal_initializer { - } - } - weights { - name: "gen2fc3linearity" - he_normal_initializer { - } - } - weights { - name: "gen2fc4linearity" - he_normal_initializer { - } - } - mini_batch_size: 256 - callback { - print { - interval: 1 - } - } - callback { timer {} } - callback { - dump_outputs { - #directory: "/p/lscratchh/jacobs32/EuroViz/fw_latent_loss/" - #general directory: "/p/gpfs1/jacobs32/EuroViz2/fw_latent_loss/" - directory: "fw_latent_loss/" - layers: "fw_latent_loss" - execution_modes: "test" - format: "npy" - } - } - #callback { - # dump_outputs { - # directory: "/dir/to/dump_y_activations/" - # directory: "/usr/workspace/wsa/jacobs32/github.saj.lbann/jags10K_multi/cycgan_m2/" - # batch_interval: 100 - # layers: "image_data_dummy gen1fc4_1 l_l2_y" - # execution_modes: "test" - # } - #} - callback { - save_model { - dir: "model" - disable_save_after_training: true - } - } - block_size: 256 - super_steps: 20000 - num_batches: 1 -} diff --git a/model_zoo/models/jag/ae_cycle_gan/cycgan_m3.prototext b/model_zoo/models/jag/ae_cycle_gan/cycgan_m3.prototext deleted file mode 100644 index 41005af6f15..00000000000 --- a/model_zoo/models/jag/ae_cycle_gan/cycgan_m3.prototext +++ /dev/null @@ -1,801 +0,0 @@ -model { - name: "inv_model" - shareable_training_data_reader: true - serialize_io: true - objective_function { - l2_weight_regularization { - scale_factor: 0.0001 - } - layer_term { - scale_factor: 0.05 - layer: "g_adv2_eval" - } - layer_term { - scale_factor: 0.025 - layer: "l_l2_x" - } - } - num_epochs: 1 - data_layout: "model_parallel" - layer { - input { - target_mode: "N/A" - } - name: "data" - data_layout: "data_parallel" - parents: " " - } - layer { - name: "slice_data" - data_layout: "model_parallel" - parents: "data" - children: "image_data_id param_data_id" - slice { - get_slice_points_from_reader: "independent" - } - } - layer { - identity { - } - name: "image_data_id" - data_layout: "model_parallel" - parents: "slice_data" - } - layer { - identity { - } - name: "param_data_id" - data_layout: "model_parallel" - parents: "slice_data" - } -########Data space end here - ###################### - # Encoder from VAE - ###################### - - # encode1 - layer { - #parents: "data" - parents: "image_data_id" - name: "encode1" - freeze: true - data_layout: "model_parallel" - fully_connected { - num_neurons: 256 - has_bias: true - } - } - layer { - parents: "encode1" - name: "encode1_elu" - data_layout: "model_parallel" - elu {} - } - layer { - parents: "encode1_elu" - name: "encode1_dropout" - data_layout: "model_parallel" - dropout { - keep_prob: 0.95 - } - } - - # encode2 - layer { - parents: "encode1_dropout" - name: "encode2" - freeze: true - data_layout: "model_parallel" - fully_connected { - num_neurons: 256 - has_bias: true - } - } - layer { - parents: "encode2" - name: "encode2_tanh" - data_layout: "model_parallel" - tanh {} - } - layer { - parents: "encode2_tanh" - name: "encode2_dropout" - data_layout: "model_parallel" - dropout { - keep_prob: 0.95 - } - } - - # encode3 - layer { - parents: "encode2_dropout" - name: "encode3" - freeze: true - data_layout: "model_parallel" - fully_connected { - num_neurons: 256 - has_bias: true - } - } - layer { - parents: "encode3" - name: "encode3_tanh" - data_layout: "model_parallel" - tanh {} - } - layer { - parents: "encode3_tanh" - name: "encode3_dropout" - data_layout: "model_parallel" - dropout { - keep_prob: 0.95 - } - } - - ###################### - # Latent space - ###################### - - layer { - parents: "encode3_dropout" - name: "z_mean" - freeze: true - data_layout: "model_parallel" - fully_connected { - num_neurons:20 - has_bias: true - } - } - layer { - parents: "encode3_dropout" - name: "z_log_sigma" - freeze: true - data_layout: "model_parallel" - fully_connected { - num_neurons:20 - has_bias: true - } - } - - ###################### - # KL divergence - ###################### - - layer { - name: "kl_one" - data_layout: "model_parallel" - constant { - value: 1.0 - num_neurons: "20" - } - } - layer { - parents: "z_mean" - name: "kl_z_mean2" - data_layout: "model_parallel" - square {} - } - layer { - parents: "z_log_sigma" - name: "kl_exp" - data_layout: "model_parallel" - exp {} - } - layer { - parents: "kl_one z_log_sigma kl_z_mean2 kl_exp" - name: "kl_full" - data_layout: "model_parallel" - weighted_sum { - scaling_factors: "-0.5 -0.5 0.5 0.5" - } - } - layer { - parents: "kl_full" - name: "kl_sum" - data_layout: "data_parallel" - reduction { - mode: "sum" - } - } - layer { - parents: "kl_sum" - name: "kl_divergence" - data_layout: "data_parallel" - evaluation {} - } - - ###################### - # Sample from latent space - ###################### - - layer { - parents: "z_log_sigma" - name: "sample_half" - data_layout: "model_parallel" - weighted_sum { - scaling_factors: "0.5" - } - } - layer { - parents: "sample_half" - name: "sample_exp" - data_layout: "model_parallel" - exp {} - } - layer { - name: "sample_noise" - data_layout: "model_parallel" - gaussian { - mean: 0.0 - stdev: 1.0 - neuron_dims: "20" - } - } - layer { - parents: "sample_exp sample_noise" - name: "sample_exp_noise" - data_layout: "model_parallel" - hadamard {} - } - layer { - parents: "z_mean sample_exp_noise" - #name: "sample" - ###This is actually sample, call image_data_dummy for legacy - name: "image_data_dummy" - data_layout: "model_parallel" - sum {} - } - #####VAE Encoder ends here, sample feeds/replaces image data dummy - layer { - fully_connected { - num_neurons: 256 - has_bias: true - } - name: "gen2fc1_1" - data_layout: "model_parallel" - weights: "gen2fc1linearity" - parents: "image_data_dummy" - } - layer { - relu { - } - name: "gen2relu1_1" - data_layout: "model_parallel" - parents: "gen2fc1_1" - } - layer { - fully_connected { - num_neurons: 2048 - has_bias: true - } - name: "gen2fc2_1" - data_layout: "model_parallel" - weights: "gen2fc2linearity" - parents: "gen2relu1_1" - } - layer { - relu { - } - name: "gen2relu2_1" - data_layout: "model_parallel" - parents: "gen2fc2_1" - } - layer { - fully_connected { - num_neurons: 8192 - has_bias: true - } - name: "gen2fc3_1" - data_layout: "model_parallel" - weights: "gen2fc3linearity" - parents: "gen2relu2_1" - } - layer { - relu { - } - name: "gen2relu3_1" - data_layout: "model_parallel" - parents: "gen2fc3_1" - } - layer { - fully_connected { - get_slice_points_from_reader: "independent" - get_num_neurons_of_slice_from_reader: [ 2 ] - has_bias: true - } - name: "gen2fc4_1" - data_layout: "model_parallel" - weights: "gen2fc4linearity" - parents: "gen2relu3_1" - } - layer { - name: "concat_gsample2_n_img" - data_layout: "model_parallel" - parents: "gen2fc4_1 image_data_dummy" - concatenation { - } - } - layer { - freeze: true - fully_connected { - num_neurons: 2048 - has_bias: true - } - name: "disc2fc1_real" - data_layout: "model_parallel" - weights: "disc2fc1linearity" - parents: "concat_gsample2_n_img" - } - layer { - relu { - } - name: "disc2relu1_real" - data_layout: "model_parallel" - parents: "disc2fc1_real" - } - layer { - freeze: true - fully_connected { - num_neurons: 256 - has_bias: true - } - name: "disc2fc2_real" - data_layout: "model_parallel" - weights: "disc2fc2linearity" - parents: "disc2relu1_real" - } - layer { - relu { - } - name: "disc2relu2_real" - data_layout: "model_parallel" - parents: "disc2fc2_real" - } - layer { - freeze: true - fully_connected { - num_neurons: 1 - has_bias: true - } - name: "disc2fc3_real" - data_layout: "model_parallel" - weights: "disc2fc3linearity" - parents: "disc2relu2_real" - } - layer { - name: "one" - data_layout: "model_parallel" - constant { - value: 1.0 - num_neurons: "1" - } - } - layer { - name: "g_adv2_bce" - data_layout: "model_parallel" - parents: "disc2fc3_real one" - sigmoid_binary_cross_entropy { - } - } - layer { - name: "g_adv2_eval" - data_layout: "model_parallel" - parents: "g_adv2_bce" - evaluation { - } - } - layer { - fully_connected { - num_neurons: 256 - has_bias: true - } - name: "gen2fc1_y" - data_layout: "model_parallel" - weights: "gen2fc1linearity" - parents: "image_data_dummy" - } - layer { - relu { - } - name: "gen2relu1_y" - data_layout: "model_parallel" - parents: "gen2fc1_y" - } - layer { - fully_connected { - num_neurons: 2048 - has_bias: true - } - name: "gen2fc2_y" - data_layout: "model_parallel" - weights: "gen2fc2linearity" - parents: "gen2relu1_y" - } - layer { - relu { - } - name: "gen2relu2_y" - data_layout: "model_parallel" - parents: "gen2fc2_y" - } - layer { - fully_connected { - num_neurons: 8192 - has_bias: true - } - name: "gen2fc3_y" - data_layout: "model_parallel" - weights: "gen2fc3linearity" - parents: "gen2relu2_y" - } - layer { - relu { - } - name: "gen2relu3_y" - data_layout: "model_parallel" - parents: "gen2fc3_y" - } - layer { - fully_connected { - get_slice_points_from_reader: "independent" - get_num_neurons_of_slice_from_reader: [ 2 ] - has_bias: true - } - name: "gen2fc4_y" - data_layout: "model_parallel" - weights: "gen2fc4linearity" - parents: "gen2relu3_y" - } - layer { - freeze: true - fully_connected { - num_neurons: 256 - has_bias: true - } - name: "gen1fc1_2" - data_layout: "model_parallel" - weights: "gen1fc1linearity" - parents: "gen2fc4_y" - } - layer { - relu { - } - name: "gen1relu1_2" - data_layout: "model_parallel" - parents: "gen1fc1_2" - } - layer { - freeze: true - fully_connected { - num_neurons: 2048 - has_bias: true - } - name: "gen1fc2_2" - data_layout: "model_parallel" - weights: "gen1fc2linearity" - parents: "gen1relu1_2" - } - layer { - relu { - } - name: "gen1relu2_2" - data_layout: "model_parallel" - parents: "gen1fc2_2" - } - layer { - dropout { - keep_prob: 0.8 - } - name: "gen1dropout1_2" - data_layout: "model_parallel" - parents: "gen1relu2_2" - } - layer { - freeze: true - fully_connected { - num_neurons: 8192 - has_bias: true - } - name: "gen1fc3_2" - data_layout: "model_parallel" - weights: "gen1fc3linearity" - parents: "gen1dropout1_2" - } - layer { - relu { - } - name: "gen1relu3_2" - data_layout: "model_parallel" - parents: "gen1fc3_2" - } - layer { - freeze: true - fully_connected { - #num_neurons: 16384 - #replace image_dim with latent_dim - num_neurons: 20 - has_bias: true - } - name: "gen1fc4_2" - data_layout: "model_parallel" - weights: "gen1fc4linearity" - parents: "gen1relu3_2" - } - layer { - name: "cycy_minus_y" - data_layout: "model_parallel" - parents: "gen1fc4_2 image_data_dummy" - weighted_sum { - scaling_factors: "1 -1" - } - } - layer { - abs { - } - name: "L_cyc_y" - data_layout: "model_parallel" - device_allocation: "cpu" - parents: "cycy_minus_y" - } - layer { - name: "L_cyc_y_eval" - data_layout: "model_parallel" - parents: "L_cyc_y" - evaluation { - } - } - layer { - freeze: true - fully_connected { - num_neurons: 256 - has_bias: true - } - name: "gen1fc1_1" - data_layout: "model_parallel" - weights: "gen1fc1linearity" - parents: "param_data_id" - } - layer { - relu { - } - name: "gen1relu1_1" - data_layout: "model_parallel" - parents: "gen1fc1_1" - } - layer { - freeze: true - fully_connected { - num_neurons: 2048 - has_bias: true - } - name: "gen1fc2_1" - data_layout: "model_parallel" - weights: "gen1fc2linearity" - parents: "gen1relu1_1" - } - layer { - relu { - } - name: "gen1relu2_1" - data_layout: "model_parallel" - parents: "gen1fc2_1" - } - layer { - dropout { - keep_prob: 0.8 - } - name: "gen1dropout1_1" - data_layout: "model_parallel" - parents: "gen1relu2_1" - } - layer { - freeze: true - fully_connected { - num_neurons: 8192 - has_bias: true - } - name: "gen1fc3_1" - data_layout: "model_parallel" - weights: "gen1fc3linearity" - parents: "gen1dropout1_1" - } - layer { - relu { - } - name: "gen1relu3_1" - data_layout: "model_parallel" - parents: "gen1fc3_1" - } - layer { - freeze: true - fully_connected { - #num_neurons: 16384 - #replace image_dim with latent_dim - num_neurons: 20 - has_bias: true - } - name: "gen1fc4_1" - data_layout: "model_parallel" - weights: "gen1fc4linearity" - parents: "gen1relu3_1" - } - layer { - fully_connected { - num_neurons: 256 - has_bias: true - } - name: "gen2fc1_gsample" - data_layout: "model_parallel" - weights: "gen2fc1linearity" - parents: "gen1fc4_1" - } - layer { - relu { - } - name: "gen2relu1_gsample" - data_layout: "model_parallel" - parents: "gen2fc1_gsample" - } - layer { - fully_connected { - num_neurons: 2048 - has_bias: true - } - name: "gen2fc2_gsample" - data_layout: "model_parallel" - weights: "gen2fc2linearity" - parents: "gen2relu1_gsample" - } - layer { - relu { - } - name: "gen2relu2_gsample" - data_layout: "model_parallel" - parents: "gen2fc2_gsample" - } - layer { - fully_connected { - num_neurons: 8192 - has_bias: true - } - name: "gen2fc3_gsample" - data_layout: "model_parallel" - weights: "gen2fc3linearity" - parents: "gen2relu2_gsample" - } - layer { - relu { - } - name: "gen2relu3_gsample" - data_layout: "model_parallel" - parents: "gen2fc3_gsample" - } - layer { - fully_connected { - get_slice_points_from_reader: "independent" - get_num_neurons_of_slice_from_reader: [ 2 ] - has_bias: true - } - name: "gen2fc4_gsample" - data_layout: "model_parallel" - weights: "gen2fc4linearity" - parents: "gen2relu3_gsample" - } - layer { - name: "cycx_minus_x" - data_layout: "model_parallel" - parents: "gen2fc4_gsample param_data_id" - weighted_sum { - scaling_factors: "1 -1" - } - } - layer { - abs { - } - name: "L_cyc_x" - data_layout: "model_parallel" - device_allocation: "cpu" - parents: "cycx_minus_x" - } - layer { - name: "L_cyc_x_eval" - data_layout: "model_parallel" - parents: "L_cyc_x" - evaluation { - } - } - layer { - name: "gsample2_minus_x" - data_layout: "model_parallel" - parents: "gen2fc4_y param_data_id" - weighted_sum { - scaling_factors: "1 -1" - } - } - layer { - l2_norm2 { - } - name: "l_l2_x" - data_layout: "model_parallel" - device_allocation: "cpu" - parents: "gsample2_minus_x" - } - weights { - name: "gen2fc1linearity" - he_normal_initializer { - } - } - weights { - name: "gen2fc2linearity" - he_normal_initializer { - } - } - weights { - name: "gen2fc3linearity" - he_normal_initializer { - } - } - weights { - name: "gen2fc4linearity" - he_normal_initializer { - } - } - weights { - name: "disc2fc1linearity" - he_normal_initializer { - } - } - weights { - name: "disc2fc2linearity" - he_normal_initializer { - } - } - weights { - name: "disc2fc3linearity" - he_normal_initializer { - } - } - weights { - name: "gen1fc1linearity" - he_normal_initializer { - } - } - weights { - name: "gen1fc2linearity" - he_normal_initializer { - } - } - weights { - name: "gen1fc3linearity" - he_normal_initializer { - } - } - weights { - name: "gen1fc4linearity" - he_normal_initializer { - } - } - mini_batch_size: 256 - callback { - print { - interval: 1 - } - } - callback { timer {} } - #callback { - # dump_outputs { - # directory: "/dir/to/dump_x_activations/" - # layers: "param_data_id gen2fc4_1" - # execution_modes: "test" - # } - #} - callback { - save_model { - dir: "model" - disable_save_after_training: true - } - } - block_size: 256 - super_steps: 20000 - num_batches: 1 -} diff --git a/model_zoo/models/jag/ae_cycle_gan/data_reader_jag_conduit_lassen.prototext b/model_zoo/models/jag/ae_cycle_gan/data_reader_jag_conduit_lassen.prototext deleted file mode 100644 index b0376077b5e..00000000000 --- a/model_zoo/models/jag/ae_cycle_gan/data_reader_jag_conduit_lassen.prototext +++ /dev/null @@ -1,112 +0,0 @@ -######################################################################## -# The JAG normalization values were computed over the 10M + 1MA + 1MB random -# pulls from the 100M data set. They are valid for the directories: -# /p/lustre2/brainusr/datasets/10MJAG/ (10M | 1M_A | 1M_B) -# /p/lustre2/brainusr/datasets/10MJAG_balanced_1K/ (1M_A | 1M_B) -# /p/gpfs1/brainusr/datasets/10MJAG/10M | 1M_A | 1M_B -# /p/gpfs1/brainusr/datasets/10MJAG_balanced_1K/ (1M_A | 1M_B) -######################################################################## - -data_reader { - requires_data_set_metadata: true - - reader { - name: "jag_conduit" - role: "train" - shuffle: true - # change to a lustre path - data_filedir: "/p/gpfs1/brainusr/datasets/10MJAG/1M_A/" - index_list: "index.txt" - index_list_per_trainer: false - index_list_per_model: false - - validation_percent: 0 - absolute_sample_count: 0 - percent_of_data_to_use: 1.0 - disable_responses: true - disable_labels: true - - num_labels: 5 - - image_preprocessor { - # assume fixed size of input images if cropper is not used - raw_width: 64 - raw_height: 64 - raw_num_channels: 4 - - normalizer { - disable: true - scale: false - subtract_mean: false - unit_variance: false - z_score: true - } - - subtractor { - disable: true - } - - cropper { - disable: true - } - - colorizer { - disable: true - } - - augmenter { - disable: true - } - } - } - - reader { - name: "jag_conduit" - role: "test" - shuffle: true - # change to a lustre path - data_filedir: "/p/gpfs1/brainusr/datasets/10MJAG/1M_B" - index_list: "index.txt" - index_list_per_trainer: false - index_list_per_model: false - - validation_percent: 0 - absolute_sample_count: 0 - percent_of_data_to_use: 1.0 - disable_responses: true - disable_labels: true - - num_labels: 5 - - image_preprocessor { - # assume fixed size of input images if cropper is not used - raw_width: 64 - raw_height: 64 - raw_num_channels: 4 - - normalizer { - disable: true - scale: false - subtract_mean: false - unit_variance: false - z_score: true - } - - subtractor { - disable: true - } - - cropper { - disable: true - } - - colorizer { - disable: true - } - - augmenter { - disable: true - } - } - } -} diff --git a/model_zoo/models/jag/ae_cycle_gan/data_reader_jag_conduit_lustre.prototext b/model_zoo/models/jag/ae_cycle_gan/data_reader_jag_conduit_lustre.prototext deleted file mode 100644 index 81467ae6970..00000000000 --- a/model_zoo/models/jag/ae_cycle_gan/data_reader_jag_conduit_lustre.prototext +++ /dev/null @@ -1,112 +0,0 @@ -######################################################################## -# The JAG normalization values were computed over the 10M + 1MA + 1MB random -# pulls from the 100M data set. They are valid for the directories: -# /p/lustre2/brainusr/datasets/10MJAG/ (10M | 1M_A | 1M_B) -# /p/lustre2/brainusr/datasets/10MJAG_balanced_1K/ (1M_A | 1M_B) -# /p/gpfs1/brainusr/datasets/10MJAG/10M | 1M_A | 1M_B -# /p/gpfs1/brainusr/datasets/10MJAG_balanced_1K/ (1M_A | 1M_B) -######################################################################## - -data_reader { - requires_data_set_metadata: true - - reader { - name: "jag_conduit" - role: "train" - shuffle: true - # change to a lustre path - data_filedir: "/p/lustre2/brainusr/datasets/10MJAG/1M_A/" - index_list: "index.txt" - index_list_per_trainer: false - index_list_per_model: false - - validation_percent: 0 - absolute_sample_count: 0 - percent_of_data_to_use: 1.0 - disable_responses: true - disable_labels: true - - num_labels: 5 - - image_preprocessor { - # assume fixed size of input images if cropper is not used - raw_width: 64 - raw_height: 64 - raw_num_channels: 4 - - normalizer { - disable: true - scale: false - subtract_mean: false - unit_variance: false - z_score: true - } - - subtractor { - disable: true - } - - cropper { - disable: true - } - - colorizer { - disable: true - } - - augmenter { - disable: true - } - } - } - - reader { - name: "jag_conduit" - role: "test" - shuffle: true - # change to a lustre path - data_filedir: "/p/lustre2/brainusr/datasets/10MJAG/1M_B/" - index_list: "index.txt" - index_list_per_trainer: false - index_list_per_model: false - - validation_percent: 0 - absolute_sample_count: 0 - percent_of_data_to_use: 1.0 - disable_responses: true - disable_labels: true - - num_labels: 5 - - image_preprocessor { - # assume fixed size of input images if cropper is not used - raw_width: 64 - raw_height: 64 - raw_num_channels: 4 - - normalizer { - disable: true - scale: false - subtract_mean: false - unit_variance: false - z_score: true - } - - subtractor { - disable: true - } - - cropper { - disable: true - } - - colorizer { - disable: true - } - - augmenter { - disable: true - } - } - } -} diff --git a/model_zoo/models/jag/ae_cycle_gan/jag10k_data.prototext b/model_zoo/models/jag/ae_cycle_gan/jag10k_data.prototext deleted file mode 100644 index f97b43e3031..00000000000 --- a/model_zoo/models/jag/ae_cycle_gan/jag10k_data.prototext +++ /dev/null @@ -1,25 +0,0 @@ -data_reader { - reader { - name: "merge_features" - format: "numpy" - role: "train" - shuffle: true - data_file_pattern: "/p/lscratchh/brainusr/datasets/jag/multichannel/jag10K_multi_train_*.npy" - validation_percent: 0 - percent_of_data_to_use: 1.0 - disable_responses: true - disable_labels: true - } - reader { - name: "merge_features" - format: "numpy" - role: "test" - shuffle: false - data_file_pattern: "/p/lscratchh/brainusr/datasets/jag/multichannel/jag10K_multi_test_*.npy" - validation_percent: 0 - #test first 16 samples only to match TF version - absolute_sample_count: 100 - disable_responses: true - disable_labels: true - } -} diff --git a/model_zoo/models/jag/ae_cycle_gan/jag_100M_metadata.prototext b/model_zoo/models/jag/ae_cycle_gan/jag_100M_metadata.prototext deleted file mode 100644 index 1643b6db51a..00000000000 --- a/model_zoo/models/jag/ae_cycle_gan/jag_100M_metadata.prototext +++ /dev/null @@ -1,115 +0,0 @@ -######################################################################## -# The JAG normalization values were computed over the 10M + 1MA + 1MB random -# pulls from the 100M data set. The image normalization values were updated -# on 1/30/2019 using the per-channel average of the pixel values -# across all views. -# They are valid for the directories: -# /p/lustre2/brainusr/datasets/10MJAG/ (10M | 1M_A | 1M_B) -# /p/lustre2/brainusr/datasets/10MJAG_balanced_1K/ (1M_A | 1M_B) -# /p/gpfs1/brainusr/datasets/10MJAG/10M | 1M_A | 1M_B -# /p/gpfs1/brainusr/datasets/10MJAG_balanced_1K/ (1M_A | 1M_B) -######################################################################## - -data_set_metadata { - schema { - split_jag_image_channels: true - - # JAG_Image, JAG_Scalar, JAG_Input - independent: [ { pieces: [ JAG_Image, JAG_Scalar ] }, { pieces: [ JAG_Input ] } ] - dependent: [ { pieces: [ JAG_Input ] } ] - - image_prefix: "/outputs/images/" - - jag_image_keys: ["(0.0, 0.0)/0.0/emi", "(90.0, 0.0)/0.0/emi", "(90.0, 78.0)/0.0/emi"] - - scalar_prefix: "/outputs/scalars/" - - # An empty list indicates to use all - # The commented out variables are not on the Jim's original list but used in the numpy-based format - jag_scalar_keys: - [ "BWx", - "BT", - "tMAXt", # absent in Jim's list - "BWn", - "MAXpressure", - #"BAte", - #"MAXtion", - "tMAXpressure", - "BAt", # absent in Jim's list - "Yn", - "Ye", - "Yx", - #"tMAXte", # absent in Jim's list - #"BAtion", - #"MAXte", - #"tMAXtion", # absent in Jim's list - "BTx", - "MAXt", # absent in Jim's list - #"BTn", - "BApressure", - "tMINradius", - "MINradius" # absent in Jim's list - ] - - # When using all the keys without explicit selection, key filters can be used - # to explicitly exclude the particular variables with keys that matches a filter. - # 'jag_scalar_filters' and 'jag_input_filters' rely on exact key string matching. - # 'jag_scalar_prefix_filters' and 'jag_input_prefix_filters' define a filter as - # the pair of a prefix substring and the minimum key length. - # For example, with the example below, any key that has a length no shorter - # than 26 and starts with the substring "image_(" is excluded. - - jag_scalar_prefix_filters: [ { key_prefix: "image_(" min_len: 26} ] - jag_scalar_filters: [ "iBT" ] - - input_prefix: "/inputs/" - - jag_input_keys: ["shape_model_initial_modes:(4,3)", - "betti_prl15_trans_u", - "betti_prl15_trans_v", - "shape_model_initial_modes:(2,1)", - "shape_model_initial_modes:(1,0)"]; - } - - normalization { - jag_scalar_normalization_params: [ - { scale: 7.610738e+00 bias: -4.075375e-01 }, #BWx - { scale: 1.459875e+00 bias: -3.427656e+00 }, #BT - { scale: 1.490713e+00 bias: -3.495498e+00 }, #tMAXt - { scale: 4.375123e+01 bias: -1.593477e+00 }, #BWn - { scale: 1.685576e-06 bias: -5.330971e-01 }, #MAXpressure - #{ scale: 2.636422e-01 bias: -9.762907e-01 }, #BAte - #{ scale: 2.419509e-01 bias: -9.853402e-01 }, #MAXtion - { scale: 1.430615e+00 bias: -3.351173e+00 }, #tMAXpressure - { scale: 2.636422e-01 bias: -9.762907e-01 }, #BAt - { scale: 7.154074e-18 bias: -1.864709e-02 }, #Yn - { scale: 3.166824e-03 bias: -1.864709e-02 }, #Ye - { scale: 2.102178e-02 bias: -3.071955e-01 }, #Yx - #{ scale: 1.490713e+00 bias: -3.495498e+00 }, #tMAXte - #{ scale: 2.636422e-01 bias: -9.762907e-01 }, #BAtion - #{ scale: 2.419509e-01 bias: -9.853402e-01 }, #MAXte - #{ scale: 1.490713e+00 bias: -3.495498e+00 }, #tMAXtion - { scale: 1.346439e+00 bias: -3.118446e+00 }, #BTx - { scale: 2.419509e-01 bias: -9.853402e-01 }, #MAXt - #{ scale: 1.459875e+00 bias: -3.427656e+00 }, #BTn - { scale: 2.061877e-06 bias: -5.213394e-01 }, #BApressure - { scale: 1.392544e+00 bias: -3.239921e+00 }, #tMINradius - { scale: 6.266253e-02 bias: -1.384504e+00 } #MINradius - ] - - jag_input_normalization_params: [ - { scale: 1.666672e+00 bias: 5.000000e-01 }, #shape_model_initial_modes:(4,3) - { scale: 1.000002e+00 bias: -1.603483e-07 }, #betti_prl15_trans_u - { scale: 1.000001e+00 bias: -1.406672e-06 }, #betti_prl15_trans_v - { scale: 1.666675e+00 bias: 4.999992e-01 }, #shape_model_initial_modes:(2,1) - { scale: 1.666669e+00 bias: 5.000008e-01 } #shape_model_initial_modes:(1,0) - ] - - jag_image_normalization_params: [ - { scale: 2.9258502e+01 bias: 0.0e+00 }, # avg = 0.0341781 - { scale: 8.5826596e+02 bias: 0.0e+00 }, # avg = 0.00116514 - { scale: 1.0004872e+05 bias: 0.0e+00 }, # avg = 9.99513e-06 - { scale: 4.8072070e+06 bias: 0.0e+00 } # avg = 2.08021e-07 - ] - } -} diff --git a/model_zoo/models/jag/ae_cycle_gan/vae1.prototext b/model_zoo/models/jag/ae_cycle_gan/vae1.prototext deleted file mode 100644 index 1646bdd0298..00000000000 --- a/model_zoo/models/jag/ae_cycle_gan/vae1.prototext +++ /dev/null @@ -1,459 +0,0 @@ -#Example taken from: https://lc.llnl.gov/bitbucket/users/jjayaram/repos/deep-latent-spaces/browse/codes/dev/VAE-FCN/vae_fcn.py and -#https://lc.llnl.gov/bitbucket/users/jjayaram/repos/deep-latent-spaces/browse/codes/dev/VAE-FCN/run_vae.py -#Timestamp 02/26/2018 8:45AM -model { - name: "ae_model" - shareable_training_data_reader: false - serialize_io: true - data_layout: "model_parallel" - mini_batch_size: 256 - block_size: 256 - num_epochs: 4 - num_parallel_readers: 0 - procs_per_trainer: 0 - - ################################################### - # Objective function - ################################################### - - objective_function { - layer_term { layer: "binary_cross_entropy" } - layer_term { layer: "kl_divergence" } - l2_weight_regularization { - scale_factor: 1e-4 - } - } - - ################################################### - # Metrics - ################################################### - - metric { - layer_metric { - name: "mean squared error" - layer: "mean_squared_error" - } - } - - ################################################### - # Callbacks - ################################################### - callback { - print { - interval: 1 - } - } - callback { timer {} } - - ################################################### - # start of layers - ################################################### - - ###################### - # Data - ###################### - #layer { - # name: "data" - # children: "encode1 reconstruction" - # data_layout: "model_parallel" - # input { - # target_mode: "reconstruction" - # } - #} - - layer { - input { - target_mode: "N/A" - } - name: "input" - data_layout: "data_parallel" - children: "data dummy" - } - layer { - parents: "input" - name: "data" - data_layout: "data_parallel" - split {} - } - layer { - parents: "input" - name: "dummy" - data_layout: "data_parallel" - dummy {} - } - layer { - name: "slice_data" - data_layout: "model_parallel" - parents: "data" - children: "image_data_dummy param_data_id" - slice { - get_slice_points_from_reader: "independent" - } - } - layer { - identity { - } - name: "image_data_dummy" - data_layout: "model_parallel" - parents: "slice_data" - } - layer { - identity { - } - name: "param_data_id" - data_layout: "model_parallel" - parents: "slice_data" - } - ###################### - # Encoder - ###################### - - # encode1 - layer { - #parents: "data" - parents: "image_data_dummy" - name: "encode1" - data_layout: "model_parallel" - fully_connected { - num_neurons: 256 - has_bias: true - } - } - layer { - parents: "encode1" - name: "encode1_elu" - data_layout: "model_parallel" - device_allocation: "cpu" - elu {} - } - layer { - parents: "encode1_elu" - name: "encode1_dropout" - data_layout: "model_parallel" - dropout { - keep_prob: 0.95 - } - } - - # encode2 - layer { - parents: "encode1_dropout" - name: "encode2" - data_layout: "model_parallel" - fully_connected { - num_neurons: 256 - has_bias: true - } - } - layer { - parents: "encode2" - name: "encode2_tanh" - data_layout: "model_parallel" - tanh {} - } - layer { - parents: "encode2_tanh" - name: "encode2_dropout" - data_layout: "model_parallel" - dropout { - keep_prob: 0.95 - } - } - - # encode3 - layer { - parents: "encode2_dropout" - name: "encode3" - data_layout: "model_parallel" - fully_connected { - num_neurons: 256 - has_bias: true - } - } - layer { - parents: "encode3" - name: "encode3_tanh" - data_layout: "model_parallel" - tanh {} - } - layer { - parents: "encode3_tanh" - name: "encode3_dropout" - data_layout: "model_parallel" - dropout { - keep_prob: 0.95 - } - } - - ###################### - # Latent space - ###################### - - layer { - parents: "encode3_dropout" - name: "z_mean" - data_layout: "model_parallel" - fully_connected { - num_neurons:20 - has_bias: true - } - } - layer { - parents: "encode3_dropout" - name: "z_log_sigma" - data_layout: "model_parallel" - fully_connected { - num_neurons:20 - has_bias: true - } - } - - ###################### - # KL divergence - ###################### - - layer { - name: "kl_one" - data_layout: "model_parallel" - constant { - value: 1.0 - num_neurons: "20" - } - } - layer { - parents: "z_mean" - name: "kl_z_mean2" - device_allocation: "cpu" - data_layout: "model_parallel" - square {} - } - layer { - parents: "z_log_sigma" - name: "kl_exp" - data_layout: "model_parallel" - device_allocation: "cpu" - exp {} - } - layer { - parents: "kl_one z_log_sigma kl_z_mean2 kl_exp" - name: "kl_full" - data_layout: "model_parallel" - weighted_sum { - scaling_factors: "-0.5 -0.5 0.5 0.5" - } - } - layer { - parents: "kl_full" - name: "kl_sum" - data_layout: "data_parallel" - reduction { - mode: "sum" - } - } - layer { - parents: "kl_sum" - name: "kl_divergence" - data_layout: "data_parallel" - evaluation {} - } - - ###################### - # Sample from latent space - ###################### - - layer { - parents: "z_log_sigma" - name: "sample_half" - data_layout: "model_parallel" - weighted_sum { - scaling_factors: "0.5" - } - } - layer { - parents: "sample_half" - name: "sample_exp" - data_layout: "model_parallel" - device_allocation: "cpu" - exp {} - } - layer { - name: "sample_noise" - data_layout: "model_parallel" - gaussian { - mean: 0.0 - stdev: 1.0 - neuron_dims: "20" - } - } - layer { - parents: "sample_exp sample_noise" - name: "sample_exp_noise" - data_layout: "model_parallel" - hadamard {} - } - layer { - parents: "z_mean sample_exp_noise" - name: "sample" - data_layout: "model_parallel" - sum {} - } - - ###################### - # Decoder - ###################### - - # decode3 - layer { - parents: "sample" - name: "decode3" - data_layout: "model_parallel" - fully_connected { - num_neurons: 256 - has_bias: true - } - } - layer { - parents: "decode3" - name: "decode3_tanh" - data_layout: "model_parallel" - tanh {} - } - layer { - parents: "decode3_tanh" - name: "decode3_dropout" - data_layout: "model_parallel" - dropout { - keep_prob: 0.95 - } - } - - # decode2 - layer { - parents: "decode3_dropout" - name: "decode2" - data_layout: "model_parallel" - fully_connected { - num_neurons: 256 - has_bias: true - } - } - layer { - parents: "decode2" - name: "decode2_tanh" - data_layout: "model_parallel" - tanh {} - } - layer { - parents: "decode2_tanh" - name: "decode2_dropout" - data_layout: "model_parallel" - dropout { - keep_prob: 0.95 - } - } - - # decode1 - layer { - parents: "decode2_dropout" - name: "decode1" - data_layout: "model_parallel" - fully_connected { - num_neurons: 256 - has_bias: true - } - } - layer { - parents: "decode1" - name: "decode1_elu" - data_layout: "model_parallel" - device_allocation: "cpu" - elu { - } - } - layer { - parents: "decode1_elu" - name: "decode1_dropout" - data_layout: "model_parallel" - dropout { - keep_prob: 0.95 - } - } - - # decode0 - layer { - parents: "decode1_dropout" - name: "decode0" - data_layout: "model_parallel" - fully_connected { - get_slice_points_from_reader: "independent" - get_num_neurons_of_slice_from_reader: [ 1 ] - has_bias: true - } - } - layer { - parents: "decode0" - name: "sigmoid" - data_layout: "model_parallel" - sigmoid {} - } - - ###################### - # Reconstruction - ###################### - - layer { - parents: "sigmoid" - name: "reconstruction" - data_layout: "model_parallel" - split {} - } - layer { - parents: "reconstruction image_data_dummy" - name: "binary_cross_entropy" - data_layout: "model_parallel" - binary_cross_entropy {} - } - layer { - parents: "reconstruction image_data_dummy" - name: "mean_squared_error" - data_layout: "model_parallel" - mean_squared_error {} - } - ####For metric, loss per individual sample - layer { - name: "ae_err" - data_layout: "model_parallel" - parents: "param_data_id mean_squared_error" - concatenation { - } - } - callback { - dump_outputs { - #directory: "/p/lscratchh/brainusr/jacobs32/EuroViz/ae_loss" - #directory: "/p/gpfs1/jacobs32/EuroViz2/ae_loss/" - directory: "ae_loss/" - layers: "ae_err" - execution_modes: "test" - format: "npy" - } - } - ####For metric, loss per individual sample - #layer { - # parents: "reconstruction image_data_dummy" - # name: "squared_error" - # data_layout: "model_parallel" - # squared_difference {} - #} - - #callback { - # dump_outputs { - # directory:"/p/lscratchh/brainusr/jacobs32/EuroViz/ae_loss" - # layers: "squared_error" - # execution_modes: "test" - # } - #} - callback { save_model { dir: "model" } } - ################################################### - # end of layers - ################################################### -} diff --git a/model_zoo/models/jag/ae_cycle_gan/vae_cyc.prototext b/model_zoo/models/jag/ae_cycle_gan/vae_cyc.prototext deleted file mode 100644 index d5d3deca580..00000000000 --- a/model_zoo/models/jag/ae_cycle_gan/vae_cyc.prototext +++ /dev/null @@ -1,555 +0,0 @@ -#Example taken from: https://lc.llnl.gov/bitbucket/users/jjayaram/repos/deep-latent-spaces/browse/codes/dev/VAE-FCN/vae_fcn.py and -#https://lc.llnl.gov/bitbucket/users/jjayaram/repos/deep-latent-spaces/browse/codes/dev/VAE-FCN/run_vae.py -#Timestamp 02/26/2018 8:45AM -model { - name: "ae_cycgan_model" - shareable_training_data_reader: true - serialize_io: true - data_layout: "model_parallel" - mini_batch_size: 256 - block_size: 256 - num_epochs: 4 - num_parallel_readers: 0 - procs_per_trainer: 0 - - ################################################### - # Objective function - ################################################### - - objective_function { - layer_term { layer: "binary_cross_entropy" } - layer_term { layer: "kl_divergence" } - l2_weight_regularization { - scale_factor: 1e-4 - } - } - - ################################################### - # Metrics - ################################################### - - metric { - layer_metric { - name: "mean squared error" - layer: "mean_squared_error" - } - } - - ################################################### - # Callbacks - ################################################### - callback { - print { - interval: 1 - } - } - callback { timer {} } - # callback { - # save_images { - # image_prefix: "vae_fcn_images_" - # image_format: "jpg" - # } - # } - - ################################################### - # start of layers - ################################################### - - ###################### - # Data - ###################### - #Layer from cycle GAN - layer { - input { - target_mode: "N/A" - } - name: "input" - data_layout: "data_parallel" - children: "data dummy" - } - layer { - parents: "input" - name: "data" - data_layout: "data_parallel" - split {} - } - layer { - parents: "input" - name: "dummy" - data_layout: "data_parallel" - dummy {} - } - layer { - name: "slice_data" - data_layout: "model_parallel" - parents: "data" - children: "image_data_id param_data_id" - slice { - get_slice_points_from_reader: "independent" - } - } - layer { - identity { - } - name: "image_data_id" - data_layout: "model_parallel" - parents: "slice_data" - } - layer { - identity { - } - name: "param_data_id" - data_layout: "model_parallel" - parents: "slice_data" - } - layer { - fully_connected { - num_neurons: 256 - has_bias: true - } - name: "gen1fc1_1" - data_layout: "model_parallel" - weights: "gen1fc1linearity" - parents: "param_data_id" - } - layer { - relu { - } - name: "gen1relu1_1" - data_layout: "model_parallel" - parents: "gen1fc1_1" - } - layer { - fully_connected { - num_neurons: 2048 - has_bias: true - } - name: "gen1fc2_1" - data_layout: "model_parallel" - weights: "gen1fc2linearity" - parents: "gen1relu1_1" - } - layer { - relu { - } - name: "gen1relu2_1" - data_layout: "model_parallel" - parents: "gen1fc2_1" - } - layer { - dropout { - keep_prob: 0.8 - } - name: "gen1dropout1_1" - data_layout: "model_parallel" - parents: "gen1relu2_1" - } - layer { - fully_connected { - num_neurons: 8192 - has_bias: true - } - name: "gen1fc3_1" - data_layout: "model_parallel" - weights: "gen1fc3linearity" - parents: "gen1dropout1_1" - } - layer { - relu { - } - name: "gen1relu3_1" - data_layout: "model_parallel" - parents: "gen1fc3_1" - } - layer { - fully_connected { - #num_neurons: 16384 - #latent_dim - num_neurons: 20 - has_bias: true - } - name: "gen1fc4_1" - data_layout: "model_parallel" - weights: "gen1fc4linearity" - parents: "gen1relu3_1" - } - - weights { - name: "gen1fc1linearity" - he_normal_initializer { - } - } - weights { - name: "gen1fc2linearity" - he_normal_initializer { - } - } - weights { - name: "gen1fc3linearity" - he_normal_initializer { - } - } - weights { - name: "gen1fc4linearity" - he_normal_initializer { - } - } - - ###################### - # Encoder - ###################### - #Encoder not really used here - # encode1 - layer { - parents: "image_data_id" - name: "encode1" - data_layout: "model_parallel" - fully_connected { - num_neurons: 256 - has_bias: true - } - } - layer { - parents: "encode1" - name: "encode1_elu" - data_layout: "model_parallel" - device_allocation: "cpu" - elu {} - } - layer { - parents: "encode1_elu" - name: "encode1_dropout" - data_layout: "model_parallel" - dropout { - keep_prob: 0.95 - } - } - - # encode2 - layer { - parents: "encode1_dropout" - name: "encode2" - data_layout: "model_parallel" - fully_connected { - num_neurons: 256 - has_bias: true - } - } - layer { - parents: "encode2" - name: "encode2_tanh" - data_layout: "model_parallel" - tanh {} - } - layer { - parents: "encode2_tanh" - name: "encode2_dropout" - data_layout: "model_parallel" - dropout { - keep_prob: 0.95 - } - } - - # encode3 - layer { - parents: "encode2_dropout" - name: "encode3" - data_layout: "model_parallel" - fully_connected { - num_neurons: 256 - has_bias: true - } - } - layer { - parents: "encode3" - name: "encode3_tanh" - data_layout: "model_parallel" - tanh {} - } - layer { - parents: "encode3_tanh" - name: "encode3_dropout" - data_layout: "model_parallel" - dropout { - keep_prob: 0.95 - } - } - ###################### - # Latent space - ###################### - layer { - parents: "encode3_dropout" - name: "z_mean" - data_layout: "model_parallel" - fully_connected { - num_neurons:20 - has_bias: true - } - } - layer { - parents: "encode3_dropout" - name: "z_log_sigma" - data_layout: "model_parallel" - fully_connected { - num_neurons:20 - has_bias: true - } - } - - ###################### - # KL divergence - ###################### - - layer { - name: "kl_one" - data_layout: "model_parallel" - constant { - value: 1.0 - num_neurons: "20" - } - } - layer { - parents: "z_mean" - name: "kl_z_mean2" - data_layout: "model_parallel" - device_allocation: "cpu" - square {} - } - layer { - parents: "z_log_sigma" - name: "kl_exp" - data_layout: "model_parallel" - device_allocation: "cpu" - exp {} - } - layer { - parents: "kl_one z_log_sigma kl_z_mean2 kl_exp" - name: "kl_full" - data_layout: "model_parallel" - weighted_sum { - scaling_factors: "-0.5 -0.5 0.5 0.5" - } - } - layer { - parents: "kl_full" - name: "kl_sum" - data_layout: "data_parallel" - reduction { - mode: "sum" - } - } - layer { - parents: "kl_sum" - name: "kl_divergence" - data_layout: "data_parallel" - evaluation {} - } - - ###################### - # Sample from latent space - ###################### - - layer { - parents: "z_log_sigma" - name: "sample_half" - data_layout: "model_parallel" - weighted_sum { - scaling_factors: "0.5" - } - } - layer { - parents: "sample_half" - name: "sample_exp" - data_layout: "model_parallel" - device_allocation: "cpu" - exp {} - } - layer { - name: "sample_noise" - data_layout: "model_parallel" - gaussian { - mean: 0.0 - stdev: 1.0 - neuron_dims: "20" - } - } - layer { - parents: "sample_exp sample_noise" - name: "sample_exp_noise" - data_layout: "model_parallel" - hadamard {} - } - layer { - parents: "z_mean sample_exp_noise" - #name: "sample" - name: "image_data_dummy" - data_layout: "model_parallel" - sum {} - } - ####output of encoder not used, dangling - ###################### - # Decoder - ###################### - - # decode3 - layer { - #parents: "sample" - parents: "gen1fc4_1" - name: "decode3" - data_layout: "model_parallel" - fully_connected { - num_neurons: 256 - has_bias: true - } - } - layer { - parents: "decode3" - name: "decode3_tanh" - data_layout: "model_parallel" - tanh {} - } - layer { - parents: "decode3_tanh" - name: "decode3_dropout" - data_layout: "model_parallel" - dropout { - keep_prob: 0.95 - } - } - - # decode2 - layer { - parents: "decode3_dropout" - name: "decode2" - data_layout: "model_parallel" - fully_connected { - num_neurons: 256 - has_bias: true - } - } - layer { - parents: "decode2" - name: "decode2_tanh" - data_layout: "model_parallel" - tanh {} - } - layer { - parents: "decode2_tanh" - name: "decode2_dropout" - data_layout: "model_parallel" - dropout { - keep_prob: 0.95 - } - } - - # decode1 - layer { - parents: "decode2_dropout" - name: "decode1" - data_layout: "model_parallel" - fully_connected { - num_neurons: 256 - has_bias: true - } - } - layer { - parents: "decode1" - name: "decode1_elu" - data_layout: "model_parallel" - device_allocation: "cpu" - elu { - } - } - layer { - parents: "decode1_elu" - name: "decode1_dropout" - data_layout: "model_parallel" - dropout { - keep_prob: 0.95 - } - } - - # decode0 - layer { - parents: "decode1_dropout" - name: "decode0" - data_layout: "model_parallel" - fully_connected { - get_slice_points_from_reader: "independent" - get_num_neurons_of_slice_from_reader: [ 1 ] - has_bias: true - } - } - layer { - parents: "decode0" - name: "sigmoid" - data_layout: "model_parallel" - sigmoid {} - } - - ###################### - # Reconstruction - ###################### - - layer { - parents: "sigmoid" - name: "reconstruction" - data_layout: "model_parallel" - split {} - } - layer { - parents: "reconstruction image_data_id" - name: "binary_cross_entropy" - data_layout: "model_parallel" - binary_cross_entropy {} - } - layer { - parents: "reconstruction image_data_id" - name: "mean_squared_error" - data_layout: "model_parallel" - mean_squared_error {} - } - ####For metric, loss per individual sample - layer { - name: "fw_out_loss" - data_layout: "model_parallel" - parents: "param_data_id mean_squared_error" - concatenation { - } - } - callback { - dump_outputs { - #directory: "/p/lscratchh/jacobs32/EuroViz/fw_out_loss/" - #directory: "/p/gpfs1/jacobs32/EuroViz2/fw_out_loss/" - directory: "fw_out_loss/" - layers: "fw_out_loss" - execution_modes: "test" - format: "npy" - } - } - - ####For metric, loss per individual sample - #layer { - # parents: "reconstruction image_data_id" - # name: "squared_error" - # data_layout: "model_parallel" - # squared_difference {} - #} - - #callback { - # dump_outputs { - # directory:"/p/lscratchh/brainusr/jacobs32/EuroViz/" - # layers: "squared_error" - # execution_modes: "test" - # } - #} - callback { - save_model { - dir: "model" - disable_save_after_training: true - } - } - ################################################### - # end of layers - ################################################### -} diff --git a/model_zoo/models/jag/cycle_gan/cycgan_m1.prototext b/model_zoo/models/jag/cycle_gan/cycgan_m1.prototext deleted file mode 100644 index 574fa83ed20..00000000000 --- a/model_zoo/models/jag/cycle_gan/cycgan_m1.prototext +++ /dev/null @@ -1,547 +0,0 @@ -model { - objective_function { - l2_weight_regularization { - scale_factor: 0.0001 - } - layer_term { - scale_factor: 1.0 - layer: "disc1_real_eval" - } - layer_term { - scale_factor: 1.0 - layer: "disc1_fake_eval" - } - layer_term { - scale_factor: 1.0 - layer: "disc2_real_eval" - } - layer_term { - scale_factor: 1.0 - layer: "disc2_fake_eval" - } - } - num_epochs: 1 - data_layout: "data_parallel" - layer { - input { - target_mode: "N/A" - } - name: "data" - data_layout: "data_parallel" - parents: " " - } - layer { - name: "zero" - data_layout: "data_parallel" - constant { - value: 0.0 - num_neurons: "1" - } - } - layer { - name: "one" - data_layout: "data_parallel" - constant { - value: 1.0 - num_neurons: "1" - } - } - layer { - name: "slice_data" - data_layout: "data_parallel" - parents: "data" - children: "image_data_dummy param_data_id" - slice { - # slice_points: "0 49174 49179" - get_slice_points_from_reader: "independent" - } - } - layer { - identity { - } - name: "image_data_dummy" - data_layout: "data_parallel" - parents: "slice_data" - } - layer { - identity { - } - name: "param_data_id" - data_layout: "data_parallel" - parents: "slice_data" - } - layer { - freeze: true - fully_connected { - num_neurons: 16 - has_bias: true - } - name: "gen1fc1" - data_layout: "data_parallel" - weights: "gen1fc1linearity" - parents: "param_data_id" - } - layer { - relu { - } - name: "gen1relu1" - data_layout: "data_parallel" - parents: "gen1fc1" - } - layer { - freeze: true - fully_connected { - num_neurons: 128 - has_bias: true - } - name: "gen1fc2" - data_layout: "data_parallel" - weights: "gen1fc2linearity" - parents: "gen1relu1" - } - layer { - relu { - } - name: "gen1relu2" - data_layout: "data_parallel" - parents: "gen1fc2" - } - layer { - dropout { - keep_prob: 0.8 - } - name: "gen1dropout1" - data_layout: "data_parallel" - parents: "gen1relu2" - } - layer { - freeze: true - fully_connected { - num_neurons: 512 - has_bias: true - } - name: "gen1fc3" - data_layout: "data_parallel" - weights: "gen1fc3linearity" - parents: "gen1dropout1" - } - layer { - relu { - } - name: "gen1relu3" - data_layout: "data_parallel" - parents: "gen1fc3" - } - layer { - freeze: true - fully_connected { - # num_neurons: 49174 - get_slice_points_from_reader: "independent" - get_num_neurons_of_slice_from_reader: [ 1 ] - has_bias: true - } - name: "gen1fc4" - data_layout: "data_parallel" - weights: "gen1fc4linearity" - parents: "gen1relu3" - } - layer { - freeze: true - fully_connected { - num_neurons: 16 - has_bias: true - } - name: "gen2fc1" - data_layout: "data_parallel" - weights: "gen2fc1linearity" - parents: "image_data_dummy" - } - layer { - relu { - } - name: "gen2relu1" - data_layout: "data_parallel" - parents: "gen2fc1" - } - layer { - freeze: true - fully_connected { - num_neurons: 128 - has_bias: true - } - name: "gen2fc2" - data_layout: "data_parallel" - weights: "gen2fc2linearity" - parents: "gen2relu1" - } - layer { - relu { - } - name: "gen2relu2" - data_layout: "data_parallel" - parents: "gen2fc2" - } - layer { - freeze: true - fully_connected { - num_neurons: 512 - has_bias: true - } - name: "gen2fc3" - data_layout: "data_parallel" - weights: "gen2fc3linearity" - parents: "gen2relu2" - } - layer { - relu { - } - name: "gen2relu3" - data_layout: "data_parallel" - parents: "gen2fc3" - } - layer { - freeze: true - fully_connected { - # num_neurons: 5 - get_slice_points_from_reader: "independent" - get_num_neurons_of_slice_from_reader: [ 2 ] - has_bias: true - } - name: "gen2fc4" - data_layout: "data_parallel" - weights: "gen2fc4linearity" - parents: "gen2relu3" - } - layer { - fully_connected { - num_neurons: 128 - has_bias: true - } - name: "disc1fc1_real" - data_layout: "data_parallel" - weights: "disc1fc1linearity" - parents: "data" - } - layer { - relu { - } - name: "disc1relu1_real" - data_layout: "data_parallel" - parents: "disc1fc1_real" - } - layer { - fully_connected { - num_neurons: 16 - has_bias: true - } - name: "disc1fc2_real" - data_layout: "data_parallel" - weights: "disc1fc2linearity" - parents: "disc1relu1_real" - } - layer { - relu { - } - name: "disc1relu2_real" - data_layout: "data_parallel" - parents: "disc1fc2_real" - } - layer { - fully_connected { - num_neurons: 1 - has_bias: true - } - name: "disc1fc3_real" - data_layout: "data_parallel" - weights: "disc1fc3linearity" - parents: "disc1relu2_real" - } - layer { - name: "concat_gsample_n_param" - data_layout: "data_parallel" - parents: "gen1fc4 param_data_id" - concatenation { - } - } - layer { - fully_connected { - num_neurons: 128 - has_bias: true - } - name: "disc1fc1_fake" - data_layout: "data_parallel" - weights: "disc1fc1linearity" - parents: "concat_gsample_n_param" - } - layer { - relu { - } - name: "disc1relu1_fake" - data_layout: "data_parallel" - parents: "disc1fc1_fake" - } - layer { - fully_connected { - num_neurons: 16 - has_bias: true - } - name: "disc1fc2_fake" - data_layout: "data_parallel" - weights: "disc1fc2linearity" - parents: "disc1relu1_fake" - } - layer { - relu { - } - name: "disc1relu2_fake" - data_layout: "data_parallel" - parents: "disc1fc2_fake" - } - layer { - fully_connected { - num_neurons: 1 - has_bias: true - } - name: "disc1fc3_fake" - data_layout: "data_parallel" - weights: "disc1fc3linearity" - parents: "disc1relu2_fake" - } - layer { - name: "concat_param_n_img" - data_layout: "data_parallel" - parents: "param_data_id image_data_dummy" - concatenation { - } - } - layer { - fully_connected { - num_neurons: 128 - has_bias: true - } - name: "disc2fc1_real" - data_layout: "data_parallel" - weights: "disc2fc1linearity" - parents: "concat_param_n_img" - } - layer { - relu { - } - name: "disc2relu1_real" - data_layout: "data_parallel" - parents: "disc2fc1_real" - } - layer { - fully_connected { - num_neurons: 16 - has_bias: true - } - name: "disc2fc2_real" - data_layout: "data_parallel" - weights: "disc2fc2linearity" - parents: "disc2relu1_real" - } - layer { - relu { - } - name: "disc2relu2_real" - data_layout: "data_parallel" - parents: "disc2fc2_real" - } - layer { - fully_connected { - num_neurons: 1 - has_bias: true - } - name: "disc2fc3_real" - data_layout: "data_parallel" - weights: "disc2fc3linearity" - parents: "disc2relu2_real" - } - layer { - name: "concat_gsample2_n_img" - data_layout: "data_parallel" - parents: "gen2fc4 image_data_dummy" - concatenation { - } - } - layer { - fully_connected { - num_neurons: 128 - has_bias: true - } - name: "disc2fc1_fake" - data_layout: "data_parallel" - weights: "disc2fc1linearity" - parents: "concat_gsample2_n_img" - } - layer { - relu { - } - name: "disc2relu1_fake" - data_layout: "data_parallel" - parents: "disc2fc1_fake" - } - layer { - fully_connected { - num_neurons: 16 - has_bias: true - } - name: "disc2fc2_fake" - data_layout: "data_parallel" - weights: "disc2fc2linearity" - parents: "disc2relu1_fake" - } - layer { - relu { - } - name: "disc2relu2_fake" - data_layout: "data_parallel" - parents: "disc2fc2_fake" - } - layer { - fully_connected { - num_neurons: 1 - has_bias: true - } - name: "disc2fc3_fake" - data_layout: "data_parallel" - weights: "disc2fc3linearity" - parents: "disc2relu2_fake" - } - layer { - name: "disc1_real_bce" - data_layout: "data_parallel" - parents: "disc1fc3_real one" - sigmoid_binary_cross_entropy { - } - } - layer { - name: "disc1_real_eval" - data_layout: "data_parallel" - parents: "disc1_real_bce" - evaluation { - } - } - layer { - name: "disc1_fake_bce" - data_layout: "data_parallel" - parents: "disc1fc3_fake zero" - sigmoid_binary_cross_entropy { - } - } - layer { - name: "disc1_fake_eval" - data_layout: "data_parallel" - parents: "disc1_fake_bce" - evaluation { - } - } - layer { - name: "disc2_real_bce" - data_layout: "data_parallel" - parents: "disc2fc3_real one" - sigmoid_binary_cross_entropy { - } - } - layer { - name: "disc2_real_eval" - data_layout: "data_parallel" - parents: "disc2_real_bce" - evaluation { - } - } - layer { - name: "disc2_fake_bce" - data_layout: "data_parallel" - parents: "disc2fc3_fake zero" - sigmoid_binary_cross_entropy { - } - } - layer { - name: "disc2_fake_eval" - data_layout: "data_parallel" - parents: "disc2_fake_bce" - evaluation { - } - } - weights { - name: "gen1fc1linearity" - he_normal_initializer { - } - } - weights { - name: "gen1fc2linearity" - he_normal_initializer { - } - } - weights { - name: "gen1fc3linearity" - he_normal_initializer { - } - } - weights { - name: "gen1fc4linearity" - he_normal_initializer { - } - } - weights { - name: "gen2fc1linearity" - he_normal_initializer { - } - } - weights { - name: "gen2fc2linearity" - he_normal_initializer { - } - } - weights { - name: "gen2fc3linearity" - he_normal_initializer { - } - } - weights { - name: "gen2fc4linearity" - he_normal_initializer { - } - } - weights { - name: "disc1fc1linearity" - he_normal_initializer { - } - } - weights { - name: "disc1fc2linearity" - he_normal_initializer { - } - } - weights { - name: "disc1fc3linearity" - he_normal_initializer { - } - } - weights { - name: "disc2fc1linearity" - he_normal_initializer { - } - } - weights { - name: "disc2fc2linearity" - he_normal_initializer { - } - } - weights { - name: "disc2fc3linearity" - he_normal_initializer { - } - } - mini_batch_size: 64 - callback { - print { - interval: 1 - } - } - block_size: 256 - super_steps: 10000 - num_batches: 1 -} diff --git a/model_zoo/models/jag/cycle_gan/cycgan_m1_template.prototext b/model_zoo/models/jag/cycle_gan/cycgan_m1_template.prototext deleted file mode 100644 index 98a6745c3da..00000000000 --- a/model_zoo/models/jag/cycle_gan/cycgan_m1_template.prototext +++ /dev/null @@ -1,66 +0,0 @@ -model { - data_layout: "data_parallel" - mini_batch_size: 64 - block_size: 256 - super_steps: 10000 - num_batches: 1 - num_epochs: 1 - num_parallel_readers: 0 - procs_per_trainer: 0 - - ################################################### - # Objective function - ################################################### - - objective_function { - layer_term { - scale_factor: 1.0 - layer: "disc1_real_eval" - } - layer_term { - scale_factor: 1.0 - layer: "disc1_fake_eval" - } - layer_term { - scale_factor: 1.0 - layer: "disc2_real_eval" - } - layer_term { - scale_factor: 1.0 - layer: "disc2_fake_eval" - } - l2_weight_regularization { - scale_factor: 1e-4 - } - } - - ################################################### - # Metrics - ################################################### - - #metric { - # layer_metric { - # layer: "dis_eval_t" - # } - # layer_metric { - # layer: "dis_eval_f" - # } - #} - - ################################################### - # Callbacks - ################################################### - callback { - print { - interval: 1 - } - } - #callback { timer {} } - - - - ################################################### - # start of layers - ################################################### - -} diff --git a/model_zoo/models/jag/cycle_gan/cycgan_m2.prototext b/model_zoo/models/jag/cycle_gan/cycgan_m2.prototext deleted file mode 100644 index 6fd5b2caa07..00000000000 --- a/model_zoo/models/jag/cycle_gan/cycgan_m2.prototext +++ /dev/null @@ -1,535 +0,0 @@ -model { - objective_function { - l2_weight_regularization { - scale_factor: 0.0001 - } - layer_term { - scale_factor: 0.05 - layer: "g_adv1_eval" - } - layer_term { - scale_factor: 0.025 - layer: "l_l2_y" - } - } - num_epochs: 1 - data_layout: "data_parallel" - layer { - input { - target_mode: "N/A" - } - name: "data" - data_layout: "data_parallel" - parents: " " - } - layer { - name: "slice_data" - data_layout: "data_parallel" - parents: "data" - children: "image_data_dummy param_data_id" - slice { - # slice_points: "0 49174 49179" - get_slice_points_from_reader: "independent" - } - } - layer { - identity { - } - name: "image_data_dummy" - data_layout: "data_parallel" - parents: "slice_data" - } - layer { - identity { - } - name: "param_data_id" - data_layout: "data_parallel" - parents: "slice_data" - } - layer { - fully_connected { - num_neurons: 16 - has_bias: true - } - name: "gen1fc1_1" - data_layout: "data_parallel" - weights: "gen1fc1linearity" - parents: "param_data_id" - } - layer { - relu { - } - name: "gen1relu1_1" - data_layout: "data_parallel" - parents: "gen1fc1_1" - } - layer { - fully_connected { - num_neurons: 128 - has_bias: true - } - name: "gen1fc2_1" - data_layout: "data_parallel" - weights: "gen1fc2linearity" - parents: "gen1relu1_1" - } - layer { - relu { - } - name: "gen1relu2_1" - data_layout: "data_parallel" - parents: "gen1fc2_1" - } - layer { - dropout { - keep_prob: 0.8 - } - name: "gen1dropout1_1" - data_layout: "data_parallel" - parents: "gen1relu2_1" - } - layer { - fully_connected { - num_neurons: 512 - has_bias: true - } - name: "gen1fc3_1" - data_layout: "data_parallel" - weights: "gen1fc3linearity" - parents: "gen1dropout1_1" - } - layer { - relu { - } - name: "gen1relu3_1" - data_layout: "data_parallel" - parents: "gen1fc3_1" - } - layer { - fully_connected { - # num_neurons: 49174 - get_slice_points_from_reader: "independent" - get_num_neurons_of_slice_from_reader: [ 1 ] - has_bias: true - } - name: "gen1fc4_1" - data_layout: "data_parallel" - weights: "gen1fc4linearity" - parents: "gen1relu3_1" - } - layer { - name: "concat_gsample_n_param" - data_layout: "data_parallel" - parents: "gen1fc4_1 param_data_id" - concatenation { - } - } - layer { - freeze: true - fully_connected { - num_neurons: 128 - has_bias: true - } - name: "disc1fc1_real" - data_layout: "data_parallel" - weights: "disc1fc1linearity" - parents: "concat_gsample_n_param" - } - layer { - relu { - } - name: "disc1relu1_real" - data_layout: "data_parallel" - parents: "disc1fc1_real" - } - layer { - freeze: true - fully_connected { - num_neurons: 16 - has_bias: true - } - name: "disc1fc2_real" - data_layout: "data_parallel" - weights: "disc1fc2linearity" - parents: "disc1relu1_real" - } - layer { - relu { - } - name: "disc1relu2_real" - data_layout: "data_parallel" - parents: "disc1fc2_real" - } - layer { - freeze: true - fully_connected { - num_neurons: 1 - has_bias: true - } - name: "disc1fc3_real" - data_layout: "data_parallel" - weights: "disc1fc3linearity" - parents: "disc1relu2_real" - } - layer { - name: "one" - data_layout: "data_parallel" - constant { - value: 1.0 - num_neurons: "1" - } - } - layer { - name: "g_adv1_bce" - data_layout: "data_parallel" - parents: "disc1fc3_real one" - sigmoid_binary_cross_entropy { - } - } - layer { - name: "g_adv1_eval" - data_layout: "data_parallel" - parents: "g_adv1_bce" - evaluation { - } - } - layer { - freeze: true - fully_connected { - num_neurons: 16 - has_bias: true - } - name: "gen2fc1_y" - data_layout: "data_parallel" - weights: "gen2fc1linearity" - parents: "image_data_dummy" - } - layer { - relu { - } - name: "gen2relu1_y" - data_layout: "data_parallel" - parents: "gen2fc1_y" - } - layer { - freeze: true - fully_connected { - num_neurons: 128 - has_bias: true - } - name: "gen2fc2_y" - data_layout: "data_parallel" - weights: "gen2fc2linearity" - parents: "gen2relu1_y" - } - layer { - relu { - } - name: "gen2relu2_y" - data_layout: "data_parallel" - parents: "gen2fc2_y" - } - layer { - freeze: true - fully_connected { - num_neurons: 512 - has_bias: true - } - name: "gen2fc3_y" - data_layout: "data_parallel" - weights: "gen2fc3linearity" - parents: "gen2relu2_y" - } - layer { - relu { - } - name: "gen2relu3_y" - data_layout: "data_parallel" - parents: "gen2fc3_y" - } - layer { - freeze: true - fully_connected { - # num_neurons: 5 - get_slice_points_from_reader: "independent" - get_num_neurons_of_slice_from_reader: [ 2 ] - has_bias: true - } - name: "gen2fc4_y" - data_layout: "data_parallel" - weights: "gen2fc4linearity" - parents: "gen2relu3_y" - } - layer { - fully_connected { - num_neurons: 16 - has_bias: true - } - name: "gen1fc1_2" - data_layout: "data_parallel" - weights: "gen1fc1linearity" - parents: "gen2fc4_y" - } - layer { - relu { - } - name: "gen1relu1_2" - data_layout: "data_parallel" - parents: "gen1fc1_2" - } - layer { - fully_connected { - num_neurons: 128 - has_bias: true - } - name: "gen1fc2_2" - data_layout: "data_parallel" - weights: "gen1fc2linearity" - parents: "gen1relu1_2" - } - layer { - relu { - } - name: "gen1relu2_2" - data_layout: "data_parallel" - parents: "gen1fc2_2" - } - layer { - dropout { - keep_prob: 0.8 - } - name: "gen1dropout1_2" - data_layout: "data_parallel" - parents: "gen1relu2_2" - } - layer { - fully_connected { - num_neurons: 512 - has_bias: true - } - name: "gen1fc3_2" - data_layout: "data_parallel" - weights: "gen1fc3linearity" - parents: "gen1dropout1_2" - } - layer { - relu { - } - name: "gen1relu3_2" - data_layout: "data_parallel" - parents: "gen1fc3_2" - } - layer { - fully_connected { - # num_neurons: 49174 - get_slice_points_from_reader: "independent" - get_num_neurons_of_slice_from_reader: [ 1 ] - has_bias: true - } - name: "gen1fc4_2" - data_layout: "data_parallel" - weights: "gen1fc4linearity" - parents: "gen1relu3_2" - } - layer { - name: "cycy_minus_y" - data_layout: "data_parallel" - parents: "gen1fc4_2 image_data_dummy" - weighted_sum { - scaling_factors: "1 -1" - } - } - layer { - abs { - } - name: "L_cyc_y" - data_layout: "data_parallel" - device_allocation: "cpu" - parents: "cycy_minus_y" - } - layer { - name: "L_cyc_y_eval" - data_layout: "data_parallel" - parents: "L_cyc_y" - evaluation { - } - } - layer { - freeze: true - fully_connected { - num_neurons: 16 - has_bias: true - } - name: "gen2fc1_gsample" - data_layout: "data_parallel" - weights: "gen2fc1linearity" - parents: "gen1fc4_1" - } - layer { - relu { - } - name: "gen2relu1_gsample" - data_layout: "data_parallel" - parents: "gen2fc1_gsample" - } - layer { - freeze: true - fully_connected { - num_neurons: 128 - has_bias: true - } - name: "gen2fc2_gsample" - data_layout: "data_parallel" - weights: "gen2fc2linearity" - parents: "gen2relu1_gsample" - } - layer { - relu { - } - name: "gen2relu2_gsample" - data_layout: "data_parallel" - parents: "gen2fc2_gsample" - } - layer { - freeze: true - fully_connected { - num_neurons: 512 - has_bias: true - } - name: "gen2fc3_gsample" - data_layout: "data_parallel" - weights: "gen2fc3linearity" - parents: "gen2relu2_gsample" - } - layer { - relu { - } - name: "gen2relu3_gsample" - data_layout: "data_parallel" - parents: "gen2fc3_gsample" - } - layer { - freeze: true - fully_connected { - # num_neurons: 5 - get_slice_points_from_reader: "independent" - get_num_neurons_of_slice_from_reader: [ 2 ] - has_bias: true - } - name: "gen2fc4_gsample" - data_layout: "data_parallel" - weights: "gen2fc4linearity" - parents: "gen2relu3_gsample" - } - layer { - name: "cycx_minus_x" - data_layout: "data_parallel" - parents: "gen2fc4_gsample param_data_id" - weighted_sum { - scaling_factors: "1 -1" - } - } - layer { - abs { - } - name: "L_cyc_x" - device_allocation: "cpu" - data_layout: "data_parallel" - parents: "cycx_minus_x" - } - layer { - name: "L_cyc_x_eval" - data_layout: "data_parallel" - parents: "L_cyc_x" - evaluation { - } - } - layer { - name: "gsample_minus_y" - data_layout: "data_parallel" - parents: "gen1fc4_1 image_data_dummy" - weighted_sum { - scaling_factors: "1 -1" - } - } - layer { - l2_norm2 { - } - name: "l_l2_y" - device_allocation: "cpu" - data_layout: "data_parallel" - parents: "gsample_minus_y" - } - weights { - name: "gen1fc1linearity" - he_normal_initializer { - } - } - weights { - name: "gen1fc2linearity" - he_normal_initializer { - } - } - weights { - name: "gen1fc3linearity" - he_normal_initializer { - } - } - weights { - name: "gen1fc4linearity" - he_normal_initializer { - } - } - weights { - name: "disc1fc1linearity" - he_normal_initializer { - } - } - weights { - name: "disc1fc2linearity" - he_normal_initializer { - } - } - weights { - name: "disc1fc3linearity" - he_normal_initializer { - } - } - weights { - name: "gen2fc1linearity" - he_normal_initializer { - } - } - weights { - name: "gen2fc2linearity" - he_normal_initializer { - } - } - weights { - name: "gen2fc3linearity" - he_normal_initializer { - } - } - weights { - name: "gen2fc4linearity" - he_normal_initializer { - } - } - mini_batch_size: 64 - callback { - print { - interval: 1 - } - } - #callback { - # dump_outputs { - # directory: "/dir/to/dump_y_activations/" - # batch_interval: 100 - # layers: "image_data_dummy gen1fc4_1 l_l2_y" - # execution_modes: "test" - # } - #} - block_size: 256 - super_steps: 10000 - num_batches: 1 -} diff --git a/model_zoo/models/jag/cycle_gan/cycgan_m2_template.prototext b/model_zoo/models/jag/cycle_gan/cycgan_m2_template.prototext deleted file mode 100644 index a5afc2959d5..00000000000 --- a/model_zoo/models/jag/cycle_gan/cycgan_m2_template.prototext +++ /dev/null @@ -1,65 +0,0 @@ -model { - data_layout: "data_parallel" - mini_batch_size: 64 - block_size: 256 - super_steps: 10000 - num_batches: 1 - num_epochs: 1 - num_parallel_readers: 0 - procs_per_trainer: 0 - - ################################################### - # Objective function - ################################################### - - objective_function { - layer_term { - scale_factor: 0.05 - layer: "g_adv1_eval" - } - layer_term { - scale_factor: 0.025 - layer: "l_l2_y" - } - l2_weight_regularization { - scale_factor: 1e-4 - } - } - - ################################################### - # Metrics - ################################################### - - #metric { - # layer_metric { - # layer: "dis_eval_t" - # } - # layer_metric { - # layer: "dis_eval_f" - # } - #} - - ################################################### - # Callbacks - ################################################### - callback { - print { - interval: 1 - } - } - #callback { timer {} } - - callback { - dump_outputs { - directory: "/dir/to/dump_y_activations/" - layers: "image_data_dummy gen1fc4_1" - execution_modes: "test" - } - } - - - ################################################### - # start of layers - ################################################### - -} diff --git a/model_zoo/models/jag/cycle_gan/cycgan_m3.prototext b/model_zoo/models/jag/cycle_gan/cycgan_m3.prototext deleted file mode 100644 index 6917f1767a1..00000000000 --- a/model_zoo/models/jag/cycle_gan/cycgan_m3.prototext +++ /dev/null @@ -1,597 +0,0 @@ -model { - objective_function { - l2_weight_regularization { - scale_factor: 0.0001 - } - layer_term { - scale_factor: 0.05 - layer: "g_adv2_eval" - } - layer_term { - scale_factor: 0.025 - layer: "l_l2_x" - } - } - num_epochs: 1 - data_layout: "data_parallel" - layer { - input { - target_mode: "N/A" - } - name: "data" - data_layout: "data_parallel" - parents: " " - } - layer { - name: "slice_data" - data_layout: "data_parallel" - parents: "data" - children: "image_data_dummy param_data_id" - slice { - # slice_points: "0 49174 49179" - get_slice_points_from_reader: "independent" - } - } - layer { - identity { - } - name: "image_data_dummy" - data_layout: "data_parallel" - parents: "slice_data" - } - layer { - identity { - } - name: "param_data_id" - data_layout: "data_parallel" - parents: "slice_data" - } - layer { - fully_connected { - num_neurons: 16 - has_bias: true - } - name: "gen2fc1_1" - data_layout: "data_parallel" - weights: "gen2fc1linearity" - parents: "image_data_dummy" - } - layer { - relu { - } - name: "gen2relu1_1" - data_layout: "data_parallel" - parents: "gen2fc1_1" - } - layer { - fully_connected { - num_neurons: 128 - has_bias: true - } - name: "gen2fc2_1" - data_layout: "data_parallel" - weights: "gen2fc2linearity" - parents: "gen2relu1_1" - } - layer { - relu { - } - name: "gen2relu2_1" - data_layout: "data_parallel" - parents: "gen2fc2_1" - } - layer { - fully_connected { - num_neurons: 512 - has_bias: true - } - name: "gen2fc3_1" - data_layout: "data_parallel" - weights: "gen2fc3linearity" - parents: "gen2relu2_1" - } - layer { - relu { - } - name: "gen2relu3_1" - data_layout: "data_parallel" - parents: "gen2fc3_1" - } - layer { - fully_connected { - # num_neurons: 5 - get_slice_points_from_reader: "independent" - get_num_neurons_of_slice_from_reader: [ 2 ] - has_bias: true - } - name: "gen2fc4_1" - data_layout: "data_parallel" - weights: "gen2fc4linearity" - parents: "gen2relu3_1" - } - layer { - name: "concat_gsample2_n_img" - data_layout: "data_parallel" - parents: "gen2fc4_1 image_data_dummy" - concatenation { - } - } - layer { - freeze: true - fully_connected { - num_neurons: 128 - has_bias: true - } - name: "disc2fc1_real" - data_layout: "data_parallel" - weights: "disc2fc1linearity" - parents: "concat_gsample2_n_img" - } - layer { - relu { - } - name: "disc2relu1_real" - data_layout: "data_parallel" - parents: "disc2fc1_real" - } - layer { - freeze: true - fully_connected { - num_neurons: 16 - has_bias: true - } - name: "disc2fc2_real" - data_layout: "data_parallel" - weights: "disc2fc2linearity" - parents: "disc2relu1_real" - } - layer { - relu { - } - name: "disc2relu2_real" - data_layout: "data_parallel" - parents: "disc2fc2_real" - } - layer { - freeze: true - fully_connected { - num_neurons: 1 - has_bias: true - } - name: "disc2fc3_real" - data_layout: "data_parallel" - weights: "disc2fc3linearity" - parents: "disc2relu2_real" - } - layer { - name: "one" - data_layout: "data_parallel" - constant { - value: 1.0 - num_neurons: "1" - } - } - layer { - name: "g_adv2_bce" - data_layout: "data_parallel" - parents: "disc2fc3_real one" - sigmoid_binary_cross_entropy { - } - } - layer { - name: "g_adv2_eval" - data_layout: "data_parallel" - parents: "g_adv2_bce" - evaluation { - } - } - layer { - fully_connected { - num_neurons: 16 - has_bias: true - } - name: "gen2fc1_y" - data_layout: "data_parallel" - weights: "gen2fc1linearity" - parents: "image_data_dummy" - } - layer { - relu { - } - name: "gen2relu1_y" - data_layout: "data_parallel" - parents: "gen2fc1_y" - } - layer { - fully_connected { - num_neurons: 128 - has_bias: true - } - name: "gen2fc2_y" - data_layout: "data_parallel" - weights: "gen2fc2linearity" - parents: "gen2relu1_y" - } - layer { - relu { - } - name: "gen2relu2_y" - data_layout: "data_parallel" - parents: "gen2fc2_y" - } - layer { - fully_connected { - num_neurons: 512 - has_bias: true - } - name: "gen2fc3_y" - data_layout: "data_parallel" - weights: "gen2fc3linearity" - parents: "gen2relu2_y" - } - layer { - relu { - } - name: "gen2relu3_y" - data_layout: "data_parallel" - parents: "gen2fc3_y" - } - layer { - fully_connected { - # num_neurons: 5 - get_slice_points_from_reader: "independent" - get_num_neurons_of_slice_from_reader: [ 2 ] - has_bias: true - } - name: "gen2fc4_y" - data_layout: "data_parallel" - weights: "gen2fc4linearity" - parents: "gen2relu3_y" - } - layer { - freeze: true - fully_connected { - num_neurons: 16 - has_bias: true - } - name: "gen1fc1_2" - data_layout: "data_parallel" - weights: "gen1fc1linearity" - parents: "gen2fc4_y" - } - layer { - relu { - } - name: "gen1relu1_2" - data_layout: "data_parallel" - parents: "gen1fc1_2" - } - layer { - freeze: true - fully_connected { - num_neurons: 128 - has_bias: true - } - name: "gen1fc2_2" - data_layout: "data_parallel" - weights: "gen1fc2linearity" - parents: "gen1relu1_2" - } - layer { - relu { - } - name: "gen1relu2_2" - data_layout: "data_parallel" - parents: "gen1fc2_2" - } - layer { - dropout { - keep_prob: 0.8 - } - name: "gen1dropout1_2" - data_layout: "data_parallel" - parents: "gen1relu2_2" - } - layer { - freeze: true - fully_connected { - num_neurons: 512 - has_bias: true - } - name: "gen1fc3_2" - data_layout: "data_parallel" - weights: "gen1fc3linearity" - parents: "gen1dropout1_2" - } - layer { - relu { - } - name: "gen1relu3_2" - data_layout: "data_parallel" - parents: "gen1fc3_2" - } - layer { - freeze: true - fully_connected { - # num_neurons: 49174 - get_slice_points_from_reader: "independent" - get_num_neurons_of_slice_from_reader: [ 1 ] - has_bias: true - } - name: "gen1fc4_2" - data_layout: "data_parallel" - weights: "gen1fc4linearity" - parents: "gen1relu3_2" - } - layer { - name: "cycy_minus_y" - data_layout: "data_parallel" - parents: "gen1fc4_2 image_data_dummy" - weighted_sum { - scaling_factors: "1 -1" - } - } - layer { - abs { - } - name: "L_cyc_y" - data_layout: "data_parallel" - device_allocation: "cpu" - parents: "cycy_minus_y" - } - layer { - name: "L_cyc_y_eval" - data_layout: "data_parallel" - parents: "L_cyc_y" - evaluation { - } - } - layer { - freeze: true - fully_connected { - num_neurons: 16 - has_bias: true - } - name: "gen1fc1_1" - data_layout: "data_parallel" - weights: "gen1fc1linearity" - parents: "param_data_id" - } - layer { - relu { - } - name: "gen1relu1_1" - data_layout: "data_parallel" - parents: "gen1fc1_1" - } - layer { - freeze: true - fully_connected { - num_neurons: 128 - has_bias: true - } - name: "gen1fc2_1" - data_layout: "data_parallel" - weights: "gen1fc2linearity" - parents: "gen1relu1_1" - } - layer { - relu { - } - name: "gen1relu2_1" - data_layout: "data_parallel" - parents: "gen1fc2_1" - } - layer { - dropout { - keep_prob: 0.8 - } - name: "gen1dropout1_1" - data_layout: "data_parallel" - parents: "gen1relu2_1" - } - layer { - freeze: true - fully_connected { - num_neurons: 512 - has_bias: true - } - name: "gen1fc3_1" - data_layout: "data_parallel" - weights: "gen1fc3linearity" - parents: "gen1dropout1_1" - } - layer { - relu { - } - name: "gen1relu3_1" - data_layout: "data_parallel" - parents: "gen1fc3_1" - } - layer { - freeze: true - fully_connected { - # num_neurons: 49174 - get_slice_points_from_reader: "independent" - get_num_neurons_of_slice_from_reader: [ 1 ] - has_bias: true - } - name: "gen1fc4_1" - data_layout: "data_parallel" - weights: "gen1fc4linearity" - parents: "gen1relu3_1" - } - layer { - fully_connected { - num_neurons: 16 - has_bias: true - } - name: "gen2fc1_gsample" - data_layout: "data_parallel" - weights: "gen2fc1linearity" - parents: "gen1fc4_1" - } - layer { - relu { - } - name: "gen2relu1_gsample" - data_layout: "data_parallel" - parents: "gen2fc1_gsample" - } - layer { - fully_connected { - num_neurons: 128 - has_bias: true - } - name: "gen2fc2_gsample" - data_layout: "data_parallel" - weights: "gen2fc2linearity" - parents: "gen2relu1_gsample" - } - layer { - relu { - } - name: "gen2relu2_gsample" - data_layout: "data_parallel" - parents: "gen2fc2_gsample" - } - layer { - fully_connected { - num_neurons: 512 - has_bias: true - } - name: "gen2fc3_gsample" - data_layout: "data_parallel" - weights: "gen2fc3linearity" - parents: "gen2relu2_gsample" - } - layer { - relu { - } - name: "gen2relu3_gsample" - data_layout: "data_parallel" - parents: "gen2fc3_gsample" - } - layer { - fully_connected { - # num_neurons: 5 - get_slice_points_from_reader: "independent" - get_num_neurons_of_slice_from_reader: [ 2 ] - has_bias: true - } - name: "gen2fc4_gsample" - data_layout: "data_parallel" - weights: "gen2fc4linearity" - parents: "gen2relu3_gsample" - } - layer { - name: "cycx_minus_x" - data_layout: "data_parallel" - parents: "gen2fc4_gsample param_data_id" - weighted_sum { - scaling_factors: "1 -1" - } - } - layer { - abs { - } - name: "L_cyc_x" - data_layout: "data_parallel" - device_allocation: "cpu" - parents: "cycx_minus_x" - } - layer { - name: "L_cyc_x_eval" - data_layout: "data_parallel" - parents: "L_cyc_x" - evaluation { - } - } - layer { - name: "gsample2_minus_x" - data_layout: "data_parallel" - parents: "gen2fc4_y param_data_id" - weighted_sum { - scaling_factors: "1 -1" - } - } - layer { - l2_norm2 { - } - name: "l_l2_x" - data_layout: "data_parallel" - device_allocation: "cpu" - parents: "gsample2_minus_x" - } - weights { - name: "gen2fc1linearity" - he_normal_initializer { - } - } - weights { - name: "gen2fc2linearity" - he_normal_initializer { - } - } - weights { - name: "gen2fc3linearity" - he_normal_initializer { - } - } - weights { - name: "gen2fc4linearity" - he_normal_initializer { - } - } - weights { - name: "disc2fc1linearity" - he_normal_initializer { - } - } - weights { - name: "disc2fc2linearity" - he_normal_initializer { - } - } - weights { - name: "disc2fc3linearity" - he_normal_initializer { - } - } - weights { - name: "gen1fc1linearity" - he_normal_initializer { - } - } - weights { - name: "gen1fc2linearity" - he_normal_initializer { - } - } - weights { - name: "gen1fc3linearity" - he_normal_initializer { - } - } - weights { - name: "gen1fc4linearity" - he_normal_initializer { - } - } - mini_batch_size: 64 - callback { - print { - interval: 1 - } - } - #callback { - # dump_outputs { - # directory: "/dir/to/dump_x_activations/" - # layers: "param_data_id gen2fc4_1" - # execution_modes: "test" - # } - #} - block_size: 256 - super_steps: 10000 - num_batches: 1 -} diff --git a/model_zoo/models/jag/cycle_gan/cycgan_m3_template.prototext b/model_zoo/models/jag/cycle_gan/cycgan_m3_template.prototext deleted file mode 100644 index e7dd209e092..00000000000 --- a/model_zoo/models/jag/cycle_gan/cycgan_m3_template.prototext +++ /dev/null @@ -1,65 +0,0 @@ -model { - data_layout: "data_parallel" - mini_batch_size: 64 - block_size: 256 - super_steps: 10000 - num_batches: 1 - num_epochs: 1 - num_parallel_readers: 0 - procs_per_trainer: 0 - - ################################################### - # Objective function - ################################################### - - objective_function { - layer_term { - scale_factor: 0.05 - layer: "g_adv2_eval" - } - layer_term { - scale_factor: 0.025 - layer: "l_l2_x" - } - l2_weight_regularization { - scale_factor: 1e-4 - } - } - - ################################################### - # Metrics - ################################################### - - #metric { - # layer_metric { - # layer: "dis_eval_t" - # } - # layer_metric { - # layer: "dis_eval_f" - # } - #} - - ################################################### - # Callbacks - ################################################### - callback { - print { - interval: 1 - } - } - #callback { timer {} } - - callback { - dump_outputs { - directory: "/dir/to/dump_x_activations/" - layers: "param_data_id gen2fc4_1" - execution_modes: "test" - } - } - - - ################################################### - # start of layers - ################################################### - -} diff --git a/model_zoo/models/jag/cycle_gan/generate_cycgan_m1.py b/model_zoo/models/jag/cycle_gan/generate_cycgan_m1.py deleted file mode 100644 index c089a2a49ca..00000000000 --- a/model_zoo/models/jag/cycle_gan/generate_cycgan_m1.py +++ /dev/null @@ -1,253 +0,0 @@ -import sys -import os -import subprocess -import functools - -# Parameters -lbann_dir = subprocess.check_output(['git', 'rev-parse', '--show-toplevel']).strip() -lbann_proto_dir = lbann_dir + '/src/proto/' -work_dir = lbann_dir + '/model_zoo/models/gan/jags/cycle_gan' -template_proto = lbann_dir + '/model_zoo/models/gan/jags/cycle_gan/cycgan_m1_template.prototext' -output_proto = lbann_dir + '/model_zoo/models/gan/jags/cycle_gan/cycgan_m1.prototext' - -# Convert a list into a space-separated string -def str_list(l): - if isinstance(l, list): - return ' '.join(str(i) for i in l) - elif isinstance(l, str): - return l - else: - raise TypeError('str_list expects a list or a string') - -# Construct a new layer and add it to the model -def new_layer(model, name, parents, layer_type, layout = 'data_parallel'): - l = model.layer.add() - l.name = name - l.data_layout = layout - l.parents = str_list(parents) - #l.device_allocation = device - exec('l.' + layer_type + '.SetInParent()') - return l - -# Construct a new set of weights and add it to the model -def new_weights(model, name, initializer = 'constant_initializer'): - w = model.weights.add() - w.name = name - exec('w.' + initializer + '.SetInParent()') - return w - -# Discriminator -#@todo: clean up, tag may not be needed -#Weight sharing on the same branch (D1) or (D2) -def add_discriminator(model,disc_input, prefix, freeze=False, add_weight=True, tag=''): - #Shared weights for same path (e.g. D1 fake and D1 real) - w1 = prefix+'fc1' - w2 = prefix+'fc2' - w3 = prefix+'fc3' - - fc1 = w1+tag - fc2 = w2+tag - fc3 = w3+tag - - - relu1 = prefix+'relu1'+tag - relu2 = prefix+'relu2'+tag - - l = new_layer(model, fc1, disc_input,'fully_connected') - l.fully_connected.num_neurons = 128 - l.fully_connected.has_bias = True - l.freeze = freeze - if(add_weight) : - w = new_weights(model, w1 + 'linearity', 'he_normal_initializer') - l.weights = w1 + 'linearity' - - l = new_layer(model, relu1, fc1,'relu') - - - l = new_layer(model, fc2, relu1,'fully_connected') - l.fully_connected.num_neurons = 16 - l.fully_connected.has_bias = True - l.freeze = freeze - if(add_weight) : - w = new_weights(model, w2 + 'linearity', 'he_normal_initializer') - l.weights = w2 + 'linearity' - - l = new_layer(model, relu2, fc2,'relu') - - l = new_layer(model, fc3, relu2, 'fully_connected') - l.fully_connected.num_neurons = 1 - l.fully_connected.has_bias = True - l.freeze = freeze - if(add_weight) : - w = new_weights(model, w3 + 'linearity', 'he_normal_initializer') - l.weights = w3 + 'linearity' - return fc3 - - -#Generator -#Weight frozen, no weight sharing -#todo, handle weight sharing -def add_generator(model, gen_input, prefix, output_dim, freeze=False, add_dropout=True, tag=''): - #different weights - fc1 = prefix+'fc1'+tag - fc2 = prefix+'fc2'+tag - fc3 = prefix+'fc3'+tag - fc4 = prefix+'fc4'+tag - - relu1 = prefix+'relu1'+tag - relu2 = prefix+'relu2'+tag - relu3 = prefix+'relu3'+tag - - dropout1 = prefix+'dropout1'+tag - - l = new_layer(model, fc1, gen_input,'fully_connected') - l.fully_connected.num_neurons = 16 - l.fully_connected.has_bias = True - l.freeze = freeze - w = new_weights(model, fc1 + 'linearity', 'he_normal_initializer') - l.weights = fc1 + 'linearity' - - l = new_layer(model, relu1, fc1,'relu') - - l = new_layer(model, fc2, relu1,'fully_connected') - l.fully_connected.num_neurons = 128 - l.fully_connected.has_bias = True - l.freeze = freeze - w = new_weights(model, fc2 + 'linearity', 'he_normal_initializer') - l.weights = fc2 + 'linearity' - - l = new_layer(model, relu2, fc2,'relu') - next_parent = relu2 - if(add_dropout): - l = new_layer(model,dropout1,next_parent, 'dropout') - l.dropout.keep_prob = 0.8 - next_parent=dropout1 - - l = new_layer(model, fc3, next_parent, 'fully_connected') - l.fully_connected.num_neurons = 512 - l.fully_connected.has_bias = True - l.freeze = freeze - w = new_weights(model, fc3 + 'linearity', 'he_normal_initializer') - l.weights = fc3 + 'linearity' - - l = new_layer(model, relu3, fc3, 'relu') - - l = new_layer(model, fc4, relu3, 'fully_connected') - l.fully_connected.num_neurons = output_dim - l.fully_connected.has_bias = True - l.freeze = freeze - w = new_weights(model, fc4 + 'linearity', 'he_normal_initializer') - l.weights = fc4 + 'linearity' - - return fc4 - - -# Configure a prototext model (e.g. add layers) -def configure_model(model): - - #####INPUT DATA (including Slices) - ### Input data comes from merge features of image (Y) and param (X) - l = new_layer(model,'data',' ', 'input') - - slice_points = [0,2500,2511] - l = new_layer(model, 'slice_data','data', 'slice') - l.children = 'image_data_dummy param_data_id' - l.slice.slice_points = str_list(slice_points) - - #Useful constants - zero = new_layer(model,'zero','','constant') - zero.constant.value = 0.0 - zero.constant.num_neurons = '1' - one = new_layer(model,'one','','constant') - one.constant.value = 1.0 - one.constant.num_neurons = '1' - - #ID Image (Y) data - l = new_layer(model,'image_data_dummy','slice_data','identity') - - #ID parameter data (X) - l = new_layer(model,'param_data_id','slice_data','identity') - - #D_Loss1 branch - #Fake path - #def add_generator(model, gen_input, prefix, output_dim, freeze=False, add_dropout=True, tag=''): - #freeze generator = True - #g_sample=generator1(x) - g_sample = add_generator(model, 'param_data_id','gen1', 2500, True,True) - #g_sample2= generator(y) - g_sample2 = add_generator(model,'image_data_dummy','gen2', 11, True,False) - - #True path (share weights with fake path discriminator) - #discriminator(y,x) - #data = y + x - D_real = add_discriminator(model, 'data','disc1',False, True, '_real') - #CONCAT - # Gsample + x - # - l = new_layer(model, 'concat_gsample_n_param','','concatenation') - l.parents = g_sample+' param_data_id' - #discriminator false path - #question: how to deal with weight sharing? - #discriminator(g_sample,x) - D_fake = add_discriminator(model,'concat_gsample_n_param','disc1',False, False, '_fake') - - #obectives here (D_real, D_fake) - - #D_loss2 branch - #Reconcatenate x+y - l = new_layer(model, 'concat_param_n_img','param_data_id image_data_dummy','concatenation') - - #D_real2 = discriminator2(x,y) - D_real2 = add_discriminator(model,'concat_param_n_img','disc2',False, True, '_real') - - #D_fake2 = discriminator2(G_sample2,y) - l = new_layer(model, 'concat_gsample2_n_img',g_sample2+ ' image_data_dummy','concatenation') - D_fake2 = add_discriminator(model,'concat_gsample2_n_img','disc2', False, False, '_fake') - - #Objective and evaluation layers here - l = new_layer(model, 'disc1_real_bce', [D_real, one.name], 'sigmoid_binary_cross_entropy') - l = new_layer(model, 'disc1_real_eval','disc1_real_bce', 'evaluation') - - l = new_layer(model, 'disc1_fake_bce', [D_fake, zero.name], 'sigmoid_binary_cross_entropy') - l = new_layer(model, 'disc1_fake_eval','disc1_fake_bce', 'evaluation') - - l = new_layer(model, 'disc2_real_bce', [D_real2, one.name], 'sigmoid_binary_cross_entropy') - l = new_layer(model, 'disc2_real_eval','disc2_real_bce', 'evaluation') - - l = new_layer(model, 'disc2_fake_bce', [D_fake2, zero.name], 'sigmoid_binary_cross_entropy') - l = new_layer(model, 'disc2_fake_eval','disc2_fake_bce', 'evaluation') - - -if __name__ == "__main__": - - # Make sure protobuf Python implementation is built - host = subprocess.check_output('hostname').strip('\n1234567890') - protoc = lbann_dir + '/build/gnu.' + host + '.llnl.gov/install/bin/protoc' - proto_python_dir = lbann_dir + '/build/gnu.' + host + '.llnl.gov/protobuf/src/python' - os.putenv('PROTOC', protoc) - subprocess.call('cd ' + proto_python_dir + '; ' - + sys.executable + ' ' - + proto_python_dir + '/setup.py build', - shell=True) - sys.path.append(proto_python_dir) - import google.protobuf.text_format as txtf - - # Compile LBANN protobuf - subprocess.call([protoc, - '-I=' + lbann_proto_dir, - '--python_out=' + work_dir, - lbann_proto_dir + '/lbann.proto']) - sys.path.append(work_dir) - global lbann_pb2 - import lbann_pb2 - - # Load template prototext - with open(template_proto, 'r') as f: - pb = txtf.Merge(f.read(), lbann_pb2.LbannPB()) - - # Configure prototext model - configure_model(pb.model) - - # Export prototext - with open(output_proto, 'w') as f: - f.write(txtf.MessageToString(pb)) diff --git a/model_zoo/models/jag/cycle_gan/generate_cycgan_m2.py b/model_zoo/models/jag/cycle_gan/generate_cycgan_m2.py deleted file mode 100644 index de8b704f877..00000000000 --- a/model_zoo/models/jag/cycle_gan/generate_cycgan_m2.py +++ /dev/null @@ -1,255 +0,0 @@ -import sys -import os -import subprocess -import functools - -# Parameters -lbann_dir = subprocess.check_output(['git', 'rev-parse', '--show-toplevel']).strip() -lbann_proto_dir = lbann_dir + '/src/proto/' -work_dir = lbann_dir + '/model_zoo/models/gan/jags/cycle_gan' -template_proto = lbann_dir + '/model_zoo/models/gan/jags/cycle_gan/cycgan_m2_template.prototext' -output_proto = lbann_dir + '/model_zoo/models/gan/jags/cycle_gan/cycgan_m2.prototext' - -# Convert a list into a space-separated string -def str_list(l): - if isinstance(l, list): - return ' '.join(str(i) for i in l) - elif isinstance(l, str): - return l - else: - raise TypeError('str_list expects a list or a string') - -# Construct a new layer and add it to the model -def new_layer(model, name, parents, layer_type, layout = 'data_parallel'): - l = model.layer.add() - l.name = name - l.data_layout = layout - l.parents = str_list(parents) - #l.device_allocation = device - exec('l.' + layer_type + '.SetInParent()') - return l - -# Construct a new set of weights and add it to the model -def new_weights(model, name, initializer = 'constant_initializer'): - w = model.weights.add() - w.name = name - exec('w.' + initializer + '.SetInParent()') - return w - -# Discriminator -#@todo: clean up, tag may not be needed -#Weight sharing on the same branch (D1) or (D2) -def add_discriminator(model,disc_input, prefix, freeze=False, add_weight=True, tag=''): - #Shared weights for same path (e.g. D1 fake and D1 real) - w1 = prefix+'fc1' - w2 = prefix+'fc2' - w3 = prefix+'fc3' - - fc1 = w1+tag - fc2 = w2+tag - fc3 = w3+tag - - - relu1 = prefix+'relu1'+tag - relu2 = prefix+'relu2'+tag - - l = new_layer(model, fc1, disc_input,'fully_connected') - l.fully_connected.num_neurons = 128 - l.fully_connected.has_bias = True - l.freeze = freeze - if(add_weight) : - w = new_weights(model, w1 + 'linearity', 'he_normal_initializer') - l.weights = w1 + 'linearity' - - l = new_layer(model, relu1, fc1,'relu') - - - l = new_layer(model, fc2, relu1,'fully_connected') - l.fully_connected.num_neurons = 16 - l.fully_connected.has_bias = True - l.freeze = freeze - if(add_weight) : - w = new_weights(model, w2 + 'linearity', 'he_normal_initializer') - l.weights = w2 + 'linearity' - - l = new_layer(model, relu2, fc2,'relu') - - l = new_layer(model, fc3, relu2, 'fully_connected') - l.fully_connected.num_neurons = 1 - l.fully_connected.has_bias = True - l.freeze = freeze - if(add_weight) : - w = new_weights(model, w3 + 'linearity', 'he_normal_initializer') - l.weights = w3 + 'linearity' - return fc3 - - -#Generator -#Weight frozen, no weight sharing -#todo, handle weight sharing -def add_generator(model, gen_input, prefix, output_dim, freeze=False, add_dropout=True, add_weight=True, tag=''): - - w1 = prefix+'fc1' - w2 = prefix+'fc2' - w3 = prefix+'fc3' - w4 = prefix+'fc4' - - fc1 = w1+tag - fc2 = w2+tag - fc3 = w3+tag - fc4 = w4+tag - - relu1 = prefix+'relu1'+tag - relu2 = prefix+'relu2'+tag - relu3 = prefix+'relu3'+tag - - dropout1 = prefix+'dropout1'+tag - - l = new_layer(model, fc1, gen_input,'fully_connected') - l.fully_connected.num_neurons = 16 - l.fully_connected.has_bias = True - l.freeze = freeze - if(add_weight): - w = new_weights(model, w1 + 'linearity', 'he_normal_initializer') - l.weights = w1 + 'linearity' - - l = new_layer(model, relu1, fc1,'relu') - - l = new_layer(model, fc2, relu1,'fully_connected') - l.fully_connected.num_neurons = 128 - l.fully_connected.has_bias = True - l.freeze = freeze - if(add_weight): - w = new_weights(model, w2 + 'linearity', 'he_normal_initializer') - l.weights = w2 + 'linearity' - - l = new_layer(model, relu2, fc2,'relu') - next_parent = relu2 - if(add_dropout): - l = new_layer(model,dropout1,next_parent, 'dropout') - l.dropout.keep_prob = 0.8 - next_parent=dropout1 - - l = new_layer(model, fc3, next_parent, 'fully_connected') - l.fully_connected.num_neurons = 512 - l.fully_connected.has_bias = True - l.freeze = freeze - if(add_weight) : - w = new_weights(model, w3 + 'linearity', 'he_normal_initializer') - l.weights = w3 + 'linearity' - - l = new_layer(model, relu3, fc3, 'relu') - - l = new_layer(model, fc4, relu3, 'fully_connected') - l.fully_connected.num_neurons = output_dim - l.fully_connected.has_bias = True - l.freeze = freeze - if(add_weight) : - w = new_weights(model, w4 + 'linearity', 'he_normal_initializer') - l.weights = w4 + 'linearity' - - return fc4 - - -# Configure a prototext model (e.g. add layers) -def configure_model(model): - - #####INPUT DATA (including Slices) - ### Input data comes from merge features of image (Y) and param (X) - l = new_layer(model,'data',' ', 'input') - - slice_points = [0,2500,2511] - l = new_layer(model, 'slice_data','data', 'slice') - l.children = 'image_data_dummy param_data_id' - l.slice.slice_points = str_list(slice_points) - - #ID Image (Y) data - l = new_layer(model,'image_data_dummy','slice_data','identity') - - #ID parameter data (X) - l = new_layer(model,'param_data_id','slice_data','identity') - - #******************************************** - #g_sample=generator(x) - #do not freeze, train generator to confuse discriminator - #_1 => first generator1 to be added, to solve problem of all generator1 having the same name - g_sample = add_generator(model, 'param_data_id','gen1', 2500, False,True,True,'_1') - # g_adv1 = discriminator(g_sample,x) - l = new_layer(model, 'concat_gsample_n_param',g_sample+' param_data_id','concatenation') - #freeze discriminator, fake it as real - D_real = add_discriminator(model,'concat_gsample_n_param','disc1',True, True, '_real') - #objective function - one = new_layer(model,'one','','constant') - one.constant.value = 1.0 - one.constant.num_neurons = '1' - l = new_layer(model, 'g_adv1_bce', [D_real, one.name], 'sigmoid_binary_cross_entropy') - l = new_layer(model, 'g_adv1_eval','g_adv1_bce', 'evaluation') - - #************************************************ - #g_sample2= generator2(y) //freeze - g_sample2 = add_generator(model,'image_data_dummy','gen2', 11, True,False,True,'_y') - #G_cyc_y = generator(G_sample2) //same generator as line 167? shared weights? train - #Dont add weights, share weights with _1 - G_cyc_y = add_generator(model,g_sample2,'gen1',2500,False,True,False,'_2') - #G_cyc_y - y - l = new_layer(model,'cycy_minus_y',G_cyc_y + ' image_data_dummy','weighted_sum') - l.weighted_sum.scaling_factors = '1 -1' - #abs(x) x= G_cyc_y - y = cycy_minus_y - l = new_layer(model,'L_cyc_y', 'cycy_minus_y', 'abs') - l = new_layer(model, 'L_cyc_y_eval','L_cyc_y', 'evaluation') - #+++++++++++++ - #G_cyc_x = generator2(G_sample) //freeze, shared weights with previous but not name - G_cyc_x = add_generator(model,g_sample,'gen2', 11, True,False,False,'_gsample') - #G_cyc_x - x - l = new_layer(model,'cycx_minus_x',G_cyc_x + ' param_data_id','weighted_sum') - l.weighted_sum.scaling_factors = '1 -1' - #abs(x) x= G_cyc_x - x = cycx_minus_x - l = new_layer(model,'L_cyc_x', 'cycx_minus_x', 'abs') - l = new_layer(model, 'L_cyc_x_eval','L_cyc_x', 'evaluation') - - #****************************************************** - #L_cyc = L_cyc_y + L_cyc_x - #l = new_layer(model, 'L_cyc', 'L_cyc_y L_cyc_x', 'weighted_sum') - #l.weighted_sum.scaling_factors = '1 1' - #l = new_layer(model, 'L_cyc_eval','L_cyc', 'evaluation') - #****************************************************** - #****************************************************** - #l2_norm(gsample - y) - l = new_layer(model, 'gsample_minus_y', g_sample+' image_data_dummy','weighted_sum') - l.weighted_sum.scaling_factors = '1 -1' - - l = new_layer(model, 'l_l2_y', 'gsample_minus_y', 'l2_norm2') - -if __name__ == "__main__": - - # Make sure protobuf Python implementation is built - host = subprocess.check_output('hostname').strip('\n1234567890') - protoc = lbann_dir + '/build/gnu.' + host + '.llnl.gov/install/bin/protoc' - proto_python_dir = lbann_dir + '/build/gnu.' + host + '.llnl.gov/protobuf/src/python' - os.putenv('PROTOC', protoc) - subprocess.call('cd ' + proto_python_dir + '; ' - + sys.executable + ' ' - + proto_python_dir + '/setup.py build', - shell=True) - sys.path.append(proto_python_dir) - import google.protobuf.text_format as txtf - - # Compile LBANN protobuf - subprocess.call([protoc, - '-I=' + lbann_proto_dir, - '--python_out=' + work_dir, - lbann_proto_dir + '/lbann.proto']) - sys.path.append(work_dir) - global lbann_pb2 - import lbann_pb2 - - # Load template prototext - with open(template_proto, 'r') as f: - pb = txtf.Merge(f.read(), lbann_pb2.LbannPB()) - - # Configure prototext model - configure_model(pb.model) - - # Export prototext - with open(output_proto, 'w') as f: - f.write(txtf.MessageToString(pb)) diff --git a/model_zoo/models/jag/cycle_gan/generate_cycgan_m3.py b/model_zoo/models/jag/cycle_gan/generate_cycgan_m3.py deleted file mode 100644 index 3a14b8b6da5..00000000000 --- a/model_zoo/models/jag/cycle_gan/generate_cycgan_m3.py +++ /dev/null @@ -1,257 +0,0 @@ -import sys -import os -import subprocess -import functools - -# Parameters -lbann_dir = subprocess.check_output(['git', 'rev-parse', '--show-toplevel']).strip() -lbann_proto_dir = lbann_dir + '/src/proto/' -work_dir = lbann_dir + '/model_zoo/models/gan/jags/cycle_gan' -template_proto = lbann_dir + '/model_zoo/models/gan/jags/cycle_gan/cycgan_m3_template.prototext' -output_proto = lbann_dir + '/model_zoo/models/gan/jags/cycle_gan/cycgan_m3.prototext' - -# Convert a list into a space-separated string -def str_list(l): - if isinstance(l, list): - return ' '.join(str(i) for i in l) - elif isinstance(l, str): - return l - else: - raise TypeError('str_list expects a list or a string') - -# Construct a new layer and add it to the model -def new_layer(model, name, parents, layer_type, layout = 'data_parallel'): - l = model.layer.add() - l.name = name - l.data_layout = layout - l.parents = str_list(parents) - #l.device_allocation = device - exec('l.' + layer_type + '.SetInParent()') - return l - -# Construct a new set of weights and add it to the model -def new_weights(model, name, initializer = 'constant_initializer'): - w = model.weights.add() - w.name = name - exec('w.' + initializer + '.SetInParent()') - return w - -# Discriminator -#@todo: clean up, tag may not be needed -#Weight sharing on the same branch (D1) or (D2) -def add_discriminator(model,disc_input, prefix, freeze=False, add_weight=True, tag=''): - #Shared weights for same path (e.g. D1 fake and D1 real) - w1 = prefix+'fc1' - w2 = prefix+'fc2' - w3 = prefix+'fc3' - - fc1 = w1+tag - fc2 = w2+tag - fc3 = w3+tag - - - relu1 = prefix+'relu1'+tag - relu2 = prefix+'relu2'+tag - - l = new_layer(model, fc1, disc_input,'fully_connected') - l.fully_connected.num_neurons = 128 - l.fully_connected.has_bias = True - l.freeze = freeze - if(add_weight) : - w = new_weights(model, w1 + 'linearity', 'he_normal_initializer') - l.weights = w1 + 'linearity' - - l = new_layer(model, relu1, fc1,'relu') - - - l = new_layer(model, fc2, relu1,'fully_connected') - l.fully_connected.num_neurons = 16 - l.fully_connected.has_bias = True - l.freeze = freeze - if(add_weight) : - w = new_weights(model, w2 + 'linearity', 'he_normal_initializer') - l.weights = w2 + 'linearity' - - l = new_layer(model, relu2, fc2,'relu') - - l = new_layer(model, fc3, relu2, 'fully_connected') - l.fully_connected.num_neurons = 1 - l.fully_connected.has_bias = True - l.freeze = freeze - if(add_weight) : - w = new_weights(model, w3 + 'linearity', 'he_normal_initializer') - l.weights = w3 + 'linearity' - return fc3 - - -#Generator -#Weight frozen, no weight sharing -#todo, handle weight sharing -def add_generator(model, gen_input, prefix, output_dim, freeze=False, add_dropout=True, add_weight=True, tag=''): - - w1 = prefix+'fc1' - w2 = prefix+'fc2' - w3 = prefix+'fc3' - w4 = prefix+'fc4' - - fc1 = w1+tag - fc2 = w2+tag - fc3 = w3+tag - fc4 = w4+tag - - relu1 = prefix+'relu1'+tag - relu2 = prefix+'relu2'+tag - relu3 = prefix+'relu3'+tag - - dropout1 = prefix+'dropout1'+tag - - l = new_layer(model, fc1, gen_input,'fully_connected') - l.fully_connected.num_neurons = 16 - l.fully_connected.has_bias = True - l.freeze = freeze - if(add_weight): - w = new_weights(model, w1 + 'linearity', 'he_normal_initializer') - l.weights = w1 + 'linearity' - - l = new_layer(model, relu1, fc1,'relu') - - l = new_layer(model, fc2, relu1,'fully_connected') - l.fully_connected.num_neurons = 128 - l.fully_connected.has_bias = True - l.freeze = freeze - if(add_weight): - w = new_weights(model, w2 + 'linearity', 'he_normal_initializer') - l.weights = w2 + 'linearity' - - l = new_layer(model, relu2, fc2,'relu') - next_parent = relu2 - if(add_dropout): - l = new_layer(model,dropout1,next_parent, 'dropout') - l.dropout.keep_prob = 0.8 - next_parent=dropout1 - - l = new_layer(model, fc3, next_parent, 'fully_connected') - l.fully_connected.num_neurons = 512 - l.fully_connected.has_bias = True - l.freeze = freeze - if(add_weight) : - w = new_weights(model, w3 + 'linearity', 'he_normal_initializer') - l.weights = w3 + 'linearity' - - l = new_layer(model, relu3, fc3, 'relu') - - l = new_layer(model, fc4, relu3, 'fully_connected') - l.fully_connected.num_neurons = output_dim - l.fully_connected.has_bias = True - l.freeze = freeze - if(add_weight) : - w = new_weights(model, w4 + 'linearity', 'he_normal_initializer') - l.weights = w4 + 'linearity' - - return fc4 - - -# Configure a prototext model (e.g. add layers) -def configure_model(model): - - #####INPUT DATA (including Slices) - ### Input data comes from merge features of image (Y) and param (X) - l = new_layer(model,'data',' ', 'input') - - slice_points = [0,2500,2511] - l = new_layer(model, 'slice_data','data', 'slice') - l.children = 'image_data_dummy param_data_id' - l.slice.slice_points = str_list(slice_points) - - #ID Image (Y) data - l = new_layer(model,'image_data_dummy','slice_data','identity') - - #ID parameter data (X) - l = new_layer(model,'param_data_id','slice_data','identity') - - #******************************************** - #g_sample2=generator2(y) - #do not freeze, train generator to confuse discriminator - #_1 => first generator1 to be added, to solve problem of all generator2 having the same name - g_sample2 = add_generator(model, 'image_data_dummy','gen2', 11, False,False,True,'_1') - # g_adv21 = discriminator2(g_sample2,y) - l = new_layer(model, 'concat_gsample2_n_img',g_sample2+' image_data_dummy','concatenation') - #freeze discriminator, fake it as real - D_real = add_discriminator(model,'concat_gsample2_n_img','disc2',True, True, '_real') - #objective function - one = new_layer(model,'one','','constant') - one.constant.value = 1.0 - one.constant.num_neurons = '1' - l = new_layer(model, 'g_adv2_bce', [D_real, one.name], 'sigmoid_binary_cross_entropy') - l = new_layer(model, 'g_adv2_eval','g_adv2_bce', 'evaluation') - - #************************************************ - #g_sample2= generator2(y) //train - g_sample2 = add_generator(model,'image_data_dummy','gen2', 11, False,False,False,'_y') - #G_cyc_y = generator(G_sample2) //same generator as line 167? shared weights? train - #Dont add weights, share weights with _1 - G_cyc_y = add_generator(model,g_sample2,'gen1',2500,True,True,False,'_2') - #G_cyc_y - y - l = new_layer(model,'cycy_minus_y',G_cyc_y + ' image_data_dummy','weighted_sum') - l.weighted_sum.scaling_factors = '1 -1' - #abs(x) x= G_cyc_y - y = cycy_minus_y - l = new_layer(model,'L_cyc_y', 'cycy_minus_y', 'abs') - l = new_layer(model, 'L_cyc_y_eval','L_cyc_y', 'evaluation') - #+++++++++++++ - #g_sample=generator(x) - g_sample = add_generator(model,'param_data_id','gen1',2500,True,True,True,'_1') - #G_cyc_x = generator2(G_sample) //freeze, shared weights with previous but not name - G_cyc_x = add_generator(model,g_sample,'gen2', 11, False,False,False,'_gsample') - #G_cyc_x - x - l = new_layer(model,'cycx_minus_x',G_cyc_x + ' param_data_id','weighted_sum') - l.weighted_sum.scaling_factors = '1 -1' - #abs(x) x= G_cyc_x - x = cycx_minus_x - l = new_layer(model,'L_cyc_x', 'cycx_minus_x', 'abs') - l = new_layer(model, 'L_cyc_x_eval','L_cyc_x', 'evaluation') - - #****************************************************** - #L_cyc = L_cyc_y + L_cyc_x - #l = new_layer(model, 'L_cyc', 'L_cyc_y L_cyc_x', 'weighted_sum') - #l.weighted_sum.scaling_factors = '1 1' - #l = new_layer(model, 'L_cyc_eval','L_cyc', 'evaluation') - #****************************************************** - #****************************************************** - #l2_norm(gsample2 - x) - l = new_layer(model, 'gsample2_minus_x', g_sample2+' param_data_id','weighted_sum') - l.weighted_sum.scaling_factors = '1 -1' - - l = new_layer(model, 'l_l2_x', 'gsample2_minus_x', 'l2_norm2') - -if __name__ == "__main__": - - # Make sure protobuf Python implementation is built - host = subprocess.check_output('hostname').strip('\n1234567890') - protoc = lbann_dir + '/build/gnu.' + host + '.llnl.gov/install/bin/protoc' - proto_python_dir = lbann_dir + '/build/gnu.' + host + '.llnl.gov/protobuf/src/python' - os.putenv('PROTOC', protoc) - subprocess.call('cd ' + proto_python_dir + '; ' - + sys.executable + ' ' - + proto_python_dir + '/setup.py build', - shell=True) - sys.path.append(proto_python_dir) - import google.protobuf.text_format as txtf - - # Compile LBANN protobuf - subprocess.call([protoc, - '-I=' + lbann_proto_dir, - '--python_out=' + work_dir, - lbann_proto_dir + '/lbann.proto']) - sys.path.append(work_dir) - global lbann_pb2 - import lbann_pb2 - - # Load template prototext - with open(template_proto, 'r') as f: - pb = txtf.Merge(f.read(), lbann_pb2.LbannPB()) - - # Configure prototext model - configure_model(pb.model) - - # Export prototext - with open(output_proto, 'w') as f: - f.write(txtf.MessageToString(pb)) diff --git a/model_zoo/models/jag/cycle_gan/jag_data.prototext b/model_zoo/models/jag/cycle_gan/jag_data.prototext deleted file mode 100644 index d6d1065206f..00000000000 --- a/model_zoo/models/jag/cycle_gan/jag_data.prototext +++ /dev/null @@ -1,25 +0,0 @@ -data_reader { - reader { - name: "merge_features" - format: "numpy" - role: "train" - shuffle: true - data_file_pattern: "/p/lscratchf/brainusr/datasets/jag/jag_train_*.npy" - validation_percent: 0 - percent_of_data_to_use: 1.0 - disable_responses: true - disable_labels: true - } - reader { - name: "merge_features" - format: "numpy" - role: "test" - shuffle: false - data_file_pattern: "/p/lscratchf/brainusr/datasets/jag/jag_test_*.npy" - validation_percent: 0 - #test first 16 samples only to match TF version - absolute_sample_count: 16 - disable_responses: true - disable_labels: true - } -} From 4563e4ba290daa33da52ac847235ef82f1820b25 Mon Sep 17 00:00:00 2001 From: Naoya Maruyama Date: Mon, 10 Jun 2019 13:21:15 -0700 Subject: [PATCH 065/634] Initialize weights in the alphabetical order of their names for run-to-run reproducibility --- src/models/model.cpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/models/model.cpp b/src/models/model.cpp index 7d8f163d920..7a377b6da24 100644 --- a/src/models/model.cpp +++ b/src/models/model.cpp @@ -736,6 +736,14 @@ void model::setup_weights() { m_weights.end()); } + // For run-to-run reproducibility, make sure the weights are + // initialized in the same order no matter how they are ordered in + // the prototext file. + std::sort(m_weights.begin(), m_weights.end(), + [](weights* const &x, weights* const &y) { + return x->get_name().compare(y->get_name()) < 0; + }); + // Setup weights for (auto* w : m_weights) { w->setup(); } From 27162b257539b41a6fe23137d7578d863ca2caaa Mon Sep 17 00:00:00 2001 From: Sam Ade Jacobs Date: Mon, 10 Jun 2019 15:55:19 -0700 Subject: [PATCH 066/634] Restore jag data and meta data reader, they now live in jag/wae_cycle_gan directory, update your experiment script as aappropriate --- .../data_reader_jag_conduit_lassen.prototext | 112 +++++++++++++++++ .../data_reader_jag_conduit_lustre.prototext | 116 ++++++++++++++++++ .../jag/wae_cycle_gan/jag10k_data.prototext | 25 ++++ .../wae_cycle_gan/jag_100M_metadata.prototext | 115 +++++++++++++++++ 4 files changed, 368 insertions(+) create mode 100644 model_zoo/models/jag/wae_cycle_gan/data_reader_jag_conduit_lassen.prototext create mode 100644 model_zoo/models/jag/wae_cycle_gan/data_reader_jag_conduit_lustre.prototext create mode 100644 model_zoo/models/jag/wae_cycle_gan/jag10k_data.prototext create mode 100644 model_zoo/models/jag/wae_cycle_gan/jag_100M_metadata.prototext diff --git a/model_zoo/models/jag/wae_cycle_gan/data_reader_jag_conduit_lassen.prototext b/model_zoo/models/jag/wae_cycle_gan/data_reader_jag_conduit_lassen.prototext new file mode 100644 index 00000000000..b0376077b5e --- /dev/null +++ b/model_zoo/models/jag/wae_cycle_gan/data_reader_jag_conduit_lassen.prototext @@ -0,0 +1,112 @@ +######################################################################## +# The JAG normalization values were computed over the 10M + 1MA + 1MB random +# pulls from the 100M data set. They are valid for the directories: +# /p/lustre2/brainusr/datasets/10MJAG/ (10M | 1M_A | 1M_B) +# /p/lustre2/brainusr/datasets/10MJAG_balanced_1K/ (1M_A | 1M_B) +# /p/gpfs1/brainusr/datasets/10MJAG/10M | 1M_A | 1M_B +# /p/gpfs1/brainusr/datasets/10MJAG_balanced_1K/ (1M_A | 1M_B) +######################################################################## + +data_reader { + requires_data_set_metadata: true + + reader { + name: "jag_conduit" + role: "train" + shuffle: true + # change to a lustre path + data_filedir: "/p/gpfs1/brainusr/datasets/10MJAG/1M_A/" + index_list: "index.txt" + index_list_per_trainer: false + index_list_per_model: false + + validation_percent: 0 + absolute_sample_count: 0 + percent_of_data_to_use: 1.0 + disable_responses: true + disable_labels: true + + num_labels: 5 + + image_preprocessor { + # assume fixed size of input images if cropper is not used + raw_width: 64 + raw_height: 64 + raw_num_channels: 4 + + normalizer { + disable: true + scale: false + subtract_mean: false + unit_variance: false + z_score: true + } + + subtractor { + disable: true + } + + cropper { + disable: true + } + + colorizer { + disable: true + } + + augmenter { + disable: true + } + } + } + + reader { + name: "jag_conduit" + role: "test" + shuffle: true + # change to a lustre path + data_filedir: "/p/gpfs1/brainusr/datasets/10MJAG/1M_B" + index_list: "index.txt" + index_list_per_trainer: false + index_list_per_model: false + + validation_percent: 0 + absolute_sample_count: 0 + percent_of_data_to_use: 1.0 + disable_responses: true + disable_labels: true + + num_labels: 5 + + image_preprocessor { + # assume fixed size of input images if cropper is not used + raw_width: 64 + raw_height: 64 + raw_num_channels: 4 + + normalizer { + disable: true + scale: false + subtract_mean: false + unit_variance: false + z_score: true + } + + subtractor { + disable: true + } + + cropper { + disable: true + } + + colorizer { + disable: true + } + + augmenter { + disable: true + } + } + } +} diff --git a/model_zoo/models/jag/wae_cycle_gan/data_reader_jag_conduit_lustre.prototext b/model_zoo/models/jag/wae_cycle_gan/data_reader_jag_conduit_lustre.prototext new file mode 100644 index 00000000000..82ac04f28e5 --- /dev/null +++ b/model_zoo/models/jag/wae_cycle_gan/data_reader_jag_conduit_lustre.prototext @@ -0,0 +1,116 @@ +######################################################################## +# The JAG normalization values were computed over the 10M + 1MA + 1MB random +# pulls from the 100M data set. They are valid for the directories: +# /p/lustre2/brainusr/datasets/10MJAG/ (10M | 1M_A | 1M_B) +# /p/lustre2/brainusr/datasets/10MJAG_balanced_1K/ (1M_A | 1M_B) +# /p/gpfs1/brainusr/datasets/10MJAG/10M | 1M_A | 1M_B +# /p/gpfs1/brainusr/datasets/10MJAG_balanced_1K/ (1M_A | 1M_B) +######################################################################## + +data_reader { + requires_data_set_metadata: true + + reader { + name: "jag_conduit" + role: "train" + shuffle: true + # change to a lustre path + #data_filedir: "/p/lustre2/brainusr/datasets/10MJAG/1M_A/" + #index_list: "index.txt" + data_filedir: "/p/lustre2/brainusr/datasets/10MJAG/1M_A/100K4trainers/" + index_list: "100Kindex.txt" + index_list_per_trainer: false + index_list_per_model: false + + validation_percent: 0 + absolute_sample_count: 0 + percent_of_data_to_use: 1.0 + disable_responses: true + disable_labels: true + + num_labels: 5 + + image_preprocessor { + # assume fixed size of input images if cropper is not used + raw_width: 64 + raw_height: 64 + raw_num_channels: 4 + + normalizer { + disable: true + scale: false + subtract_mean: false + unit_variance: false + z_score: true + } + + subtractor { + disable: true + } + + cropper { + disable: true + } + + colorizer { + disable: true + } + + augmenter { + disable: true + } + } + } + + reader { + name: "jag_conduit" + role: "test" + shuffle: true + # change to a lustre path + data_filedir: "/p/lustre2/brainusr/datasets/10MJAG/1M_B/" + index_list: "index.txt" + #data_filedir: "/p/lustre2/brainusr/datasets/10MJAG/1M_A/100K16trainers/" + #index_list: "t1_sample_list.txt" + index_list_per_trainer: false + index_list_per_model: false + + validation_percent: 0 + absolute_sample_count: 0 + percent_of_data_to_use: 0.0005 + disable_responses: true + disable_labels: true + + num_labels: 5 + + image_preprocessor { + # assume fixed size of input images if cropper is not used + raw_width: 64 + raw_height: 64 + raw_num_channels: 4 + + normalizer { + disable: true + scale: false + subtract_mean: false + unit_variance: false + z_score: true + } + + subtractor { + disable: true + } + + cropper { + disable: true + } + + colorizer { + disable: true + } + + augmenter { + disable: true + } + } + } +} diff --git a/model_zoo/models/jag/wae_cycle_gan/jag10k_data.prototext b/model_zoo/models/jag/wae_cycle_gan/jag10k_data.prototext new file mode 100644 index 00000000000..f97b43e3031 --- /dev/null +++ b/model_zoo/models/jag/wae_cycle_gan/jag10k_data.prototext @@ -0,0 +1,25 @@ +data_reader { + reader { + name: "merge_features" + format: "numpy" + role: "train" + shuffle: true + data_file_pattern: "/p/lscratchh/brainusr/datasets/jag/multichannel/jag10K_multi_train_*.npy" + validation_percent: 0 + percent_of_data_to_use: 1.0 + disable_responses: true + disable_labels: true + } + reader { + name: "merge_features" + format: "numpy" + role: "test" + shuffle: false + data_file_pattern: "/p/lscratchh/brainusr/datasets/jag/multichannel/jag10K_multi_test_*.npy" + validation_percent: 0 + #test first 16 samples only to match TF version + absolute_sample_count: 100 + disable_responses: true + disable_labels: true + } +} diff --git a/model_zoo/models/jag/wae_cycle_gan/jag_100M_metadata.prototext b/model_zoo/models/jag/wae_cycle_gan/jag_100M_metadata.prototext new file mode 100644 index 00000000000..1643b6db51a --- /dev/null +++ b/model_zoo/models/jag/wae_cycle_gan/jag_100M_metadata.prototext @@ -0,0 +1,115 @@ +######################################################################## +# The JAG normalization values were computed over the 10M + 1MA + 1MB random +# pulls from the 100M data set. The image normalization values were updated +# on 1/30/2019 using the per-channel average of the pixel values +# across all views. +# They are valid for the directories: +# /p/lustre2/brainusr/datasets/10MJAG/ (10M | 1M_A | 1M_B) +# /p/lustre2/brainusr/datasets/10MJAG_balanced_1K/ (1M_A | 1M_B) +# /p/gpfs1/brainusr/datasets/10MJAG/10M | 1M_A | 1M_B +# /p/gpfs1/brainusr/datasets/10MJAG_balanced_1K/ (1M_A | 1M_B) +######################################################################## + +data_set_metadata { + schema { + split_jag_image_channels: true + + # JAG_Image, JAG_Scalar, JAG_Input + independent: [ { pieces: [ JAG_Image, JAG_Scalar ] }, { pieces: [ JAG_Input ] } ] + dependent: [ { pieces: [ JAG_Input ] } ] + + image_prefix: "/outputs/images/" + + jag_image_keys: ["(0.0, 0.0)/0.0/emi", "(90.0, 0.0)/0.0/emi", "(90.0, 78.0)/0.0/emi"] + + scalar_prefix: "/outputs/scalars/" + + # An empty list indicates to use all + # The commented out variables are not on the Jim's original list but used in the numpy-based format + jag_scalar_keys: + [ "BWx", + "BT", + "tMAXt", # absent in Jim's list + "BWn", + "MAXpressure", + #"BAte", + #"MAXtion", + "tMAXpressure", + "BAt", # absent in Jim's list + "Yn", + "Ye", + "Yx", + #"tMAXte", # absent in Jim's list + #"BAtion", + #"MAXte", + #"tMAXtion", # absent in Jim's list + "BTx", + "MAXt", # absent in Jim's list + #"BTn", + "BApressure", + "tMINradius", + "MINradius" # absent in Jim's list + ] + + # When using all the keys without explicit selection, key filters can be used + # to explicitly exclude the particular variables with keys that matches a filter. + # 'jag_scalar_filters' and 'jag_input_filters' rely on exact key string matching. + # 'jag_scalar_prefix_filters' and 'jag_input_prefix_filters' define a filter as + # the pair of a prefix substring and the minimum key length. + # For example, with the example below, any key that has a length no shorter + # than 26 and starts with the substring "image_(" is excluded. + + jag_scalar_prefix_filters: [ { key_prefix: "image_(" min_len: 26} ] + jag_scalar_filters: [ "iBT" ] + + input_prefix: "/inputs/" + + jag_input_keys: ["shape_model_initial_modes:(4,3)", + "betti_prl15_trans_u", + "betti_prl15_trans_v", + "shape_model_initial_modes:(2,1)", + "shape_model_initial_modes:(1,0)"]; + } + + normalization { + jag_scalar_normalization_params: [ + { scale: 7.610738e+00 bias: -4.075375e-01 }, #BWx + { scale: 1.459875e+00 bias: -3.427656e+00 }, #BT + { scale: 1.490713e+00 bias: -3.495498e+00 }, #tMAXt + { scale: 4.375123e+01 bias: -1.593477e+00 }, #BWn + { scale: 1.685576e-06 bias: -5.330971e-01 }, #MAXpressure + #{ scale: 2.636422e-01 bias: -9.762907e-01 }, #BAte + #{ scale: 2.419509e-01 bias: -9.853402e-01 }, #MAXtion + { scale: 1.430615e+00 bias: -3.351173e+00 }, #tMAXpressure + { scale: 2.636422e-01 bias: -9.762907e-01 }, #BAt + { scale: 7.154074e-18 bias: -1.864709e-02 }, #Yn + { scale: 3.166824e-03 bias: -1.864709e-02 }, #Ye + { scale: 2.102178e-02 bias: -3.071955e-01 }, #Yx + #{ scale: 1.490713e+00 bias: -3.495498e+00 }, #tMAXte + #{ scale: 2.636422e-01 bias: -9.762907e-01 }, #BAtion + #{ scale: 2.419509e-01 bias: -9.853402e-01 }, #MAXte + #{ scale: 1.490713e+00 bias: -3.495498e+00 }, #tMAXtion + { scale: 1.346439e+00 bias: -3.118446e+00 }, #BTx + { scale: 2.419509e-01 bias: -9.853402e-01 }, #MAXt + #{ scale: 1.459875e+00 bias: -3.427656e+00 }, #BTn + { scale: 2.061877e-06 bias: -5.213394e-01 }, #BApressure + { scale: 1.392544e+00 bias: -3.239921e+00 }, #tMINradius + { scale: 6.266253e-02 bias: -1.384504e+00 } #MINradius + ] + + jag_input_normalization_params: [ + { scale: 1.666672e+00 bias: 5.000000e-01 }, #shape_model_initial_modes:(4,3) + { scale: 1.000002e+00 bias: -1.603483e-07 }, #betti_prl15_trans_u + { scale: 1.000001e+00 bias: -1.406672e-06 }, #betti_prl15_trans_v + { scale: 1.666675e+00 bias: 4.999992e-01 }, #shape_model_initial_modes:(2,1) + { scale: 1.666669e+00 bias: 5.000008e-01 } #shape_model_initial_modes:(1,0) + ] + + jag_image_normalization_params: [ + { scale: 2.9258502e+01 bias: 0.0e+00 }, # avg = 0.0341781 + { scale: 8.5826596e+02 bias: 0.0e+00 }, # avg = 0.00116514 + { scale: 1.0004872e+05 bias: 0.0e+00 }, # avg = 9.99513e-06 + { scale: 4.8072070e+06 bias: 0.0e+00 } # avg = 2.08021e-07 + ] + } +} From b9945c6675e6f7988ce01e57e73dee0b49bf9cf3 Mon Sep 17 00:00:00 2001 From: "David A. Hysom" Date: Tue, 11 Jun 2019 09:13:09 -0700 Subject: [PATCH 067/634] drivers to test ingest speed for JAG and HYDRA, and to convert HYDRA to contain only the data required by LBANN --- model_zoo/jag_utils/CMakeLists.txt | 7 + model_zoo/jag_utils/convert.cpp | 189 ++++++++++ model_zoo/jag_utils/test_reading_speed.cpp | 380 +++++++++++++++++++++ 3 files changed, 576 insertions(+) create mode 100644 model_zoo/jag_utils/convert.cpp create mode 100644 model_zoo/jag_utils/test_reading_speed.cpp diff --git a/model_zoo/jag_utils/CMakeLists.txt b/model_zoo/jag_utils/CMakeLists.txt index 9030bde2243..2a0140e2d32 100644 --- a/model_zoo/jag_utils/CMakeLists.txt +++ b/model_zoo/jag_utils/CMakeLists.txt @@ -54,3 +54,10 @@ target_link_libraries(generate_corrupt_samples-bin lbann ) set_target_properties(generate_corrupt_samples-bin PROPERTIES OUTPUT_NAME generate_corrupt_samples) + add_executable( test_reading_speed-bin test_reading_speed.cpp ) + target_link_libraries(test_reading_speed-bin lbann ) + set_target_properties(test_reading_speed-bin PROPERTIES OUTPUT_NAME test_reading_speed) + + add_executable( convert-bin convert.cpp ) + target_link_libraries(convert-bin lbann ) + set_target_properties(convert-bin PROPERTIES OUTPUT_NAME convert) diff --git a/model_zoo/jag_utils/convert.cpp b/model_zoo/jag_utils/convert.cpp new file mode 100644 index 00000000000..8710d695401 --- /dev/null +++ b/model_zoo/jag_utils/convert.cpp @@ -0,0 +1,189 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +// +//////////////////////////////////////////////////////////////////////////////// + +#include "lbann_config.hpp" + +#include "conduit/conduit.hpp" +#include "conduit/conduit_relay.hpp" +#include "conduit/conduit_relay_io_hdf5.hpp" +#include +#include +#include +#include +#include +#include "lbann/lbann.hpp" +#include "lbann/utils/jag_utils.hpp" +#include +#include + +using namespace lbann; +using namespace std; + +vector get_input_names_jag(); +vector get_scalar_names_jag(); +vector get_image_names_jag(); +vector get_input_names_hydra(); +vector get_scalar_names_hydra(); +vector get_image_names_hydra(); +void test_hydra(string filename); +void test_jag(string filename); + +//========================================================================== +#define MAX_SAMPLES 10000 + +int main(int argc, char *argv[]) { + int random_seed = lbann_default_random_seed; + world_comm_ptr comm = initialize(argc, argv, random_seed); + + options *opts = options::get(); + opts->init(argc, argv); + + if (!(opts->has_string("filelist") && opts->has_string("output_dir") && opts->has_string("format"))) { + LBANN_ERROR("usage: test_speed_hydra_ --filelist= --output_dir= --format="); + } + + string filelist = opts->get_string("filelist"); + string format = opts->get_string("format"); + string output_dir = opts->get_string("output_dir"); + stringstream s; + s << "mkdir -p " << output_dir; + system(s.str().c_str()); + + hid_t hdf5_file_hnd; + std::string key; + conduit::Node n_ok; + conduit::Node tmp; + + vector input_names = get_input_names_hydra(); + vector scalar_names = get_scalar_names_hydra(); + vector image_names = get_image_names_hydra(); + + int num_samples = 0; + int num_files = 0; + ifstream in(filelist.c_str()); + int sample_id = 0; + string filename; + while (!in.eof()) { + getline(in, filename); + if (filename.size() < 2) { + continue; + } + ++num_files; + conduit::Node node; + hdf5_file_hnd = conduit::relay::io::hdf5_open_file_for_read( filename.c_str() ); + cout << "reading: " << filename << endl; + + size_t k = filename.rfind("/"); + stringstream s2; + s2 << output_dir << "/" << filename.substr(k+1); + + std::vector cnames; + conduit::relay::io::hdf5_group_list_child_names(hdf5_file_hnd, "/", cnames); + cout << "samples per file: " << cnames.size() << endl; + + for (size_t i=0; i get_input_names_hydra() { + vector f; + f.push_back("p_preheat"); + f.push_back("sc_peak"); + f.push_back("t_3rd"); + f.push_back("t_end"); + return f; +} + +vector get_scalar_names_hydra() { + vector f; + f.push_back("avg_rhor"); + f.push_back("peak_eprod"); + f.push_back("peak_tion_bw_DT"); + f.push_back("bt_tion_bw_DT"); + f.push_back("avg_tion_bw_DT"); + f.push_back("adiabat"); + f.push_back("bangt"); + f.push_back("burnwidth"); + f.push_back("bt_rhor"); + f.push_back("bt_eprodr"); + f.push_back("peak_eprodr"); + return f; +} + +vector get_image_names_hydra() { + vector f; + f.push_back("(90,0)/bang/image/data"); + f.push_back("(0,0)/bang/image/data"); + return f; +} + + + diff --git a/model_zoo/jag_utils/test_reading_speed.cpp b/model_zoo/jag_utils/test_reading_speed.cpp new file mode 100644 index 00000000000..3f16b2d8fc7 --- /dev/null +++ b/model_zoo/jag_utils/test_reading_speed.cpp @@ -0,0 +1,380 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +// +//////////////////////////////////////////////////////////////////////////////// + +#include "lbann_config.hpp" + +#include "conduit/conduit.hpp" +#include "conduit/conduit_relay.hpp" +#include "conduit/conduit_relay_io_hdf5.hpp" +#include +#include +#include +#include +#include +#include "lbann/lbann.hpp" +#include "lbann/utils/jag_utils.hpp" +#include +#include + +using namespace lbann; +using namespace std; + +vector get_input_names_jag(); +vector get_scalar_names_jag(); +vector get_image_names_jag(); +vector get_input_names_hydra(); +vector get_scalar_names_hydra(); +vector get_image_names_hydra(); +void test_hydra(string filename); +void test_jag(string filename); + +//========================================================================== +#define MAX_SAMPLES 10000 + +int main(int argc, char *argv[]) { + int random_seed = lbann_default_random_seed; + world_comm_ptr comm = initialize(argc, argv, random_seed); + + options *opts = options::get(); + opts->init(argc, argv); + + if (!(opts->has_string("filelist") && opts->has_int("jag"))) { + LBANN_ERROR("usage: test_speed_hydra_ --filelist= --jag=<0|1>"); + } + + if (opts->get_int("jag")) { + test_jag(opts->get_string("filelist")); + } else { + test_hydra(opts->get_string("filelist")); + } + return EXIT_SUCCESS; +} + +vector get_input_names_hydra() { + vector f; + f.push_back("p_preheat"); + f.push_back("sc_peak"); + f.push_back("t_3rd"); + f.push_back("t_end"); + return f; +} + +vector get_scalar_names_hydra() { + vector f; + f.push_back("avg_rhor"); + f.push_back("peak_eprod"); + f.push_back("peak_tion_bw_DT"); + f.push_back("bt_tion_bw_DT"); + f.push_back("avg_tion_bw_DT"); + f.push_back("adiabat"); + f.push_back("bangt"); + f.push_back("burnwidth"); + f.push_back("bt_rhor"); + f.push_back("bt_eprodr"); + f.push_back("peak_eprodr"); + return f; +} + +vector get_image_names_hydra() { + vector f; + f.push_back("(90,0)/bang/image/data"); + f.push_back("(0,0)/bang/image/data"); + return f; +} + +vector get_input_names_jag() { + vector f; + f.push_back("shape_model_initial_modes:(4,3)"); + f.push_back("betti_prl15_trans_u"); + f.push_back("betti_prl15_trans_v"); + f.push_back("shape_model_initial_modes:(2,1)"); + f.push_back("shape_model_initial_modes:(1,0)"); + return f; +} + +vector get_scalar_names_jag() { + vector f; + f.push_back("BWx"); + f.push_back("BT"); + f.push_back("tMAXt"); + f.push_back("BWn"); + f.push_back("MAXpressure"); + f.push_back("BAte"); + f.push_back("MAXtion"); + f.push_back("tMAXpressure"); + f.push_back("BAt"); + f.push_back("Yn"); + f.push_back("Ye"); + f.push_back("Yx"); + f.push_back("tMAXte"); + f.push_back("BAtion"); + f.push_back("MAXte"); + f.push_back("tMAXtion"); + f.push_back("BTx"); + f.push_back("MAXt"); + f.push_back("BTn"); + f.push_back("BApressure"); + f.push_back("tMINradius"); + f.push_back("MINradius"); + return f; +} + +vector get_image_names_jag() { + vector f; + f.push_back("(0.0, 0.0)/0.0/emi"); + f.push_back("(90.0, 0.0)/0.0/emi"); + f.push_back("(90.0, 78.0)/0.0/emi"); + return f; +} + +void test_hydra(string filename) { + double tm1 = get_time(); + hid_t hdf5_file_hnd; + std::string key; + conduit::Node n_ok; + conduit::Node tmp; + + vector input_names = get_input_names_hydra(); + vector scalar_names = get_scalar_names_hydra(); + vector image_names = get_image_names_hydra(); + + int num_samples = 0; + int num_files = 0; + double total = 0; + double bytes = 0; + ifstream in(filename.c_str()); + long sample_size = 0; + while (!in.eof()) { + getline(in, filename); + if (filename.size() < 2) { + continue; + } + ++num_files; + + try { + hdf5_file_hnd = conduit::relay::io::hdf5_open_file_for_read( filename.c_str() ); + } catch (...) { + LBANN_ERROR("failed to open " + filename + " for reading"); + } + cout << "reading: " << filename << endl; + + std::vector cnames; + try { + conduit::relay::io::hdf5_group_list_child_names(hdf5_file_hnd, "/", cnames); + } catch (...) { + LBANN_ERROR("exception hdf5_group_list_child_names; " + filename); + } + cout << "samples per file: " << cnames.size() << endl; + + for (size_t i=0; i= MAX_SAMPLES) { + goto FINISHED; + } + } catch (...) { + LBANN_ERROR("error reading " + key + " from file " + filename); + } + } + } + } + +FINISHED: + + double tm2 = get_time(); + cout << "========================================================\n" + << "hydra test:\n"; + cout << "bytes per sample: " << sample_size << endl; + cout << "time: " << tm2 - tm1 << " num samples: " << num_samples << " num files: " << num_files << "\n" + << "num inputs: " << input_names.size() << " scalars: " << scalar_names.size() << endl; + cout << "num bytes: " << bytes << " time to read 1M bytes: " << (tm2 - tm1)/(bytes/1000000) << endl; + +} + +void test_jag(string filename) { +cout << "starting test_jag; filename: " << filename << endl; + double tm1 = get_time(); + hid_t hdf5_file_hnd; + std::string key; + conduit::Node n_ok; + conduit::Node tmp; + + vector input_names = get_input_names_jag(); + vector scalar_names = get_scalar_names_jag(); + vector image_names = get_image_names_jag(); + + int num_samples = 0; + int num_files = 0; + double total = 0; + double bytes = 0; + ifstream in(filename.c_str()); + if (!in) { + LBANN_ERROR("failed to open " + filename + " for reading\n"); + } + long sample_size = 0; + int bad_samples = 0; + while (!in.eof()) { + getline(in, filename); + if (filename.size() < 2) { + continue; + } + ++num_files; + cout << "reading: " << filename << endl; + + try { + hdf5_file_hnd = conduit::relay::io::hdf5_open_file_for_read( filename.c_str() ); + } catch (...) { + LBANN_ERROR("failed to open " + filename + " for reading"); + } + + std::vector cnames; + try { + conduit::relay::io::hdf5_group_list_child_names(hdf5_file_hnd, "/", cnames); + } catch (...) { + LBANN_ERROR("exception hdf5_group_list_child_names; " + filename); + } + cout << "samples per file: " << cnames.size() << " num samples: " << num_samples << endl; + + for (size_t i=0; i= MAX_SAMPLES) { + goto FINISHED; + } + + } catch (...) { + conduit::Node node; + conduit::relay::io::load(filename, "hdf5", node); + const conduit::Schema *s = node.schema_ptr(); + cerr << "KEY: " << key << endl; + s->print(); + LBANN_ERROR("error reading " + key + " from file " + filename); + } + } else { + ++bad_samples; + } + } + } + +FINISHED: + + double tm2 = get_time(); + cout << "========================================================\n" + << "jag test:\n"; + cout << "bytes per sample: " << sample_size << endl; + cout << "num bad samples: " << bad_samples << endl; + cout << "time: " << tm2 - tm1 << " num samples: " << num_samples << " num files: " << num_files << "\n" + << "num inputs: " << input_names.size() << " scalars: " << scalar_names.size() << endl; + cout << "num bytes: " << bytes << " time to read 1M bytes: " << (tm2 - tm1)/(bytes/1000000) << endl; + +} From a1b92f28eacf8e0b3069a40dba86770b941965f4 Mon Sep 17 00:00:00 2001 From: "David A. Hysom" Date: Tue, 11 Jun 2019 09:46:27 -0700 Subject: [PATCH 068/634] initial commit --- .../jag/ae_cycle_gan/hydra_metadata.prototext | 114 ++++++++++++++++++ 1 file changed, 114 insertions(+) create mode 100644 model_zoo/models/jag/ae_cycle_gan/hydra_metadata.prototext diff --git a/model_zoo/models/jag/ae_cycle_gan/hydra_metadata.prototext b/model_zoo/models/jag/ae_cycle_gan/hydra_metadata.prototext new file mode 100644 index 00000000000..ec1701f9042 --- /dev/null +++ b/model_zoo/models/jag/ae_cycle_gan/hydra_metadata.prototext @@ -0,0 +1,114 @@ +######################################################################## +# The HYDRA normalization values were computed over the +# the 00008 set of 100 files (10K samples), June, 2019 +# John Field cautions that the HYDRA schema will change in +# the future +######################################################################## + +data_set_metadata { + schema { + split_jag_image_channels: false + + # JAG_Image, JAG_Scalar, JAG_Input + independent: [ { pieces: [ JAG_Image, JAG_Scalar ] }, { pieces: [ JAG_Input ] } ] + dependent: [ { pieces: [ JAG_Input ] } ] + + image_prefix: "/images/" + + ## all hydra image keys: + # "(90,0)/bang/image/data" + # "(90,0)/0.03/image/data" + # "(90,0)/0.02/image/data" + # "(90,0)/0.01/image/data" + # + # "(0,0)/bang/image/data" + # "(0,0)/0.03/image/data" + # "(0,0)/0.02/image/data" + # "(0,0)/0.01/image/data" + jag_image_keys: ["(90,0)/bang/image/data", "(0,0)/bang/image/data"] + + scalar_prefix: "/scalars/" + + # An empty list indicates to use all + # The commented out variables are not on the Jim's original list but used in the numpy-based format + jag_scalar_keys: + [ "avg_rhor", + "peak_eprod", + "peak_tion_bw_DT", + "bt_tion_bw_DT", + "avg_tion_bw_DT", + "adiabat", + "bangt", + "burnwidth", + "bt_rhor", + "bt_eprodr", + "peak_eprodr" + ] + + # When using all the keys without explicit selection, key filters can be used + # to explicitly exclude the particular variables with keys that matches a filter. + # 'jag_scalar_filters' and 'jag_input_filters' rely on exact key string matching. + # 'jag_scalar_prefix_filters' and 'jag_input_prefix_filters' define a filter as + # the pair of a prefix substring and the minimum key length. + # For example, with the example below, any key that has a length no shorter + # than 26 and starts with the substring "image_(" is excluded. + + jag_scalar_prefix_filters: [ { key_prefix: "image_(" min_len: 26} ] + jag_scalar_filters: [ "iBT" ] + + input_prefix: "/inputs/" + + jag_input_keys: ["preheat", + "sc_peak", + "t_3rd", + "t_end" + ] + } + + jag_input_normalization_params: [ + { scale: 0.0337373 bias: -0.0105617 }, #p_preheat avg= 15.4355 + { scale: 1.04127 bias: 0.49368 }, #sc_peak avg= 0.00650919 + { scale: 1.00482 bias: 0.499533 }, #t_3rd avg= -0.0241983 + { scale: 1.00725 bias: 0.496931 } #t_end avg= -0.00750582 + ] + + jag_scalar_normalization_params: [ + { scale: 1.82482 bias: -0.511432 }, #avg_rhor avg= 0.529763 + { scale: 0.681226 bias: -0.0150223 }, #peak_eprod avg= 0.201066 + { scale: 0.198934 bias: -0.801525 }, #peak_tion_bw_DT avg= 6.37529 + { scale: 0.244173 bias: -0.604468 }, #bt_tion_bw_DT avg= 4.0855 + { scale: 0.269391 bias: -0.656678 }, #avg_tion_bw_DT avg= 3.91583 + { scale: 0.0492209 bias: -0.186354 }, #adiabat avg= 10.6166 + { scale: 522.423 bias: -3.80809 }, #bangt avg= 0.00814444 + { scale: 3787.06 bias: -0.274563 }, #burnwidth avg= 0.000173271 + { scale: 1.68807 bias: -0.510794 }, #bt_rhor avg= 0.578218 + { scale: 5.27623e-05 bias: -0.00320741 }, #bt_eprodr avg= 1572.53 + { scale: 5.21263e-05 bias: -0.00322019 } #peak_eprodr avg= 1587.55 + ] + + # image data shape is (3,3,64,64) + # from John Field: sets of three: {absorption, emission forward, + # and emission back} # Since we are in 1D, forward and back emission + # are the same. + jag_image_normalization_params: [ + { scale: 1.31227 bias: -5.2241e-05 }, #(90,0)/bang/image/data + { scale: 1.5386e-05 bias: 8.4296e-05 }, #(90,0)/bang/image/data + { scale: 1.5386e-05 bias: 8.4296e-05 }, #(90,0)/bang/image/data + { scale: 1.28446 bias: -0.18841 }, #(90,0)/bang/image/data + { scale: 4.06761e-05 bias: 1.03167e-06 }, #(90,0)/bang/image/data + { scale: 4.06761e-05 bias: 1.03167e-06 }, #(90,0)/bang/image/data + { scale: 1.44979 bias: -0.289003 }, #(90,0)/bang/image/data + { scale: 0.00024344 bias: 7.96652e-08 }, #(90,0)/bang/image/data + { scale: 0.00024344 bias: 7.96652e-08 }, #(90,0)/bang/image/data + { scale: 1.31227 bias: -5.2241e-05 } #(0,0)/bang/image/data + { scale: 1.5386e-05 bias: 8.4296e-05 } #(0,0)/bang/image/data + { scale: 1.5386e-05 bias: 8.4296e-05 } #(0,0)/bang/image/data + { scale: 1.28446 bias: -0.18841 } #(0,0)/bang/image/data + { scale: 4.06761e-05 bias: 1.03167e-06 } #(0,0)/bang/image/data + { scale: 4.06761e-05 bias: 1.03167e-06 } #(0,0)/bang/image/data + { scale: 1.44979 bias: -0.289003 } #(0,0)/bang/image/data + { scale: 0.00024344 bias: 7.96652e-08 } #(0,0)/bang/image/data + { scale: 0.00024344 bias: 7.96652e-08 } #(0,0)/bang/image/data + ] + +} From 3e1bce967324a6f0a527201e90f120b0222fe47b Mon Sep 17 00:00:00 2001 From: Sam Ade Jacobs Date: Wed, 12 Jun 2019 09:12:07 -0700 Subject: [PATCH 069/634] Unit test model and data reader for JAG --- .../unit_tests/prototext/jag_reader.prototext | 112 +++++++++++++++ .../prototext/jag_single_layer_ae.prototext | 133 ++++++++++++++++++ 2 files changed, 245 insertions(+) create mode 100644 bamboo/unit_tests/prototext/jag_reader.prototext create mode 100644 bamboo/unit_tests/prototext/jag_single_layer_ae.prototext diff --git a/bamboo/unit_tests/prototext/jag_reader.prototext b/bamboo/unit_tests/prototext/jag_reader.prototext new file mode 100644 index 00000000000..443809ca8e4 --- /dev/null +++ b/bamboo/unit_tests/prototext/jag_reader.prototext @@ -0,0 +1,112 @@ +######################################################################## +# The JAG normalization values were computed over the 10M + 1MA + 1MB random +# pulls from the 100M data set. They are valid for the directories: +# /p/lustre2/brainusr/datasets/10MJAG/ (10M | 1M_A | 1M_B) +# /p/lustre2/brainusr/datasets/10MJAG_balanced_1K/ (1M_A | 1M_B) +# /p/gpfs1/brainusr/datasets/10MJAG/10M | 1M_A | 1M_B +# /p/gpfs1/brainusr/datasets/10MJAG_balanced_1K/ (1M_A | 1M_B) +######################################################################## + +data_reader { + requires_data_set_metadata: true + + reader { + name: "jag_conduit" + role: "train" + shuffle: true + data_filedir: "/p/lustre2/brainusr/datasets/10MJAG/1M_A/100K4trainers/" + index_list: "100Kindex.txt" + index_list_per_trainer: false + index_list_per_model: false + + validation_percent: 0 + absolute_sample_count: 0 + #Use 1000 of 100K samples + percent_of_data_to_use: 0.001 + disable_responses: true + disable_labels: true + + num_labels: 5 + + image_preprocessor { + # assume fixed size of input images if cropper is not used + raw_width: 64 + raw_height: 64 + raw_num_channels: 4 + + normalizer { + disable: true + scale: false + subtract_mean: false + unit_variance: false + z_score: true + } + + subtractor { + disable: true + } + + cropper { + disable: true + } + + colorizer { + disable: true + } + + augmenter { + disable: true + } + } + } + + reader { + name: "jag_conduit" + role: "test" + shuffle: false + # change to a lustre path + data_filedir: "/p/lustre2/brainusr/datasets/10MJAG/1M_A/100K16trainers/" + index_list: "t1_sample_list.txt" + index_list_per_trainer: false + index_list_per_model: false + + validation_percent: 0 + absolute_sample_count: 0 + percent_of_data_to_use: 0.005 + disable_responses: true + disable_labels: true + + num_labels: 5 + + image_preprocessor { + # assume fixed size of input images if cropper is not used + raw_width: 64 + raw_height: 64 + raw_num_channels: 4 + + normalizer { + disable: true + scale: false + subtract_mean: false + unit_variance: false + z_score: true + } + + subtractor { + disable: true + } + + cropper { + disable: true + } + + colorizer { + disable: true + } + + augmenter { + disable: true + } + } + } +} diff --git a/bamboo/unit_tests/prototext/jag_single_layer_ae.prototext b/bamboo/unit_tests/prototext/jag_single_layer_ae.prototext new file mode 100644 index 00000000000..c171bbdfbb9 --- /dev/null +++ b/bamboo/unit_tests/prototext/jag_single_layer_ae.prototext @@ -0,0 +1,133 @@ +#Unit test for JAG model and (particularly) data reader +#Run time for this example is about 2s per epoch on 16 nodes (32 tasks) +#Example on how to run: +#srun --nodes=16 --ntasks=32 build/gnu.Release.catalyst.llnl.gov/lbann/build/model_zoo/lbann --model=bamboo/unit_tests/prototext/jag_single_layer_ae.prototext --optimizer=model_zoo/optimizers/opt_adam.prototext --reader=bamboo/unit_tests/prototext/jag_reader.prototext --metadata=model_zoo/models/jag/wae_cycle_gan/jag_100M_metadata.prototext +model { + name: "ae_model" + shareable_training_data_reader:false + serialize_io: true + data_layout: "data_parallel" + mini_batch_size: 128 + block_size: 256 + num_epochs: 4 + num_parallel_readers: 0 + procs_per_trainer: 0 + + ################################################### + # Objective function + ################################################### + + objective_function { + layer_term { layer: "img_loss" } + l2_weight_regularization { + scale_factor: 1e-4 + } + } + + ################################################### + # Metrics + ################################################### + + metric { + layer_metric { + name: "reconstr_loss" + layer: "img_loss" + } + } + ################################################### + # Callbacks + ################################################### + callback { + print { + interval: 1 + } + } + callback { timer {} } + + ################################################### + # start of layers + ################################################### + + # Data + layer { + input { + io_buffer: "partitioned" + target_mode: "N/A" + } + name: "data" + data_layout: "data_parallel" + parents: " " + } + layer { + name: "slice_data" + data_layout: "data_parallel" + parents: "data" + children: "image_data_dummy param_data_id" + slice { + get_slice_points_from_reader: "independent" + } + } + #Y (images + scalar) + layer { + identity { + } + name: "image_data_dummy" + data_layout: "data_parallel" + parents: "slice_data" + } + # X (params not used) + layer { + identity { + } + name: "param_data_id" + data_layout: "data_parallel" + parents: "slice_data" + } + ## Hidden layer + layer { + fully_connected { + num_neurons: 1024 + has_bias: true + } + name: "encodefc" + data_layout: "data_parallel" + parents: "image_data_dummy" + } + layer { + parents: "encodefc" + name: "encodeelu" + data_layout: "data_parallel" + elu {} + } + layer { + parents: "encodeelu" + name: "encodedropout" + data_layout: "data_parallel" + dropout { + keep_prob: 0.9 + } + } + + #Y'(reconstructed images and scalar) + layer { + parents: "encodedropout" + name: "decode" + data_layout: "data_parallel" + fully_connected { + get_slice_points_from_reader: "independent" + get_num_neurons_of_slice_from_reader: [ 1 ] + has_bias: true + } + } + # Loss/Metric layer + layer { + parents: "decode image_data_dummy" + name: "img_loss" + data_layout: "data_parallel" + mean_squared_error {} + } + + ################################################### + # end of layers + ################################################### +} From 85741838c7492101d30d417656d67fea9ca92921 Mon Sep 17 00:00:00 2001 From: Nikoli Dryden Date: Wed, 12 Jun 2019 14:42:24 -0500 Subject: [PATCH 070/634] New preprocessing pipeline (#1014) * Bump OpenCV to 4.1.0; drop highgui and old version support. * Kill the mnist_siamese data reader. * Kill the imagenet_patches data reader. * Kill unneeded tests. * Kill patch processing. * Kill ancient image preprocessor. * Kill image_utils. Will be restored; temporarily breaks things. * Kill instantiation of data_reader_multi_images. This is now an ABC of triplet and can be refactored out later. * Kill old preprocessing pipeline. * Kill old preprocessing pipeline prototext. * Do not need to replace old preprocessing pipeline in jag_conduit. * Add OpenCV utilities. * Fix issues with OpenCV/old preproc pipeline removal. * Add image_utils, for loading/saving images. * Restore save_image. * Add initial version of new preprocessing pipeline. * Add new preprocessing pipeline to prototext. * Add transform pipeline to data reader. * Fix bug in to_lbann_layout. * Add fused normalize/to LBANN layout transform. * Support non-in-place normalize. * Restore preprocessing for MNIST and CIFAR10 data readers. * Restore loading/preprocessing to ImageNet/triplet/multihead siamese data readers. * Update data reader prototexts. ImageNet data reader changes to better match proper normalization. * Remove unneeded preprocessing for JAG reader. * Add debug-mode argument checking to fast_rand_int. * Fix off-by-1 that could lead to infinite loops. * Move setting expected output dims to image data reader. This fixes a memory corruption issue caused by multiple threads setting the same thing. * Kill unused lbann_data_generator.cpp * Make non-contiguous LBANN matrices error out. * Make transform random number helpers static. * random_resized_crop -> random_resized_crop_with_fixed_aspect_ratio; random_resized_aspect_ratio_crop -> random_resized_crop. * Clarify to_lbann_layout rescaling. * Rename image_utils.cpp/hpp to image.cpp/hpp. * Rename opencv_utils.hpp to opencv.hpp. * Fix unit test. * Fix return type for get_linearized_size. * Remove unused save_image method. * Remove unused methods in ImageNet data reader. * Fixed-size crops throw an exception if input image is too small. * Fix bug in random_crop where wrong dimension was used. * Update some docs/comments. * Remove unneeded implementations of transform_pipeline move operator/constructor. * Updating JAG Conduit reader to work with new transform pipeline. Note that transformation from cv::Mat to CPUMat is currently failing in the to_lbann_layout transformation object. * Improve non-contiguous check. * Restore data store functionality for ImageNet data reader. Note I haven't actually tested this. * Use I/O generator for transform RNGs. Fixes issue where transforms done by different I/O threads would have the same random number sequence. * Address @benson31's comments on image.hpp/cpp. * Updated the JAG data reader to use a simple transform to repack the JAG image data from HDF5/Conduit HWC to CHW format. Then used the scale and translate transformation to divide each channel by the average and add the channel offset values for normalization. Added classes for both the HWC to CHW and scale and translate transformations, * Cleaned up dead code. * Added an explicit move for the create_datum_views to ensure that the contents of the view are not deep copied. Also, removed some of the dump_outputs callback from the test code. --- CMakeLists.txt | 2 + include/lbann/CMakeLists.txt | 1 + include/lbann/data_readers/CMakeLists.txt | 18 - include/lbann/data_readers/cv_augmenter.hpp | 114 ---- include/lbann/data_readers/cv_colorizer.hpp | 81 --- include/lbann/data_readers/cv_cropper.hpp | 121 ----- include/lbann/data_readers/cv_decolorizer.hpp | 84 --- .../lbann/data_readers/cv_mean_extractor.hpp | 157 ------ include/lbann/data_readers/cv_normalizer.hpp | 399 -------------- include/lbann/data_readers/cv_process.hpp | 166 ------ .../lbann/data_readers/cv_process_patches.hpp | 83 --- include/lbann/data_readers/cv_resizer.hpp | 103 ---- include/lbann/data_readers/cv_subtractor.hpp | 171 ------ include/lbann/data_readers/cv_transform.hpp | 221 -------- include/lbann/data_readers/cv_utils.hpp | 498 ------------------ include/lbann/data_readers/data_reader.hpp | 24 +- .../lbann/data_readers/data_reader_csv.hpp | 1 - .../lbann/data_readers/data_reader_image.hpp | 9 +- .../data_readers/data_reader_imagenet.hpp | 21 +- .../data_reader_imagenet_patches.hpp | 74 --- .../lbann/data_readers/data_reader_jag.hpp | 5 - .../data_readers/data_reader_jag_conduit.hpp | 32 +- .../lbann/data_readers/data_reader_mnist.hpp | 1 - .../data_reader_mnist_siamese.hpp | 126 ----- .../data_readers/data_reader_multi_images.hpp | 5 +- .../data_reader_multihead_siamese.hpp | 5 +- .../data_readers/data_reader_triplet.hpp | 3 +- .../lbann/data_readers/image_preprocessor.hpp | 209 -------- include/lbann/data_readers/image_utils.hpp | 86 --- include/lbann/data_readers/opencv.hpp | 68 --- .../lbann/data_readers/opencv_extensions.hpp | 233 -------- .../data_readers/patchworks/CMakeLists.txt | 11 - .../patchworks/patchworks_ROI.hpp | 153 ------ .../patchworks_patch_descriptor.hpp | 186 ------- .../patchworks/patchworks_stats.hpp | 93 ---- include/lbann/lbann.hpp | 2 - include/lbann/proto/factories.hpp | 9 + .../lbann/proto/init_image_data_readers.hpp | 1 - include/lbann/transforms/CMakeLists.txt | 15 + include/lbann/transforms/normalize.hpp | 72 +++ .../transforms/repack_HWC_to_CHW_layout.hpp | 56 ++ .../sample_normalize.hpp} | 42 +- include/lbann/transforms/scale.hpp | 54 ++ .../lbann/transforms/scale_and_translate.hpp | 57 ++ include/lbann/transforms/transform.hpp | 112 ++++ .../lbann/transforms/transform_pipeline.hpp | 95 ++++ .../lbann/transforms/vision/CMakeLists.txt | 19 + .../lbann/transforms/vision/center_crop.hpp | 40 +- include/lbann/transforms/vision/colorize.hpp | 48 ++ include/lbann/transforms/vision/grayscale.hpp | 48 ++ .../transforms/vision/horizontal_flip.hpp | 55 ++ .../vision/normalize_to_lbann_layout.hpp | 73 +++ .../lbann/transforms/vision/random_affine.hpp | 77 +++ .../lbann/transforms/vision/random_crop.hpp | 54 ++ .../transforms/vision/random_resized_crop.hpp | 75 +++ ...m_resized_crop_with_fixed_aspect_ratio.hpp | 62 +++ include/lbann/transforms/vision/resize.hpp | 54 ++ .../transforms/vision/resized_center_crop.hpp | 57 ++ .../vision/to_lbann_layout.hpp} | 59 +-- .../lbann/transforms/vision/vertical_flip.hpp | 55 ++ include/lbann/utils/CMakeLists.txt | 2 + include/lbann/utils/image.hpp | 75 +++ include/lbann/utils/opencv.hpp | 118 +++++ include/lbann/utils/random.hpp | 6 + .../data_reader_cifar10.prototext | 44 +- .../data_reader_imagenet.prototext | 93 +--- .../data_readers/data_reader_mnist.prototext | 42 +- ...ata_reader_mnist_numpy_npz_int16.prototext | 42 +- .../data_reader_jag_conduit_lassen.prototext | 62 --- .../wae_cycle_gan/jag_100M_metadata.prototext | 4 + .../data_reader_imagenet_patches.prototext | 130 ----- .../jag_single_layer_ae.prototext | 115 ++++ src/CMakeLists.txt | 1 + src/data_readers/CMakeLists.txt | 19 - src/data_readers/cv_augmenter.cpp | 253 --------- src/data_readers/cv_colorizer.cpp | 94 ---- src/data_readers/cv_cropper.cpp | 196 ------- src/data_readers/cv_decolorizer.cpp | 97 ---- src/data_readers/cv_mean_extractor.cpp | 168 ------ src/data_readers/cv_normalizer.cpp | 342 ------------ src/data_readers/cv_process.cpp | 312 ----------- src/data_readers/cv_process_patches.cpp | 109 ---- src/data_readers/cv_resizer.cpp | 117 ---- src/data_readers/cv_subtractor.cpp | 393 -------------- src/data_readers/cv_utils.cpp | 112 ---- src/data_readers/data_reader_cifar10.cpp | 8 +- src/data_readers/data_reader_image.cpp | 12 +- src/data_readers/data_reader_imagenet.cpp | 124 +---- .../data_reader_imagenet_patches.cpp | 175 ------ src/data_readers/data_reader_jag.cpp | 30 +- src/data_readers/data_reader_jag_conduit.cpp | 188 ++----- src/data_readers/data_reader_mnist.cpp | 8 +- .../data_reader_mnist_siamese.cpp | 296 ----------- src/data_readers/data_reader_multi_images.cpp | 31 +- .../data_reader_multihead_siamese.cpp | 36 +- src/data_readers/data_reader_triplet.cpp | 30 +- src/data_readers/image_preprocessor.cpp | 335 ------------ src/data_readers/image_utils.cpp | 370 ------------- src/data_readers/lbann_data_generator.cpp | 104 ---- src/data_readers/patchworks/patchworks.cpp | 182 ------- .../patchworks/patchworks_ROI.cpp | 153 ------ .../patchworks_patch_descriptor.cpp | 270 ---------- .../patchworks/patchworks_stats.cpp | 147 ------ src/proto/factories/CMakeLists.txt | 1 + src/proto/factories/transform_factory.cpp | 133 +++++ src/proto/init_image_data_readers.cpp | 425 ++------------- src/proto/lbann.proto | 200 +++---- src/proto/proto_common.cpp | 39 +- .../patchworks => transforms}/CMakeLists.txt | 12 +- src/transforms/normalize.cpp | 105 ++++ src/transforms/repack_HWC_to_CHW_layout.cpp | 84 +++ src/transforms/sample_normalize.cpp | 49 ++ src/transforms/scale.cpp | 48 ++ src/transforms/scale_and_translate.cpp | 48 ++ src/transforms/transform_pipeline.cpp | 110 ++++ src/transforms/unit_test/CMakeLists.txt | 9 + src/transforms/unit_test/normalize_test.cpp | 96 ++++ .../unit_test/sample_normalize_test.cpp | 36 ++ src/transforms/unit_test/scale_test.cpp | 32 ++ .../unit_test/transform_pipeline_test.cpp | 38 ++ src/transforms/vision/CMakeLists.txt | 19 + src/transforms/vision/center_crop.cpp | 65 +++ src/transforms/vision/colorize.cpp | 48 ++ src/transforms/vision/grayscale.cpp | 48 ++ src/transforms/vision/horizontal_flip.cpp | 44 ++ .../vision/normalize_to_lbann_layout.cpp | 94 ++++ src/transforms/vision/random_affine.cpp | 103 ++++ src/transforms/vision/random_crop.cpp | 64 +++ src/transforms/vision/random_resized_crop.cpp | 93 ++++ ...m_resized_crop_with_fixed_aspect_ratio.cpp | 71 +++ src/transforms/vision/resize.cpp | 45 ++ src/transforms/vision/resized_center_crop.cpp | 67 +++ src/transforms/vision/to_lbann_layout.cpp | 80 +++ .../vision/unit_test/CMakeLists.txt | 18 + .../vision/unit_test/center_crop_test.cpp | 76 +++ .../vision/unit_test/colorize_test.cpp | 66 +++ .../vision/unit_test/grayscale_test.cpp | 66 +++ src/transforms/vision/unit_test/helper.hpp | 58 ++ .../vision/unit_test/horizontal_flip_test.cpp | 80 +++ .../vision/unit_test/random_affine_test.cpp | 100 ++++ .../vision/unit_test/random_crop_test.cpp | 64 +++ .../unit_test/random_resized_crop_test.cpp | 118 +++++ ...ized_crop_with_fixed_aspect_ratio_test.cpp | 188 +++++++ .../vision/unit_test/resize_test.cpp | 118 +++++ .../unit_test/resized_center_crop_test.cpp | 188 +++++++ .../vision/unit_test/to_lbann_layout_test.cpp | 82 +++ .../unit_test/transform_pipeline_test.cpp | 45 ++ .../vision/unit_test/vertical_flip_test.cpp | 80 +++ src/transforms/vision/vertical_flip.cpp | 44 ++ src/utils/CMakeLists.txt | 1 + src/utils/image.cpp | 241 +++++++++ src/utils/unit_test/CMakeLists.txt | 1 + src/utils/unit_test/image_test.cpp | 51 ++ superbuild/opencv/AddOpenCVOptions.cmake | 2 +- superbuild/opencv/CMakeLists.txt | 2 +- tests/test_img_pipeline/CMakeLists.txt | 81 --- tests/test_img_pipeline/Mat.hpp | 1 - tests/test_img_pipeline/README.txt | 42 -- tests/test_img_pipeline/include | 1 - tests/test_img_pipeline/lbann | 1 - tests/test_img_pipeline/lbann_config.hpp | 1 - tests/test_img_pipeline/main.cpp | 350 ------------ tests/test_img_pipeline/src | 1 - tests/test_patchworks/CMakeLists.txt | 81 --- tests/test_patchworks/Mat.hpp | 1 - tests/test_patchworks/README.txt | 21 - tests/test_patchworks/include | 1 - tests/test_patchworks/lbann | 1 - tests/test_patchworks/lbann_config.hpp | 1 - tests/test_patchworks/main.cpp | 124 ----- tests/test_patchworks/patchworks_image.cpp | 198 ------- tests/test_patchworks/patchworks_image.hpp | 120 ----- tests/test_patchworks/patchworks_utils.cpp | 78 --- tests/test_patchworks/patchworks_utils.hpp | 26 - tests/test_patchworks/src | 1 - 175 files changed, 5179 insertions(+), 10247 deletions(-) delete mode 100644 include/lbann/data_readers/cv_augmenter.hpp delete mode 100644 include/lbann/data_readers/cv_colorizer.hpp delete mode 100644 include/lbann/data_readers/cv_cropper.hpp delete mode 100644 include/lbann/data_readers/cv_decolorizer.hpp delete mode 100644 include/lbann/data_readers/cv_mean_extractor.hpp delete mode 100644 include/lbann/data_readers/cv_normalizer.hpp delete mode 100644 include/lbann/data_readers/cv_process.hpp delete mode 100644 include/lbann/data_readers/cv_process_patches.hpp delete mode 100644 include/lbann/data_readers/cv_resizer.hpp delete mode 100644 include/lbann/data_readers/cv_subtractor.hpp delete mode 100644 include/lbann/data_readers/cv_transform.hpp delete mode 100644 include/lbann/data_readers/cv_utils.hpp delete mode 100644 include/lbann/data_readers/data_reader_imagenet_patches.hpp delete mode 100644 include/lbann/data_readers/data_reader_mnist_siamese.hpp delete mode 100644 include/lbann/data_readers/image_preprocessor.hpp delete mode 100644 include/lbann/data_readers/image_utils.hpp delete mode 100644 include/lbann/data_readers/opencv.hpp delete mode 100644 include/lbann/data_readers/opencv_extensions.hpp delete mode 100644 include/lbann/data_readers/patchworks/CMakeLists.txt delete mode 100644 include/lbann/data_readers/patchworks/patchworks_ROI.hpp delete mode 100644 include/lbann/data_readers/patchworks/patchworks_patch_descriptor.hpp delete mode 100644 include/lbann/data_readers/patchworks/patchworks_stats.hpp create mode 100644 include/lbann/transforms/CMakeLists.txt create mode 100644 include/lbann/transforms/normalize.hpp create mode 100644 include/lbann/transforms/repack_HWC_to_CHW_layout.hpp rename include/lbann/{data_readers/patchworks/patchworks.hpp => transforms/sample_normalize.hpp} (60%) create mode 100644 include/lbann/transforms/scale.hpp create mode 100644 include/lbann/transforms/scale_and_translate.hpp create mode 100644 include/lbann/transforms/transform.hpp create mode 100644 include/lbann/transforms/transform_pipeline.hpp create mode 100644 include/lbann/transforms/vision/CMakeLists.txt rename src/data_readers/cv_transform.cpp => include/lbann/transforms/vision/center_crop.hpp (59%) create mode 100644 include/lbann/transforms/vision/colorize.hpp create mode 100644 include/lbann/transforms/vision/grayscale.hpp create mode 100644 include/lbann/transforms/vision/horizontal_flip.hpp create mode 100644 include/lbann/transforms/vision/normalize_to_lbann_layout.hpp create mode 100644 include/lbann/transforms/vision/random_affine.hpp create mode 100644 include/lbann/transforms/vision/random_crop.hpp create mode 100644 include/lbann/transforms/vision/random_resized_crop.hpp create mode 100644 include/lbann/transforms/vision/random_resized_crop_with_fixed_aspect_ratio.hpp create mode 100644 include/lbann/transforms/vision/resize.hpp create mode 100644 include/lbann/transforms/vision/resized_center_crop.hpp rename include/lbann/{data_readers/patchworks/patchworks_common.hpp => transforms/vision/to_lbann_layout.hpp} (54%) create mode 100644 include/lbann/transforms/vision/vertical_flip.hpp create mode 100644 include/lbann/utils/image.hpp create mode 100644 include/lbann/utils/opencv.hpp delete mode 100644 model_zoo/models/siamese/siamese_alexnet/data_reader_imagenet_patches.prototext create mode 100644 model_zoo/tests/data_reader_tests/jag_single_layer_ae.prototext delete mode 100644 src/data_readers/cv_augmenter.cpp delete mode 100644 src/data_readers/cv_colorizer.cpp delete mode 100644 src/data_readers/cv_cropper.cpp delete mode 100644 src/data_readers/cv_decolorizer.cpp delete mode 100644 src/data_readers/cv_mean_extractor.cpp delete mode 100644 src/data_readers/cv_normalizer.cpp delete mode 100644 src/data_readers/cv_process.cpp delete mode 100644 src/data_readers/cv_process_patches.cpp delete mode 100644 src/data_readers/cv_resizer.cpp delete mode 100644 src/data_readers/cv_subtractor.cpp delete mode 100644 src/data_readers/cv_utils.cpp delete mode 100644 src/data_readers/data_reader_imagenet_patches.cpp delete mode 100644 src/data_readers/data_reader_mnist_siamese.cpp delete mode 100644 src/data_readers/image_preprocessor.cpp delete mode 100644 src/data_readers/image_utils.cpp delete mode 100644 src/data_readers/lbann_data_generator.cpp delete mode 100644 src/data_readers/patchworks/patchworks.cpp delete mode 100644 src/data_readers/patchworks/patchworks_ROI.cpp delete mode 100644 src/data_readers/patchworks/patchworks_patch_descriptor.cpp delete mode 100644 src/data_readers/patchworks/patchworks_stats.cpp create mode 100644 src/proto/factories/transform_factory.cpp rename src/{data_readers/patchworks => transforms}/CMakeLists.txt (52%) create mode 100644 src/transforms/normalize.cpp create mode 100644 src/transforms/repack_HWC_to_CHW_layout.cpp create mode 100644 src/transforms/sample_normalize.cpp create mode 100644 src/transforms/scale.cpp create mode 100644 src/transforms/scale_and_translate.cpp create mode 100644 src/transforms/transform_pipeline.cpp create mode 100644 src/transforms/unit_test/CMakeLists.txt create mode 100644 src/transforms/unit_test/normalize_test.cpp create mode 100644 src/transforms/unit_test/sample_normalize_test.cpp create mode 100644 src/transforms/unit_test/scale_test.cpp create mode 100644 src/transforms/unit_test/transform_pipeline_test.cpp create mode 100644 src/transforms/vision/CMakeLists.txt create mode 100644 src/transforms/vision/center_crop.cpp create mode 100644 src/transforms/vision/colorize.cpp create mode 100644 src/transforms/vision/grayscale.cpp create mode 100644 src/transforms/vision/horizontal_flip.cpp create mode 100644 src/transforms/vision/normalize_to_lbann_layout.cpp create mode 100644 src/transforms/vision/random_affine.cpp create mode 100644 src/transforms/vision/random_crop.cpp create mode 100644 src/transforms/vision/random_resized_crop.cpp create mode 100644 src/transforms/vision/random_resized_crop_with_fixed_aspect_ratio.cpp create mode 100644 src/transforms/vision/resize.cpp create mode 100644 src/transforms/vision/resized_center_crop.cpp create mode 100644 src/transforms/vision/to_lbann_layout.cpp create mode 100644 src/transforms/vision/unit_test/CMakeLists.txt create mode 100644 src/transforms/vision/unit_test/center_crop_test.cpp create mode 100644 src/transforms/vision/unit_test/colorize_test.cpp create mode 100644 src/transforms/vision/unit_test/grayscale_test.cpp create mode 100644 src/transforms/vision/unit_test/helper.hpp create mode 100644 src/transforms/vision/unit_test/horizontal_flip_test.cpp create mode 100644 src/transforms/vision/unit_test/random_affine_test.cpp create mode 100644 src/transforms/vision/unit_test/random_crop_test.cpp create mode 100644 src/transforms/vision/unit_test/random_resized_crop_test.cpp create mode 100644 src/transforms/vision/unit_test/random_resized_crop_with_fixed_aspect_ratio_test.cpp create mode 100644 src/transforms/vision/unit_test/resize_test.cpp create mode 100644 src/transforms/vision/unit_test/resized_center_crop_test.cpp create mode 100644 src/transforms/vision/unit_test/to_lbann_layout_test.cpp create mode 100644 src/transforms/vision/unit_test/transform_pipeline_test.cpp create mode 100644 src/transforms/vision/unit_test/vertical_flip_test.cpp create mode 100644 src/transforms/vision/vertical_flip.cpp create mode 100644 src/utils/image.cpp create mode 100644 src/utils/unit_test/image_test.cpp delete mode 100644 tests/test_img_pipeline/CMakeLists.txt delete mode 120000 tests/test_img_pipeline/Mat.hpp delete mode 100644 tests/test_img_pipeline/README.txt delete mode 120000 tests/test_img_pipeline/include delete mode 120000 tests/test_img_pipeline/lbann delete mode 120000 tests/test_img_pipeline/lbann_config.hpp delete mode 100644 tests/test_img_pipeline/main.cpp delete mode 120000 tests/test_img_pipeline/src delete mode 100644 tests/test_patchworks/CMakeLists.txt delete mode 120000 tests/test_patchworks/Mat.hpp delete mode 100644 tests/test_patchworks/README.txt delete mode 120000 tests/test_patchworks/include delete mode 120000 tests/test_patchworks/lbann delete mode 120000 tests/test_patchworks/lbann_config.hpp delete mode 100644 tests/test_patchworks/main.cpp delete mode 100644 tests/test_patchworks/patchworks_image.cpp delete mode 100644 tests/test_patchworks/patchworks_image.hpp delete mode 100644 tests/test_patchworks/patchworks_utils.cpp delete mode 100644 tests/test_patchworks/patchworks_utils.hpp delete mode 120000 tests/test_patchworks/src diff --git a/CMakeLists.txt b/CMakeLists.txt index 72062f30c9a..d689bc0817c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -446,6 +446,8 @@ if (LBANN_WITH_UNIT_TESTING) include(CTest) include(Catch) add_subdirectory(src/utils/unit_test) + add_subdirectory(src/transforms/unit_test) + add_subdirectory(src/transforms/vision/unit_test) # Add this one last add_subdirectory(unit_test) diff --git a/include/lbann/CMakeLists.txt b/include/lbann/CMakeLists.txt index 28123a8350b..3bff6abb1b3 100644 --- a/include/lbann/CMakeLists.txt +++ b/include/lbann/CMakeLists.txt @@ -17,6 +17,7 @@ add_subdirectory(models) add_subdirectory(objective_functions) add_subdirectory(optimizers) add_subdirectory(proto) +add_subdirectory(transforms) add_subdirectory(utils) add_subdirectory(weights) diff --git a/include/lbann/data_readers/CMakeLists.txt b/include/lbann/data_readers/CMakeLists.txt index f6d513de63a..aecf829df39 100644 --- a/include/lbann/data_readers/CMakeLists.txt +++ b/include/lbann/data_readers/CMakeLists.txt @@ -1,23 +1,12 @@ # Add the headers for this directory set_full_path(THIS_DIR_HEADERS compound_data_reader.hpp - cv_augmenter.hpp - cv_colorizer.hpp - cv_decolorizer.hpp - cv_cropper.hpp - cv_mean_extractor.hpp - cv_normalizer.hpp - cv_process.hpp - cv_process_patches.hpp - cv_transform.hpp - cv_utils.hpp data_reader.hpp data_reader_ascii.hpp data_reader_cifar10.hpp data_reader_csv.hpp data_reader_image.hpp data_reader_imagenet.hpp - data_reader_imagenet_patches.hpp data_reader_merge_features.hpp data_reader_merge_samples.hpp data_reader_mnist.hpp @@ -28,15 +17,8 @@ set_full_path(THIS_DIR_HEADERS data_reader_pilot2_molecular.hpp data_reader_python.hpp data_reader_synthetic.hpp - image_preprocessor.hpp - image_utils.hpp - opencv.hpp - opencv_extensions.hpp data_reader_multihead_siamese.hpp ) -# Add the subdirectories -add_subdirectory(patchworks) - # Propagate the files up the tree set(HEADERS "${HEADERS}" "${THIS_DIR_HEADERS}" PARENT_SCOPE) diff --git a/include/lbann/data_readers/cv_augmenter.hpp b/include/lbann/data_readers/cv_augmenter.hpp deleted file mode 100644 index ba584ab18fe..00000000000 --- a/include/lbann/data_readers/cv_augmenter.hpp +++ /dev/null @@ -1,114 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. -// Produced at the Lawrence Livermore National Laboratory. -// Written by the LBANN Research Team (B. Van Essen, et al.) listed in -// the CONTRIBUTORS file. -// -// LLNL-CODE-697807. -// All rights reserved. -// -// This file is part of LBANN: Livermore Big Artificial Neural Network -// Toolkit. For details, see http://software.llnl.gov/LBANN or -// https://github.com/LLNL/LBANN. -// -// Licensed under the Apache License, Version 2.0 (the "Licensee"); you -// may not use this file except in compliance with the License. You may -// obtain a copy of the License at: -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the license. -// -// cv_augmenter .cpp .hpp - Augmenting functions for images in opencv format -//////////////////////////////////////////////////////////////////////////////// - -#ifndef LBANN_CV_AUGMENTER_HPP -#define LBANN_CV_AUGMENTER_HPP - -#include "cv_transform.hpp" -#include -#include -#include -#include - -#ifdef LBANN_HAS_OPENCV -namespace lbann { - -/** - * Supports the following transforms: - * - Random horizontal and vertical flips - * - Random rotations - * - Random horizontal and vertical shifts - * - Random shearing - */ -class cv_augmenter : public cv_transform { - protected: - // --- configuration variables --- - /** Whether to do horizontal flips. */ - bool m_do_horizontal_flip; - /** Whether to do vertical flips. */ - bool m_do_vertical_flip; - - /** Range in degrees for rotations (0-180). */ - float m_rotation_range; - /** Range (fraction of total width) for horizontal shifts. */ - float m_horizontal_shift_range; - /** Range (fraction of total height) for vertical shifts. */ - float m_vertical_shift_range; - /** Shear angle (radians). */ - float m_shear_range; - - // --- state variables --- - /// Flip decision made - cv_flipping m_flip; // currently more of a configuration variable but can easily become a state variable - /// The rest of the affine tranformations determined - cv::Mat_ m_trans; - - /// Check if there is a reason to enable. (i.e., any option set) - bool check_to_enable() const override; - - public: - cv_augmenter(); - cv_augmenter(const cv_augmenter& rhs); - cv_augmenter& operator=(const cv_augmenter& rhs); - cv_augmenter* clone() const override; - - ~cv_augmenter() override {} - - /// Set the parameters all at once - void set(const bool hflip, const bool vflip, const float rot, - const float hshift, const float vshift, const float shear); - - /// Clear the states of the previous transform applied - void reset() override; - - /** - * Construct an affine transformation matrix based on the options and random - * numbers. If successful, the tranform is enabled. If not, it is disabled. - * @return false if not enabled or unsuccessful. - */ - bool determine_transform(const cv::Mat& image) override; - - /// Augmentation is irreversible. Thus, this has no effect. - bool determine_inverse_transform() override { return false; } - - /** - * Apply the transformation determined. - * As this method is executed, the transform becomes deactivated. - * @return false if not successful. - */ - bool apply(cv::Mat& image) override; - - std::string get_type() const override { return "augmenter"; } - std::string get_description() const override; - std::ostream& print(std::ostream& os) const override; -}; - -} // end of namespace lbann -#endif // LBANN_HAS_OPENCV - -#endif // LBANN_CV_AUGMENTER_HPP diff --git a/include/lbann/data_readers/cv_colorizer.hpp b/include/lbann/data_readers/cv_colorizer.hpp deleted file mode 100644 index 7d667f9cca5..00000000000 --- a/include/lbann/data_readers/cv_colorizer.hpp +++ /dev/null @@ -1,81 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. -// Produced at the Lawrence Livermore National Laboratory. -// Written by the LBANN Research Team (B. Van Essen, et al.) listed in -// the CONTRIBUTORS file. -// -// LLNL-CODE-697807. -// All rights reserved. -// -// This file is part of LBANN: Livermore Big Artificial Neural Network -// Toolkit. For details, see http://software.llnl.gov/LBANN or -// https://github.com/LLNL/LBANN. -// -// Licensed under the Apache License, Version 2.0 (the "Licensee"); you -// may not use this file except in compliance with the License. You may -// obtain a copy of the License at: -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the license. -// -// cv_colorizer .cpp .hpp - transform a non-color (grayscale) image into a -// 3-channel color image -//////////////////////////////////////////////////////////////////////////////// - -#ifndef LBANN_CV_COLORIZER_HPP -#define LBANN_CV_COLORIZER_HPP - -#include "cv_transform.hpp" - -#ifdef LBANN_HAS_OPENCV -namespace lbann { - -class cv_colorizer : public cv_transform { - protected: - // --- state variables --- - bool m_gray; ///< whether an image is monochrome or not - - public: - cv_colorizer() : cv_transform(), m_gray(false) {} - cv_colorizer(const cv_colorizer& rhs); - cv_colorizer& operator=(const cv_colorizer& rhs); - cv_colorizer *clone() const override; - - ~cv_colorizer() override {} - - void set() { reset(); } - void reset() override { - m_enabled = false; - m_gray = false; - } - - /** - * If a given image is in grayscale, the tranform is enabled, and not otherwise. - * @return false if not enabled or unsuccessful. - */ - bool determine_transform(const cv::Mat& image) override; - - /// convert back to color image if it used to be a grayscale image - bool determine_inverse_transform() override; - - /** - * Apply color conversion if enabled. - * As it is applied, the transform becomes deactivated. - * @return false if not successful. - */ - bool apply(cv::Mat& image) override; - - std::string get_type() const override { return "colorizer"; } - std::string get_description() const override; - std::ostream& print(std::ostream& os) const override; -}; - -} // end of namespace lbann -#endif // LBANN_HAS_OPENCV - -#endif // LBANN_CV_COLORIZER_HPP diff --git a/include/lbann/data_readers/cv_cropper.hpp b/include/lbann/data_readers/cv_cropper.hpp deleted file mode 100644 index 651e7945d5b..00000000000 --- a/include/lbann/data_readers/cv_cropper.hpp +++ /dev/null @@ -1,121 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. -// Produced at the Lawrence Livermore National Laboratory. -// Written by the LBANN Research Team (B. Van Essen, et al.) listed in -// the CONTRIBUTORS file. -// -// LLNL-CODE-697807. -// All rights reserved. -// -// This file is part of LBANN: Livermore Big Artificial Neural Network -// Toolkit. For details, see http://software.llnl.gov/LBANN or -// https://github.com/LLNL/LBANN. -// -// Licensed under the Apache License, Version 2.0 (the "Licensee"); you -// may not use this file except in compliance with the License. You may -// obtain a copy of the License at: -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the license. -// -// cv_cropper .cpp .hpp - Functions to crop images -//////////////////////////////////////////////////////////////////////////////// - -#ifndef LBANN_CV_CROPPER_HPP -#define LBANN_CV_CROPPER_HPP - -#include "lbann/data_readers/cv_transform.hpp" -#include -#include - -#ifdef LBANN_HAS_OPENCV -namespace lbann { - -/** - * If the size of a region of interest (ROI) is defined, use the area at the - * center of a given image. Otherwise, use the entire image. - * Zoom in/out the image if necessary to cover the ROI. Then, crop out an area - * of the desired size from the region either randomly within the ROI or at the - * center depending on the given specification. - */ -class cv_cropper : public cv_transform { - protected: - // --- configuration variables --- - unsigned int m_width; ///< desired width of an image - unsigned int m_height; ///< desired height of an image - /// randomize the center position of the area of interest - bool m_rand_crop; - /// indicate if a specific ROI is set or supposed to use whole image - bool m_is_roi_set; - /// The size of the initial region of interest to crop from - std::pair m_roi_size; - - // --- state variables --- - double m_zoom; ///< zoom factor to prepare the initial region for a given image - /** Three modes of pixel interpolation: INTER_LINEAR, INTER_AREA, and INTER_LINEAR - * The first choice is the default when not adaptive. The other two are used when - * interpolatng adaptively. The second is when shrinking, and the third is when enlarging - */ - static const int m_interpolation_choices[3]; - int m_interpolation; ///< id of the channel value interpolation method used - bool m_adaptive_interpolation; ///< whether to use adaptive interpolation - - void unset_roi(); - - public: - cv_cropper(); - cv_cropper(const cv_cropper& rhs) = default; - cv_cropper& operator=(const cv_cropper& rhs) = default; - cv_cropper *clone() const override; - ~cv_cropper() override {} - - /** - * Set the parameters all at once - * @param width desired width of the crop - * @param height desired height of the crop - * @param random_crop whether to crop randomly from the initial region of interest or at the center - * @param roi the size of the initial region of interest to crop from. Set (0,0) to use the full image. - * @param adaptive_interpolation whether to apply a different interpolation method depending on how an image is resized - */ - void set(const unsigned int width, const unsigned int height, - const bool random_crop = false, - const std::pair& roi = std::make_pair(0,0), - const bool adaptive_interpolation = false); - - unsigned int get_crop_width() const { return m_width; } - unsigned int get_crop_height() const { return m_height; } - - /// Clear the states of the previous transform applied - void reset() override; - - /** - * Construct transformation parameters based on the options and random - * numbers. If successful, the tranform is enabled.If not, it is disabled. - * @return false if not enabled or unsuccessful. - */ - bool determine_transform(const cv::Mat& image) override; - - /// Cropping is irreversible. Thus, this has no effect. - bool determine_inverse_transform() override { return false; } - - /** - * Apply the transformation determined. - * As this method is executed, the transform becomes deactivated. - * @return false if not successful. - */ - bool apply(cv::Mat& image) override; - - std::string get_type() const override { return "cropper"; } - std::string get_description() const override; - std::ostream& print(std::ostream& os) const override; -}; - -} // end of namespace lbann -#endif // LBANN_HAS_OPENCV - -#endif // LBANN_CV_CROPPER_HPP diff --git a/include/lbann/data_readers/cv_decolorizer.hpp b/include/lbann/data_readers/cv_decolorizer.hpp deleted file mode 100644 index 18e09aea0cf..00000000000 --- a/include/lbann/data_readers/cv_decolorizer.hpp +++ /dev/null @@ -1,84 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. -// Produced at the Lawrence Livermore National Laboratory. -// Written by the LBANN Research Team (B. Van Essen, et al.) listed in -// the CONTRIBUTORS file. -// -// LLNL-CODE-697807. -// All rights reserved. -// -// This file is part of LBANN: Livermore Big Artificial Neural Network -// Toolkit. For details, see http://software.llnl.gov/LBANN or -// https://github.com/LLNL/LBANN. -// -// Licensed under the Apache License, Version 2.0 (the "Licensee"); you -// may not use this file except in compliance with the License. You may -// obtain a copy of the License at: -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the license. -// -// cv_decolorizer .cpp .hpp - transform a color image into a single-channel -// monochrome image -//////////////////////////////////////////////////////////////////////////////// - -#ifndef LBANN_CV_DECOLORIZER_HPP -#define LBANN_CV_DECOLORIZER_HPP - -#include "lbann_config.hpp" -#include "cv_transform.hpp" - -#ifdef LBANN_HAS_OPENCV -namespace lbann { - -class cv_decolorizer : public cv_transform { - protected: - // --- state variables --- - bool m_color; ///< whether an image is color or not - /// Method to used: either pick one channel, or mix BGR channels (default) - bool m_pick_1ch; - - public: - cv_decolorizer() : cv_transform(), m_color(false), m_pick_1ch(false) {} - cv_decolorizer(const cv_decolorizer& rhs); - cv_decolorizer& operator=(const cv_decolorizer& rhs); - cv_decolorizer *clone() const override; - - ~cv_decolorizer() override {} - - void set(const bool pick_1ch); - void reset() override { - m_enabled = false; - m_color = false; - } - - /** - * If a given image is in color, the tranform is enabled, and not otherwise. - * @return false if not enabled or unsuccessful. - */ - bool determine_transform(const cv::Mat& image) override; - - /// The decolorizing transform is irreversible. Thus, this has no effect. - bool determine_inverse_transform() override { return false; } - - /** - * Convert a color image to a monochrome image if enabled. - * As it is applied, the transform becomes deactivated. - * @return false if not successful. - */ - bool apply(cv::Mat& image) override; - - std::string get_type() const override { return "decolorizer"; } - std::string get_description() const override; - std::ostream& print(std::ostream& os) const override; -}; - -} // end of namespace lbann -#endif // LBANN_HAS_OPENCV - -#endif // LBANN_CV_DECOLORIZER_HPP diff --git a/include/lbann/data_readers/cv_mean_extractor.hpp b/include/lbann/data_readers/cv_mean_extractor.hpp deleted file mode 100644 index eef53a0afa5..00000000000 --- a/include/lbann/data_readers/cv_mean_extractor.hpp +++ /dev/null @@ -1,157 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. -// Produced at the Lawrence Livermore National Laboratory. -// Written by the LBANN Research Team (B. Van Essen, et al.) listed in -// the CONTRIBUTORS file. -// -// LLNL-CODE-697807. -// All rights reserved. -// -// This file is part of LBANN: Livermore Big Artificial Neural Network -// Toolkit. For details, see http://software.llnl.gov/LBANN or -// https://github.com/LLNL/LBANN. -// -// Licensed under the Apache License, Version 2.0 (the "Licensee"); you -// may not use this file except in compliance with the License. You may -// obtain a copy of the License at: -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the license. -// -// cv_mean_extractor .cpp .hpp - accumulate mean over the image set -//////////////////////////////////////////////////////////////////////////////// - -#ifndef LBANN_CV_MEAN_EXTRACTOR_HPP -#define LBANN_CV_MEAN_EXTRACTOR_HPP - -#include "cv_transform.hpp" -#include - -#ifdef LBANN_HAS_OPENCV -namespace lbann { - -/** - * Computes a cumulative pixel-wise average of a stream of images. - * It is assumed that the images have the same size and the same number of - * channels. However, they are not required to have the same channel depth. - * If a channel value is an integral type, it is normalized to a floating - * point number of type Float_T between 0 and 1 (inclusive at both ends). - * If a channel value is already in a floating point type, the value is used - * without normalization. - * Images accumulate per pixel and a mean image is obtained by dividing each - * pixel accumulation by the total number of images (if m_batch_size is larger - * than the number of all the images observed). The current mean of images can - * be obtained at any point during the operation by the member function - * extract(). This returns the image normalized to the range of - * channel type, Channel_T. For example, if Channel_T is uint8_t, the range of - * mean values from 0.0 to 1.0 maps to the range from 0 to 256. - * To cope with a large number of images, one might rely on semi-moving average - * method. Up to m_batch_size number of images accumulate aa a batch while the - * moving average of batches is computed upon request by calling extract(). - * This is particularly useful when Float_T is single precision with a limited - * number of bits to represent a wide range of numbers and the images have a - * large bit depth. - */ -class cv_mean_extractor : public cv_transform { - public: - /// type of image statistics value accumulated - using Float_T = double; - static const unsigned int m_default_batch_size = 65536u; - - protected: - // --- configuration variables --- - unsigned int m_batch_size; ///< number of samples per batch - - // --- state variables --- - unsigned int m_batch_cnt; ///< number of complete batches - unsigned int m_partial_cnt; ///< number of samples currently contributing towards a batch - /// OpenCv type code used to create m_sum and m_avg based on Float_T and the number of channels - int m_type_code; - cv::Mat m_sum; ///< partial batch accumulated so far - cv::Mat m_avg; ///< cumulative moving average - - /// create the matrices for accumulating image statistics - void create_matrices(const unsigned int width, const unsigned int height, const unsigned int n_ch); - - public: - cv_mean_extractor(); - cv_mean_extractor(const cv_mean_extractor& rhs); - cv_mean_extractor& operator=(const cv_mean_extractor& rhs); - cv_mean_extractor *clone() const override; - - ~cv_mean_extractor() override {} - - void set(const unsigned int width, const unsigned int height, const unsigned int n_ch, - const unsigned int batch_sz = cv_mean_extractor::m_default_batch_size); - void set(const unsigned int batch_sz); - void reset() override; - - bool determine_transform(const cv::Mat& image) override; - /// The transform does not modify the image. Thus, this has no effect. - bool determine_inverse_transform() override; - bool apply(cv::Mat& image) override; - - template - cv::Mat extract() const; - - std::string get_type() const override { return "mean extractor"; } - std::string get_description() const override; - std::ostream& print(std::ostream& os) const override; -}; - -/** - * Convert the maxtrix representing the cumulative moving average of images - * observed so far into an image with the channel type 'Channel_T'. The default - * is uint8_t. If it is given as void, the matrix is returned as is. - */ -template -inline cv::Mat cv_mean_extractor::extract() const { - cv::Mat avg_so_far; - if (m_partial_cnt == 0u) { - avg_so_far = m_avg; - } else { - cv::addWeighted(m_avg, m_batch_cnt/static_cast(m_batch_cnt+1), - m_sum, 1/static_cast((m_batch_cnt + 1) * m_partial_cnt), - 0.0, avg_so_far, m_type_code); - } - - if (avg_so_far.empty()) return cv::Mat(); - - if (std::is_void::value) return avg_so_far; - - double minVal = 0.0; - double maxVal = 0.0; - cv::minMaxLoc(avg_so_far, &minVal, &maxVal, nullptr, nullptr); - //const double max_channel_type = std::numeric_limits::max(); - const double max_channel_type = depth_normalization::inverse_factor(); - - cv::Mat recovered; - if ((minVal < 0.0) || (maxVal > 1.0)) { - // This condition may rise either because of unnormalized images with raw - // floating point values or because of precision error. In these cases, - // the minimum value maps to 0 and the maximum value maps to the greatest - // value of Channel_T - const double range = maxVal-minVal; - if (range == 0.0) return cv::Mat(); - const double alpha = max_channel_type/range; - const double beta = - alpha*minVal; - avg_so_far.convertTo(recovered, cv_image_type::T(), - alpha, beta); - } else { - // In this case, 0 maps to 0, and 1 maps to the greatest value of Channel_T - avg_so_far.convertTo(recovered, cv_image_type::T(), - max_channel_type, 0.0); - } - - return recovered; -} - -} // end of namespace lbann -#endif // LBANN_HAS_OPENCV - -#endif // LBANN_CV_MEAN_EXTRACTOR_HPP diff --git a/include/lbann/data_readers/cv_normalizer.hpp b/include/lbann/data_readers/cv_normalizer.hpp deleted file mode 100644 index dfaf2954f89..00000000000 --- a/include/lbann/data_readers/cv_normalizer.hpp +++ /dev/null @@ -1,399 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. -// Produced at the Lawrence Livermore National Laboratory. -// Written by the LBANN Research Team (B. Van Essen, et al.) listed in -// the CONTRIBUTORS file. -// -// LLNL-CODE-697807. -// All rights reserved. -// -// This file is part of LBANN: Livermore Big Artificial Neural Network -// Toolkit. For details, see http://software.llnl.gov/LBANN or -// https://github.com/LLNL/LBANN. -// -// Licensed under the Apache License, Version 2.0 (the "Licensee"); you -// may not use this file except in compliance with the License. You may -// obtain a copy of the License at: -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the license. -// -// lbann_cv_normalizer .cpp .hpp - Normalizing functions for images -// in opencv format -//////////////////////////////////////////////////////////////////////////////// - -#ifndef LBANN_CV_NORMALIZER_HPP -#define LBANN_CV_NORMALIZER_HPP - -#include // typeid -#include "cv_transform.hpp" -#include "lbann/base.hpp" // DataType -#include "lbann/utils/mild_exception.hpp" - -#ifdef LBANN_HAS_OPENCV -namespace lbann { -/** - * Modifies the channel values of each pixel according to the chosen normalization - * strategies: - * - Standardize to 0 mean - * - Standardize to unit variance - * - Scale to the range [0, 1] - * - Normalize via z-score - * - * Combine these strategies into a single per-pixel linear transform, and - * process them all at once. - * It tries to replace the values in place if possible, rather - * than creating a new copy of data, especially, if the channel data type of - * source image is the same as that of the resultant image. - */ -class cv_normalizer : public cv_transform { - public: - /** This is the interim type of input values computed from image data - * It does not have to be the same as the type of the values stored, i.e., DataType. - */ - using ComputeType = DataType; - //using ComputeType = double; - /** - * Define the type of normalization methods available. - * z-score method is essentially the combination of mean subtraction and unit variance - */ - enum normalization_type {_none=0, _u_scale=1, _mean_sub=2, _unit_var=4, _z_score=6}; - using channel_trans_t = std::pair; - - protected: - // --- configuration variables --- - /// Whether to normalize to 0 mean. - bool m_mean_subtraction; - /// Whether to normalize to unit variance. - bool m_unit_variance; - /// Whether to scale to [0, 1]. - bool m_unit_scale; - /// Whether to normalize via z-score. - bool m_z_score; - - - // --- state variables --- - /** - * The parameter to use for linearly transforming channel values of each pixel as: - * new_value[ch] = cv::saturate_cast(m_trans[ch].first*value[ch] + m_trans[ch].second) - */ - std::vector m_trans; - - - /// Set a normalization bit flag - normalization_type set_normalization_bits(const normalization_type ntype, const normalization_type flag) const { - return static_cast(static_cast(ntype) | static_cast(flag)); - } - - /// Mask normalization bits - normalization_type mask_normalization_bits(const normalization_type ntype, const normalization_type flag) const { - return static_cast(static_cast(ntype) & static_cast(flag)); - } - - /// Enable a particular normalization method - normalization_type& set_normalization_type(normalization_type& ntype, const normalization_type flag) const; - - /// Check if there is a reason to enable. (i.e., any option set) - bool check_to_enable() const override; - - public: - - cv_normalizer(); - cv_normalizer(const cv_normalizer& rhs); - cv_normalizer& operator=(const cv_normalizer& rhs); - cv_normalizer *clone() const override; - - ~cv_normalizer() override {} - - /// Set the parameters all at once - void set(const bool meansub, const bool unitvar, const bool unitscale, const bool zscore); - - /// Whether to subtract the per-channel and per-sample mean. - void subtract_mean(bool b) { - m_mean_subtraction = b; - } - /// Whether to normalize to unit variance, per-channel and per-sample. - void unit_variance(bool b) { - m_unit_variance = b; - } - /// Whether to scale to [0, 1] - void unit_scale(bool b) { - m_unit_scale = b; - } - /// Whether to normalize by z-scores, per-channel and per-sample. - void z_score(bool b) { - m_z_score = b; - } - - /// Set a pre-determined normalization transform. - void set_transform(const std::vector& t); - - /// Clear the states of the previous transform applied - void reset() override; - - /// Returns the channel-wise scaling parameter for normalization transform - std::vector transform() const { - return (m_enabled? m_trans : std::vector()); - } - - /** - * Combine the normalizations enabled and define a linear transform - * per pixel to address them all. If successful, the tranform is enabled. - * If not, it is disabled. - * @return false if not enabled or unsuccessful. - */ - bool determine_transform(const cv::Mat& image) override; - - /** - * Reverse the normalization done as x' = alpha*x + beta by - * x = (x'- beta)/alpha - * If successful, the tranform is enabled. If not, it is disabled. - * @return false if not enabled or unsuccessful. - */ - bool determine_inverse_transform() override; - - /** - * Apply the normalization defined as a linear tranform per pixel. - * As this method is executed, the transform becomes deactivated. - * @return false if not successful. - */ - bool apply(cv::Mat& image) override; - - // utilities - template - static OutputIterator scale(InputIterator first, InputIterator last, OutputIterator result, - const std::vector trans); - - template - static bool scale_with_known_type(cv::Mat& image, const std::vector& trans); - - /** - * Scale an image using a set of parameters for linearly transforming channel - * values per pixel. - * The resultant image will contain channel values of LBANN's DataType. - */ - static bool scale(cv::Mat& image, const std::vector& trans); - - - template - static bool compute_mean_stddev_with_known_type(const cv::Mat& image, - std::vector& mean, std::vector& stddev, cv::InputArray mask); - - /// Compute the per-channel and per-sample mean and standard deviation - static bool compute_mean_stddev(const cv::Mat& image, - std::vector& mean, std::vector& stddev, - cv::InputArray mask=cv::noArray()); - - std::string get_type() const override { return "normalizer"; } - std::string get_description() const override; - std::ostream& print(std::ostream& os) const override; -}; - - -/** - * Linearly transform each value while copying it from one sequential container - * to another, which may be the same container if the type of the initial value - * and that of the result are the same. - * The transformation is alpha[ch]*input[ch] + beta[ch] -> output[ch] - * @param first The beginning of the input interator - * @param last The last of the input iterator - * @param result The beginning of the output iterator - * @param trans Parameters for linearly transforming channel values per pixel - * @return the last of output iterator - */ -template -inline OutputIterator cv_normalizer::scale( - InputIterator first, InputIterator last, OutputIterator result, - const std::vector trans) { - const size_t NCh = trans.size(); - bool trivial_alpha = true; - bool trivial_beta = true; - - for (size_t ch=0u; ch < NCh; ++ch) { - trivial_alpha = trivial_alpha && (trans[ch].first == 1.0); - trivial_beta = trivial_beta && (trans[ch].second == 0.0); - } - - if (trivial_alpha && trivial_beta) { - if ((typeid(*first) == typeid(*result)) && - (reinterpret_cast(&(*first)) == - reinterpret_cast(&(*result)))) - // This way, it works both for iterator and for pointer - { - std::advance(result, std::distance(first,last)); - return result; - } else { - return std::copy(first, last, result); - } - } - - using T = typename std::iterator_traits::value_type; - - // At this point NCh should not be zero because both alpha and beta are not trivial. - if (NCh == 1) { - const ComputeType a = trans[0].first; - const ComputeType b = trans[0].second; - - while (first != last) { - *result = cv::saturate_cast(a * (*first) + b); - ++result; - ++first; - } - } else { - size_t ch = 0u; - - while (first != last) { - *result = cv::saturate_cast(trans[ch].first * (*first) + trans[ch].second); - ++result; - ++first; - ++ch; - ch = (ch % NCh); - } - } - return result; -} - - -/** - * Linear transform image pixels by scaling parameters given for each channel - * The transformation is trans[ch].first*input[ch] + trans[ch].second -> output[ch]. - * The first template parameter is the channel value type of the input image. - * The second one is the channel value type desired for the output image. - * - * @param image The image to be modified, which is the input and also the ouput. - * @param trans Parameters for linearly transforming channel values per pixel - * @return true if successful. The input image will be modified to a new one. - */ -template -inline bool cv_normalizer::scale_with_known_type(cv::Mat& image, - const std::vector& trans) { - const auto Width = static_cast(image.cols); - const auto Height = static_cast(image.rows); - const auto NCh = static_cast(image.channels()); - if ((trans.size() > 0u) && (trans.size() != NCh)) { - return false; - } - - - // overwrite the storage of the source image if the source and the result have - // the same data type. Otherwise, create a new image for the result. The result - // will replace the image referenced by the input. - if (std::is_same::value) { - if (image.isContinuous()) { - scale(reinterpret_cast(image.datastart), - reinterpret_cast(image.dataend), - reinterpret_cast(image.data), trans); - } else { - // TODO: Should we make this to copy to a new continuous block instead of - // updating the values in-place? - const unsigned int stride = Width*NCh; - for (unsigned int i = 0u; i < Height; ++i) { - auto *optr = reinterpret_cast(image.ptr(i)); - const Tsrc *iptr = optr; - scale(iptr, iptr+stride, optr, trans); - } - } - } else { - cv::Mat image_out = cv::Mat(Height, Width, CV_MAKETYPE(cv::DataType::depth, NCh)); - - if (image.isContinuous()) { - scale(reinterpret_cast(image.datastart), - reinterpret_cast(image.dataend), - reinterpret_cast(image_out.data), trans); - } else { - const unsigned int stride = Width*NCh; - auto *ptr_out = reinterpret_cast(image_out.data); - for (unsigned int i = 0u; i < Height; ++i, ptr_out += stride) { - const Tsrc *ptr = reinterpret_cast(image.ptr(i)); - scale(ptr, ptr+stride, ptr_out, trans); - } - } - image = image_out; - } - return true; -} - - -/** - * Compute the per-channel and per-sample mean and standard deviation - * for a sample image of channel value type T - */ -template -inline bool cv_normalizer::compute_mean_stddev_with_known_type(const cv::Mat& image, - std::vector& mean, std::vector& stddev, cv::InputArray mask) { - mean.clear(); - stddev.clear(); - if (image.empty()) { - return false; - } - - const int NCh = image.channels(); - const int num_pixels = image.rows * image.cols; - ComputeType sum[NCh]; - ComputeType sqsum[NCh]; - ComputeType shift[NCh]; - - for (int ch = 0; ch < NCh; ++ch) { - sum[ch] = 0.0; - sqsum[ch] = 0.0; - const auto *ptr = reinterpret_cast(image.datastart); - shift[ch] = static_cast(*(ptr+ch)); - } - - mean.resize(NCh); - stddev.resize(NCh); - - if (image.isContinuous()) { - const auto *ptr = reinterpret_cast(image.datastart); - const auto *const ptrend = reinterpret_cast(image.dataend); - - int ch = 0; - do { - const ComputeType diff = (*ptr - shift[ch]); - sum[ch] += diff; - sqsum[ch] += diff*diff; - ++ch; - ch = ch % NCh; - } while ((++ptr) != ptrend); - - for (int c = 0; c < NCh; ++c) { - const ComputeType shifted_mean = sum[c] / num_pixels; - mean[c] = shifted_mean + shift[c]; - stddev[c] = sqrt(std::max(sqsum[c]/num_pixels - shifted_mean * shifted_mean, ComputeType(0))); - } - } else { - const int stride = image.cols*NCh; - const int Height = image.rows; - - for (int i = 0; i < Height; ++i) { - const auto *ptr = reinterpret_cast(image.ptr(i)); - const T *const ptrend = ptr + stride; - - int ch = 0; - do { - const ComputeType diff = (*ptr - shift[ch]); - sum[ch] += diff; - sqsum[ch] += diff*diff; - ++ch; - ch = ch % NCh; - } while ((++ptr) != ptrend); - } - - for (int ch = 0; ch < NCh; ++ch) { - const ComputeType shifted_mean = sum[ch] / num_pixels; - mean[ch] = shifted_mean + shift[ch]; - stddev[ch] = sqrt(std::max(sqsum[ch]/num_pixels - shifted_mean*shifted_mean, ComputeType(0))); - } - } - return true; -} - -} // end of namespace lbann -#endif // LBANN_HAS_OPENCV - -#endif // LBANN_CV_NORMALIZER_HPP diff --git a/include/lbann/data_readers/cv_process.hpp b/include/lbann/data_readers/cv_process.hpp deleted file mode 100644 index ffc315016a4..00000000000 --- a/include/lbann/data_readers/cv_process.hpp +++ /dev/null @@ -1,166 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. -// Produced at the Lawrence Livermore National Laboratory. -// Written by the LBANN Research Team (B. Van Essen, et al.) listed in -// the CONTRIBUTORS file. -// -// LLNL-CODE-697807. -// All rights reserved. -// -// This file is part of LBANN: Livermore Big Artificial Neural Network -// Toolkit. For details, see http://software.llnl.gov/LBANN or -// https://github.com/LLNL/LBANN. -// -// Licensed under the Apache License, Version 2.0 (the "Licensee"); you -// may not use this file except in compliance with the License. You may -// obtain a copy of the License at: -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the license. -// -// cv_process .cpp .hpp - structure that defines the operations -// on image data in opencv format -//////////////////////////////////////////////////////////////////////////////// - -#ifndef LBANN_CV_PROCESS_HPP -#define LBANN_CV_PROCESS_HPP - -#include "cv_transform.hpp" -#include "cv_normalizer.hpp" -#include "cv_subtractor.hpp" -#include "cv_augmenter.hpp" -#include "cv_colorizer.hpp" -#include "cv_decolorizer.hpp" -#include "cv_cropper.hpp" -#include "cv_resizer.hpp" -#include "cv_mean_extractor.hpp" -#include -#include // std::numeric_limits - -#ifdef LBANN_HAS_OPENCV -namespace lbann { - -/** A structure packs the parameters for image pre-/post-processing that takes - * advantage of the OpenCV framework. - */ -class cv_process { - /// OpenCV flip codes: c<0 for top_left <-> bottom_right, c=0 for top<->down, and c>0 for left<->right - - protected: - /// unique name for the processor - std::string m_name; - /// Whether to flip an image - cv_transform::cv_flipping m_flip; - /// Whether to split channels - bool m_split; - /// whether a normalizing transform is set or not - bool m_is_normalizer_set; - /// The index of the normalizing transform in the array of transforms - unsigned int m_normalizer_idx; - - /// Array of transforms - std::vector > m_transforms; - - /// Check if the last transform registered in the list is a normalizer and not a subtractor - bool to_fuse_normalizer_with_copy() const; - - void set_normalizer_info(); - - public: - cv_process() - : m_flip(cv_transform::_no_flip_), m_split(true), m_is_normalizer_set(false), m_normalizer_idx(0u) {} - - cv_process(const cv_process& rhs); - cv_process& operator=(const cv_process& rhs); - - cv_process(const cv_transform::cv_flipping flip_code, const bool tosplit) - : m_flip(flip_code), m_split(tosplit), m_is_normalizer_set(false), m_normalizer_idx(0u) {} - - virtual ~cv_process() {} - - std::string get_name() const { return m_name; } - void set_name(const std::string& name) { m_name = name; } - - /// Reset all the transforms - void reset(); - - /// Check whether to flip - bool to_flip() const { - return (m_flip != cv_transform::_no_flip_); - } - /// Tell how to flip - int how_to_flip() const { - return static_cast(m_flip); - } - /** - * Set the flipping behavior. This is to deal with custom image format, which - * is not supported by OpenCV's builtin decoders and may impose different pixel - * coordinate system in its custom decoder. - * It is not to substitute for random flipping in augmentation. - */ - void set_to_flip(const cv_transform::cv_flipping f) { - m_flip = f; - } - /// Set to split channels - bool to_split() const { - return m_split; - } - - /// Export transform operator of normalizer to allow lazy application - std::vector get_transform_normalize() const; - /// Export transform operator of normalizer for a specific channel - std::vector get_transform_normalize(const unsigned int ch) const; - - /// Turn off normalizer. This is useful to make sure it off after potential lazy application - void disable_lazy_normalizer(); - - /// Turn off all transforms - void disable_transforms(); - - /// Add a tranform - bool add_transform(std::unique_ptr tr); - - /// Add a normalizing tranform - bool add_normalizer(std::unique_ptr tr); - bool add_normalizer(std::unique_ptr tr); - - /// Allow access to the list of transforms registered - const std::vector >& get_transforms() const { - return m_transforms; - } - - /// Allow read-only access to a particular transform indexed by idx - const cv_transform* get_transform(const unsigned int idx) const; - - /// Allow read-write access to a particular transform indexed by idx - cv_transform* get_transform(const unsigned int idx); - - /// Retrun the number of transforms registered - unsigned int get_num_transforms() const { return m_transforms.size(); } - - /** Return final image dimension {width, height} after all the transforms - * If a cropper is set, returns {crop_width, crop_height}. Otherwise, {0,0}. - */ - std::vector get_data_dims() const; - - void determine_inverse_lazy_normalization(); - - /// Execute a range of transforms [tr_strart, tr_end) on the given image in order - bool preprocess(cv::Mat& image, unsigned int tr_start = 0u, - unsigned int tr_end = std::numeric_limits::max()); - /// Execute all the inverse transforms on the given image in the reverse order - bool postprocess(cv::Mat& image); - - virtual std::string get_type() const { return "cv_process"; } - virtual std::string get_description() const; -}; - -} // end of namespace lbann -#endif // LBANN_HAS_OPENCV - -#endif // LBANN_CV_PROCESS_HPP diff --git a/include/lbann/data_readers/cv_process_patches.hpp b/include/lbann/data_readers/cv_process_patches.hpp deleted file mode 100644 index b9c52ff955a..00000000000 --- a/include/lbann/data_readers/cv_process_patches.hpp +++ /dev/null @@ -1,83 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. -// Produced at the Lawrence Livermore National Laboratory. -// Written by the LBANN Research Team (B. Van Essen, et al.) listed in -// the CONTRIBUTORS file. -// -// LLNL-CODE-697807. -// All rights reserved. -// -// This file is part of LBANN: Livermore Big Artificial Neural Network -// Toolkit. For details, see http://software.llnl.gov/LBANN or -// https://github.com/LLNL/LBANN. -// -// Licensed under the Apache License, Version 2.0 (the "Licensee"); you -// may not use this file except in compliance with the License. You may -// obtain a copy of the License at: -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the license. -// -// cv_process_patches .cpp .hpp - structure that defines the operations -// on patches extracted from an image in the opencv format -//////////////////////////////////////////////////////////////////////////////// - -#ifndef LBANN_CV_PROCESS_PATCHES_HPP -#define LBANN_CV_PROCESS_PATCHES_HPP - -#include "cv_process.hpp" -#include "patchworks/patchworks_patch_descriptor.hpp" -#include // std::numeric_limits - -#ifdef LBANN_HAS_OPENCV -namespace lbann { - -/// Similar to cv_process but works on patches that are extracted from an image -class cv_process_patches : public cv_process { - protected: - patchworks::patch_descriptor m_pd; - bool m_self_label; - unsigned int m_when_to_extract; - - public: - cv_process_patches(); - cv_process_patches(const bool self_label); - cv_process_patches(const cv_process_patches& rhs); - cv_process_patches(const cv_transform::cv_flipping flip_code, const bool tosplit); - cv_process_patches& operator=(const cv_process_patches& rhs); - - ~cv_process_patches() override {} - - void set_patch_descriptor(const patchworks::patch_descriptor& pd, - const unsigned int when_to_extract = - std::numeric_limits::max()); - patchworks::patch_descriptor& patch_descriptor() { - return m_pd; - } - const patchworks::patch_descriptor& patch_descriptor() const { - return m_pd; - } - unsigned int get_when_to_extract() const { return m_when_to_extract; } - bool is_self_labeling() const { return m_self_label; } - unsigned int get_num_labels() const { return m_pd.get_num_labels(); } - virtual unsigned int get_patch_label() const { return m_pd.get_last_label(); } - unsigned int get_num_patches() const { return m_pd.get_num_patches(); } - std::vector get_data_dims() const { - return {m_pd.get_num_patches(), m_pd.get_patch_width(), m_pd.get_patch_height()}; - } - - bool preprocess(cv::Mat& image, std::vector& patches); - - std::string get_type() const override { return "cv_process_patches"; } - std::string get_description() const override; -}; - -} // end of namespace lbann -#endif // LBANN_HAS_OPENCV - -#endif // LBANN_CV_PROCESS_PATCHES_HPP diff --git a/include/lbann/data_readers/cv_resizer.hpp b/include/lbann/data_readers/cv_resizer.hpp deleted file mode 100644 index 69555897d2c..00000000000 --- a/include/lbann/data_readers/cv_resizer.hpp +++ /dev/null @@ -1,103 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. -// Produced at the Lawrence Livermore National Laboratory. -// Written by the LBANN Research Team (B. Van Essen, et al.) listed in -// the CONTRIBUTORS file. -// -// LLNL-CODE-697807. -// All rights reserved. -// -// This file is part of LBANN: Livermore Big Artificial Neural Network -// Toolkit. For details, see http://software.llnl.gov/LBANN or -// https://github.com/LLNL/LBANN. -// -// Licensed under the Apache License, Version 2.0 (the "Licensee"); you -// may not use this file except in compliance with the License. You may -// obtain a copy of the License at: -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the license. -// -// cv_resizer .cpp .hpp - Functions to resize images -//////////////////////////////////////////////////////////////////////////////// - -#ifndef LBANN_CV_RESIZER_HPP -#define LBANN_CV_RESIZER_HPP - -#include "lbann/data_readers/cv_transform.hpp" -#include -#include - -#ifdef LBANN_HAS_OPENCV -namespace lbann { - -/** - * Simple image resizing without maintaining the aspect ratio. - */ -class cv_resizer : public cv_transform { - protected: - // --- configuration variables --- - unsigned int m_width; ///< desired width of an image - unsigned int m_height; ///< desired height of an image - - // --- state variables --- - /** Three modes of pixel interpolation: INTER_LINEAR, INTER_AREA, and INTER_LINEAR - * The first choice is the default when not adaptive. The other two are used when - * interpolatng adaptively. The second is when shrinking, and the third is when enlarging - */ - static const int m_interpolation_choices[3]; - int m_interpolation; ///< id of the channel value interpolation method used - bool m_adaptive_interpolation; ///< whether to use adaptive interpolation - - public: - cv_resizer(); - cv_resizer(const cv_resizer& rhs) = default; - cv_resizer& operator=(const cv_resizer& rhs) = default; - cv_resizer *clone() const override; - ~cv_resizer() override {} - - /** - * Set the parameters all at once - * @param width desired width - * @param height desired height - * @param adaptive_interpolation whether to apply a different interpolation method depending on how an image is resized - */ - void set(const unsigned int width, const unsigned int height, - const bool adaptive_interpolation = false); - - unsigned int get_width() const { return m_width; } - unsigned int get_height() const { return m_height; } - - /// Clear the states of the previous transform applied - void reset() override; - - /** - * Determine whether to enable transformation. - * @return false if not enabled. - */ - bool determine_transform(const cv::Mat& image) override; - - /// Determine whether to enable inverse transformation. - bool determine_inverse_transform() override { return false; } - - /** - * Apply the transformation. - * As this method is executed, the transform becomes deactivated. - * @return false if not successful. - */ - bool apply(cv::Mat& image) override; - - std::string get_type() const override { return "resizer"; } - std::string get_description() const override; - std::ostream& print(std::ostream& os) const override; -}; - -} // end of namespace lbann -#endif // LBANN_HAS_OPENCV - -#endif // LBANN_CV_RESIZER_HPP diff --git a/include/lbann/data_readers/cv_subtractor.hpp b/include/lbann/data_readers/cv_subtractor.hpp deleted file mode 100644 index 169181c4576..00000000000 --- a/include/lbann/data_readers/cv_subtractor.hpp +++ /dev/null @@ -1,171 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. -// Produced at the Lawrence Livermore National Laboratory. -// Written by the LBANN Research Team (B. Van Essen, et al.) listed in -// the CONTRIBUTORS file. -// -// LLNL-CODE-697807. -// All rights reserved. -// -// This file is part of LBANN: Livermore Big Artificial Neural Network -// Toolkit. For details, see http://software.llnl.gov/LBANN or -// https://github.com/LLNL/LBANN. -// -// Licensed under the Apache License, Version 2.0 (the "Licensee"); you -// may not use this file except in compliance with the License. You may -// obtain a copy of the License at: -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the license. -// -// cv_subtractor .cpp .hpp - subtract channel values of an image (possibly the -// pixel-wise mean of dataset) from the corresponding values of another (input) -//////////////////////////////////////////////////////////////////////////////// - -#ifndef LBANN_CV_SUBTRACTOR_HPP -#define LBANN_CV_SUBTRACTOR_HPP - -#include "cv_transform.hpp" -#include "lbann/base.hpp" - -#ifdef LBANN_HAS_OPENCV -namespace lbann { - -/** - * Subtract channel values of an image from the corresponding values of another. - * The former is likely to carry pre-computed mean data per pixel and per channel. - * The latter is an input image. Both image needs to have the same size and the - * same number of channels. The subtracted result is represented in the scale - * between 0 and 1 (both inclusive). - * In the common current use case, a colorizer comes before a subtractor which is - * followed by a random cropper. In this scenario, the input images must be resized - * in advance to match the size of the mean image. - * In another scenario, where the random cropping is not used but resizing is done - * on-line, the subtractor can come after cropper without requiring the input images - * to be resized in advance. - * Alternatively, even a simpler approach is to use a mean image with uniform pixels. - * In this way, it does not need to know the size of input images, and is not impacted - * by random cropping or flipping augmentation. - */ -class cv_subtractor : public cv_transform { - protected: - // --- configuration variables --- - /** - * The image to subtract from an input image in the pixel-wise fashion. - * It has channel values of a floating point type, in the scale from 0 to 1. - * An input image will be mapped into the scale before subtraction by linearly - * mapping the smallest representative value to 0 and the largest representative - * value to 1. - */ - cv::Mat m_img_to_sub; - - /** - * The image to divide an input image in the pixel-wise fashion. - * It has channel values of a floating point type, in the scale from 0 to 1. - * An input image will be mapped into the scale before division. - */ - cv::Mat m_img_to_div; - - /** uniform mean per channel used for channel-wise mean-subtraction. - * This is used to construct the m_img_to_sub when the size of the image is known. - */ - std::vector m_channel_mean; - - /** uniform standard deviation per channel used for channel-wise z-score (division). - * This is used to construct the m_img_to_div when the size of the image is known. - */ - std::vector m_channel_stddev; - - // --- state variables --- - bool m_applied; ///< has been subtracted - - public: - cv_subtractor() : cv_transform(), m_applied(false) {} - cv_subtractor(const cv_subtractor& rhs); - cv_subtractor& operator=(const cv_subtractor& rhs); - cv_subtractor *clone() const override; - - ~cv_subtractor() override {} - - static cv::Mat read_binary_image_file(const std::string filename); - - /// Load and set the image to subtract from every input image. - void set_mean(const std::string name_of_img, const int depth_code = cv_image_type::T()); - - /** - * Set the mean fixed per channel for mean-subtracting each input image. - * This supports an alternative method for mean subtraction given that the - * mean per channel is uniform. - */ - void set_mean(const std::vector channel_mean); - - /** - * Set the dataset-wise mean image to subtract from each input image. - * The image represents the pre-computed pixel-wise mean of the dataset. - * In case that this image is not in a floating point type, it is converted to - * one with the depth specified by depth_code. - */ - void set_mean(const cv::Mat& img, const int depth_code = cv_image_type::T()); - - /// Load and set the image to normalize the pixels of every input image. - void set_stddev(const std::string name_of_img, const int depth_code = cv_image_type::T()); - - /** - * Set the dataset-wise standard deviation fixed per channel for normalizing - * each input image. - * This supports an alternative method for normalizing with stddev given that - * it is uniform per channel. - */ - void set_stddev(const std::vector channel_stddev); - - /** - * Set the dataset-wise standard deviation to normalize each input image. - * In case that this image is not in a floating point type, it is converted to - * one with the depth specified by depth_code. - */ - void set_stddev(const cv::Mat& img, const int depth_code = cv_image_type::T()); - - void reset() override { - m_enabled = false; - m_applied = false; - } - - /** - * If a given image is in grayscale, the tranform is enabled, and not otherwise. - * @return false if not enabled or unsuccessful. - */ - bool determine_transform(const cv::Mat& image) override; - - /// convert back to color image if it used to be a grayscale image - bool determine_inverse_transform() override; - - /** - * Apply color conversion if enabled. - * As it is applied, the transform becomes deactivated. - * @return false if not successful. - */ - bool apply(cv::Mat& image) override; - - /// true if both sub and div are channel-wise - bool check_if_channel_wise() const; - - std::string get_type() const override { return "subtractor"; } - std::string get_description() const override; - std::ostream& print(std::ostream& os) const override; - - protected: - /// Construct an image of the unform channel values using the channel-wise mean. - bool create_img_to_sub(int width, int height, int n_channels); - /// Construct an image of the unform channel values using the channel-wise stddev. - bool create_img_to_div(int width, int height, int n_channels); -}; - -} // end of namespace lbann -#endif // LBANN_HAS_OPENCV - -#endif // LBANN_CV_SUBTRACTOR_HPP diff --git a/include/lbann/data_readers/cv_transform.hpp b/include/lbann/data_readers/cv_transform.hpp deleted file mode 100644 index 72455fc8907..00000000000 --- a/include/lbann/data_readers/cv_transform.hpp +++ /dev/null @@ -1,221 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. -// Produced at the Lawrence Livermore National Laboratory. -// Written by the LBANN Research Team (B. Van Essen, et al.) listed in -// the CONTRIBUTORS file. -// -// LLNL-CODE-697807. -// All rights reserved. -// -// This file is part of LBANN: Livermore Big Artificial Neural Network -// Toolkit. For details, see http://software.llnl.gov/LBANN or -// https://github.com/LLNL/LBANN. -// -// Licensed under the Apache License, Version 2.0 (the "Licensee"); you -// may not use this file except in compliance with the License. You may -// obtain a copy of the License at: -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the license. -// -// cv_transform .cpp .hpp - base class for the transformation -// on image data in opencv format -//////////////////////////////////////////////////////////////////////////////// - -#ifndef LBANN_CV_TRANSFORM_HPP -#define LBANN_CV_TRANSFORM_HPP - -#include "opencv.hpp" -#include "opencv_extensions.hpp" - -#ifdef LBANN_HAS_OPENCV -namespace lbann { - -class cv_transform { - protected: - // --- configuration variables --- - // place for the variables to keep the configuration set during initialization - - std::string m_name; - - // --- state variables --- - /// per-image indicator of whether to apply transform or not - bool m_enabled; - - // transform prepared given the configuration (and the image) - // m_trans; - - // Allow to manually shut transform off without destroying it - //bool m_manual_switch; - - /** Check if transform is configured to apply. - * (e.g., if any of the augmentaion methods is enabled) - */ - virtual bool check_to_enable() const { - return true; - } - - public: - enum cv_flipping {_both_axes_=-1, _vertical_=0, _horizontal_=1, _no_flip_=2}; - static const constexpr char* const cv_flip_desc[] = {"both_axes", "vertical", "horizontal", "none"}; - static std::string flip_desc(const cv_flipping flip_code) { return std::string(cv_flip_desc[static_cast(flip_code)+1]); } - - static const float pi; - - - cv_transform(); - cv_transform(const cv_transform& rhs); - cv_transform& operator=(const cv_transform& rhs); - virtual cv_transform *clone() const; - - virtual ~cv_transform() {} - - // define a method to configure the transform - // void set(args) { reset(); ... } - /// Reset the transform state but do not alter the configuration variables - virtual void reset() { - m_enabled = false; - // e.g., m_trans.clear(); - } - - virtual bool determine_transform(const cv::Mat& image); - virtual bool determine_inverse_transform(); - virtual bool apply(cv::Mat& image) = 0; - - /// Turn transform on - void enable() { - m_enabled = true; - } - /// Turn transform off - void disable() { - m_enabled = false; - } - /// Check if transform is on - bool is_enabled() const { - return m_enabled; - } - - //bool toggle_manual_switch() { return (m_manual_switch = !m_manual_switch); } - - // administrative methods - /** Return this transform's type, e.g: "augmenter," "normalizer," etc. */ - virtual std::string get_type() const = 0; - - /// Returns this transform's name - std::string get_name() const { return m_name; } - - /** Sets this transform's name; this is an arbitrary string, e.g, assigned in a prototext file. */ - void set_name(const std::string& name) { m_name = name; } - - /** Returns a description of the parameters passed to the ctor */ - virtual std::string get_description() const; - - virtual std::ostream& print(std::ostream& os) const; -}; - -/// Default constructor -inline cv_transform::cv_transform() - : m_name(""), m_enabled(false)//, m_manual_switch(false) -{} - -/// Deep-copying constructor -inline cv_transform::cv_transform(const cv_transform& rhs) - : m_name(rhs.m_name), m_enabled(rhs.m_enabled) {} - -/// Assignement operator. deep-copy everything -inline cv_transform& cv_transform::operator=(const cv_transform& rhs) { - m_enabled = rhs.m_enabled; - m_name = rhs.m_name; - return *this; -} - -/** Prepare transform for the given image as configured. - * Then, check if they are valid, and turn the transform on if so. - * The preparation includes as much precomputation as possible. For example, - * if the transformation consists of constructing four affine transform matrices - * and applying them to the given image in sequence, the transform matrices - * will be reduced to one. Then, the following function apply(image) will - * finally apply it to the image. - */ -inline bool cv_transform::determine_transform(const cv::Mat& image) { - // clear any transform state computed for previous image - // reset() - m_enabled = check_to_enable(); - // if (!m_enabled) return false; - // compute m_trans for the image and the configuration of the transform - // Here, some transform may not applicable to the given image. - // In that case, set m_enabled = false (or fruther throw an exception). - return m_enabled; -} - -/** Prepare the inverse transform to undo preprocessing transforms if needed - * for postprocessing. Not all transforms can be or need to be inversed. - * Then, check if they are valid, and turn the transform on if so. - * By default, turn this off as we do not need to undo in most of the cases. - * In need of manual overriding to enable/disable inverse transform, implement - * such a logic in this fuction and interfaces to enable/disable. - */ -inline bool cv_transform::determine_inverse_transform() { - // In case of manual overriding, if (!m_manual_switch) return false; - // If this transform, by design, can not be or does not need to be inversed, - // return (m_enabled = false); - // - // If the transform has not been applied (e.g., m_trans has not been set), - // return (m_enabled = false); - // Note that this cannot be determined by m_enabled as the transform is turned - // off once applied. - // - // Compute the inverse of m_trans and overwrite m_trans; - // set m_enabled to true; - // return true; - return false; -} - -/** Apply transform once and turn it off - * To conditionally apply the transform given an image, - * determine_transform(image) or determine_inverse_transform() must be called - * in advance. These will do as much precomputation as possible. For example, - * if the transformation consists of constructing four affine transform matrices - * and multiplying them to the given image in sequence, the transform matrices - * will be reduced to one. Then, this function will finally apply it to the image. - * There are three possible ways to implement condition checking as shown below, - * but here the third option is preferred for minimizing the number of calls - * 1. checking m_enabled internally - * 2. externally call is_enabled() - * 3. rely on the return value of determine_transform()/determine_inverse_transform() - */ -inline bool cv_transform::apply(cv::Mat& image) { - // As the transform is applied once, turn this off - m_enabled = false; - // Return the success of transform - return true; -} - -/// Return the pointer of a newly copy-constructed object -inline cv_transform *cv_transform::clone() const { - return static_cast(nullptr); -} - -//inline std::string cv_transform::get_type() { return "image transform"; } - -inline std::string cv_transform::get_description() const { - return std::string {} + get_type(); -} - -inline std::ostream& cv_transform::print(std::ostream& os) const { - os << get_description(); // Print out configuration variables - // Additionally, print out state variables as well - return os; -} - -std::ostream& operator<<(std::ostream& os, const cv_transform& tr); - -} // end of namespace lbann -#endif // LBANN_HAS_OPENCV - -#endif // LBANN_CV_TRANSFORM_HPP diff --git a/include/lbann/data_readers/cv_utils.hpp b/include/lbann/data_readers/cv_utils.hpp deleted file mode 100644 index fdac1bc77e3..00000000000 --- a/include/lbann/data_readers/cv_utils.hpp +++ /dev/null @@ -1,498 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. -// Produced at the Lawrence Livermore National Laboratory. -// Written by the LBANN Research Team (B. Van Essen, et al.) listed in -// the CONTRIBUTORS file. -// -// LLNL-CODE-697807. -// All rights reserved. -// -// This file is part of LBANN: Livermore Big Artificial Neural Network -// Toolkit. For details, see http://software.llnl.gov/LBANN or -// https://github.com/LLNL/LBANN. -// -// Licensed under the Apache License, Version 2.0 (the "Licensee"); you -// may not use this file except in compliance with the License. You may -// obtain a copy of the License at: -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the license. -// -// cv_utils .cpp .hpp - operations related to opencv images -//////////////////////////////////////////////////////////////////////////////// - -#ifndef LBANN_CV_UTILS_HPP -#define LBANN_CV_UTILS_HPP - -#include -#include // operator typeid -#include "opencv_extensions.hpp" -#include "cv_process.hpp" -#include "lbann/utils/mild_exception.hpp" - - -#ifdef LBANN_HAS_OPENCV -namespace lbann { - -class cv_utils { - public: - - // copy_cvMat_to_buf (with a tempoary buffer) - template - static bool copy_cvMat_to_buf_with_full_info(const cv::Mat& image, std::vector& buf, const cv_process& pp); - - template - static bool copy_cvMat_to_buf_with_known_type(const cv::Mat& image, std::vector& buf, const cv_process& pp); - - /** Copy a cv::Mat image into a serialized buffer. - * The argument pp specifies the parameters for image preprocessing that - * takes advantage of the OpenCV framework. Returns true if successful. - */ - static bool copy_cvMat_to_buf(const cv::Mat& image, std::vector& buf, const cv_process& pp); - - - // copy_buf_to_cvMat (with a tempoary buffer) - template - static cv::Mat copy_buf_to_cvMat_with_full_info(const std::vector& buf, const int Width, const int Height, const cv_process& pp); - - template - static cv::Mat copy_buf_to_cvMat_with_known_type(const std::vector& buf, const int Width, const int Height, const cv_process& pp); - - /** Reconstruct a cv::Mat image from a serialized buffer. - * The image size is specified by Width and Height. Type indetifies the - * OpenCV image type. The last argument pp specifies the parameters for - * image postprocessing that takes advantage of the OpenCV framework. - * Returns a reconstructed cv::Mat image if successful and an empty one - * otherwise. - */ - static cv::Mat copy_buf_to_cvMat(const std::vector& buf, const int Width, const int Height, const int Type, const cv_process& pp); - - - // copy_buf_to_cvMat (with an El::Matrix block) - template - static bool copy_cvMat_to_buf_with_full_info(const cv::Mat& image, CPUMat& buf, const cv_process& pp); - - template - static bool copy_cvMat_to_buf_with_known_type(const cv::Mat& image, CPUMat& buf, const cv_process& pp); - - /** Copy a cv::Mat image into an El::Matrix block. - * The argument pp specifies the parameters for image preprocessing that - * takes advantage of the OpenCV framework. Returns true if successful. - */ - static bool copy_cvMat_to_buf(const cv::Mat& image, CPUMat& buf, const cv_process& pp); - - - // copy_buf_to_cvMat (with an El::Matrix block) - template - static cv::Mat copy_buf_to_cvMat_with_full_info(const CPUMat& buf, const int Width, const int Height, const cv_process& pp); - - template - static cv::Mat copy_buf_to_cvMat_with_known_type(const CPUMat& buf, const int Width, const int Height, const cv_process& pp); - - /** Reconstruct a cv::Mat image from an El::Matrix block. - * The image size is specified by Width and Height. Type indetifies the - * OpenCV image type. The last argument pp specifies the parameters for - * image postprocessing that takes advantage of the OpenCV framework. - * Returns a reconstructed cv::Mat image if successful and an empty one - * otherwise. - */ - static cv::Mat copy_buf_to_cvMat(const CPUMat& buf, const int Width, const int Height, const int Type, const cv_process& pp); - - /** - * Use cv::imdecode() to load an image data instead of relying on cv::imread(). - * This avoids reading the image header to determine the decoder directly from - * the file but allow doing so from the memory. - * The arguments are the same as the ones with cv::imread() as well as the - * return type. Avoiding the extra access to the underlying filesystem may - * result in a better performance. - */ - static cv::Mat lbann_imread(const std::string& img_file_path, int flags, std::vector& buf, cv::Mat* image = nullptr); -}; - - -//^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -// copy_cvMat_to_buf (vector) -/** - * Copy a cv::Mat image into a serialized buffer. This requires the type of - * channel values and the number of channels in the image to be known at - * compile time. The default for these are the type uint8_t and 3 channels. - * The argument pp specifies the parameters for image preprocessing that - * takes advantage of the OpenCV framework. Returns true if successful. - */ -template -inline bool cv_utils::copy_cvMat_to_buf_with_full_info( - const cv::Mat& image, std::vector& buf, const cv_process& pp) { - _LBANN_SILENT_EXCEPTION(image.empty(), "", false) - - const int Width = image.cols; - const int Height = image.rows; - const int sz = Height*Width; - - buf.resize(sz*NCh*sizeof(T)); - auto *Pixels = reinterpret_cast(&(buf[0])); - - if (pp.to_split()) { - // TODO: like the case with the output in El::Matrixi type, branch on whether the - // input channel type T is same as that of the output (especially ::DataType) - std::vector trans = pp.get_transform_normalize(); - if (trans.size() == 0u) { - trans.assign(NCh, cv_normalizer::channel_trans_t(1.0, 0.0)); - } - _LBANN_MILD_EXCEPTION((trans.size() != NCh), - "Incorrect number of channels in transform", false); - std::vector channels(NCh); - - for(size_t ch=0; ch < NCh; ++ch, Pixels += sz) { - channels[ch] = cv::Mat(Height, Width, CV_MAKETYPE(image.depth(),1), Pixels); - } - cv::split(image, channels); - - Pixels = reinterpret_cast(&(buf[0])); - - for(size_t ch=0; ch < NCh; ++ch, Pixels += sz) { - cv_normalizer:: - scale(Pixels, Pixels + sz, Pixels, {trans[ch]}); - } - } else { - if (image.isContinuous()) { - cv_normalizer:: - scale(reinterpret_cast(image.datastart), - reinterpret_cast(image.dataend), - Pixels, pp.get_transform_normalize()); - } else { - const int stride = Width*NCh; - for (int i = 0; i < Height; ++i, Pixels += stride) { - const auto *ptr = reinterpret_cast(image.ptr(i)); - cv_normalizer:: - scale(ptr, ptr+stride, Pixels, pp.get_transform_normalize()); - } - } - } - - return true; -} - -/** - * Copy a cv::Mat image into a serialized buffer. This requires the type of - * channel values to be known at compile time. The default type is uint8_t. - * The argument pp specifies the parameters for image preprocessing that - * takes advantage of the OpenCV framework. Returns true if successful. - */ -template -inline bool cv_utils::copy_cvMat_to_buf_with_known_type( - const cv::Mat& image, std::vector& buf, const cv_process& pp) { - _SWITCH_CV_FUNC_KNOWN_TYPE_3PARAMS(image.channels(), T, \ - copy_cvMat_to_buf_with_full_info, \ - image, buf, pp) - return false; -} -//vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv - - -//^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -// copy_buf_to_cvMat (vector) -/** - * Reconstruct a cv::Mat image from a serialized buffer. This requires the type - * of channel values and the number of channels in the image to be known at - * compile time. The default for these are the type uint8_t and 3 channels. - * The image size is specified by Width and Height. The argument pp specifies - * the parameters for image postprocessing that takes advantage of the OpenCV - * framework. Returns an empty image if unsuccessful. - */ -template -inline cv::Mat cv_utils::copy_buf_to_cvMat_with_full_info( - const std::vector& buf, const int Width, const int Height, const cv_process& pp) { - - const int sz = Height*Width; - - _LBANN_MILD_EXCEPTION(sz*NCh*sizeof(T) != buf.size(), \ - "Size mismatch. Buffer has " << buf.size() << " items when " \ - << sz*NCh*sizeof(T) << " are expected.", \ - cv::Mat()) - - const auto *Pixels = reinterpret_cast(&(buf[0])); - - cv::Mat image = cv::Mat(Height, Width, CV_MAKETYPE(cv::DataType::depth, NCh)); - - if (pp.to_split()) { - // TODO: like the case with the output of El::Matrix type, branch on whether the - // input channel type T is same as that of the output (especially ::DataType) - std::vector trans = pp.get_transform_normalize(); - if (trans.size() == 0u) { - trans.assign(NCh, cv_normalizer::channel_trans_t(1.0, 0.0)); - } - _LBANN_MILD_EXCEPTION((trans.size() != NCh), - "Incorrect number of channels in transform", cv::Mat()); - std::vector channels(NCh); - - for(size_t ch=0; ch < NCh; ++ch, Pixels += sz) { - channels[ch] = cv::Mat(Height, Width, CV_MAKETYPE(image.depth(),1), const_cast(Pixels)); - } - - cv::merge(channels, image); - auto *optr = reinterpret_cast(image.data); - for(size_t ch=0; ch < NCh; ++ch, optr += sz) { - cv_normalizer:: - scale(reinterpret_cast(image.datastart), - reinterpret_cast(image.dataend), - optr, {trans[ch]}); - } - } else { - cv_normalizer:: - scale(Pixels, Pixels + sz*NCh, reinterpret_cast(image.data), pp.get_transform_normalize()); - } - - return image; -} - -/** - * Reconstruct a cv::Mat image from a serialized buffer. This requires the type - * of channel values to be known at compile time. The default type is uint8_t. - * The image size is specified by Width and Height. The last argument pp - * specifies the parameters for image postprocessing that takes advantage of the - * OpenCV framework. Returns a reconstructed cv::Mat image if successful and an - * empty one otherwise. - */ -template -inline cv::Mat cv_utils::copy_buf_to_cvMat_with_known_type( - const std::vector& buf, const int Width, const int Height, const cv_process& pp) { - _LBANN_MILD_EXCEPTION(buf.size() == 0u || Width == 0 || Height == 0, \ - "An empty image (" << Height << " x " << Width << ") or a buffer (" << buf.size() << ")", \ - cv::Mat()) - - const auto sz = static_cast(Width*Height*sizeof(T)); - const size_t NCh = buf.size()/sz; - - _LBANN_MILD_EXCEPTION(sz*NCh != buf.size(), \ - "Size mismatch. Buffer has " << buf.size() << " items when " << sz*NCh << " are expected.", \ - cv::Mat()) - - _SWITCH_CV_FUNC_KNOWN_TYPE_4PARAMS(NCh, T, \ - copy_buf_to_cvMat_with_full_info, \ - buf, Width, Height, pp); - - _LBANN_DEBUG_MSG(NCh << "-channel image is not supported."); - return cv::Mat(); -} -//vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv - - - -//^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -// copy_cvMat_to_buf (Elemental) -/** - * Copy a cv::Mat image into a data block of El::Matrix type. This - * requires the type of channel values and the number of channels in the image - * to be known at compile time. The default for these are the DataType of LBANN - * and 3 channels. In case of copying a single image into a collection of - * images as an existing El::Matrix matrix, a sub-matrix View can be passed. - * The argument pp specifies the parameters for image preprocessing that - * takes advantage of the OpenCV framework. Returns true if successful. - */ -template -inline bool cv_utils::copy_cvMat_to_buf_with_full_info( - const cv::Mat& image, CPUMat& buf, const cv_process& pp) { - // NCh need not be a template parameter here. It can be a function argument. - // However, keeping it as a static parameter enables custom accesses on pixels - // For example, - // using Vec_T = cv::Vec; - // image.at(y, x) = newPixel; - _LBANN_SILENT_EXCEPTION(image.empty(), "", false) - - const int Width = image.cols; - const int Height = image.rows; - const int sz = Height*Width; - - if (buf.Height() != sz*NCh) { -#if 0 - return false; -#else - //_LBANN_DEBUG_MSG("Resizing buffer height to " << sz*NCh); - buf.Resize(sz*NCh, ((buf.Width()<1)? 1 : buf.Width())); -#endif - } - - DataType *Pixels = buf.Buffer(); - - if (pp.to_split()) { - std::vector trans = pp.get_transform_normalize(); - if (trans.size() == 0u) { - trans.assign(NCh, cv_normalizer::channel_trans_t(1.0, 0.0)); - } - _LBANN_MILD_EXCEPTION((trans.size() != NCh), - "Incorrect number of channels in transform", false); - std::vector channels(NCh); - - if (std::is_same::value) { - for(size_t ch=0; ch < NCh; ++ch, Pixels += sz) { - // create a separate image per channel aliasing the memory of buf - channels[ch] = cv::Mat(Height, Width, CV_MAKETYPE(image.depth(),1), Pixels); - } - Pixels = buf.Buffer(); - - cv::split(image, channels); - - for(size_t ch=0; ch < NCh; ++ch, Pixels += sz) { - cv_normalizer:: - scale(Pixels, Pixels + sz, Pixels, {trans[ch]}); - } - } else { - cv::split(image, channels); - - for(size_t ch=0; ch < NCh; ++ch, Pixels += sz) { - cv_normalizer:: - scale(reinterpret_cast(channels[ch].datastart), - reinterpret_cast(channels[ch].dataend), - Pixels, {trans[ch]}); - } - } - } else { - if (image.isContinuous()) { - cv_normalizer:: - scale(reinterpret_cast(image.datastart), - reinterpret_cast(image.dataend), - Pixels, pp.get_transform_normalize()); - } else { - const int stride = Width*NCh; - for (int i = 0; i < Height; ++i, Pixels += stride) { - const auto *ptr = reinterpret_cast(image.ptr(i)); - cv_normalizer:: - scale(ptr, ptr+stride, Pixels, pp.get_transform_normalize()); - } - } - } - - return true; -} - -/** - * Copy a cv::Mat image into a data block of El::Matrix type. This - * requires the type of channel values in the image to be known at compile time. - * The default for these are the DataType of LBANN. - * The argument pp specifies the parameters for image preprocessing that - * takes advantage of the OpenCV framework. Returns true if successful. - */ -template -inline bool cv_utils::copy_cvMat_to_buf_with_known_type( - const cv::Mat& image, CPUMat& buf, const cv_process& pp) { - _SWITCH_CV_FUNC_KNOWN_TYPE_3PARAMS(image.channels(), T, \ - copy_cvMat_to_buf_with_full_info, \ - image, buf, pp) - return false; -} -//vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv - - -//^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -// copy_buf_to_cvMat (Elemental) -/** - * Reconstruct a cv::Mat image from a data block of El::Matrix type. - * This requires the type of channel values and the number of channels in the - * image to be known at compile time. The default for these are DataType of - * LBANN and 3 channels. In case of copying a single image data in a matrix - * of multiple images, a sub-matrix View can be passed. - * The image size is specified by Width and Height. The argument pp specifies - * the parameters for image postprocessing that takes advantage of the OpenCV - * framework. Returns an empty image if unsuccessful. - */ -template -inline cv::Mat cv_utils::copy_buf_to_cvMat_with_full_info( - const CPUMat& buf, const int Width, const int Height, const cv_process& pp) { - - const int sz = Height*Width; - _LBANN_MILD_EXCEPTION(sz*NCh != buf.Height(), \ - "Size mismatch. Buffer has " << buf.Height() << " items in a column when " \ - << sz*NCh << " are expected.", \ - cv::Mat()) - - const DataType *Pixels = buf.LockedBuffer(); - - cv::Mat image = cv::Mat(Height, Width, CV_MAKETYPE(cv::DataType::depth, NCh)); - - if (pp.to_split()) { - std::vector trans = pp.get_transform_normalize(); - if (trans.size() == 0u) { - trans.assign(NCh, cv_normalizer::channel_trans_t(1.0, 0.0)); - } - _LBANN_MILD_EXCEPTION((trans.size() != NCh), - "Incorrect number of channels in transform", cv::Mat()); - std::vector channels(NCh); - - if (std::is_same::value) { - for(size_t ch=0; ch < NCh; ++ch, Pixels += sz) { - channels[ch] = cv::Mat(Height, Width, CV_MAKETYPE(image.depth(),1), - const_cast(Pixels)); - } - - cv::merge(channels, image); - const auto *iptr = reinterpret_cast(image.data); - auto *optr = reinterpret_cast(image.data); - - cv_normalizer:: - scale(iptr, iptr+sz*NCh, optr, trans); - } else { - for(size_t ch=0; ch < NCh; ++ch, Pixels += sz) { - channels[ch] = cv::Mat(Height, Width, CV_MAKETYPE(image.depth(),1)); - cv_normalizer:: - scale(Pixels, Pixels+sz, - reinterpret_cast(channels[ch].data), {trans[ch]}); - } - cv::merge(channels, image); - } - } else { - cv_normalizer:: - scale(Pixels, Pixels + sz*NCh, - reinterpret_cast(image.data), - pp.get_transform_normalize()); - } - - return image; -} - -/** - * Reconstruct a cv::Mat image from a data block of El::Matrix type. - * This requires the type of channel values to be known at compile time. The - * default type is DataType. In this case, the new image may require conversion - * to an integer type during postprocessing such that it can be stored in an - * typical image file format. An image can sometimes be constructed even when - * T is different from DataType if the type casting of a DataType value into T - * is valid. - * The image size is specified by Width and Height. The last argument pp - * specifies the parameters for image postprocessing that takes advantage of the - * OpenCV framework. This returns a reconstructed cv::Mat image if successful - * and an empty one otherwise. - */ -template -inline cv::Mat cv_utils::copy_buf_to_cvMat_with_known_type( - const CPUMat& buf, const int Width, const int Height, const cv_process& pp) { - _LBANN_MILD_EXCEPTION(buf.Height() == 0u || buf.Width() == 0u || Width == 0 || Height == 0, \ - "An empty image (" << Height << " x " << Width << ") or a buffer (" \ - << buf.Height() << " x " << buf.Width() << ").", \ - cv::Mat()) - - const int sz = Height*Width; - const int NCh = buf.Height()/sz; - - _LBANN_MILD_EXCEPTION(sz*NCh != buf.Height(), \ - "Size mismatch. Buffer has " << buf.Height() << " items in a column when " \ - << sz*NCh << " are expected.", \ - cv::Mat()) - - _SWITCH_CV_FUNC_KNOWN_TYPE_4PARAMS(NCh, T, \ - copy_buf_to_cvMat_with_full_info, \ - buf, Width, Height, pp) - - _LBANN_DEBUG_MSG(NCh << "-channel image is not supported."); - return cv::Mat(); -} -//vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv - -} // end of namespace lbann -#endif // LBANN_HAS_OPENCV - -#endif // LBANN_CV_UTILS_HPP diff --git a/include/lbann/data_readers/data_reader.hpp b/include/lbann/data_readers/data_reader.hpp index f5a8a4fe359..a20b6e49964 100644 --- a/include/lbann/data_readers/data_reader.hpp +++ b/include/lbann/data_readers/data_reader.hpp @@ -35,9 +35,9 @@ #include "lbann/comm.hpp" #include "lbann/io/file_io.hpp" #include "lbann/io/persist.hpp" -#include "lbann/data_readers/image_preprocessor.hpp" #include "lbann/utils/options.hpp" #include "lbann/utils/threads/thread_pool.hpp" +#include "lbann/transforms/transform_pipeline.hpp" #include #include #include @@ -62,7 +62,7 @@ class model; * classes should implement load and the appropriate subset of fetch_datum, * fetch_label, and fetch_response. */ -class generic_data_reader : public lbann_image_preprocessor { +class generic_data_reader { public: #define JAG_NOOP_VOID if (m_jag_partitioned) { return; } @@ -105,7 +105,7 @@ class generic_data_reader : public lbann_image_preprocessor { generic_data_reader(const generic_data_reader&) = default; generic_data_reader& operator=(const generic_data_reader&) = default; - ~generic_data_reader() override {} + virtual ~generic_data_reader() {} virtual generic_data_reader* copy() const = 0; /// set the comm object @@ -285,15 +285,6 @@ class generic_data_reader : public lbann_image_preprocessor { /// Fetch this mini-batch's responses into Y. virtual int fetch_responses(CPUMat& Y); - /** - * Save pixels to an image. The implementing data reader is responsible for - * handling format detection, conversion, etc. - */ - // TODO: This function needs to go away from here - void save_image(Mat& pixels, const std::string filename, - bool do_scale = true) override { - NOT_IMPLEMENTED("save_image"); - } /** * During the network's update phase, the data reader will * advanced the current position pointer. If the pointer wraps @@ -726,6 +717,11 @@ class generic_data_reader : public lbann_image_preprocessor { /// have identical shuffled indices virtual void post_update() {} + /** Set the transform pipeline this data reader will use. */ + void set_transform_pipeline(transform::transform_pipeline&& tp) { + m_transform_pipeline = std::move(tp); + } + protected: // For use with conduit when samples are corrupt. @@ -902,6 +898,10 @@ class generic_data_reader : public lbann_image_preprocessor { void set_jag_variables(int mb_size); model *m_model; + + /** Transform pipeline for preprocessing data. */ + transform::transform_pipeline m_transform_pipeline; + /// for use with data_store: issue a warning a single time if m_data_store != nullptr, /// but we're not retrieving a conduit::Node from the store. This typically occurs /// during the test phase diff --git a/include/lbann/data_readers/data_reader_csv.hpp b/include/lbann/data_readers/data_reader_csv.hpp index 58c55885c68..ae0ead7811f 100644 --- a/include/lbann/data_readers/data_reader_csv.hpp +++ b/include/lbann/data_readers/data_reader_csv.hpp @@ -30,7 +30,6 @@ #define LBANN_DATA_READER_CSV_HPP #include "data_reader.hpp" -#include "image_preprocessor.hpp" #include namespace lbann { diff --git a/include/lbann/data_readers/data_reader_image.hpp b/include/lbann/data_readers/data_reader_image.hpp index 67e78f78a36..3c9095af07c 100644 --- a/include/lbann/data_readers/data_reader_image.hpp +++ b/include/lbann/data_readers/data_reader_image.hpp @@ -30,8 +30,6 @@ #define IMAGE_DATA_READER_HPP #include "data_reader.hpp" -#include "image_preprocessor.hpp" -#include "cv_process.hpp" #include "lbann/data_store/data_store_conduit.hpp" namespace lbann { @@ -82,11 +80,6 @@ class image_data_reader : public generic_data_reader { return {m_image_num_channels, m_image_height, m_image_width}; } - void save_image(Mat& pixels, const std::string filename, bool do_scale = true) override { - internal_save_image(pixels, filename, m_image_height, m_image_width, - m_image_num_channels, do_scale); - } - /// Return the sample list of current minibatch std::vector get_image_list_of_current_mb() const; @@ -120,9 +113,9 @@ class image_data_reader : public generic_data_reader { int m_image_num_channels; ///< number of image channels int m_image_linearized_size; ///< linearized image size int m_num_labels; ///< number of labels - std::vector m_thread_cv_buffer; void load_conduit_node_from_file(int data_id, conduit::Node &node); + }; } // namespace lbann diff --git a/include/lbann/data_readers/data_reader_imagenet.hpp b/include/lbann/data_readers/data_reader_imagenet.hpp index afac32dbe24..ed674ff0d19 100644 --- a/include/lbann/data_readers/data_reader_imagenet.hpp +++ b/include/lbann/data_readers/data_reader_imagenet.hpp @@ -30,36 +30,29 @@ #define LBANN_DATA_READER_IMAGENET_HPP #include "data_reader_image.hpp" -#include "cv_process.hpp" namespace lbann { class imagenet_reader : public image_data_reader { public: - imagenet_reader(bool shuffle) = delete; - imagenet_reader(const std::shared_ptr& pp, bool shuffle = true); - imagenet_reader(const imagenet_reader&); - imagenet_reader(const imagenet_reader&, const std::vector& ds_sample_move_list);; - imagenet_reader(const imagenet_reader&, const std::vector& ds_sample_move_list, std::string role); - imagenet_reader& operator=(const imagenet_reader&); + imagenet_reader(bool shuffle = true); + imagenet_reader(const imagenet_reader&, + const std::vector& ds_sample_move_list); + imagenet_reader(const imagenet_reader&, + const std::vector& ds_sample_move_list, std::string role); + imagenet_reader(const imagenet_reader&) = default; + imagenet_reader& operator=(const imagenet_reader&) = default; ~imagenet_reader() override; imagenet_reader* copy() const override { return new imagenet_reader(*this); } - void setup(int num_io_threads, std::shared_ptr io_thread_pool) override; - std::string get_type() const override { return "imagenet_reader"; } protected: void set_defaults() override; - virtual bool replicate_processor(const cv_process& pp, const int nthreads); virtual CPUMat create_datum_view(CPUMat& X, const int mb_idx) const; bool fetch_datum(CPUMat& X, int data_id, int mb_idx) override; - - /// preprocessor duplicated for each omp thread - std::vector > m_pps; - std::unique_ptr m_master_pps; }; } // namespace lbann diff --git a/include/lbann/data_readers/data_reader_imagenet_patches.hpp b/include/lbann/data_readers/data_reader_imagenet_patches.hpp deleted file mode 100644 index 49539429fab..00000000000 --- a/include/lbann/data_readers/data_reader_imagenet_patches.hpp +++ /dev/null @@ -1,74 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. -// Produced at the Lawrence Livermore National Laboratory. -// Written by the LBANN Research Team (B. Van Essen, et al.) listed in -// the CONTRIBUTORS file. -// -// LLNL-CODE-697807. -// All rights reserved. -// -// This file is part of LBANN: Livermore Big Artificial Neural Network -// Toolkit. For details, see http://software.llnl.gov/LBANN or -// https://github.com/LLNL/LBANN. -// -// Licensed under the Apache License, Version 2.0 (the "Licensee"); you -// may not use this file except in compliance with the License. You may -// obtain a copy of the License at: -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the license. -// -// lbann_data_reader_imagenet_patches .hpp .cpp - extract patches from ImageNet dataset -//////////////////////////////////////////////////////////////////////////////// - -#ifndef LBANN_DATA_READER_IMAGENET_PATCHES_HPP -#define LBANN_DATA_READER_IMAGENET_PATCHES_HPP - -#include "data_reader_image.hpp" -#include "cv_process_patches.hpp" - -namespace lbann { -class imagenet_reader_patches : public image_data_reader { - public: - imagenet_reader_patches(bool shuffle) = delete; - imagenet_reader_patches(const std::shared_ptr& pp, bool shuffle = true); - imagenet_reader_patches(const imagenet_reader_patches&); - imagenet_reader_patches& operator=(const imagenet_reader_patches&); - ~imagenet_reader_patches() override; - - imagenet_reader_patches* copy() const override { return new imagenet_reader_patches(*this); } - - void setup(int num_io_threads, std::shared_ptr io_thread_pool) override; - - std::string get_type() const override { - return "imagenet_reader_patches"; - } - - int get_linearized_data_size() const override { - return m_image_linearized_size * m_num_patches; - } - const std::vector get_data_dims() const override { - return {m_num_patches*m_image_num_channels, m_image_height, m_image_width}; - } - - protected: - void set_defaults() override; - virtual bool replicate_processor(const cv_process_patches& pp, const int nthreads); - virtual std::vector create_datum_views(CPUMat& X, const int mb_idx) const; - bool fetch_datum(CPUMat& X, int data_id, int mb_idx) override; - - protected: - int m_num_patches; ///< number of patches extracted - /// preprocessor for patches duplicated for each omp thread - std::vector > m_pps; - std::unique_ptr m_master_pps; -}; - -} // namespace lbann - -#endif // LBANN_DATA_READER_IMAGENET_PATCHES_HPP diff --git a/include/lbann/data_readers/data_reader_jag.hpp b/include/lbann/data_readers/data_reader_jag.hpp index c10daf0c9de..f437e6f8dbc 100644 --- a/include/lbann/data_readers/data_reader_jag.hpp +++ b/include/lbann/data_readers/data_reader_jag.hpp @@ -31,7 +31,6 @@ #include #include #include "lbann/base.hpp" -#include "lbann/data_readers/opencv.hpp" #include "data_reader.hpp" namespace lbann { @@ -106,8 +105,6 @@ class data_reader_jag : public generic_data_reader { /// Return the pointer to the raw image data data_t* get_image_ptr(const size_t i) const; - /// Return the image data as a 1-D vector of lbann::DataType - cv::Mat get_image(const size_t i) const; /// Return the pointer to the raw scalar data scalar_t* get_scalar_ptr(const size_t i) const; @@ -119,8 +116,6 @@ class data_reader_jag : public generic_data_reader { /// Return the input values of the simulation correspoding to the i-th sample std::vector get_input(const size_t i) const; - void save_image(Mat& pixels, const std::string filename, bool do_scale = true) override; - protected: /// add data type for independent variable void add_independent_variable_type(const variable_t independent); diff --git a/include/lbann/data_readers/data_reader_jag_conduit.hpp b/include/lbann/data_readers/data_reader_jag_conduit.hpp index 7d91c89c300..71b03a883de 100644 --- a/include/lbann/data_readers/data_reader_jag_conduit.hpp +++ b/include/lbann/data_readers/data_reader_jag_conduit.hpp @@ -27,19 +27,18 @@ #ifndef _DATA_READER_JAG_CONDUIT_HPP_ #define _DATA_READER_JAG_CONDUIT_HPP_ -#include "lbann_config.hpp" +#include "lbann_config.hpp" -#include "lbann/data_readers/opencv.hpp" #include "data_reader.hpp" #include "conduit/conduit.hpp" #include "hdf5.h" -#include "lbann/data_readers/cv_process.hpp" #include #include #include #include #include #include "lbann/data_readers/sample_list_jag.hpp" +#include namespace lbann { @@ -75,8 +74,7 @@ class data_reader_jag_conduit : public generic_data_reader { /// Type to define a prefix string and the minimum length requirement to filter out a key using prefix_t = std::pair; - data_reader_jag_conduit(bool shuffle = true) = delete; - data_reader_jag_conduit(const std::shared_ptr& pp, bool shuffle = true); + data_reader_jag_conduit(bool shuffle = true); data_reader_jag_conduit(const data_reader_jag_conduit&); data_reader_jag_conduit(const data_reader_jag_conduit&, const std::vector& ds_sample_move_list); data_reader_jag_conduit& operator=(const data_reader_jag_conduit&); @@ -215,15 +213,6 @@ class data_reader_jag_conduit : public generic_data_reader { /// Show the description std::string get_description() const; - /// Return the image simulation output of the i-th sample - std::vector get_cv_images(const size_t i, conduit::Node& sample) const; - - /** - * Return the images of the i-th sample as an 1-D vector of lbann::DataType - * There is one image per view, each of which is taken at closest to the bang time. - */ - std::vector get_images(const size_t i, conduit::Node& sample) const; - /// Return the scalar simulation output data of the i-th sample std::vector get_scalars(const size_t i, conduit::Node& sample) const; @@ -233,13 +222,8 @@ class data_reader_jag_conduit : public generic_data_reader { template static size_t add_val(const std::string key, const conduit::Node& n, std::vector& vals); - void save_image(Mat& pixels, const std::string filename, bool do_scale = true) override; - void setup_data_store(int mini_batch_size); - /// A untiliy function to convert the pointer to image data into an opencv image - static cv::Mat cast_to_cvMat(const std::pair img, - const int height, const int num_ch=1); /// A utility function to convert a JAG variable type to name string static std::string to_string(const variable_t t); @@ -261,7 +245,6 @@ class data_reader_jag_conduit : public generic_data_reader { void preload_data_store() override; virtual void set_defaults(); - virtual bool replicate_processor(const cv_process& pp, const int nthreads); virtual void copy_members(const data_reader_jag_conduit& rhs, const std::vector& ds_sample_move_list = std::vector()); /// add data type for independent variable @@ -409,10 +392,6 @@ class data_reader_jag_conduit : public generic_data_reader { /// Keys to select a set of simulation input parameters to use. By default, use all. std::vector m_input_keys; - /// preprocessor duplicated for each omp thread - std::vector > m_pps; - std::unique_ptr m_master_pps; - /** * Set of keys that are associated with non_numerical values. * Such a variable requires a specific method for mapping to a numeric value. @@ -469,11 +448,6 @@ class data_reader_jag_conduit : public generic_data_reader { sample_list_jag m_sample_list; bool m_list_per_trainer; bool m_list_per_model; - - /** temporary image normalization - * The inputs are the image to normalize, the image source id and the channel id. - */ - void image_normalization(cv::Mat& img, size_t i, size_t ch) const; }; /** diff --git a/include/lbann/data_readers/data_reader_mnist.hpp b/include/lbann/data_readers/data_reader_mnist.hpp index 2d3b30e0ed6..ebd8df8ec27 100644 --- a/include/lbann/data_readers/data_reader_mnist.hpp +++ b/include/lbann/data_readers/data_reader_mnist.hpp @@ -30,7 +30,6 @@ #define LBANN_DATA_READER_MNIST_HPP #include "data_reader_image.hpp" -#include "image_preprocessor.hpp" namespace lbann { diff --git a/include/lbann/data_readers/data_reader_mnist_siamese.hpp b/include/lbann/data_readers/data_reader_mnist_siamese.hpp deleted file mode 100644 index 4536e3cebad..00000000000 --- a/include/lbann/data_readers/data_reader_mnist_siamese.hpp +++ /dev/null @@ -1,126 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. -// Produced at the Lawrence Livermore National Laboratory. -// Written by the LBANN Research Team (B. Van Essen, et al.) listed in -// the CONTRIBUTORS file. -// -// LLNL-CODE-697807. -// All rights reserved. -// -// This file is part of LBANN: Livermore Big Artificial Neural Network -// Toolkit. For details, see http://software.llnl.gov/LBANN or -// https://github.com/LLNL/LBANN. -// -// Licensed under the Apache License, Version 2.0 (the "Licensee"); you -// may not use this file except in compliance with the License. You may -// obtain a copy of the License at: -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the license. -// -// data_reader_mnist_siamese .hpp .cpp - data reader class for mnist dataset -// employing two images per sample to feed siamese model -//////////////////////////////////////////////////////////////////////////////// - -#ifndef DATA_READER_MNIST_SIAMESE_HPP -#define DATA_READER_MNIST_SIAMESE_HPP - -#include "data_reader_multi_images.hpp" -#include "cv_process.hpp" -#include -#include -#include -#include - -namespace lbann { - -/** - * With MNIST dataset, there is no individual image file. All the images or - * labels are packed into a single binary file respectively. This reader - * pre-loads all the data into memory as minist_reader does. - * However, to feed a siamese model, this reader randomly chooses the paired - * input on-line. It maintains another data index list, 'm_shuffled_indices2'. - * It first copies the primary list maintined by the base class to the secondary - * list, and shuffles the secondary whenever the primary gets shuffled via the - * overridden shuffle_indices() method. - */ -class data_reader_mnist_siamese : public data_reader_multi_images { - public: - using label_t = unsigned char; - using sample_t = std::pair; - - data_reader_mnist_siamese(const std::shared_ptr& pp, bool shuffle = true); - data_reader_mnist_siamese(const data_reader_mnist_siamese&); - data_reader_mnist_siamese& operator=(const data_reader_mnist_siamese&); - ~data_reader_mnist_siamese() override; - - data_reader_mnist_siamese* copy() const override { - return new data_reader_mnist_siamese(*this); - } - - std::string get_type() const override { - return "data_reader_mnist_siamese"; - } - - /** Set up MNIST dataset-specific input parameters, which are pre-defined - * and also set as the default. This does not change the setup, but only - * preserves the default. - */ - void set_input_params(const int, const int, const int, const int) override; - - // dataset specific functions - void load() override; - - /// Fetch this mini-batch's samples into X by calling the new overloaded fetch_datum() - int fetch_data(CPUMat& X, El::Matrix& indices_fetched) override; - /// Fetch this mini-batch's labels into Y by calling the new overloaded fetch_label() - int fetch_labels(CPUMat& Y) override; - - protected: - /** - * Set the default configuration such as the width, height, and number of - * channels of the image sample. - */ - void set_defaults() override; - - // unused virtual interfaces replaced by the new interfaces that taks a pair - // of indices to sample list. - using data_reader_multi_images::fetch_datum; - using data_reader_multi_images::fetch_label; - bool fetch_datum(CPUMat& X, int data_id, int mb_idx) override; - bool fetch_label(CPUMat& Y, int data_id, int mb_idx) override; - - /** - * Fetch two data items identified by the pair of indices to the pre-loaded data list, - * and put them into the column mb_idx of matrix x. - */ - virtual bool fetch_datum(CPUMat& X, std::pair data_id, int mb_idx); - /** - * Take a pair of indices to the preloaded sample list, and compare the labels - * of the corresponding samples. Store 1 if equal or 0 at the column mb_idx of - * the given matrix Y. - */ - virtual bool fetch_label(CPUMat& Y, std::pair data_id, int mb_idx); - - /** - * Shuffle the second index list added in this class as well as the one in the - * base class whenever the latter gets shuffled. - */ - void shuffle_indices() override; - - protected: - using generic_data_reader::m_shuffled_indices; - /// To randomly choose the siamese pair input online - std::vector m_shuffled_indices2; - /// Store the preloaded data - std::vector> m_image_data; -}; - -} // namespace lbann - -#endif // DATA_READER_MNIST_SIAMESE_HPP diff --git a/include/lbann/data_readers/data_reader_multi_images.hpp b/include/lbann/data_readers/data_reader_multi_images.hpp index 93a2959bd7d..31ab31c2881 100644 --- a/include/lbann/data_readers/data_reader_multi_images.hpp +++ b/include/lbann/data_readers/data_reader_multi_images.hpp @@ -31,7 +31,6 @@ #define DATA_READER_MULTI_IMAGES_HPP #include "data_reader_imagenet.hpp" -#include "cv_process.hpp" #include #include #include @@ -43,8 +42,7 @@ class data_reader_multi_images : public imagenet_reader { using img_src_t = std::vector; using sample_t = std::pair; - data_reader_multi_images(bool shuffle) = delete; - data_reader_multi_images(const std::shared_ptr& pp, bool shuffle = true); + data_reader_multi_images(bool shuffle = true); data_reader_multi_images(const data_reader_multi_images&); data_reader_multi_images& operator=(const data_reader_multi_images&); ~data_reader_multi_images() override; @@ -99,7 +97,6 @@ class data_reader_multi_images : public imagenet_reader { protected: void set_defaults() override; virtual std::vector create_datum_views(CPUMat& X, const int mb_idx) const; - bool fetch_datum(CPUMat& X, int data_id, int mb_idx) override; bool fetch_label(CPUMat& Y, int data_id, int mb_idx) override; bool read_text_stream(std::istream& text_stream, std::vector& list); diff --git a/include/lbann/data_readers/data_reader_multihead_siamese.hpp b/include/lbann/data_readers/data_reader_multihead_siamese.hpp index dc95f3cb7e8..55e79f95eca 100644 --- a/include/lbann/data_readers/data_reader_multihead_siamese.hpp +++ b/include/lbann/data_readers/data_reader_multihead_siamese.hpp @@ -31,7 +31,6 @@ #define DATA_READER_MULTIHEAD_SIAMESE_HPP #include "data_reader_multi_images.hpp" -#include "cv_process.hpp" #include "offline_patches_npz.hpp" #include #include @@ -44,8 +43,8 @@ class data_reader_multihead_siamese : public data_reader_multi_images { using label_t = offline_patches_npz::label_t; using sample_t = offline_patches_npz::sample_t; - data_reader_multihead_siamese(const std::shared_ptr& pp, unsigned int nimages, bool shuffle = true); - data_reader_multihead_siamese(const std::shared_ptr& pp, bool shuffle = true); + data_reader_multihead_siamese(unsigned int nimages, bool shuffle = true); + data_reader_multihead_siamese(bool shuffle = true); data_reader_multihead_siamese(const data_reader_multihead_siamese&); data_reader_multihead_siamese& operator=(const data_reader_multihead_siamese&); diff --git a/include/lbann/data_readers/data_reader_triplet.hpp b/include/lbann/data_readers/data_reader_triplet.hpp index a1ee9e07871..8fde7501916 100644 --- a/include/lbann/data_readers/data_reader_triplet.hpp +++ b/include/lbann/data_readers/data_reader_triplet.hpp @@ -34,7 +34,6 @@ #define DATA_READER_TRIPLET_HPP #include "data_reader_multi_images.hpp" -#include "cv_process.hpp" #include "offline_patches_npz.hpp" #include #include @@ -47,7 +46,7 @@ class data_reader_triplet : public data_reader_multi_images { using label_t = offline_patches_npz::label_t; using sample_t = offline_patches_npz::sample_t; - data_reader_triplet(const std::shared_ptr& pp, bool shuffle = true); + data_reader_triplet(bool shuffle = true); data_reader_triplet(const data_reader_triplet&); data_reader_triplet& operator=(const data_reader_triplet&); ~data_reader_triplet() override; diff --git a/include/lbann/data_readers/image_preprocessor.hpp b/include/lbann/data_readers/image_preprocessor.hpp deleted file mode 100644 index fb730e23bf1..00000000000 --- a/include/lbann/data_readers/image_preprocessor.hpp +++ /dev/null @@ -1,209 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. -// Produced at the Lawrence Livermore National Laboratory. -// Written by the LBANN Research Team (B. Van Essen, et al.) listed in -// the CONTRIBUTORS file. -// -// LLNL-CODE-697807. -// All rights reserved. -// -// This file is part of LBANN: Livermore Big Artificial Neural Network -// Toolkit. For details, see http://software.llnl.gov/LBANN or -// https://github.com/LLNL/LBANN. -// -// Licensed under the Apache License, Version 2.0 (the "Licensee"); you -// may not use this file except in compliance with the License. You may -// obtain a copy of the License at: -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the license. -// -// image_preprocessor.hpp - Preprocessing utilities for image inputs -//////////////////////////////////////////////////////////////////////////////// - -#ifndef LBANN_IMAGE_PREPROCESSOR -#define LBANN_IMAGE_PREPROCESSOR - -#include "lbann_config.hpp" - -#ifdef LBANN_HAS_OPENCV -#include "lbann/data_readers/opencv.hpp" -#else -#error OpenCV required -#endif -#include "lbann/base.hpp" - -namespace lbann { - -/** - * Support class for preprocessing image inputs. - * Supports the following transforms: - * - Random horizontal and vertical flips - * - Random rotations - * - Random horizontal and vertical shifts - * - Random shearing - * - Standardize to 0 mean - * - Standardize to unit variance - * - Scale to the range [0, 1] - * - Normalize via z-score - */ -class lbann_image_preprocessor { - public: - lbann_image_preprocessor(); - lbann_image_preprocessor(const lbann_image_preprocessor&) = default; - lbann_image_preprocessor& operator=( - const lbann_image_preprocessor&) = default; - virtual ~lbann_image_preprocessor() {} - - /** Whether to do random horizontal flips. */ - void horizontal_flip(bool b) { - m_horizontal_flip = b; - } - /** Whether to do random vertical flips. */ - void vertical_flip(bool b) { - m_vertical_flip = b; - } - /** Do random rotations up to range degrees (0-180). */ - void rotation(float range) { - m_rotation_range = range; - } - /** Do random horizontal shifts up to range (fraction of image width). */ - void horizontal_shift(float range) { - m_horizontal_shift = range; - } - /** Do random vertical shifts up to range (fraction of image height). */ - void vertical_shift(float range) { - m_vertical_shift = range; - } - /** Do random shears up to range (radians). */ - void shear_range(float range) { - m_shear_range = range; - } - /** Whether to subtract the sample-wise mean. */ - void subtract_mean(bool b) { - m_mean_subtraction = b; - } - /** Whether to normalize to unit variance, sample-wise. */ - void unit_variance(bool b) { - m_unit_variance = b; - } - /** Whether to scale to [0, 1] (assumes max value is 255). */ - void scale(bool b) { - m_scale = b; - } - /** - * Whether to normalize by z-scores, sample-wise. - * This and mean subtraction/unit variance are mutually exclusive. - */ - void z_score(bool b) { - m_z_score = b; - } - /** Disable all data augmentation. */ - void disable_augmentation() { - horizontal_flip(false); - vertical_flip(false); - rotation(0.0f); - horizontal_shift(0.0f); - vertical_shift(0.0f); - shear_range(0.0f); - } - - /** - * Add noise to data (disable by default) - * noise_factor control the ammount of noise - * to be set to a value above zero but less than 1 (say 0.5) - * */ - void add_noise(float noise_factor=0.0f) { - m_noise_factor = noise_factor; - } - - /** - * Preprocess pixels according to the currently-set augmentation transforms. - * @param pixels The pixels to process as a column vector (num x 1 mat). - * @param imheight Height of the image in pixels. - * @param imwidth Width of the image in pixels. - * @param num_channels The number of channels pixels has. - */ - void augment(Mat& pixels, unsigned imheight, unsigned imwidth, - unsigned num_channels); - /** - * Normalize poxels according to the currently-set transforms. - * @param pixels The pixels to process as a column vector. - * @param num_channels The number of channels pixels has. - */ - void normalize(Mat& pixels, unsigned num_channels); - - /** - * External interface to saving an image. - * Classes that want to support this should use it to interface with - * internal_save_image. - * @param pixels The image to save (as a column vector). - * @param filename The image filename (type inferred from extension). - * @param do_scale Whether pixels has been scaled (default true). - */ - virtual void save_image(Mat& pixels, const std::string filename, - bool do_scale = true) {} - - protected: - /** Whether to do horizontal flips. */ - bool m_horizontal_flip; - /** Whether to do vertical flips. */ - bool m_vertical_flip; - /** Range in degrees for rotations (0-180). */ - float m_rotation_range; - /** Range (fraction of total width) for horizontal shifts. */ - float m_horizontal_shift; - /** Range (fraction of total height) for vertical shifts. */ - float m_vertical_shift; - /** Shear angle (radians). */ - float m_shear_range; - /** Whether to normalize to 0 mean. */ - bool m_mean_subtraction; - /** Whether to normalize to unit variance. */ - bool m_unit_variance; - /** Whether to scale to [0, 1]. */ - bool m_scale; - /** Whether to normalize via z-score. */ - bool m_z_score; - - float m_noise_factor; - - void mean_subtraction(Mat& pixels, unsigned num_channels); - void unit_variance(Mat& pixels, unsigned num_channels); - void unit_scale(Mat& pixels, unsigned num_channels); - void z_score(Mat& pixels, unsigned num_channels); - - void pixel_noise(Mat& pixels); - - /** - * Convert a column vector of pixels to an OpenCV matrix. - */ - cv::Mat cv_pixels(const Mat& pixels, unsigned imheight, unsigned imwidth, - unsigned num_channels); - /** Undo cv_pixels. */ - void col_pixels(const cv::Mat& sqpixels, Mat& pixels, unsigned num_channels); - - /** @brief Flip sqpixels. - * @param sqpixels The image to flip - * @param flip_flag OpenCV flip flag: 0=vertical, 1=horizontal, -1=both. - */ - void flip(cv::Mat& sqpixels, int flip_flag); - /** Apply the affine transformation in 3x3 matrix trans. */ - void affine_trans(cv::Mat& sqpixels, const Mat& trans); - - /** - * Save pixels to filename. - */ - void internal_save_image(Mat& pixels, const std::string filename, - unsigned imheight, unsigned imwidth, - unsigned num_channels, bool do_scale); -}; - -} // namespace lbann - -#endif // LBANN_IMAGE_PREPROCESSOR diff --git a/include/lbann/data_readers/image_utils.hpp b/include/lbann/data_readers/image_utils.hpp deleted file mode 100644 index b52a7f4cb78..00000000000 --- a/include/lbann/data_readers/image_utils.hpp +++ /dev/null @@ -1,86 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. -// Produced at the Lawrence Livermore National Laboratory. -// Written by the LBANN Research Team (B. Van Essen, et al.) listed in -// the CONTRIBUTORS file. -// -// LLNL-CODE-697807. -// All rights reserved. -// -// This file is part of LBANN: Livermore Big Artificial Neural Network -// Toolkit. For details, see http://software.llnl.gov/LBANN or -// https://github.com/LLNL/LBANN. -// -// Licensed under the Apache License, Version 2.0 (the "Licensee"); you -// may not use this file except in compliance with the License. You may -// obtain a copy of the License at: -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the license. -// -// image_utils .cpp .hpp - Image I/O utility functions -//////////////////////////////////////////////////////////////////////////////// - -#ifndef LBANN_IMAGE_UTILS_HPP -#define LBANN_IMAGE_UTILS_HPP - -#include "lbann/base.hpp" -#include -#include // operator typeid - -#ifdef LBANN_HAS_OPENCV -#include "lbann/data_readers/cv_utils.hpp" -#include "lbann/data_readers/cv_process_patches.hpp" -#endif - - -namespace lbann { -class image_utils { - public: - static bool loadIMG(std::vector& image_buf, int& Width, int& Height, bool Flip, unsigned char *&Pixels); - static bool loadIMG(const std::string& Imagefile, int& Width, int& Height, bool Flip, unsigned char *&Pixels, std::vector& buf); - static bool saveIMG(const std::string& Imagefile, int Width, int Height, bool Flip, unsigned char *Pixels); - -#ifdef LBANN_HAS_OPENCV - // The other load/import methods rely on these core methods - /// process an image and put it into an LBANN Mat data block - static bool process_image(cv::Mat& image, int& Width, int& Height, int& Type, cv_process& pp, CPUMat& out); - /// process an image and put it into a serialized buffer - static bool process_image(cv::Mat& image, int& Width, int& Height, int& Type, cv_process& pp, std::vector& out); - /// process an image and put it into an LBANN Mat data blocks - static bool process_image(cv::Mat& image, int& Width, int& Height, int& Type, cv_process_patches& pp, std::vector& out); -#endif // LBANN_HAS_OPENCV - - // new function, to support sharded data reader and data store functionality - static bool load_image(std::vector& image_buf, int& Width, int& Height, int& Type, cv_process& pp, CPUMat& data, cv::Mat* cv_buf = nullptr); - - // new function, to support sharded data reader and data store functionality - static bool load_image(std::vector& image_buf, - int& Width, int& Height, int& Type, cv_process_patches& pp, std::vector& data, cv::Mat* cv_buf = nullptr); - - // load/save an image into/from an LBANN data block of El::Matrix type - // Use a thread save temporary buffer for decoding the image - /// Load an image from a file and put it into an LBANN Mat data block - static bool load_image(const std::string& filename, int& Width, int& Height, int& Type, cv_process& pp, CPUMat& data, std::vector& buf, cv::Mat* cv_buf = nullptr); - /// Load an image from a file, extract patches from it and put them into LBANN Mat data blocks - static bool load_image(const std::string& filename, int& Width, int& Height, int& Type, cv_process_patches& pp, std::vector& data, std::vector& buf, cv::Mat* cv_buf = nullptr); - /// Save an image using data from an LBANN Mat data block - static bool save_image(const std::string& filename, const int Width, const int Height, const int Type, cv_process& pp, const CPUMat& data); - - // import/export via a buffer of std::vector containg the raw bytes of an image file - /// Import an image from a file buffer (inbuf) and put it into an LBANN Mat data block - static bool import_image(cv::InputArray inbuf, int& Width, int& Height, int& Type, cv_process& pp, CPUMat& data, cv::Mat* cv_buf = nullptr); - /// Import an image from a file buffer (inbuf), extract patches from it and put them into LBANN Mat data blocks - static bool import_image(cv::InputArray inbuf, int& Width, int& Height, int& Type, cv_process_patches& pp, std::vector& data, cv::Mat* cv_buf = nullptr); - /// Export an image using data from an LBANN Mat block into a file buffer (outbuf) - static bool export_image(const std::string& fileExt, std::vector& outbuf, const int Width, const int Height, const int Type, cv_process& pp, const CPUMat& data); -}; - -} // end of namespace lbann - -#endif // LBANN_IMAGE_UTILS_HPP diff --git a/include/lbann/data_readers/opencv.hpp b/include/lbann/data_readers/opencv.hpp deleted file mode 100644 index 9adc7efa0d7..00000000000 --- a/include/lbann/data_readers/opencv.hpp +++ /dev/null @@ -1,68 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. -// Produced at the Lawrence Livermore National Laboratory. -// Written by the LBANN Research Team (B. Van Essen, et al.) listed in -// the CONTRIBUTORS file. -// -// LLNL-CODE-697807. -// All rights reserved. -// -// This file is part of LBANN: Livermore Big Artificial Neural Network -// Toolkit. For details, see http://software.llnl.gov/LBANN or -// https://github.com/LLNL/LBANN. -// -// Licensed under the Apache License, Version 2.0 (the "Licensee"); you -// may not use this file except in compliance with the License. You may -// obtain a copy of the License at: -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the license. -// -// opencv.hpp - LBANN header for opencv -//////////////////////////////////////////////////////////////////////////////// - -/** - * LBANN header for opencv - * - includes OpenCV headers according to the version - * - use newer built-in variables in place of the deprecated ones for newer OpenCV - */ - -#include "lbann_config.hpp" - -#ifdef LBANN_HAS_OPENCV -#ifndef _LBANN_OPENCV_H_INCLUDED_ -#define _LBANN_OPENCV_H_INCLUDED_ - -#include -#if (!defined(CV_VERSION_EPOCH) && (CV_VERSION_MAJOR >= 3)) -#include -#include -#include -#define _LBANN_CV_UNCHANGED_ cv::IMREAD_UNCHANGED -#define _LBANN_CV_GRAYSCALE_ cv::IMREAD_GRAYSCALE -#define _LBANN_CV_COLOR_ cv::IMREAD_COLOR -#define _LBANN_CV_ANYDEPTH_ cv::IMREAD_ANYDEPTH -#define _LBANN_CV_ANYCOLOR_ cv::IMREAD_ANYCOLOR -#else -#include -#include -#include -#include -#define _LBANN_CV_UNCHANGED_ CV_LOAD_IMAGE_UNCHANGED -#define _LBANN_CV_GRAYSCALE_ CV_LOAD_IMAGE_GRAYSCALE -#define _LBANN_CV_COLOR_ CV_LOAD_IMAGE_COLOR -#define _LBANN_CV_ANYDEPTH_ CV_LOAD_IMAGE_ANYDEPTH -#define _LBANN_CV_ANYCOLOR_ CV_LOAD_IMAGE_ANYCOLOR -#endif - -#define _LBANN_CV_BLUE_ 0 -#define _LBANN_CV_GREEN_ 1 -#define _LBANN_CV_RED_ 2 - -#endif // _LBANN_OPENCV_H_INCLUDED_ -#endif // LBANN_HAS_OPENCV diff --git a/include/lbann/data_readers/opencv_extensions.hpp b/include/lbann/data_readers/opencv_extensions.hpp deleted file mode 100644 index b24ed360d4d..00000000000 --- a/include/lbann/data_readers/opencv_extensions.hpp +++ /dev/null @@ -1,233 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. -// Produced at the Lawrence Livermore National Laboratory. -// Written by the LBANN Research Team (B. Van Essen, et al.) listed in -// the CONTRIBUTORS file. -// -// LLNL-CODE-697807. -// All rights reserved. -// -// This file is part of LBANN: Livermore Big Artificial Neural Network -// Toolkit. For details, see http://software.llnl.gov/LBANN or -// https://github.com/LLNL/LBANN. -// -// Licensed under the Apache License, Version 2.0 (the "Licensee"); you -// may not use this file except in compliance with the License. You may -// obtain a copy of the License at: -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the license. -// -// opencv_extensions.hpp - LBANN's cv::Mat pixel type handling mechanisms -//////////////////////////////////////////////////////////////////////////////// - -#ifdef LBANN_HAS_OPENCV -#ifndef _LBANN_OPENCV_EXTENSIONS_H_INCLUDED_ -#define _LBANN_OPENCV_EXTENSIONS_H_INCLUDED_ -#include "lbann/data_readers/opencv.hpp" - -namespace lbann { - -/// A template structure to convert an OpenCV identifier of channel depth to a standard C++ type -template class cv_depth_type {}; - -/// define a specialized mapper from a CV channel type to its c++ native type -#define _def_cv_depth_translation(_CV_TYPE_, _NATIVE_TYPE_) \ -template<> struct cv_depth_type<_CV_TYPE_> { \ - public: \ - using standard_type = _NATIVE_TYPE_; \ -} - -/// cv_depth_type maps to uint8_t -_def_cv_depth_translation(CV_8U, uint8_t); -/// cv_depth_type maps to int8_t -_def_cv_depth_translation(CV_8S, int8_t); -/// cv_depth_type maps to uint16_t -_def_cv_depth_translation(CV_16U, uint16_t); -/// cv_depth_type maps to int16_t -_def_cv_depth_translation(CV_16S, int16_t); -/// cv_depth_type maps to int32_t -_def_cv_depth_translation(CV_32S, int32_t); -/// cv_depth_type maps to float -_def_cv_depth_translation(CV_32F, float); -/// cv_depth_type maps to double -_def_cv_depth_translation(CV_64F, double); - - -/// Convert an OpenCV identifier of image depth to a standard C++ type -#define _depth_type(_cv_depth_) lbann::cv_depth_type<_cv_depth_>::standard_type - - -/** A template structure to map the type of channel into the - * corresponding OpenCV type identifier of image. - * - _T_: The channel value type as a native C++ type - */ -template -struct cv_image_type { - /** A static member function which returns the OpenCV image type based on - * the channel type and number of channels: - * - _C_: The number of channels It ranges from 1 to CV_CN_MAX which is 512 - */ - static int T(const int _C_) { - return CV_MAKETYPE(cv::DataType<_T_>::depth, _C_); - } - /** A static member function which maps a native c++ type to the corresponding - * OpenCV channel type. - * The depth value returned ranges from 0 to (CV_DEPTH_MAX-1) which is 7 - */ - static int T() { - return cv::DataType<_T_>::depth; - } -}; - - -template -struct depth_normalization { - static double factor() { - if (!std::is_integral::value) { - return 1.0; - } else { - return 1.0/std::numeric_limits::max(); - } - } - static double inverse_factor() { - if (!std::is_integral::value) { - return 1.0; - } else { - return std::numeric_limits::max(); - } - } -}; - -template<> -struct depth_normalization { - static double factor() { - return 1.0; - } - static double inverse_factor() { - return 1.0; - } -}; - -/// Checks if an OpenCV depth code corresponds to an integral type -inline bool is_float(const int cv_depth) { - return ((cv_depth == CV_64F) || (cv_depth == CV_32F)); -} - -inline bool check_if_cv_Mat_is_float_type(const cv::Mat& image) { - return is_float(image.depth()); -} - -inline bool check_if_cv_Mat_has_same_shape(const cv::Mat& image1, const cv::Mat& image2) { - return ((image1.cols == image2.cols) && - (image1.rows == image2.rows) && - (image1.channels() == image2.channels())); -} - -template -static double depth_norm_factor() { - return depth_normalization::factor(); -} - -template -static double depth_norm_inverse_factor() { - return depth_normalization::inverse_factor(); -} - -/// Return the factor for unit scaling with the type indicated by the OpenCV depth -double get_depth_normalizing_factor(const int cv_depth); -/// Return the factor to inverse the unit scaling -double get_depth_denormalizing_factor(const int cv_depth); - -/// returns the number of bytes that would be used for the image without compresstion and any header -inline size_t image_data_amount(const cv::Mat& img) { - return static_cast(CV_ELEM_SIZE(img.depth())* - CV_MAT_CN(img.type())* - img.cols*img.rows); -} - -} // end of namespace lbann - -#define _SWITCH_CV_FUNC_KNOWN_TYPE_1PARAM(_SW_CH_,_T_,_FUNC_,_P1_) \ - switch (_SW_CH_) { \ - case 1: return _FUNC_<_T_,1>(_P1_); \ - case 2: return _FUNC_<_T_,2>(_P1_); \ - case 3: return _FUNC_<_T_,3>(_P1_); \ - case 4: return _FUNC_<_T_,4>(_P1_); \ - } - -#define _SWITCH_CV_FUNC_KNOWN_TYPE_2PARAMS(_SW_CH_,_T_,_FUNC_,_P1_,_P2_) \ - switch (_SW_CH_) { \ - case 1: return _FUNC_<_T_,1>(_P1_,_P2_); \ - case 2: return _FUNC_<_T_,2>(_P1_,_P2_); \ - case 3: return _FUNC_<_T_,3>(_P1_,_P2_); \ - case 4: return _FUNC_<_T_,4>(_P1_,_P2_); \ - } - -#define _SWITCH_CV_FUNC_KNOWN_TYPE_3PARAMS(_SW_CH_,_T_,_FUNC_,_P1_,_P2_,_P3_) \ - switch (_SW_CH_) { \ - case 1: return _FUNC_<_T_,1>(_P1_,_P2_,_P3_); \ - case 2: return _FUNC_<_T_,2>(_P1_,_P2_,_P3_); \ - case 3: return _FUNC_<_T_,3>(_P1_,_P2_,_P3_); \ - case 4: return _FUNC_<_T_,4>(_P1_,_P2_,_P3_); \ - } - -#define _SWITCH_CV_FUNC_KNOWN_TYPE_4PARAMS(_SW_CH_,_T_,_FUNC_,_P1_,_P2_,_P3_,_P4_) \ - switch (_SW_CH_) { \ - case 1: return _FUNC_<_T_,1>(_P1_,_P2_,_P3_,_P4_); \ - case 2: return _FUNC_<_T_,2>(_P1_,_P2_,_P3_,_P4_); \ - case 3: return _FUNC_<_T_,3>(_P1_,_P2_,_P3_,_P4_); \ - case 4: return _FUNC_<_T_,4>(_P1_,_P2_,_P3_,_P4_); \ - } - -#define _SWITCH_CV_FUNC_1PARAM(_SW_D_,_FUNC_,_P1_) \ - switch (_SW_D_) { \ - case CV_8U : return _FUNC_<_depth_type(CV_8U) >(_P1_); \ - case CV_8S : return _FUNC_<_depth_type(CV_8S) >(_P1_); \ - case CV_16U: return _FUNC_<_depth_type(CV_16U)>(_P1_); \ - case CV_16S: return _FUNC_<_depth_type(CV_16S)>(_P1_); \ - case CV_32S: return _FUNC_<_depth_type(CV_32S)>(_P1_); \ - case CV_32F: return _FUNC_<_depth_type(CV_32F)>(_P1_); \ - case CV_64F: return _FUNC_<_depth_type(CV_64F)>(_P1_); \ - } - -#define _SWITCH_CV_FUNC_2PARAMS(_SW_D_,_FUNC_,_P1_,_P2_) \ - switch (_SW_D_) { \ - case CV_8U : return _FUNC_<_depth_type(CV_8U) >(_P1_,_P2_); \ - case CV_8S : return _FUNC_<_depth_type(CV_8S) >(_P1_,_P2_); \ - case CV_16U: return _FUNC_<_depth_type(CV_16U)>(_P1_,_P2_); \ - case CV_16S: return _FUNC_<_depth_type(CV_16S)>(_P1_,_P2_); \ - case CV_32S: return _FUNC_<_depth_type(CV_32S)>(_P1_,_P2_); \ - case CV_32F: return _FUNC_<_depth_type(CV_32F)>(_P1_,_P2_); \ - case CV_64F: return _FUNC_<_depth_type(CV_64F)>(_P1_,_P2_); \ - } - -#define _SWITCH_CV_FUNC_3PARAMS(_SW_D_,_FUNC_,_P1_,_P2_,_P3_) \ - switch (_SW_D_) { \ - case CV_8U : return _FUNC_<_depth_type(CV_8U) >(_P1_,_P2_,_P3_); \ - case CV_8S : return _FUNC_<_depth_type(CV_8S) >(_P1_,_P2_,_P3_); \ - case CV_16U: return _FUNC_<_depth_type(CV_16U)>(_P1_,_P2_,_P3_); \ - case CV_16S: return _FUNC_<_depth_type(CV_16S)>(_P1_,_P2_,_P3_); \ - case CV_32S: return _FUNC_<_depth_type(CV_32S)>(_P1_,_P2_,_P3_); \ - case CV_32F: return _FUNC_<_depth_type(CV_32F)>(_P1_,_P2_,_P3_); \ - case CV_64F: return _FUNC_<_depth_type(CV_64F)>(_P1_,_P2_,_P3_); \ - } - -#define _SWITCH_CV_FUNC_4PARAMS(_SW_D_,_FUNC_,_P1_,_P2_,_P3_,_P4_) \ - switch (_SW_D_) { \ - case CV_8U : return _FUNC_<_depth_type(CV_8U) >(_P1_,_P2_,_P3_,_P4_); \ - case CV_8S : return _FUNC_<_depth_type(CV_8S) >(_P1_,_P2_,_P3_,_P4_); \ - case CV_16U: return _FUNC_<_depth_type(CV_16U)>(_P1_,_P2_,_P3_,_P4_); \ - case CV_16S: return _FUNC_<_depth_type(CV_16S)>(_P1_,_P2_,_P3_,_P4_); \ - case CV_32S: return _FUNC_<_depth_type(CV_32S)>(_P1_,_P2_,_P3_,_P4_); \ - case CV_32F: return _FUNC_<_depth_type(CV_32F)>(_P1_,_P2_,_P3_,_P4_); \ - case CV_64F: return _FUNC_<_depth_type(CV_64F)>(_P1_,_P2_,_P3_,_P4_); \ - } - -#endif // _LBANN_OPENCV_EXTENSIONS_H_INCLUDED_ -#endif // LBANN_HAS_OPENCV diff --git a/include/lbann/data_readers/patchworks/CMakeLists.txt b/include/lbann/data_readers/patchworks/CMakeLists.txt deleted file mode 100644 index d45491f93cd..00000000000 --- a/include/lbann/data_readers/patchworks/CMakeLists.txt +++ /dev/null @@ -1,11 +0,0 @@ -# Add the headers for this directory -set_full_path(THIS_DIR_HEADERS - patchworks.hpp - patchworks_ROI.hpp - patchworks_common.hpp - patchworks_patch_descriptor.hpp - patchworks_stats.hpp - ) - -# Propagate the files up the tree -set(HEADERS "${HEADERS}" "${THIS_DIR_HEADERS}" PARENT_SCOPE) diff --git a/include/lbann/data_readers/patchworks/patchworks_ROI.hpp b/include/lbann/data_readers/patchworks/patchworks_ROI.hpp deleted file mode 100644 index 3abdfed5da6..00000000000 --- a/include/lbann/data_readers/patchworks/patchworks_ROI.hpp +++ /dev/null @@ -1,153 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. -// Produced at the Lawrence Livermore National Laboratory. -// Written by the LBANN Research Team (B. Van Essen, et al.) listed in -// the CONTRIBUTORS file. -// -// LLNL-CODE-697807. -// All rights reserved. -// -// This file is part of LBANN: Livermore Big Artificial Neural Network -// Toolkit. For details, see http://software.llnl.gov/LBANN or -// https://github.com/LLNL/LBANN. -// -// Licensed under the Apache License, Version 2.0 (the "Licensee"); you -// may not use this file except in compliance with the License. You may -// obtain a copy of the License at: -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the license. -// -// patchworks_ROI.hpp - LBANN PATCHWORKS ROI (region-of-interest) header -//////////////////////////////////////////////////////////////////////////////// - -/** - * LBANN PATCHWORKS ROI header - * - Region of interest descriptor - */ - -#include "lbann_config.hpp" - -#ifdef LBANN_HAS_OPENCV -#ifndef _PATCHWORKS_ROI_H_INCLUDED_ -#define _PATCHWORKS_ROI_H_INCLUDED_ - -#include -#include -#include "patchworks_common.hpp" - -namespace lbann { -namespace patchworks { - -/** - * Regions of interest descriptor. - * Contains a pair of coordinates that defines a rectangular region of interest - */ -class ROI { - public: - /// An internal value to represent an uninitialized coordinate value - static const int undefined_coordinate; - - int m_left; ///< The left-most pixel position of the region - int m_top; ///< The top-most pixel position of the region - int m_right; ///< The right-most pixel position of the region - int m_bottom; ///< The bottom-most pixel position of the region - - ROI() ///< The default constructor - : m_left(undefined_coordinate), m_top(undefined_coordinate), - m_right(undefined_coordinate), m_bottom(undefined_coordinate) {} - - void init(); ///< Reset the structure with undefined coordinate values - bool is_undefined() const; ///< Tell if the structure has not been initialized - bool is_valid() const; ///< Check if the region is valid - bool set_overlapping_region(const cv::Mat& img); - /// Check if the region of interest covers the whole image - bool is_whole_image(const cv::Mat& img); - - /// Set a region by the coordinates - bool set_by_corners(const int p0_x, const int p0_y, - const int p1_x, const int p1_y); - /// Set a region by the center and its size - bool set_by_center(const int px, const int py, - const unsigned int _width, const unsigned int _height); - - /// move the region horizontally by dx and vertically by dy - void move(const std::pair displacement); - - /// Returns the left position of the region - int left() const { - return m_left; - } - /// Returns the top poisition of the region - int top() const { - return m_top; - } - /// Returns the right position of the region - int right() const { - return m_right; - } - /// Returns the bottom position of the region - int bottom() const { - return m_bottom; - } - - /// Returns a cv::Rect equivalent - cv::Rect rect() const { - return cv::Rect(m_left, m_top, m_right-m_left, m_bottom-m_top); - } - /// Returns the width of the rectangular region - int width() const { - return (m_right - m_left); - } - /// Returns the height of the rectangular region - int height() const { - return (m_bottom - m_top); - } - /// Returns the area of the rectangular region - int area() const { - return width()*height(); - } - /// Returns the size of the area (width, hegiht) - - std::ostream& Print(std::ostream& os) const { ///< Print out the content - return os << '(' << m_left << ", " << m_top << ") (" - << m_right << ", " << m_bottom << ')'; - } - - /// Check if this ROI is exactly the same as the given rectangular area - bool operator==(const ROI& rarea) const; - /// Check if this ROI is not exactly the same as the given rectangular area - bool operator!=(const ROI& rarea) const; - /// Check if the given rectangular region contains this ROI but is not the same - bool operator<(const ROI& rarea) const; - /// Check if the given rectangular region contains this ROI - bool operator<=(const ROI& rarea) const; - /// Check if this ROI contains the given rectangular region but is not the same - bool operator>(const ROI& rarea) const; - /// Check if this ROI contains the given rectangular region - bool operator>=(const ROI& rarea) const; -}; - -inline bool ROI::operator<=(const ROI& rarea) const { - return (((rarea.m_left <= m_left) && (rarea.m_top <= m_top)) && - ((m_right <= rarea.m_right) && (m_bottom <= rarea.m_bottom)) && - is_valid()); -} - -inline bool ROI::operator>=(const ROI& rarea) const { - return (((m_left <= rarea.m_left) && (m_top <= rarea.m_top)) && - ((rarea.m_right <= m_right) && (rarea.m_bottom <= m_bottom)) && - rarea.is_valid()); -} - -std::ostream& operator<<(std::ostream& os, const ROI& roi); - -} // end of namespace patchworks -} // end of namespace lbann -#endif // _PATCHWORKS_ROI_H_INCLUDED_ -#endif // LBANN_HAS_OPENCV diff --git a/include/lbann/data_readers/patchworks/patchworks_patch_descriptor.hpp b/include/lbann/data_readers/patchworks/patchworks_patch_descriptor.hpp deleted file mode 100644 index 2891055593c..00000000000 --- a/include/lbann/data_readers/patchworks/patchworks_patch_descriptor.hpp +++ /dev/null @@ -1,186 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. -// Produced at the Lawrence Livermore National Laboratory. -// Written by the LBANN Research Team (B. Van Essen, et al.) listed in -// the CONTRIBUTORS file. -// -// LLNL-CODE-697807. -// All rights reserved. -// -// This file is part of LBANN: Livermore Big Artificial Neural Network -// Toolkit. For details, see http://software.llnl.gov/LBANN or -// https://github.com/LLNL/LBANN. -// -// Licensed under the Apache License, Version 2.0 (the "Licensee"); you -// may not use this file except in compliance with the License. You may -// obtain a copy of the License at: -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the license. -// -// patchworks_patch_descriptor.hpp - LBANN PATCHWORKS header for patch descriptor -//////////////////////////////////////////////////////////////////////////////// - -/** - * LBANN PATCHWORKS header for patch descriptor - */ - -#include "lbann_config.hpp" - -#ifdef LBANN_HAS_OPENCV -#ifndef _PATCHWORKS_PATCH_DESCRIPTOR_H_INCLUDED_ -#define _PATCHWORKS_PATCH_DESCRIPTOR_H_INCLUDED_ - -#include -#include -#include "patchworks_common.hpp" -#include "patchworks_ROI.hpp" - -namespace lbann { -namespace patchworks { - -class patch_descriptor { - public: - // --- configuration variables --- - unsigned int m_width; ///< patch width - unsigned int m_height; ///< patch height - unsigned int m_gap; ///< gap between patches - unsigned int m_jitter; ///< for patch position randomization - - /** patch centering mode - * 0: place the center patch anywhere within the image - * 1: place the center patch anywhere as long as it allows the space for all 8 neighboring patches - * other: place the center patch at the center of the image - */ - unsigned int m_mode_center; - - /** chromatic aberration correction mode - * 0: nothing - * 1: pixel transform px*B where a=[-1 2 -1] and B=I-a'a/(aa') - * 2: randomly replace two channels with white noise - */ - unsigned int m_mode_chrom; - - /// Whether patches are self-labeled - bool m_self_label; - - /// The file extension name (i.e., image type) - std::string m_ext; - - // --- post-configuration variables --- - ROI m_sample_area; ///< The area to sample patches from - /// The list of displacements used to generate consecutive patches - std::vector m_displacements; - - // --- state variables --- - ROI m_patch_center; ///< The center patch region - /// The actual patch positions - std::vector m_positions; - /// The index of displacement used to generate the current patch - unsigned int m_cur_patch_idx; - - public: - patch_descriptor() { - init(); ///< Default constructor - } - virtual ~patch_descriptor() {} - void init(); ///< Initializer - void reset(); ///< Clear state variables other than configuration variables - - /// Get patch size - unsigned int get_patch_width() const { return m_width; } - unsigned int get_patch_height() const { return m_height; } - - /// Set patch size - void set_size(const int w, const int h); - /// Set the gap between neighboring patches - void set_gap(const unsigned int g) { - m_gap = g; - } - /// Set poisiton radomization parameter, the maximum jitter - void set_jitter(const unsigned int j) { - m_jitter = j; - } - /// Set mode to place center patch - void set_mode_centering(const unsigned int m) { - m_mode_center = m; - } - /// Set correction mode for chromatic aberration - void set_mode_chromatic_aberration(const unsigned int m) { - m_mode_chrom = m; - } - - /// Declare the size of the image to take patches from, and implicitly set the area to sample as the entire image - bool set_sample_image(const unsigned int w, const unsigned int h); - /// Explicitly set the area to sample patches - bool set_sample_area(const ROI& area); - - /// Set the file extention of patch files - void set_file_ext(const std::string e) { - m_ext = e; - } - - /// Mark self labeling for patches - void set_self_label() { m_self_label = true; } - - /// Unmark self labeling - void unset_self_label() { m_self_label = false; } - - bool is_self_labeling() const { return m_self_label; } - - unsigned int get_num_labels() const { return 8u; } - - /// A function that populates the list of displacements from the base patch to the next one - virtual void define_patch_set(); - - /// transform each pixel by B = I - a'*a/(a*a') where a=[-1 2 -1] to mitigate chromatic aberration - bool is_to_correct_chromatic_aberration_at_pixel() const { - return (m_mode_chrom == 1); - } - - /// randomly drop two channels to avoid chromatic aberration impact - bool is_to_drop_2channels() const { - return (m_mode_chrom == 2); - } - - /// Allow read-only access to the patch displacements - const std::vector& get_displacements() const { - return m_displacements; - } - - virtual unsigned int get_num_patches() const { return 2u; } - - /// Compute the position of the first patch - virtual bool get_first_patch(ROI& patch); - /// Compute the position of a subsequent patch - virtual bool get_next_patch(ROI& patch); - /// extract all the patches defined - virtual bool extract_patches(const cv::Mat& img, std::vector& patches); - /** - * Return the label of the last patch generated. - * For dual patch scenarios, it is one less the id of the non-center patch position. - */ - virtual unsigned int get_last_label() const { return m_cur_patch_idx - 1; } - - /// Allow read-only access to the positions of the patches generated - const std::vector& access_positions() const { - return m_positions; - } - virtual std::string get_type() const { return "patch_descriptor"; } - virtual std::string get_description() const; - /// Print out the content of patch descriptor - virtual std::ostream& print(std::ostream& os) const; -}; - -/// stream out the patch descriptor content -std::ostream& operator<<(std::ostream& os, const patch_descriptor& pd); - -} // end of namespace patchworks -} // end of namespace lbann -#endif // _PATCHWORKS_PATCH_DESCRIPTOR_H_INCLUDED_ -#endif // LBANN_HAS_OPENCV diff --git a/include/lbann/data_readers/patchworks/patchworks_stats.hpp b/include/lbann/data_readers/patchworks/patchworks_stats.hpp deleted file mode 100644 index 12141012eef..00000000000 --- a/include/lbann/data_readers/patchworks/patchworks_stats.hpp +++ /dev/null @@ -1,93 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. -// Produced at the Lawrence Livermore National Laboratory. -// Written by the LBANN Research Team (B. Van Essen, et al.) listed in -// the CONTRIBUTORS file. -// -// LLNL-CODE-697807. -// All rights reserved. -// -// This file is part of LBANN: Livermore Big Artificial Neural Network -// Toolkit. For details, see http://software.llnl.gov/LBANN or -// https://github.com/LLNL/LBANN. -// -// Licensed under the Apache License, Version 2.0 (the "Licensee"); you -// may not use this file except in compliance with the License. You may -// obtain a copy of the License at: -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the license. -// -// patchworks_stats.hpp - LBANN PATCHWORKS header for pixel statistics -//////////////////////////////////////////////////////////////////////////////// - -/** - * LBANN PATCHWORKS header for pixel statistics - */ - -#include "lbann_config.hpp" - -#ifdef LBANN_HAS_OPENCV -#ifndef _PATCHWORKS_STATS_INCLUDED_ -#define _PATCHWORKS_STATS_INCLUDED_ - -#include -#include -#include "patchworks_common.hpp" - -namespace lbann { -namespace patchworks { - -/// Pixel statistics of an image -struct image_stats { - size_t cnt; ///< number of values (pixels) - size_t cntZeros; ///< number of zero values - pw_fp_t min; ///< minimum intensity of a pixel - pw_fp_t max; ///< maximum intensity of a pixel - pw_fp_t median; ///< median intensity of a pixel - pw_fp_t minNZ; ///< number of non-zero pixels - pw_fp_t medianNZ; ///< median among non-zero values - double avg; ///< average intensity - double avgNZ; ///< average intensity among non-zeros - double stdev; ///< standard deviation of intensity - double stdevNZ; ///< standard deviation among non-zero values - - /// Print out statistics - std::ostream& Print(std::ostream& os) const { - os << " stats:" << std::endl - << " - cnt : " << cnt << std::endl - << " - cnt0 : " << cntZeros << std::endl - << " - min : " << min << std::endl - << " - max : " << max << std::endl - << " - med : " << median << std::endl - << " - minNZ : " << minNZ << std::endl - << " - medNZ : " << medianNZ << std::endl - << " - avg : " << avg << std::endl - << " - avgNZ : " << avgNZ << std::endl - << " - std : " << stdev << std::endl - << " - stdNZ : " << stdevNZ << std::endl; - return os; - } -}; - -/// Stream out the image statistics -inline std::ostream& operator<<(std::ostream& os, const image_stats& stats) { - return stats.Print(os); -} - -/// Compute the pixel statistics for a mono channel image -bool get_single_channel_stats(const cv::Mat& img, image_stats& stats); - -/// Compute the pixel statistics of an image per channel -bool get_channel_stats(const cv::Mat& img, std::vector& stats); - - -} // end of namespace patchworks -} // end of namespace lbann -#endif // _PATCHWORKS_STATS_INCLUDED_ -#endif // LBANN_HAS_OPENCV diff --git a/include/lbann/lbann.hpp b/include/lbann/lbann.hpp index 3a5bd27ef03..52018264b2b 100644 --- a/include/lbann/lbann.hpp +++ b/include/lbann/lbann.hpp @@ -105,11 +105,9 @@ /// Data readers #include "lbann/data_readers/data_reader_imagenet.hpp" -#include "lbann/data_readers/data_reader_imagenet_patches.hpp" #include "lbann/data_readers/data_reader_cifar10.hpp" #include "lbann/data_readers/data_reader_mnist.hpp" #include "lbann/data_readers/data_reader_multi_images.hpp" -#include "lbann/data_readers/data_reader_mnist_siamese.hpp" #include "lbann/data_readers/data_reader_multihead_siamese.hpp" #include "lbann/data_readers/data_reader_synthetic.hpp" #include "lbann/data_readers/data_reader_jag.hpp" diff --git a/include/lbann/proto/factories.hpp b/include/lbann/proto/factories.hpp index ca68f30975d..f9d93f07419 100644 --- a/include/lbann/proto/factories.hpp +++ b/include/lbann/proto/factories.hpp @@ -29,6 +29,8 @@ #include "lbann/proto/proto_common.hpp" #include "lbann/data_readers/data_reader.hpp" +#include "lbann/transforms/transform.hpp" +#include "lbann/transforms/transform_pipeline.hpp" namespace lbann { namespace proto { @@ -80,6 +82,13 @@ optimizer* construct_optimizer(lbann_comm* comm, /** Construct an objective function specified with prototext. */ objective_function* construct_objective_function(const lbann_data::ObjectiveFunction& proto_obj); +/** Construct a transform given a prototext. */ +std::unique_ptr construct_transform( + const lbann_data::Transform& trans); +/** Construct a transform pipeline given a data reader prototext. */ +transform::transform_pipeline construct_transform_pipeline( + const lbann_data::Reader& data_reader); + /** Parse a space-separated list. */ template std::vector parse_list(std::string str) { diff --git a/include/lbann/proto/init_image_data_readers.hpp b/include/lbann/proto/init_image_data_readers.hpp index f35a5797e2b..94c782531f3 100644 --- a/include/lbann/proto/init_image_data_readers.hpp +++ b/include/lbann/proto/init_image_data_readers.hpp @@ -32,7 +32,6 @@ namespace lbann { extern void init_image_data_reader(const lbann_data::Reader& pb_readme, const lbann_data::DataSetMetaData& pb_metadata, const bool master, generic_data_reader* &reader); -extern void init_generic_preprocessor(const lbann_data::Reader& pb_readme, const bool master, generic_data_reader* reader); extern void init_org_image_data_reader(const lbann_data::Reader& pb_readme, const bool master, generic_data_reader* &reader); } // namespace lbann diff --git a/include/lbann/transforms/CMakeLists.txt b/include/lbann/transforms/CMakeLists.txt new file mode 100644 index 00000000000..73511e8f331 --- /dev/null +++ b/include/lbann/transforms/CMakeLists.txt @@ -0,0 +1,15 @@ +# Add the headers for this directory +set_full_path(THIS_DIR_HEADERS + normalize.hpp + repack_HWC_to_CHW_layout.hpp + sample_normalize.hpp + scale.hpp + scale_and_translate.hpp + transform.hpp + transform_pipeline.hpp + ) + +add_subdirectory(vision) + +# Propagate the files up the tree +set(HEADERS "${HEADERS}" "${THIS_DIR_HEADERS}" PARENT_SCOPE) diff --git a/include/lbann/transforms/normalize.hpp b/include/lbann/transforms/normalize.hpp new file mode 100644 index 00000000000..259d3d3ec12 --- /dev/null +++ b/include/lbann/transforms/normalize.hpp @@ -0,0 +1,72 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_TRANSFORMS_NORMALIZE_HPP_INCLUDED +#define LBANN_TRANSFORMS_NORMALIZE_HPP_INCLUDED + +#include +#include "lbann/utils/exception.hpp" +#include "lbann/transforms/transform.hpp" + +namespace lbann { +namespace transform { + +/** + * Normalize with mean and standard deviation. + * This is done channel-wise for images. If the input does not have channels, + * (e.g. it is not an image), it is treated as having one "channel". + * This is only applicable after conversion to an LBANN CPUMat. + */ +class normalize : public transform { +public: + /** Apply channel-wise means and standard deviations. */ + normalize(std::vector means, std::vector stds) : + transform(), m_means(means), m_stds(stds) { + if (m_means.size() != m_stds.size()) { + LBANN_ERROR("Normalize mean and std have different numbers of channels."); + } + } + + transform* copy() const override { return new normalize(*this); } + + std::string get_type() const override { return "normalize"; } + + bool supports_non_inplace() const { return true; } + + void apply(utils::type_erased_matrix& data, std::vector& dims) override; + void apply(utils::type_erased_matrix& data, CPUMat& out, + std::vector& dims) override; +private: + /** Channel-wise means. */ + std::vector m_means; + /** Channel-wise standard deviations. */ + std::vector m_stds; +}; + +} // namespace transform +} // namespace lbann + +#endif // LBANN_TRANSFORMS_NORMALIZED_CENTER_CROP_HPP_INCLUDED diff --git a/include/lbann/transforms/repack_HWC_to_CHW_layout.hpp b/include/lbann/transforms/repack_HWC_to_CHW_layout.hpp new file mode 100644 index 00000000000..ea74b6c29fa --- /dev/null +++ b/include/lbann/transforms/repack_HWC_to_CHW_layout.hpp @@ -0,0 +1,56 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_TRANSFORMS_REPACK_HWC_TO_CHW_LAYOUT_HPP_INCLUDED +#define LBANN_TRANSFORMS_REPACK_HWC_TO_CHW_LAYOUT_HPP_INCLUDED + +#include "lbann/transforms/transform.hpp" + +namespace lbann { +namespace transform { + +/** + * Convert data to LBANN's native data layout. + * Currently only supports converting from and interleaved channel format. + */ +class repack_HWC_to_CHW_layout : public transform { +public: + transform* copy() const override { return new repack_HWC_to_CHW_layout(*this); } + + std::string get_type() const override { return "to_lbann_layout"; } + + bool supports_non_inplace() const { return true; } + + void apply(utils::type_erased_matrix& data, std::vector& dims) override; + + void apply(utils::type_erased_matrix& data, CPUMat& out, + std::vector& dims) override; +}; + +} // namespace transform +} // namespace lbann + +#endif // LBANN_TRANSFORMS_REPACK_HWC_TO_CHW_LAYOUT_HPP_INCLUDED diff --git a/include/lbann/data_readers/patchworks/patchworks.hpp b/include/lbann/transforms/sample_normalize.hpp similarity index 60% rename from include/lbann/data_readers/patchworks/patchworks.hpp rename to include/lbann/transforms/sample_normalize.hpp index d445bb2d343..6bd3203ec69 100644 --- a/include/lbann/data_readers/patchworks/patchworks.hpp +++ b/include/lbann/transforms/sample_normalize.hpp @@ -22,38 +22,30 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or // implied. See the License for the specific language governing // permissions and limitations under the license. -// -// patchworks.hpp - LBANN PATCHWORKS main interface header //////////////////////////////////////////////////////////////////////////////// -/** - * LBANN PATCHWORKS main interface header - * - includes the main interface function declarations - */ - -#include "lbann_config.hpp" +#ifndef LBANN_TRANSFORMS_SAMPLE_NORMALIZE_HPP_INCLUDED +#define LBANN_TRANSFORMS_SAMPLE_NORMALIZE_HPP_INCLUDED -#ifdef LBANN_HAS_OPENCV -#ifndef _PATCHWORKS_H_INCLUDED_ -#define _PATCHWORKS_H_INCLUDED_ -#include -#include "patchworks_common.hpp" -#include "patchworks_patch_descriptor.hpp" +#include "lbann/transforms/transform.hpp" namespace lbann { -namespace patchworks { +namespace transform { -/// Compute the min and max value of pixels -std::pair check_min_max(const cv::Mat& _img); +/** + * Normalize to have mean 0, standard deviation 1. + * This only works after conversion to an LBANN CPUMat. + */ +class sample_normalize : public transform { +public: + transform* copy() const override { return new sample_normalize(*this); } -/// Adjust for reducing chromatic aberration -cv::Mat correct_chromatic_aberration(const cv::Mat& _img); + std::string get_type() const override { return "sample_normalize"; } -/// Drop 2 channels randomly -cv::Mat drop_2channels(const cv::Mat& _img); + void apply(utils::type_erased_matrix& data, std::vector& dims) override; +}; -} // end of namespace patchworks -} // end of namespace lbann +} // namespace transform +} // namespace lbann -#endif //_PATCHWORKS_H_INCLUDED_ -#endif // LBANN_HAS_OPENCV +#endif // LBANN_TRANSFORMS_SAMPLE_NORMALIZE_HPP_INCLUDED diff --git a/include/lbann/transforms/scale.hpp b/include/lbann/transforms/scale.hpp new file mode 100644 index 00000000000..0d8218852c4 --- /dev/null +++ b/include/lbann/transforms/scale.hpp @@ -0,0 +1,54 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_TRANSFORMS_SCALE_HPP_INCLUDED +#define LBANN_TRANSFORMS_SCALE_HPP_INCLUDED + +#include "lbann/transforms/transform.hpp" + +namespace lbann { +namespace transform { + +/** Scale data by a constant. */ +class scale : public transform { +public: + /** Scale all data by scale_val. */ + scale(float scale_val) : transform(), m_scale(scale_val) {} + + transform* copy() const override { return new scale(*this); } + + std::string get_type() const override { return "scale"; } + + void apply(utils::type_erased_matrix& data, std::vector& dims) override; +private: + /** Amount to scale data by. */ + float m_scale; +}; + +} // namespace transform +} // namespace lbann + +#endif // LBANN_TRANSFORMS_SCALE_HPP_INCLUDED diff --git a/include/lbann/transforms/scale_and_translate.hpp b/include/lbann/transforms/scale_and_translate.hpp new file mode 100644 index 00000000000..42821168b33 --- /dev/null +++ b/include/lbann/transforms/scale_and_translate.hpp @@ -0,0 +1,57 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_TRANSFORMS_SCALE_AND_TRANSLATE_HPP_INCLUDED +#define LBANN_TRANSFORMS_SCALE_AND_TRANSLATE_HPP_INCLUDED + +#include "lbann/transforms/transform.hpp" + +namespace lbann { +namespace transform { + +/** Scale and Translate data by a constant pair of constants. */ +class scale_and_translate : public transform { +public: + /** Scale_And_Translate all data by scale_and_translate_val. */ + scale_and_translate(float scale_val, float translate_val) + : transform(), m_scale(scale_val), m_translate(translate_val) {} + + transform* copy() const override { return new scale_and_translate(*this); } + + std::string get_type() const override { return "scale"; } + + void apply(utils::type_erased_matrix& data, std::vector& dims) override; +private: + /** Amount to scale data by. */ + float m_scale; + /** Amount to translate data by. */ + float m_translate; +}; + +} // namespace transform +} // namespace lbann + +#endif // LBANN_TRANSFORMS_SCALE_AND_TRANSLATE_HPP_INCLUDED diff --git a/include/lbann/transforms/transform.hpp b/include/lbann/transforms/transform.hpp new file mode 100644 index 00000000000..f7a9f3e73a4 --- /dev/null +++ b/include/lbann/transforms/transform.hpp @@ -0,0 +1,112 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_TRANSFORMS_TRANSFORM_HPP_INCLUDED +#define LBANN_TRANSFORMS_TRANSFORM_HPP_INCLUDED + +#include "lbann/base.hpp" +#include "lbann/utils/description.hpp" +#include "lbann/utils/random.hpp" +#include "lbann/utils/type_erased_matrix.hpp" +#include "lbann/utils/exception.hpp" + +namespace lbann { +namespace transform { + +/** + * Abstract base class for transforms on data. + * + * A transform takes a CPUMat and modifies it in-place. Transforms should + * be thread-safe, as one instance of a transform may be called concurrently + * within multiple threads. + * + * Because transforms may switch between underlying data types throughout the + * pipeline, everything is done in terms of a type_erased_matrix, which can + * swap between underlying data types. + */ +class transform { +public: + transform() = default; + transform(const transform&) = default; + transform& operator=(const transform&) = default; + virtual ~transform() = default; + + /** Create a copy of the transform instance. */ + virtual transform* copy() const = 0; + + /** Human-readable type name. */ + virtual std::string get_type() const = 0; + /** Human-readable description. */ + virtual description get_description() const { + return description(get_type() + " transform"); + } + + /** True if the transform supports non-in-place apply. */ + virtual bool supports_non_inplace() const { + return false; + } + + /** + * Apply the transform to data. + * @param data The input data to transform, which is modified in-place. The + * matrix shuold be contiguous. + * @param dims The dimensions of the data tensor. For "plain data", dims + * should have one entry, giving its size. For images, dims should have three + * entries: channels, height, width. + * @note dims is a hack until we have proper tensors. + */ + virtual void apply(utils::type_erased_matrix& data, std::vector& dims) = 0; + + /** + * Apply the transform to data. + * This does not modify data in-place but places its output in out. + */ + virtual void apply(utils::type_erased_matrix& data, CPUMat& out, + std::vector& dims) { + LBANN_ERROR("Non-in-place apply not implemented."); + } +protected: + /** Return a value uniformly at random in [a, b). */ + static inline float get_uniform_random(float a, float b) { + fast_rng_gen& gen = get_fast_io_generator(); + std::uniform_real_distribution dist(a, b); + return dist(gen); + } + /** Return true with probability p. */ + static inline bool get_bool_random(float p) { + return get_uniform_random(0.0, 1.0) < p; + } + /** Return an integer uniformly at random in [a, b). */ + static inline El::Int get_uniform_random_int(El::Int a, El::Int b) { + fast_rng_gen& gen = get_fast_io_generator(); + return fast_rand_int(gen, b - a) + a; + } +}; + +} // namespace transform +} // namespace lbann + +#endif // LBANN_TRANSFORMS_TRANSFORM_HPP_INCLUDED diff --git a/include/lbann/transforms/transform_pipeline.hpp b/include/lbann/transforms/transform_pipeline.hpp new file mode 100644 index 00000000000..aae1b8a545b --- /dev/null +++ b/include/lbann/transforms/transform_pipeline.hpp @@ -0,0 +1,95 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_TRANSFORMS_TRANSFORM_PIPELINE_HPP_INCLUDED +#define LBANN_TRANSFORMS_TRANSFORM_PIPELINE_HPP_INCLUDED + +#include "lbann/base.hpp" +#include "lbann/utils/description.hpp" +#include "lbann/transforms/transform.hpp" + +namespace lbann { +namespace transform { + +/** + * Applies a sequence of transforms to input data. + */ +class transform_pipeline { +public: + transform_pipeline() {} + transform_pipeline(const transform_pipeline&); + transform_pipeline(transform_pipeline&&) = default; + transform_pipeline& operator=(const transform_pipeline&); + transform_pipeline& operator=(transform_pipeline&&) = default; + ~transform_pipeline() {} + + transform_pipeline* copy() const { return new transform_pipeline(*this); } + + /** + * Add trans as the next transform to apply. + */ + void add_transform(std::unique_ptr trans) { + m_transforms.push_back(std::move(trans)); + } + + /** + * Set the expected dimensions of the data after applying the transforms. + * This is primarily meant as a debugging aid/sanity check. + */ + void set_expected_out_dims(std::vector expected_out_dims) { + m_expected_out_dims = expected_out_dims; + } + + /** + * Apply the transforms to data. + * @param data The data to transform. data will be modified in-place. + * @param dims Dimensions of data. Will be modified in-place. + */ + void apply(utils::type_erased_matrix& data, std::vector& dims); + /** Apply to CPUMat data, which will be modified in-place. */ + void apply(CPUMat& data, std::vector& dims); + /** + * Apply the transforms to data. + * @param data The data to transform. Will be modified in-place. + * @param out_data Output will be placed here. It will not be reallocated. + * @param dims Dimensions of data. Will be modified in-place. + */ + void apply(El::Matrix& data, CPUMat& out_data, + std::vector& dims); +private: + /** Ordered list of transforms to apply. */ + std::vector> m_transforms; + /** Expected dimensions after applying all transforms. */ + std::vector m_expected_out_dims; + + /** Assert dims matches expected_out_dims (if set). */ + void assert_expected_out_dims(const std::vector& dims); +}; + +} // namespace transform +} // namespace lbann + +#endif // LBANN_TRANSFORMS_TRANSFORM_PIPELINE_HPP_INCLUDED diff --git a/include/lbann/transforms/vision/CMakeLists.txt b/include/lbann/transforms/vision/CMakeLists.txt new file mode 100644 index 00000000000..fa6ee2aff49 --- /dev/null +++ b/include/lbann/transforms/vision/CMakeLists.txt @@ -0,0 +1,19 @@ +# Add the headers for this directory +set_full_path(THIS_DIR_HEADERS + center_crop.hpp + colorize.hpp + grayscale.hpp + horizontal_flip.hpp + normalize_to_lbann_layout.hpp + random_affine.hpp + random_crop.hpp + random_resized_crop.hpp + random_resized_crop_with_fixed_aspect_ratio.hpp + resize.hpp + resized_center_crop.hpp + to_lbann_layout.hpp + vertical_flip.hpp + ) + +# Propagate the files up the tree +set(HEADERS "${HEADERS}" "${THIS_DIR_HEADERS}" PARENT_SCOPE) diff --git a/src/data_readers/cv_transform.cpp b/include/lbann/transforms/vision/center_crop.hpp similarity index 59% rename from src/data_readers/cv_transform.cpp rename to include/lbann/transforms/vision/center_crop.hpp index 1a0774b2813..e7d512c6f3e 100644 --- a/src/data_readers/cv_transform.cpp +++ b/include/lbann/transforms/vision/center_crop.hpp @@ -22,33 +22,33 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or // implied. See the License for the specific language governing // permissions and limitations under the license. -// -// cv_transform .cpp .hpp - base class for the transformation -// on image data in opencv format //////////////////////////////////////////////////////////////////////////////// -#include "lbann/data_readers/cv_transform.hpp" +#ifndef LBANN_TRANSFORMS_CENTER_CROP_HPP_INCLUDED +#define LBANN_TRANSFORMS_CENTER_CROP_HPP_INCLUDED + +#include "lbann/transforms/transform.hpp" -#ifdef LBANN_HAS_OPENCV namespace lbann { +namespace transform { -const constexpr char* const cv_transform::cv_flip_desc[]; +/** Crop an image at the center. */ +class center_crop : public transform { +public: + /** Crop to an h x w image. */ + center_crop(size_t h, size_t w) : transform(), m_h(h), m_w(w) {} -/** The mathematical constant (this is the way to get it in C++). */ -const float cv_transform::pi = static_cast(std::acos(-1)); + transform* copy() const override { return new center_crop(*this); } -double get_depth_normalizing_factor(const int cv_depth) { - _SWITCH_CV_FUNC_1PARAM(cv_depth, depth_norm_factor, ); - // The caller must check the exception by detecting 0.0 - return 0.0; -} + std::string get_type() const override { return "center_crop"; } -double get_depth_denormalizing_factor(const int cv_depth) { - _SWITCH_CV_FUNC_1PARAM(cv_depth, depth_norm_inverse_factor, ); - // The caller must check the exception by detecting 0.0 - return 0.0; -} + void apply(utils::type_erased_matrix& data, std::vector& dims) override; +private: + /** Height and width of the crop. */ + size_t m_h, m_w; +}; -} // end of namespace lbann +} // namespace transform +} // namespace lbann -#endif // LBANN_HAS_OPENCV +#endif // LBANN_TRANSFORMS_CENTER_CROP_HPP_INCLUDED diff --git a/include/lbann/transforms/vision/colorize.hpp b/include/lbann/transforms/vision/colorize.hpp new file mode 100644 index 00000000000..f5f444eacec --- /dev/null +++ b/include/lbann/transforms/vision/colorize.hpp @@ -0,0 +1,48 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_TRANSFORMS_COLORIZE_HPP_INCLUDED +#define LBANN_TRANSFORMS_COLORIZE_HPP_INCLUDED + +#include "lbann/transforms/transform.hpp" + +namespace lbann { +namespace transform { + +/** Convert an image from grayscale to color. */ +class colorize : public transform { +public: + transform* copy() const override { return new colorize(*this); } + + std::string get_type() const override { return "colorize"; } + + void apply(utils::type_erased_matrix& data, std::vector& dims) override; +}; + +} // namespace transform +} // namespace lbann + +#endif // LBANN_TRANSFORMS_COLORIZE_HPP_INCLUDED diff --git a/include/lbann/transforms/vision/grayscale.hpp b/include/lbann/transforms/vision/grayscale.hpp new file mode 100644 index 00000000000..b185344eff6 --- /dev/null +++ b/include/lbann/transforms/vision/grayscale.hpp @@ -0,0 +1,48 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_TRANSFORMS_GRAYSCALE_HPP_INCLUDED +#define LBANN_TRANSFORMS_GRAYSCALE_HPP_INCLUDED + +#include "lbann/transforms/transform.hpp" + +namespace lbann { +namespace transform { + +/** Convert an image to grayscale. */ +class grayscale : public transform { +public: + transform* copy() const override { return new grayscale(*this); } + + std::string get_type() const override { return "grayscale"; } + + void apply(utils::type_erased_matrix& data, std::vector& dims) override; +}; + +} // namespace transform +} // namespace lbann + +#endif // LBANN_TRANSFORMS_GRAYSCALE_HPP_INCLUDED diff --git a/include/lbann/transforms/vision/horizontal_flip.hpp b/include/lbann/transforms/vision/horizontal_flip.hpp new file mode 100644 index 00000000000..63ea133d25d --- /dev/null +++ b/include/lbann/transforms/vision/horizontal_flip.hpp @@ -0,0 +1,55 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_TRANSFORMS_HORIZONTAL_FLIP_HPP_INCLUDED +#define LBANN_TRANSFORMS_HORIZONTAL_FLIP_HPP_INCLUDED + +#include "lbann/transforms/transform.hpp" + +namespace lbann { +namespace transform { + +/** Horizontally flip image data with given probability. */ +class horizontal_flip : public transform { +public: + /** Flip image with probability p. */ + horizontal_flip(float p) : transform(), m_p(p) {} + + transform* copy() const override { return new horizontal_flip(*this); } + + std::string get_type() const override { return "horizontal_flip"; } + + void apply(utils::type_erased_matrix& data, std::vector& dims) override; + +private: + /** Probability that that the image is flipped. */ + float m_p; +}; + +} // namespace transform +} // namespace lbann + +#endif // LBANN_TRANSFORMS_HORIZONTAL_FLIP_HPP_INCLUDED diff --git a/include/lbann/transforms/vision/normalize_to_lbann_layout.hpp b/include/lbann/transforms/vision/normalize_to_lbann_layout.hpp new file mode 100644 index 00000000000..2f01d08fbe3 --- /dev/null +++ b/include/lbann/transforms/vision/normalize_to_lbann_layout.hpp @@ -0,0 +1,73 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_TRANSFORMS_NORMALIZE_TO_LBANN_LAYOUT_HPP_INCLUDED +#define LBANN_TRANSFORMS_NORMALIZE_TO_LBANN_LAYOUT_HPP_INCLUDED + +#include "lbann/transforms/transform.hpp" + +namespace lbann { +namespace transform { + +/** + * Normalize and convert data to LBANN's native data layout. + * Currently only supports converting from OpenCV layouts. + * This normalizes with provided channel-wise means and standard deviations, + * scales from [0, 255] to [0, 1], and converts to LBANN's data layout. + * Normalization is applied after the scaling to [0, 1]. + * This essentially fuses the to_lbann_layout and normalize transforms. + */ +class normalize_to_lbann_layout : public transform { +public: + /** Apply channel-wise means and standard deviations. */ + normalize_to_lbann_layout(std::vector means, std::vector stds) : + transform(), m_means(means), m_stds(stds) { + if (m_means.size() != m_stds.size()) { + LBANN_ERROR("Normalize mean and std have different numbers of channels."); + } + } + + transform* copy() const override { return new normalize_to_lbann_layout(*this); } + + std::string get_type() const override { return "normalize_to_lbann_layout"; } + + bool supports_non_inplace() const { return true; } + + void apply(utils::type_erased_matrix& data, std::vector& dims) override; + + void apply(utils::type_erased_matrix& data, CPUMat& out, + std::vector& dims) override; +private: + /** Channel-wise means. */ + std::vector m_means; + /** Channel-wise standard deviations. */ + std::vector m_stds; +}; + +} // namespace transform +} // namespace lbann + +#endif // LBANN_TRANSFORMS_NORMALIZE_TO_LBANN_LAYOUT_HPP_INCLUDED diff --git a/include/lbann/transforms/vision/random_affine.hpp b/include/lbann/transforms/vision/random_affine.hpp new file mode 100644 index 00000000000..a54a392cd4c --- /dev/null +++ b/include/lbann/transforms/vision/random_affine.hpp @@ -0,0 +1,77 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_TRANSFORMS_RANDOM_AFFINE_HPP_INCLUDED +#define LBANN_TRANSFORMS_RANDOM_AFFINE_HPP_INCLUDED + +#include "lbann/transforms/transform.hpp" + +namespace lbann { +namespace transform { + +/** Apply a random affine transform to an image. */ +class random_affine : public transform { +public: + /** + * Set up the affine transform. + * Rotate a random number of degrees selected in [rotate_min, rotate_max]. + * Translate the vertical dimension in a random amount in [-h*translate_h, + * h*translate_h], and the horizontal dimension in [-w*translate_w, + * w*translate_w]. + * Scale by a random amount in [scale_min, scale_max]. + * Shear by a random number of degrees in [shear_min, shear_max]. + * Set arguments to 0 to disable that transform. + */ + random_affine(float rotate_min, float rotate_max, + float translate_h, float translate_w, + float scale_min, float scale_max, + float shear_min, float shear_max) : + transform(), + m_rotate_min(rotate_min), m_rotate_max(rotate_max), + m_translate_h(translate_h), m_translate_w(translate_w), + m_scale_min(scale_min), m_scale_max(scale_max), + m_shear_min(shear_min), m_shear_max(shear_max) {} + + transform* copy() const override { return new random_affine(*this); } + + std::string get_type() const override { return "random_affine"; } + + void apply(utils::type_erased_matrix& data, std::vector& dims) override; +private: + /** Range in degrees to rotate. */ + float m_rotate_min, m_rotate_max; + /** Fraction of height/width to translate. */ + float m_translate_h, m_translate_w; + /** Range for fraction to scale by. */ + float m_scale_min, m_scale_max; + /** Range for degrees to shear. */ + float m_shear_min, m_shear_max; +}; + +} // namespace transform +} // namespace lbann + +#endif // LBANN_TRANSFORMS_RANDOM_AFFINED_CENTER_CROP_HPP_INCLUDED diff --git a/include/lbann/transforms/vision/random_crop.hpp b/include/lbann/transforms/vision/random_crop.hpp new file mode 100644 index 00000000000..43f3c003a91 --- /dev/null +++ b/include/lbann/transforms/vision/random_crop.hpp @@ -0,0 +1,54 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_TRANSFORMS_RANDOM_CROP_HPP_INCLUDED +#define LBANN_TRANSFORMS_RANDOM_CROP_HPP_INCLUDED + +#include "lbann/transforms/transform.hpp" + +namespace lbann { +namespace transform { + +/** Crop an image at a random location. */ +class random_crop : public transform { +public: + /** Crop to an h x w image. */ + random_crop(size_t h, size_t w) : transform(), m_h(h), m_w(w) {} + + transform* copy() const override { return new random_crop(*this); } + + std::string get_type() const override { return "random_crop"; } + + void apply(utils::type_erased_matrix& data, std::vector& dims) override; +private: + /** Height and width of the crop. */ + size_t m_h, m_w; +}; + +} // namespace transform +} // namespace lbann + +#endif // LBANN_TRANSFORMS_RANDOM_CROP_HPP_INCLUDED diff --git a/include/lbann/transforms/vision/random_resized_crop.hpp b/include/lbann/transforms/vision/random_resized_crop.hpp new file mode 100644 index 00000000000..261adda9a31 --- /dev/null +++ b/include/lbann/transforms/vision/random_resized_crop.hpp @@ -0,0 +1,75 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_TRANSFORMS_RANDOM_RESIZED_CROP_HPP_INCLUDED +#define LBANN_TRANSFORMS_RANDOM_RESIZED_CROP_HPP_INCLUDED + +#include "lbann/transforms/transform.hpp" + +namespace lbann { +namespace transform { + +/** + * Extract a crop of random size and aspect ratio, then crop to a size. + * This is commonly used for Inception-style networks and some other + * image classification networks. + */ +class random_resized_crop : public transform { +public: + /** + * Crop to a random size and aspect ratio, then resize to h x w. + * The random crop has area in [scale_min, scale_max] of the original image + * area, and aspect ratio in [ar_min, ar_max] of the original. This random + * crop is then resized to be h x w. + * These default to (0.08, 1.0) and (3/4, 4/3), respectively, which are the + * standard. + */ + random_resized_crop(size_t h, size_t w, + float scale_min=0.08, float scale_max=1.0, + float ar_min=0.75, float ar_max=4.0f/3.0f) : + transform(), + m_h(h), m_w(w), + m_scale_min(scale_min), m_scale_max(scale_max), + m_ar_min(ar_min), m_ar_max(ar_max) {} + + transform* copy() const override { return new random_resized_crop(*this); } + + std::string get_type() const override { return "random_resized_crop"; } + + void apply(utils::type_erased_matrix& data, std::vector& dims) override; +private: + /** Height and width of the final crop. */ + size_t m_h, m_w; + /** Range for the area of the random crop. */ + float m_scale_min, m_scale_max; + /** Range for the aspect ratio of the random crop. */ + float m_ar_min, m_ar_max; +}; + +} // namespace transform +} // namespace lbann + +#endif // LBANN_TRANSFORMS_RANDOM_RESIZED_CROP_HPP_INCLUDED diff --git a/include/lbann/transforms/vision/random_resized_crop_with_fixed_aspect_ratio.hpp b/include/lbann/transforms/vision/random_resized_crop_with_fixed_aspect_ratio.hpp new file mode 100644 index 00000000000..95de08165e9 --- /dev/null +++ b/include/lbann/transforms/vision/random_resized_crop_with_fixed_aspect_ratio.hpp @@ -0,0 +1,62 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_TRANSFORMS_RANDOM_RESIZED_CROP_WITH_FIXED_ASPECT_RATIO_HPP_INCLUDED +#define LBANN_TRANSFORMS_RANDOM_RESIZED_CROP_WITH_FIXED_ASPECT_RATIO_HPP_INCLUDED + +#include "lbann/transforms/transform.hpp" + +namespace lbann { +namespace transform { + +/** Resize an image then extract a random crop. */ +class random_resized_crop_with_fixed_aspect_ratio : public transform { +public: + /** Resize to h x w, then extract a random crop_h x crop_w crop. */ + random_resized_crop_with_fixed_aspect_ratio( + size_t h, size_t w, size_t crop_h, size_t crop_w) : + transform(), m_h(h), m_w(w), m_crop_h(crop_h), m_crop_w(crop_w) {} + + transform* copy() const override { + return new random_resized_crop_with_fixed_aspect_ratio(*this); + } + + std::string get_type() const override { + return "random_resized_crop_with_fixed_aspect_ratio"; + } + + void apply(utils::type_erased_matrix& data, std::vector& dims) override; +private: + /** Height and width of the resized image. */ + size_t m_h, m_w; + /** Height and width of the crop. */ + size_t m_crop_h, m_crop_w; +}; + +} // namespace transform +} // namespace lbann + +#endif // LBANN_TRANSFORMS_RANDOM_RESIZED_CROP_WITH_FIXED_ASPECT_RATIO_HPP_INCLUDED diff --git a/include/lbann/transforms/vision/resize.hpp b/include/lbann/transforms/vision/resize.hpp new file mode 100644 index 00000000000..244c3547df5 --- /dev/null +++ b/include/lbann/transforms/vision/resize.hpp @@ -0,0 +1,54 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_TRANSFORMS_RESIZE_HPP_INCLUDED +#define LBANN_TRANSFORMS_RESIZE_HPP_INCLUDED + +#include "lbann/transforms/transform.hpp" + +namespace lbann { +namespace transform { + +/** Resize an image. */ +class resize : public transform { +public: + /** Resize to h x w. */ + resize(size_t h, size_t w) : transform(), m_h(h), m_w(w) {} + + transform* copy() const override { return new resize(*this); } + + std::string get_type() const override { return "resize"; } + + void apply(utils::type_erased_matrix& data, std::vector& dims) override; +private: + /** Height and width of the resized image. */ + size_t m_h, m_w; +}; + +} // namespace transform +} // namespace lbann + +#endif // LBANN_TRANSFORMS_RESIZE_HPP_INCLUDED diff --git a/include/lbann/transforms/vision/resized_center_crop.hpp b/include/lbann/transforms/vision/resized_center_crop.hpp new file mode 100644 index 00000000000..81eead713b0 --- /dev/null +++ b/include/lbann/transforms/vision/resized_center_crop.hpp @@ -0,0 +1,57 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_TRANSFORMS_RESIZED_CENTER_CROP_HPP_INCLUDED +#define LBANN_TRANSFORMS_RESIZED_CENTER_CROP_HPP_INCLUDED + +#include "lbann/transforms/transform.hpp" + +namespace lbann { +namespace transform { + +/** Resize an image and then crop its center. */ +class resized_center_crop : public transform { +public: + /** Resize to h x w, then extract a crop_h x crop_w crop from the center. */ + resized_center_crop(size_t h, size_t w, size_t crop_h, size_t crop_w) : + transform(), m_h(h), m_w(w), m_crop_h(crop_h), m_crop_w(crop_w) {} + + transform* copy() const override { return new resized_center_crop(*this); } + + std::string get_type() const override { return "resized_center_crop"; } + + void apply(utils::type_erased_matrix& data, std::vector& dims) override; +private: + /** Height and width of the resized image. */ + size_t m_h, m_w; + /** Height and width of the crop. */ + size_t m_crop_h, m_crop_w; +}; + +} // namespace transform +} // namespace lbann + +#endif // LBANN_TRANSFORMS_RESIZED_CENTER_CROP_HPP_INCLUDED diff --git a/include/lbann/data_readers/patchworks/patchworks_common.hpp b/include/lbann/transforms/vision/to_lbann_layout.hpp similarity index 54% rename from include/lbann/data_readers/patchworks/patchworks_common.hpp rename to include/lbann/transforms/vision/to_lbann_layout.hpp index 5c3b9ceb7d1..d342e39be0b 100644 --- a/include/lbann/data_readers/patchworks/patchworks_common.hpp +++ b/include/lbann/transforms/vision/to_lbann_layout.hpp @@ -22,49 +22,36 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or // implied. See the License for the specific language governing // permissions and limitations under the license. -// -// patchworks_common.hpp - LBANN PATCHWORKS header for common definitions //////////////////////////////////////////////////////////////////////////////// -/** - * LBANN PATCHWORKS common header - * - includes commonly used macros, definitions and declarations - */ +#ifndef LBANN_TRANSFORMS_TO_LBANN_LAYOUT_HPP_INCLUDED +#define LBANN_TRANSFORMS_TO_LBANN_LAYOUT_HPP_INCLUDED + +#include "lbann/transforms/transform.hpp" -#include "lbann_config.hpp" +namespace lbann { +namespace transform { -#ifdef LBANN_HAS_OPENCV -#ifndef _PATCHWORKS_COMMON_H_ -#define _PATCHWORKS_COMMON_H_ +/** + * Convert data to LBANN's native data layout. + * Currently only supports converting from OpenCV layouts. + * This will also rescale data from [0, 255] to [0, 1]. + */ +class to_lbann_layout : public transform { +public: + transform* copy() const override { return new to_lbann_layout(*this); } -#include // std::pair -#include -#include -#include -#include "lbann/data_readers/opencv_extensions.hpp" + std::string get_type() const override { return "to_lbann_layout"; } -namespace lbann { -namespace patchworks { + bool supports_non_inplace() const { return true; } -/// Patch displacement type -using displacement_type = std::pair; + void apply(utils::type_erased_matrix& data, std::vector& dims) override; -#if 0 -// using 32-bit floating point for intermediate image data processing -using pw_fp_t = float; -using pw_cv_vec3 = cv::Vec3f; -#define _PATCHWORKS_STAT_FLOAT_ 32 -#define _PW_CV_FP_ CV_32FC3 -#else -// using 64-bit floating point for intermediate image data processing -using pw_fp_t = double; -using pw_cv_vec3 = cv::Vec3d; -#define _PATCHWORKS_STAT_FLOAT_ 64 -#define _PW_CV_FP_ CV_64FC3 -#endif + void apply(utils::type_erased_matrix& data, CPUMat& out, + std::vector& dims) override; +}; -} // end of namespace patchworks -} // end of namespace lbann +} // namespace transform +} // namespace lbann -#endif // _PATCHWORKS_COMMON_H_ -#endif // LBANN_HAS_OPENCV +#endif // LBANN_TRANSFORMS_TO_LBANN_LAYOUT_HPP_INCLUDED diff --git a/include/lbann/transforms/vision/vertical_flip.hpp b/include/lbann/transforms/vision/vertical_flip.hpp new file mode 100644 index 00000000000..9e02a7ea14b --- /dev/null +++ b/include/lbann/transforms/vision/vertical_flip.hpp @@ -0,0 +1,55 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_TRANSFORMS_VERTICAL_FLIP_HPP_INCLUDED +#define LBANN_TRANSFORMS_VERTICAL_FLIP_HPP_INCLUDED + +#include "lbann/transforms/transform.hpp" + +namespace lbann { +namespace transform { + +/** Vertically flip image data with given probability. */ +class vertical_flip : public transform { +public: + /** Flip image with probability p. */ + vertical_flip(float p) : transform(), m_p(p) {} + + transform* copy() const override { return new vertical_flip(*this); } + + std::string get_type() const override { return "vertical_flip"; } + + void apply(utils::type_erased_matrix& data, std::vector& dims) override; + +private: + /** Probability that that the image is flipped. */ + float m_p; +}; + +} // namespace transform +} // namespace lbann + +#endif // LBANN_TRANSFORMS_VERTICAL_FLIP_HPP_INCLUDED diff --git a/include/lbann/utils/CMakeLists.txt b/include/lbann/utils/CMakeLists.txt index a07932b662f..6dbd75433f1 100644 --- a/include/lbann/utils/CMakeLists.txt +++ b/include/lbann/utils/CMakeLists.txt @@ -14,11 +14,13 @@ set_full_path(THIS_DIR_HEADERS file_utils.hpp glob.hpp im2col.hpp + image.hpp jag_utils.hpp lbann_library.hpp mild_exception.hpp number_theory.hpp omp_diagnostics.hpp + opencv.hpp options.hpp profiling.hpp prototext.hpp diff --git a/include/lbann/utils/image.hpp b/include/lbann/utils/image.hpp new file mode 100644 index 00000000000..73fda36d395 --- /dev/null +++ b/include/lbann/utils/image.hpp @@ -0,0 +1,75 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_UTILS_IMAGE_HPP +#define LBANN_UTILS_IMAGE_HPP + +#include "lbann/base.hpp" + +namespace lbann { + +/** + * Load an image from filename. + * @param filename The path to the image to load. + * @param dst Image will be loaded into this matrix, in OpenCV format. + * @param dims Will contain the dimensions of the image as {channels, height, + * width}. + */ +void load_image(const std::string& filename, El::Matrix& dst, + std::vector& dims); + +/** + * Decode an image from buf. + * @param src A buffer containing image data to be decoded. + * @param dst Image will be loaded into this matrix, in OpenCV format. + * @param dims Will contain the dimensions of the image as {channels, height, + * width}. + */ +void decode_image(El::Matrix& src, El::Matrix& dst, + std::vector& dims); + +/** + * Save an image to filename. + * @param filename The path to the image to write. + * @param src The image to save. This is in OpenCV format. + * @param dims The dimensions of the image. + */ +void save_image(const std::string& filename, El::Matrix& src, + const std::vector& dims); +/** + * Save an image to filename. + * @param filename The path to the image to write. + * @param src The image to save. This is in standard LBANN format, and will be + * converted to a uint8_t matrix, interpolating between the min and max values + * in it. + * @param dims The dimensions of the image. + */ +void save_image(const std::string& filename, const CPUMat& src, + const std::vector& dims); + +} // namespace lbann + +#endif // LBANN_UTILS_IMAGE_HPP diff --git a/include/lbann/utils/opencv.hpp b/include/lbann/utils/opencv.hpp new file mode 100644 index 00000000000..f14208ca7b4 --- /dev/null +++ b/include/lbann/utils/opencv.hpp @@ -0,0 +1,118 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_UTILS_OPENCV_HPP_INCLUDED +#define LBANN_UTILS_OPENCV_HPP_INCLUDED + +#include "lbann/utils/exception.hpp" +#include "lbann/utils/type_erased_matrix.hpp" +#include + +namespace lbann { +namespace utils { + +/** + * Check whether data is an image. + * Currently requires data to be a uint8_t CPUMat, with 3 dimensions, the first + * (channel) being 1 or 3. + * + * @param data The data to check. + * @param dims The dimensions associated with data. + */ +inline bool check_is_image(const utils::type_erased_matrix& data, + const std::vector& dims) { + try { + // Check if we can do the conversion. + const auto& unused = data.template get(); + (void) unused; + } catch (utils::bad_any_cast) { + return false; + } + if (dims.size() != 3 || (dims[0] != 1 && dims[0] != 3)) { + return false; + } + return true; +} + +/** + * Throw an error if data is not an image. + * Currently requires data to be a uint8_t CPUMat, with 3 dimensions, the first + * (channel) being 1 or 3. + * Also throws an error if OpenCV is not supported. + * + * @param data The data to check. + * @param dims The dimensions associated with data. + */ +inline void assert_is_image(const utils::type_erased_matrix& data, + const std::vector& dims) { + try { + // Check if we can do the conversion. + const auto& unused = data.template get(); + (void) unused; + } catch (utils::bad_any_cast) { + LBANN_ERROR("Data is not an image: not uint8_t."); + } + if (dims.size() != 3 || (dims[0] != 1 && dims[0] != 3)) { + LBANN_ERROR("Data is not an image: bad dims."); + } +} + +/** + * Construct an OpenCV Mat that refers to data. + * No data is copied, this just sets up a cv::Mat header. + * @param data The matrix with data to use. + * @param dims Dimensions of the data. + */ +inline cv::Mat get_opencv_mat(utils::type_erased_matrix& data, const std::vector& dims) { + assert_is_image(data, dims); + auto& mat = data.template get(); + return cv::Mat(dims[1], dims[2], dims[0] == 1 ? CV_8UC1 : CV_8UC3, + mat.Buffer()); +} + +/** + * Construct an OpenCV Mat that refers to data. + * No data is copied, this just sets up a cv::Mat header. + * @param data The matrix with data to use. + * @param dims Dimensions of the data. + */ +inline cv::Mat get_opencv_mat(El::Matrix& data, const std::vector& dims) { + if (dims.size() != 3 || (dims[0] != 1 && dims[0] != 3)) { + LBANN_ERROR("Data is not an image: bad dims."); + } + return cv::Mat(dims[1], dims[2], dims[0] == 1 ? CV_8UC1 : CV_8UC3, + data.Buffer()); +} + +/** Get the linearized size of dims. */ +inline size_t get_linearized_size(const std::vector& dims) { + return std::accumulate(dims.begin(), dims.end(), 1, std::multiplies()); +} + +} // namespace utils +} // namespace lbann + +#endif // LBANN_UTILS_OPENCV_HPP_INCLUDED diff --git a/include/lbann/utils/random.hpp b/include/lbann/utils/random.hpp index dd48d1ee787..640fc40ae22 100644 --- a/include/lbann/utils/random.hpp +++ b/include/lbann/utils/random.hpp @@ -30,6 +30,7 @@ #include "lbann/base.hpp" #include "lbann/comm.hpp" #include "lbann/io/persist.hpp" +#include "lbann/utils/exception.hpp" #include namespace lbann { @@ -84,6 +85,11 @@ fast_rng_gen& get_fast_io_generator(); */ template inline T fast_rand_int(Generator& g, T max) { +#ifdef LBANN_DEBUG + if (max == 0) { + LBANN_ERROR("fast_rand_int called with max=0"); + } +#endif typename Generator::result_type x; do { x = g(); diff --git a/model_zoo/data_readers/data_reader_cifar10.prototext b/model_zoo/data_readers/data_reader_cifar10.prototext index 17825df6524..0984212cd2a 100644 --- a/model_zoo/data_readers/data_reader_cifar10.prototext +++ b/model_zoo/data_readers/data_reader_cifar10.prototext @@ -8,25 +8,9 @@ data_reader { validation_percent: 0.1 absolute_sample_count: 0 percent_of_data_to_use: 1.0 - image_preprocessor { - normalizer { - scale: true - subtract_mean: false - unit_variance: false - z_score: false - } - augmenter { - disable: true - horizontal_flip: false - vertical_flip: false - rotation: 0 - horizontal_shift: 0 - vertical_shift: 0 - shear_range: 0 - } - noiser { - disable: true - factor: 0.0 + transforms { + scale { + scale: 0.003921568627 # 1/255 } } } @@ -38,25 +22,9 @@ data_reader { label_filename: "/p/lscratchh/brainusr/datasets/cifar10-bin/test_batch.bin" absolute_sample_count: 0 percent_of_data_to_use: 1.0 - image_preprocessor { - normalizer { - scale: true - subtract_mean: false - unit_variance: false - z_score: false - } - augmenter { - disable: true - horizontal_flip: false - vertical_flip: false - rotation: 0 - horizontal_shift: 0 - vertical_shift: 0 - shear_range: 0 - } - noiser { - disable: true - factor: 0.0 + transforms { + scale { + scale: 0.003921568627 # 1/255 } } } diff --git a/model_zoo/data_readers/data_reader_imagenet.prototext b/model_zoo/data_readers/data_reader_imagenet.prototext index b96333dd35b..f021a7bc70d 100644 --- a/model_zoo/data_readers/data_reader_imagenet.prototext +++ b/model_zoo/data_readers/data_reader_imagenet.prototext @@ -11,40 +11,24 @@ data_reader { percent_of_data_to_use: 1.0 num_labels: 1000 - image_preprocessor { - subtractor { - disable: true - image_to_sub: "mean-256x256x3-6.bin" + transforms { + random_resized_crop { + height: 224 + width: 224 } - - cropper { - disable: false - crop_width: 224 - crop_height: 224 - crop_randomly: true - resized_width: 256 - resized_height: 256 - } - - colorizer { - disable: false - } - - augmenter { - disable: false - horizontal_flip: true - vertical_flip: false - rotation: 0 - horizontal_shift: 0 - vertical_shift: 0 - shear_range: 0 + } + transforms { + horizontal_flip { + p: 0.5 } - - normalizer { - scale: false - subtract_mean: false - unit_variance: false - z_score: true + } + transforms { + colorize {} + } + transforms { + normalize_to_lbann_layout { + means: "0.406 0.456 0.485" + stddevs: "0.225 0.224 0.229" } } } @@ -60,40 +44,21 @@ data_reader { percent_of_data_to_use: 1.0 num_labels: 1000 - image_preprocessor { - subtractor { - disable: true - image_to_sub: "mean-256x256x3-6.bin" - } - - cropper { - disable: false - crop_width: 224 + transforms { + resized_center_crop { + height: 256 + width: 256 crop_height: 224 - crop_randomly: false - resized_width: 256 - resized_height: 256 - } - - colorizer { - disable: false - } - - augmenter { - disable: true - horizontal_flip: false - vertical_flip: false - rotation: 0 - horizontal_shift: 0 - vertical_shift: 0 - shear_range: 0 + crop_width: 224 } - - normalizer { - scale: false - subtract_mean: false - unit_variance: false - z_score: true + } + transforms { + colorize {} + } + transforms { + normalize_to_lbann_layout { + means: "0.406 0.456 0.485" + stddevs: "0.225 0.224 0.229" } } } diff --git a/model_zoo/data_readers/data_reader_mnist.prototext b/model_zoo/data_readers/data_reader_mnist.prototext index 7ac51189a10..f9911977b92 100644 --- a/model_zoo/data_readers/data_reader_mnist.prototext +++ b/model_zoo/data_readers/data_reader_mnist.prototext @@ -9,24 +9,9 @@ data_reader { validation_percent: 0.1 absolute_sample_count: 0 percent_of_data_to_use: 1.0 - image_preprocessor { - normalizer { - scale: true - subtract_mean: false - unit_variance: false - z_score: false - } - augmenter { - horizontal_flip: false - vertical_flip: false - rotation: 0 - horizontal_shift: 0 - vertical_shift: 0 - shear_range: 0 - } - noiser { - disable: true - factor: 0.0 + transforms { + scale { + scale: 0.003921568627 # 1/255 } } } @@ -39,24 +24,9 @@ data_reader { label_filename: "t10k-labels-idx1-ubyte" absolute_sample_count: 0 percent_of_data_to_use: 1.0 - image_preprocessor { - normalizer { - scale: true - subtract_mean: false - unit_variance: false - z_score: false - } - augmenter { - horizontal_flip: false - vertical_flip: false - rotation: 0 - horizontal_shift: 0 - vertical_shift: 0 - shear_range: 0 - } - noiser { - disable: true - factor: 0.0 + transforms { + scale { + scale: 0.003921568627 # 1/255 } } } diff --git a/model_zoo/data_readers/data_reader_mnist_numpy_npz_int16.prototext b/model_zoo/data_readers/data_reader_mnist_numpy_npz_int16.prototext index 55b4f5c8068..7a235b99c01 100644 --- a/model_zoo/data_readers/data_reader_mnist_numpy_npz_int16.prototext +++ b/model_zoo/data_readers/data_reader_mnist_numpy_npz_int16.prototext @@ -11,24 +11,9 @@ data_reader { num_labels: 10 scaling_factor_int16: 0.000030518509476 # 1 / 0x7FFF - image_preprocessor { - normalizer { - scale: true - subtract_mean: false - unit_variance: false - z_score: false - } - augmenter { - horizontal_flip: false - vertical_flip: false - rotation: 0 - horizontal_shift: 0 - vertical_shift: 0 - shear_range: 0 - } - noiser { - disable: true - factor: 0.0 + transforms { + scale { + scale: 0.003921568627 # 1/255 } } } @@ -43,24 +28,9 @@ data_reader { num_labels: 10 scaling_factor_int16: 0.000030518509476 # 1 / 0x7FFF - image_preprocessor { - normalizer { - scale: true - subtract_mean: false - unit_variance: false - z_score: false - } - augmenter { - horizontal_flip: false - vertical_flip: false - rotation: 0 - horizontal_shift: 0 - vertical_shift: 0 - shear_range: 0 - } - noiser { - disable: true - factor: 0.0 + transforms { + scale { + scale: 0.003921568627 # 1/255 } } } diff --git a/model_zoo/models/jag/wae_cycle_gan/data_reader_jag_conduit_lassen.prototext b/model_zoo/models/jag/wae_cycle_gan/data_reader_jag_conduit_lassen.prototext index b0376077b5e..00d8238c1fe 100644 --- a/model_zoo/models/jag/wae_cycle_gan/data_reader_jag_conduit_lassen.prototext +++ b/model_zoo/models/jag/wae_cycle_gan/data_reader_jag_conduit_lassen.prototext @@ -27,37 +27,6 @@ data_reader { disable_labels: true num_labels: 5 - - image_preprocessor { - # assume fixed size of input images if cropper is not used - raw_width: 64 - raw_height: 64 - raw_num_channels: 4 - - normalizer { - disable: true - scale: false - subtract_mean: false - unit_variance: false - z_score: true - } - - subtractor { - disable: true - } - - cropper { - disable: true - } - - colorizer { - disable: true - } - - augmenter { - disable: true - } - } } reader { @@ -77,36 +46,5 @@ data_reader { disable_labels: true num_labels: 5 - - image_preprocessor { - # assume fixed size of input images if cropper is not used - raw_width: 64 - raw_height: 64 - raw_num_channels: 4 - - normalizer { - disable: true - scale: false - subtract_mean: false - unit_variance: false - z_score: true - } - - subtractor { - disable: true - } - - cropper { - disable: true - } - - colorizer { - disable: true - } - - augmenter { - disable: true - } - } } } diff --git a/model_zoo/models/jag/wae_cycle_gan/jag_100M_metadata.prototext b/model_zoo/models/jag/wae_cycle_gan/jag_100M_metadata.prototext index 1643b6db51a..d76f3155959 100644 --- a/model_zoo/models/jag/wae_cycle_gan/jag_100M_metadata.prototext +++ b/model_zoo/models/jag/wae_cycle_gan/jag_100M_metadata.prototext @@ -20,6 +20,10 @@ data_set_metadata { image_prefix: "/outputs/images/" + image_width: 64 + image_height: 64 + image_num_channels: 4 + jag_image_keys: ["(0.0, 0.0)/0.0/emi", "(90.0, 0.0)/0.0/emi", "(90.0, 78.0)/0.0/emi"] scalar_prefix: "/outputs/scalars/" diff --git a/model_zoo/models/siamese/siamese_alexnet/data_reader_imagenet_patches.prototext b/model_zoo/models/siamese/siamese_alexnet/data_reader_imagenet_patches.prototext deleted file mode 100644 index 782c1029026..00000000000 --- a/model_zoo/models/siamese/siamese_alexnet/data_reader_imagenet_patches.prototext +++ /dev/null @@ -1,130 +0,0 @@ -data_reader { - reader { - name: "imagenet_patches" - role: "train" - shuffle: true - data_filedir: "/p/lscratchh/brainusr/datasets/ILSVRC2012/original/train/" - data_filename: "/p/lscratchh/brainusr/datasets/ILSVRC2012/original/labels/train.txt" - label_filename: "" - validation_percent: 0.1 - absolute_sample_count: 0 - percent_of_data_to_use: 0.001 - num_labels: 8 - - image_preprocessor { - # assume fixed size of input images if cropper is not used - raw_width: 440 - raw_height: 440 - - # crop_size must be at least 3*patch_size+2*patch_gap+2*patch_jitter - # In addition, it might be better to leave some margin such that patches are - # taken from the central area where there are actual interesting objects/patterns. - - cropper { - disable: false - crop_width: 440 - crop_height: 440 - crop_randomly: false - resized_width: 440 - resized_height: 440 - } - - decolorizer { - disable: false - pick_1ch: true - } - -# colorizer { -# disable: false -# } - - augmenter { - disable: true - horizontal_flip: false - vertical_flip: false - rotation: 0 - horizontal_shift: 0 - vertical_shift: 0 - shear_range: 0 - } - - normalizer { - scale: true - subtract_mean: true - unit_variance: true - z_score: false - } - - patch_extractor { - patch_width: 96 - patch_height: 96 - patch_gap: 48 - patch_jitter: 7 - centering_mode: 1 - ca_correction_mode: 0 - } - } - } - - reader { - name: "imagenet_patches" - role: "test" - shuffle: true - data_filedir: "/p/lscratchh/brainusr/datasets/ILSVRC2012/original/val/" - data_filename: "/p/lscratchh/brainusr/datasets/ILSVRC2012/original/labels/val.txt" - label_filename: "" - absolute_sample_count: 0 - percent_of_data_to_use: 0.1 - num_labels: 8 - - image_preprocessor { - # assume fixed size of input images if cropper is not used - raw_width: 440 - raw_height: 440 - - cropper { - disable: false - crop_width: 440 - crop_height: 440 - crop_randomly: false - resized_width: 440 - resized_height: 440 - } - - decolorizer { - disable: false - pick_1ch: true - } - -# colorizer { -# disable: false -# } - - augmenter { - disable: true - horizontal_flip: false - vertical_flip: false - rotation: 0 - horizontal_shift: 0 - vertical_shift: 0 - shear_range: 0 - } - - normalizer { - scale: true - subtract_mean: true - unit_variance: true - z_score: false - } - - patch_extractor { - patch_width: 96 - patch_height: 96 - patch_gap: 48 - patch_jitter: 7 - centering_mode: 1 - ca_correction_mode: 0 - } - } - } -} diff --git a/model_zoo/tests/data_reader_tests/jag_single_layer_ae.prototext b/model_zoo/tests/data_reader_tests/jag_single_layer_ae.prototext new file mode 100644 index 00000000000..54006968dae --- /dev/null +++ b/model_zoo/tests/data_reader_tests/jag_single_layer_ae.prototext @@ -0,0 +1,115 @@ +model { + name: "ae_model" + shareable_training_data_reader:false + serialize_io: true + data_layout: "data_parallel" + mini_batch_size: 128 + block_size: 256 + num_epochs: 4 + num_parallel_readers: 0 + procs_per_trainer: 0 + + ################################################### + # Objective function + ################################################### + + objective_function { + layer_term { layer: "img_loss" } + l2_weight_regularization { + scale_factor: 1e-4 + } + } + + ################################################### + # Metrics + ################################################### + + metric { + layer_metric { + name: "reconstr_loss" + layer: "img_loss" + } + } + ################################################### + # Callbacks + ################################################### + callback { + print { + interval: 1 + } + } + callback { timer {} } + + ################################################### + # start of layers + ################################################### + + # Data + layer { + input { + io_buffer: "partitioned" + target_mode: "N/A" + } + name: "data" + data_layout: "data_parallel" + parents: " " + } + layer { + name: "slice_data" + data_layout: "data_parallel" + parents: "data" + children: "image_data_dummy param_data_id" + slice { + get_slice_points_from_reader: "independent" + } + } + #Y (images + scalar) + layer { + identity { + } + name: "image_data_dummy" + data_layout: "data_parallel" + parents: "slice_data" + } + # X (params not used) + layer { + identity { + } + name: "param_data_id" + data_layout: "data_parallel" + parents: "slice_data" + } + ## Hidden layer + layer { + fully_connected { + num_neurons: 1024 + has_bias: true + } + name: "encodefc1" + data_layout: "data_parallel" + parents: "image_data_dummy" + } + + #Y'(reconstructed images and scalar) + layer { + parents: "encodefc1" + name: "decode0" + data_layout: "data_parallel" + fully_connected { + get_slice_points_from_reader: "independent" + get_num_neurons_of_slice_from_reader: [ 1 ] + has_bias: true + } + } + # Loss/Metric layer + layer { + parents: "decode0 image_data_dummy" + name: "img_loss" + data_layout: "data_parallel" + mean_squared_error {} + } + + ################################################### + # end of layers + ################################################### +} diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index c11b281a01b..133a095b0ca 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -16,6 +16,7 @@ add_subdirectory(models) add_subdirectory(objective_functions) add_subdirectory(optimizers) add_subdirectory(proto) +add_subdirectory(transforms) add_subdirectory(utils) add_subdirectory(weights) diff --git a/src/data_readers/CMakeLists.txt b/src/data_readers/CMakeLists.txt index ebc80896808..64d7b339477 100644 --- a/src/data_readers/CMakeLists.txt +++ b/src/data_readers/CMakeLists.txt @@ -1,24 +1,11 @@ # Add the source files for this directory set_full_path(THIS_DIR_SOURCES - cv_augmenter.cpp - cv_colorizer.cpp - cv_cropper.cpp - cv_decolorizer.cpp - cv_mean_extractor.cpp - cv_normalizer.cpp - cv_process.cpp - cv_process_patches.cpp - cv_resizer.cpp - cv_subtractor.cpp - cv_transform.cpp - cv_utils.cpp data_reader.cpp data_reader_ascii.cpp data_reader_cifar10.cpp data_reader_csv.cpp data_reader_image.cpp data_reader_imagenet.cpp - data_reader_imagenet_patches.cpp data_reader_jag.cpp data_reader_jag_conduit.cpp data_reader_merge_features.cpp @@ -32,18 +19,12 @@ set_full_path(THIS_DIR_SOURCES data_reader_pilot2_molecular.cpp data_reader_synthetic.cpp data_reader_multi_images.cpp - data_reader_mnist_siamese.cpp data_reader_multihead_siamese.cpp data_reader_python.cpp offline_patches_npz.cpp - image_preprocessor.cpp - image_utils.cpp numpy_conduit_converter.cpp data_reader_numpy_npz_conduit.cpp ) -# Add the subdirectories -add_subdirectory(patchworks) - # Propagate the files up the tree set(SOURCES "${SOURCES}" "${THIS_DIR_SOURCES}" PARENT_SCOPE) diff --git a/src/data_readers/cv_augmenter.cpp b/src/data_readers/cv_augmenter.cpp deleted file mode 100644 index 2418d592399..00000000000 --- a/src/data_readers/cv_augmenter.cpp +++ /dev/null @@ -1,253 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. -// Produced at the Lawrence Livermore National Laboratory. -// Written by the LBANN Research Team (B. Van Essen, et al.) listed in -// the CONTRIBUTORS file. -// -// LLNL-CODE-697807. -// All rights reserved. -// -// This file is part of LBANN: Livermore Big Artificial Neural Network -// Toolkit. For details, see http://software.llnl.gov/LBANN or -// https://github.com/LLNL/LBANN. -// -// Licensed under the Apache License, Version 2.0 (the "Licensee"); you -// may not use this file except in compliance with the License. You may -// obtain a copy of the License at: -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the license. -// -// cv_augmenter .cpp .hpp - Augmenting functions for images in opencv format -//////////////////////////////////////////////////////////////////////////////// - -#include "lbann/data_readers/cv_augmenter.hpp" -#include "lbann/utils/mild_exception.hpp" -#include "lbann/utils/random.hpp" - -#ifdef LBANN_HAS_OPENCV -namespace lbann { - -cv_augmenter::cv_augmenter() - : cv_transform(), - m_do_horizontal_flip(false), - m_do_vertical_flip(false), - m_rotation_range(0.0f), - m_horizontal_shift_range(0.0f), - m_vertical_shift_range(0.0f), - m_shear_range(0.0f), - m_flip(_no_flip_), - m_trans(cv::Mat_::eye(3,3)) { - //check_enabled(); // enable if default parameter changes -} - - -cv_augmenter::cv_augmenter(const cv_augmenter& rhs) - : cv_transform(rhs), - m_do_horizontal_flip(rhs.m_do_horizontal_flip), - m_do_vertical_flip(rhs.m_do_vertical_flip), - m_rotation_range(rhs.m_rotation_range), - m_horizontal_shift_range(rhs.m_horizontal_shift_range), - m_vertical_shift_range(rhs.m_vertical_shift_range), - m_shear_range(rhs.m_shear_range), - m_flip(rhs.m_flip), - m_trans(rhs.m_trans) { -} - -cv_augmenter *cv_augmenter::clone() const { - return new cv_augmenter(*this); -} - -cv_augmenter& cv_augmenter::operator=(const cv_augmenter& rhs) { - if (this == &rhs) { - return (*this); - } - - cv_transform::operator=(rhs); - m_do_horizontal_flip = rhs.m_do_horizontal_flip; - m_do_vertical_flip = rhs.m_do_vertical_flip; - m_rotation_range = rhs.m_rotation_range; - m_horizontal_shift_range = rhs.m_horizontal_shift_range; - m_vertical_shift_range = rhs.m_vertical_shift_range; - m_shear_range = rhs.m_shear_range; - m_flip = rhs.m_flip; - m_trans = rhs.m_trans; - - return (*this); -} - - -bool cv_augmenter::check_to_enable() const { - return ( m_do_horizontal_flip || - m_do_vertical_flip || - (m_horizontal_shift_range != 0.0f) || - (m_vertical_shift_range != 0.0f) || - (m_shear_range != 0.0f) || - (m_rotation_range != 0.0f)); -} - - -void cv_augmenter::set(const bool hflip, const bool vflip, const float rot, - const float hshift, const float vshift, const float shear) { - reset(); - m_do_horizontal_flip = hflip; - m_do_vertical_flip = vflip; - m_rotation_range = rot; - m_horizontal_shift_range = hshift; - m_vertical_shift_range = vshift; - m_shear_range = shear; -} - - -void cv_augmenter::reset() { - m_enabled = false; // will turns on when the transform is determined - m_flip = _no_flip_; - m_trans = cv::Mat_::eye(3,3); -} - - -bool cv_augmenter::determine_transform(const cv::Mat& image) { - reset(); - - _LBANN_SILENT_EXCEPTION(image.empty(), "", false) - - if (!check_to_enable()) { - return false; - } - - rng_gen& gen = get_io_generator(); - - std::uniform_int_distribution bool_dist(0, 1); - - // Flips -#ifdef _COMPAT_WITH_EL_AUGMENT_ - const bool horiz_flip = bool_dist(gen) && m_do_horizontal_flip; - const bool vert_flip = bool_dist(gen) && m_do_vertical_flip; -#else - const bool horiz_flip = m_do_horizontal_flip && bool_dist(gen); - const bool vert_flip = m_do_vertical_flip && bool_dist(gen); -#endif - - if (horiz_flip && vert_flip) { - m_flip = _both_axes_; - } else if (horiz_flip) { - m_flip = _horizontal_; - } else if (vert_flip) { - m_flip = _vertical_; - } else { - m_flip = _no_flip_; - } - - // Shift (Translate) - float x_shift = 0.0f; - float y_shift = 0.0f; - if (m_horizontal_shift_range != 0.0f) { - std::uniform_real_distribution dist(-m_horizontal_shift_range, - m_horizontal_shift_range); - x_shift = dist(gen) * image.cols; - } - if (m_vertical_shift_range != 0.0f) { - std::uniform_real_distribution dist(-m_vertical_shift_range, - m_vertical_shift_range); - y_shift = dist(gen) * image.rows; - } - cv::Mat_ shift_mat = cv::Mat_::eye(3,3); - shift_mat(0, 2) = x_shift; - shift_mat(1, 2) = y_shift; - //std::cout << "x_shift " << x_shift << ", y_shift " << y_shift << std::endl; - - // Shearing - float shear = 0.0f; - if (m_shear_range != 0.0f) { - std::uniform_real_distribution dist(-m_shear_range, - m_shear_range); - shear = dist(gen); - } - cv::Mat_ shear_mat = cv::Mat_::zeros(3,3); - shear_mat(0, 0) = 1.0f; - shear_mat(2, 2) = 1.0f; - shear_mat(0, 1) = -std::sin(shear); - shear_mat(1, 1) = std::cos(shear); - //std::cout << "shear " << shear << std::endl; - - // Rotation - float rotate = 0.0f; - if (m_rotation_range != 0.0f) { - std::uniform_real_distribution dist(-m_rotation_range, - m_rotation_range); - rotate = pi / 180.0f * dist(gen); - } - cv::Mat_ rot_mat = cv::Mat_::zeros(3,3); - rot_mat(2, 2) = 1.0f; - rot_mat(0, 0) = std::cos(rotate); - rot_mat(0, 1) = -std::sin(rotate); - rot_mat(1, 0) = std::sin(rotate); - rot_mat(1, 1) = std::cos(rotate); - //std::cout << "rotate " << rotate << std::endl; - - // Compute the final transformation. -#if 0 - cv::Mat_ tmp_mat = cv::Mat_::zeros(3, 3); - cv::gemm(shift_mat, shear_mat, 1.0f, tmp_mat, 0.0f, tmp_mat, 0); - cv::gemm(tmp_mat, rot_mat, 1.0f, m_trans, 0.0f, m_trans, 0); -#else - //m_trans = (shift_mat * shear_mat) * rot_mat; - m_trans = shear_mat * rot_mat; - m_trans(0,2) = x_shift; - m_trans(1,2) = y_shift; -#endif - - return (m_enabled = true); -} - - -bool cv_augmenter::apply(cv::Mat& image) { - m_enabled = false; // turn off as it is applied - - _LBANN_SILENT_EXCEPTION(image.empty(), "", false) - - cv::Mat image_copy; - - if (m_flip == _no_flip_) { - image_copy = image.clone(); - } else { - cv::flip(image, image_copy, static_cast(m_flip)); - } - - cv::Mat_ _trans(m_trans, cv::Rect_(0,0,3,2)); - - cv::warpAffine(image_copy, image, _trans, image.size(), - cv::INTER_LINEAR, cv::BORDER_REPLICATE); - - return true; -} - -std::string cv_augmenter::get_description() const { - std::stringstream os; - os << get_type() + ":" << std::endl - << " - horizontal flip: " << (m_do_horizontal_flip? "true" : "false") << std::endl - << " - vertical flip: " << (m_do_vertical_flip? "true" : "false") << std::endl - << " - rotation range: " << m_rotation_range << std::endl - << " - horizontal shift range: " << m_horizontal_shift_range << std::endl - << " - vertical shift range: " << m_vertical_shift_range << std::endl - << " - shear range: " << m_shear_range << std::endl; - return os.str(); -} - -std::ostream& cv_augmenter::print(std::ostream& os) const { - os << get_description() - << " - flipping: " << cv_transform::flip_desc(m_flip) << std::endl << std::fixed - << " - transfrom: " << m_trans(0,0) << '\t' << m_trans(0,1) << '\t' << m_trans(0,2) << std::endl - << " " << m_trans(1,0) << '\t' << m_trans(1,1) << '\t' << m_trans(1,2) << std::endl - << " " << m_trans(2,0) << '\t' << m_trans(2,1) << '\t' << m_trans(2,2) << std::endl; //<< std::defaultfloat; - - return os; -} - -} // end of namespace lbann -#endif // LBANN_HAS_OPENCV diff --git a/src/data_readers/cv_colorizer.cpp b/src/data_readers/cv_colorizer.cpp deleted file mode 100644 index 4606623cf70..00000000000 --- a/src/data_readers/cv_colorizer.cpp +++ /dev/null @@ -1,94 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. -// Produced at the Lawrence Livermore National Laboratory. -// Written by the LBANN Research Team (B. Van Essen, et al.) listed in -// the CONTRIBUTORS file. -// -// LLNL-CODE-697807. -// All rights reserved. -// -// This file is part of LBANN: Livermore Big Artificial Neural Network -// Toolkit. For details, see http://software.llnl.gov/LBANN or -// https://github.com/LLNL/LBANN. -// -// Licensed under the Apache License, Version 2.0 (the "Licensee"); you -// may not use this file except in compliance with the License. You may -// obtain a copy of the License at: -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the license. -// -// cv_colorizer .cpp .hpp - transform a non-color (grayscale) image into a -// 3-channel color image -//////////////////////////////////////////////////////////////////////////////// - -#include "lbann/data_readers/cv_colorizer.hpp" -#include "lbann/utils/mild_exception.hpp" - -#ifdef LBANN_HAS_OPENCV -namespace lbann { - -cv_colorizer::cv_colorizer(const cv_colorizer& rhs) - : cv_transform(rhs), m_gray(rhs.m_gray) {} - -cv_colorizer& cv_colorizer::operator=(const cv_colorizer& rhs) { - cv_transform::operator=(rhs); - m_gray = rhs.m_gray; - return *this; -} - -cv_colorizer *cv_colorizer::clone() const { - return (new cv_colorizer(*this)); -} - -bool cv_colorizer::determine_transform(const cv::Mat& image) { - //reset(); // redundant here - // enable colorizing transform if the given image is in grayscale - m_enabled = m_gray = (!image.empty() && (image.channels() == 1)); - //_LBANN_SILENT_EXCEPTION(image.empty(), "", false); // redundant - return m_enabled; -} - -bool cv_colorizer::determine_inverse_transform() { - // Enable inverse transform only if grayscale to color transform has been applied - m_enabled = m_gray; - // indicate that the current image is a color image - m_gray = false; - return m_enabled; -} - -bool cv_colorizer::apply(cv::Mat& image) { - m_enabled = false; // turn off as the transform is applied once - - if (!m_gray) { // apply the inverse transform from color to gray - cv::Mat image_dst; - cv::cvtColor(image, image_dst, cv::COLOR_BGR2GRAY); - image = image_dst; - } else { // apply the transform from gray to color - cv::Mat image_dst; - cv::cvtColor(image, image_dst, cv::COLOR_GRAY2BGR); - image = image_dst; - } - - return true; -} - -std::string cv_colorizer::get_description() const { - std::stringstream os; - os << get_type() + ":" << std::endl; - return os.str(); -} - -std::ostream& cv_colorizer::print(std::ostream& os) const { - os << get_description() - << " - " << (m_gray? "grayscale" : "color") << std::endl; - return os; -} - -} // end of namespace lbann -#endif // LBANN_HAS_OPENCV diff --git a/src/data_readers/cv_cropper.cpp b/src/data_readers/cv_cropper.cpp deleted file mode 100644 index 6e16b09466a..00000000000 --- a/src/data_readers/cv_cropper.cpp +++ /dev/null @@ -1,196 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. -// Produced at the Lawrence Livermore National Laboratory. -// Written by the LBANN Research Team (B. Van Essen, et al.) listed in -// the CONTRIBUTORS file. -// -// LLNL-CODE-697807. -// All rights reserved. -// -// This file is part of LBANN: Livermore Big Artificial Neural Network -// Toolkit. For details, see http://software.llnl.gov/LBANN or -// https://github.com/LLNL/LBANN. -// -// Licensed under the Apache License, Version 2.0 (the "Licensee"); you -// may not use this file except in compliance with the License. You may -// obtain a copy of the License at: -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the license. -// -// cv_cropper .cpp .hpp - functions to crop images -//////////////////////////////////////////////////////////////////////////////// - -#include "lbann/data_readers/cv_cropper.hpp" -#include "lbann/utils/mild_exception.hpp" -#include "lbann/utils/random.hpp" -#include "lbann/utils/exception.hpp" -#include -#include - -#ifdef LBANN_HAS_OPENCV -namespace lbann { - -const int cv_cropper::m_interpolation_choices[3] = {cv::INTER_LINEAR, cv::INTER_AREA, cv::INTER_LINEAR}; - -cv_cropper::cv_cropper() - : cv_transform(), m_width(0u), m_height(0u), - m_rand_crop(false), m_is_roi_set(false), - m_roi_size(std::pair(0,0)), - m_zoom(1.0), m_interpolation(m_interpolation_choices[0]), - m_adaptive_interpolation(false) {} - - -cv_cropper *cv_cropper::clone() const { - return new cv_cropper(*this); -} - -/// Make sure to clear the roi flag as well when clearing roi size -void cv_cropper::unset_roi() { - m_is_roi_set = false; - m_roi_size = std::pair(0, 0); -} - -void cv_cropper::set(const unsigned int width, const unsigned int height, - const bool random_crop, - const std::pair& roi_sz, - const bool adaptive_interpolation) { - reset(); - m_width = width; - m_height = height; - m_rand_crop = random_crop; - m_adaptive_interpolation = adaptive_interpolation; - - if ((roi_sz.first > 0) && (roi_sz.second > 0)) { - if (((unsigned) roi_sz.first < width) || ((unsigned) roi_sz.second < height)) { - std::stringstream err; - err << __FILE__ << " " << __LINE__ << " :: cv_cropper: ROI size is smaller than that of a patch"; - throw lbann_exception(err.str()); - } else { - m_is_roi_set = true; - m_roi_size = roi_sz; - } - } else if (!((roi_sz.first == 0) && (roi_sz.second == 0))) { - std::stringstream err; - err << __FILE__ << " " << __LINE__ << " :: cv_cropper: invalid ROI size"; - throw lbann_exception(err.str()); - } else { - unset_roi(); - } -} - -void cv_cropper::reset() { - m_enabled = false; - m_zoom = 1.0; - m_interpolation = m_interpolation_choices[0]; -} - -bool cv_cropper::determine_transform(const cv::Mat& image) { - m_enabled = false; //sufficient for now in place of reset(); - - _LBANN_SILENT_EXCEPTION(image.empty(), "", false) - - double zoom_h = 1.0; - double zoom_v = 1.0; - if (m_is_roi_set) { - zoom_h = image.cols / static_cast(m_roi_size.first); - zoom_v = image.rows / static_cast(m_roi_size.second); - } - - m_zoom = std::min(zoom_h, zoom_v); - - if (m_zoom > 1.0) { // rescales the image by the factor of 1/m_zoom (shrink) - m_interpolation = m_interpolation_choices[static_cast(m_adaptive_interpolation)]; - } else { - m_interpolation = m_interpolation_choices[static_cast(m_adaptive_interpolation) << 1]; - } - - return (m_enabled = true); -} - -/** - * Method 1: - * a. Rescale the raw image, I, such that one dimension matches the corresponding - * dimension of the specified rectangular area, R, while trying to maintain the - * size as closely as possible to that of the raw image without altering the - * aspect ratio. - * b. Crop off the excess area of the resized image, which goes beyond the - * specified R aligned at the center of the image. - * c. Crop out an area of the specified size, C, at the center of R or at a random - * position within R. - * - * Method 2: - * Instead of rescaling-crop-crop as in method 1, - * a. Compute the projection of the final crop area, C', on the raw image I without - * actually rescaling the image. This still requires to compute the scaling factor - * for image resizing. - * However, instead of applying it to the raw image, apply the inverse to project - * the crop C onto the raw image I. This does not change any actual pixel. - * b. Crop the projected area C' - * c. Rescale C' to C. This deals with a smaller number of pixels than method 1 for - * resizing, only those that remain. - * - * We rely on Method 2 here. - */ -bool cv_cropper::apply(cv::Mat& image) { - m_enabled = false; // turn off as it is applied - - //_LBANN_SILENT_EXCEPTION(image.empty(), "", false); // redundant - - const double zoomed_roi_width = m_roi_size.first * m_zoom; - const double zoomed_roi_height = m_roi_size.second * m_zoom; - const double zoomed_width = m_width * m_zoom; - const double zoomed_height = m_height * m_zoom; - - int crop_x_start = 0; - int crop_y_start = 0; - - // Get random crop of image - if(m_rand_crop) { - const int rnd_dw = fast_rand_int(get_fast_io_generator(), static_cast(2*(zoomed_roi_width - zoomed_width)) + 1); - const int rnd_dh = fast_rand_int(get_fast_io_generator(), static_cast(2*(zoomed_roi_height - zoomed_height)) + 1); - crop_x_start = static_cast(image.cols - zoomed_roi_width + rnd_dw + 1) / 2; - crop_y_start = static_cast(image.rows - zoomed_roi_height + rnd_dh + 1) / 2; - } else { - crop_x_start = static_cast(image.cols - zoomed_width + 1) / 2; - crop_y_start = static_cast(image.rows - zoomed_height + 1) / 2; - } - - cv::Mat zoomed_crop = image(cv::Rect(crop_x_start, crop_y_start, zoomed_width, zoomed_height)); - cv::Mat crop; - cv::resize(zoomed_crop, crop, cv::Size(m_width,m_height), 0, 0, m_interpolation); - image = crop; - - return true; -} - -std::string cv_cropper::get_description() const { - std::stringstream os; - os << get_type() + ":" << std::endl - << " - crop size: " << m_width << "x" << m_height << std::endl - << " - resized size: " << m_roi_size.first << "x" << m_roi_size.second << std::endl - << " - random crop: " << m_rand_crop << std::endl - << " - adaptive interpolation: " << m_adaptive_interpolation << std::endl; - return os.str(); -} - -std::ostream& cv_cropper::print(std::ostream& os) const { - os << get_description() - << " - zoom: 1/" << m_zoom << std::endl - << " - interpolation: "; - switch(m_interpolation) { - case cv::INTER_LINEAR: os << "INTER_LINEAR" << std::endl; break; - case cv::INTER_CUBIC: os << "INTER_CUBIC" << std::endl; break; - case cv::INTER_AREA: os << "INTER_AREA" << std::endl; break; - default: os << "unrecognized" << std::endl; break; - } - return os; -} - -} // end of namespace lbann -#endif // LBANN_HAS_OPENCV diff --git a/src/data_readers/cv_decolorizer.cpp b/src/data_readers/cv_decolorizer.cpp deleted file mode 100644 index 9d7f7ae3a14..00000000000 --- a/src/data_readers/cv_decolorizer.cpp +++ /dev/null @@ -1,97 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. -// Produced at the Lawrence Livermore National Laboratory. -// Written by the LBANN Research Team (B. Van Essen, et al.) listed in -// the CONTRIBUTORS file. -// -// LLNL-CODE-697807. -// All rights reserved. -// -// This file is part of LBANN: Livermore Big Artificial Neural Network -// Toolkit. For details, see http://software.llnl.gov/LBANN or -// https://github.com/LLNL/LBANN. -// -// Licensed under the Apache License, Version 2.0 (the "Licensee"); you -// may not use this file except in compliance with the License. You may -// obtain a copy of the License at: -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the license. -// -// cv_decolorizer .cpp .hpp - transform a color image into a single-channel -// monochrome image -//////////////////////////////////////////////////////////////////////////////// - -#include "lbann/data_readers/cv_decolorizer.hpp" -#include "lbann/utils/mild_exception.hpp" - -#ifdef LBANN_HAS_OPENCV -namespace lbann { - -cv_decolorizer::cv_decolorizer(const cv_decolorizer& rhs) - : cv_transform(rhs), m_color(rhs.m_color), m_pick_1ch(rhs.m_pick_1ch) {} - -cv_decolorizer& cv_decolorizer::operator=(const cv_decolorizer& rhs) { - cv_transform::operator=(rhs); - m_color = rhs.m_color; - m_pick_1ch = rhs.m_pick_1ch; - return *this; -} - -cv_decolorizer *cv_decolorizer::clone() const { - return (new cv_decolorizer(*this)); -} - -void cv_decolorizer::set(const bool pick_1ch) { - m_pick_1ch = pick_1ch; - reset(); -} - -bool cv_decolorizer::determine_transform(const cv::Mat& image) { - //reset(); // redundant here - // enable decolorizing transform if the given image is a color image - m_enabled = m_color = (!image.empty() && (image.channels() > 1)); - //_LBANN_SILENT_EXCEPTION(image.empty(), "", false); // redundant - return m_enabled; -} - -bool cv_decolorizer::apply(cv::Mat& image) { - m_enabled = false; // turn off as the transform is applied once - - if (m_color) { - if (m_pick_1ch) { - // Drop all the channels but one. - const int Nch = image.channels(); - std::vector channels(Nch); - cv::split(image, channels); - image = channels[1 % Nch]; - } else { - // Compute a new channel by the linear combination of all channels - cv::Mat image_dst; - cv::cvtColor(image, image_dst, cv::COLOR_BGR2GRAY); - image = image_dst; - } - } - - return true; -} - -std::string cv_decolorizer::get_description() const { - std::stringstream os; - os << get_type() + ":" << std::endl; - return os.str(); -} - -std::ostream& cv_decolorizer::print(std::ostream& os) const { - os << get_description() - << " - " << (m_color? "color" : "grayscale") << std::endl; - return os; -} - -} // end of namespace lbann -#endif // LBANN_HAS_OPENCV diff --git a/src/data_readers/cv_mean_extractor.cpp b/src/data_readers/cv_mean_extractor.cpp deleted file mode 100644 index 4026e71bcb3..00000000000 --- a/src/data_readers/cv_mean_extractor.cpp +++ /dev/null @@ -1,168 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. -// Produced at the Lawrence Livermore National Laboratory. -// Written by the LBANN Research Team (B. Van Essen, et al.) listed in -// the CONTRIBUTORS file. -// -// LLNL-CODE-697807. -// All rights reserved. -// -// This file is part of LBANN: Livermore Big Artificial Neural Network -// Toolkit. For details, see http://software.llnl.gov/LBANN or -// https://github.com/LLNL/LBANN. -// -// Licensed under the Apache License, Version 2.0 (the "Licensee"); you -// may not use this file except in compliance with the License. You may -// obtain a copy of the License at: -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the license. -// -// cv_mean_extractor .cpp .hpp - accumulate mean over the image set -//////////////////////////////////////////////////////////////////////////////// - -#include "lbann/data_readers/cv_mean_extractor.hpp" -#include "lbann/utils/mild_exception.hpp" -#include "lbann/utils/exception.hpp" - -#ifdef LBANN_HAS_OPENCV -namespace lbann { - -cv_mean_extractor::cv_mean_extractor() -: cv_transform(), m_batch_size(m_default_batch_size), m_batch_cnt(0u), m_partial_cnt(0u), m_type_code(0) -{} - -cv_mean_extractor::cv_mean_extractor(const cv_mean_extractor& rhs) - : cv_transform(rhs), m_batch_size(rhs.m_batch_size), - m_batch_cnt(rhs.m_batch_cnt), m_partial_cnt(rhs.m_partial_cnt), - m_type_code(rhs.m_type_code), m_sum(rhs.m_sum.clone()), m_avg(rhs.m_avg.clone()) -{} - -cv_mean_extractor& cv_mean_extractor::operator=(const cv_mean_extractor& rhs) { - cv_transform::operator=(rhs); - m_batch_size = rhs.m_batch_size; - m_batch_cnt = rhs.m_batch_cnt; - m_partial_cnt = rhs.m_partial_cnt; - m_type_code = rhs.m_type_code; - m_sum = rhs.m_sum.clone(); - m_avg = rhs.m_avg.clone(); - return *this; -} - -cv_mean_extractor *cv_mean_extractor::clone() const { - return (new cv_mean_extractor(*this)); -} - -/** Set up the internal matrices used to accumulate image statistics, - * and initialize the batch size. - */ -void cv_mean_extractor::set(const unsigned int width, const unsigned int height, - const unsigned int n_ch, const unsigned int batch_sz) { - if (!m_sum.empty() || (width == 0u) || (height == 0u) || (n_ch == 0u) || (batch_sz == 0u)) { - std::stringstream err; - err << __FILE__ << " " << __LINE__ << " :: cv_mean_extractor: either using an invalid " - << "parameter or attempting to reconfigure"; - throw lbann_exception(err.str()); - } - - m_batch_size = batch_sz; - - create_matrices(width, height, n_ch); - reset(); -} - -/** - * This can be used to set the batch size only, and defer the creation of - * matrices for accumulating statistics until the first image is seen. - */ -void cv_mean_extractor::set(const unsigned int batch_sz) { - if (!m_sum.empty() || (batch_sz == 0u)) { - std::stringstream err; - err << __FILE__ << " " << __LINE__ << " :: cv_mean_extractor: " << - "cannot reset the batch size once started and it must be greater than 0"; - throw lbann_exception(err.str()); - } - m_batch_size = batch_sz; -} - -void cv_mean_extractor::create_matrices(const unsigned int width, const unsigned int height, const unsigned int n_ch) { - // OpenCV image type code - m_type_code = cv_image_type::T(n_ch); - m_sum = cv::Mat(height, width, m_type_code); - m_avg = cv::Mat(height, width, m_type_code); -} - -void cv_mean_extractor::reset() { - // convert to a single change image before resetting the values as the - // dimension of Scalar is limited to 4 (4 channels) - cv::Mat m_sum_1ch = m_sum.reshape(1); - m_sum_1ch.setTo(static_cast(0)); - cv::Mat m_avg_1ch = m_avg.reshape(1); - m_avg_1ch.setTo(static_cast(0)); - - m_batch_cnt = 0u; - m_partial_cnt = 0u; - m_enabled = false; -} - -/** - * If the size or the number of channels of the given image is different - * from what is expected, fails. - */ -bool cv_mean_extractor::determine_transform(const cv::Mat& image) { - m_enabled = false; - _LBANN_SILENT_EXCEPTION(image.empty(), "", false); - // If it has not been configured (other than batch size), do it here - if (m_sum.empty()) { - create_matrices(image.cols, image.rows, image.channels()); - reset(); - - m_enabled = true; - } else { - m_enabled = check_if_cv_Mat_has_same_shape(image, m_avg); - } - return m_enabled; -} - -bool cv_mean_extractor::determine_inverse_transform() { - // inversing is irrelevant - return (m_enabled = false); -} - -bool cv_mean_extractor::apply(cv::Mat& image) { - m_enabled = false; // turn off as the transform is applied once - const double f = get_depth_normalizing_factor(image.depth()); - cv::addWeighted(m_sum, 1.0, image, f, 0.0, m_sum, m_type_code); - if (++m_partial_cnt == m_batch_size) { - m_partial_cnt = 0u; - ++m_batch_cnt; - cv::addWeighted(m_avg, static_cast(m_batch_cnt-1)/m_batch_cnt, - m_sum, 1/static_cast(m_batch_cnt*m_batch_size), - 0.0, m_avg, m_type_code); - cv::Mat m_sum_1ch = m_sum.reshape(1); - m_sum_1ch.setTo(static_cast(0)); - } - return true; -} - -std::string cv_mean_extractor::get_description() const { - std::stringstream os; - os << get_type() + ":" << std::endl - << " - batch size " << m_batch_size << std::endl; - return os.str(); -} - -std::ostream& cv_mean_extractor::print(std::ostream& os) const { - os << get_description() - << " - partial cnt " << m_partial_cnt << std::endl - << " - batch cnt " << m_batch_cnt << std::endl; - return os; -} - -} // end of namespace lbann -#endif // LBANN_HAS_OPENCV diff --git a/src/data_readers/cv_normalizer.cpp b/src/data_readers/cv_normalizer.cpp deleted file mode 100644 index 88099544bfc..00000000000 --- a/src/data_readers/cv_normalizer.cpp +++ /dev/null @@ -1,342 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. -// Produced at the Lawrence Livermore National Laboratory. -// Written by the LBANN Research Team (B. Van Essen, et al.) listed in -// the CONTRIBUTORS file. -// -// LLNL-CODE-697807. -// All rights reserved. -// -// This file is part of LBANN: Livermore Big Artificial Neural Network -// Toolkit. For details, see http://software.llnl.gov/LBANN or -// https://github.com/LLNL/LBANN. -// -// Licensed under the Apache License, Version 2.0 (the "Licensee"); you -// may not use this file except in compliance with the License. You may -// obtain a copy of the License at: -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the license. -// -// lbann_cv_normalizer .cpp .hpp - Normalizing functions for images -// in opencv format -//////////////////////////////////////////////////////////////////////////////// - -#include "lbann/data_readers/cv_normalizer.hpp" -#include "lbann/utils/mild_exception.hpp" -#include //fabs - -#ifdef LBANN_HAS_OPENCV -namespace lbann { - -cv_normalizer::cv_normalizer() - : cv_transform(), m_mean_subtraction(false), m_unit_variance(false), - m_unit_scale(true), m_z_score(false) -{} - - -cv_normalizer::cv_normalizer(const cv_normalizer& rhs) - : cv_transform(rhs), m_mean_subtraction(rhs.m_mean_subtraction), m_unit_variance(rhs.m_unit_variance), - m_unit_scale(rhs.m_unit_scale), m_z_score(rhs.m_z_score), m_trans(rhs.m_trans) { -} - - -cv_normalizer& cv_normalizer::operator=(const cv_normalizer& rhs) { - if (this == &rhs) { - return (*this); - } - cv_transform::operator=(rhs); - m_mean_subtraction = rhs.m_mean_subtraction; - m_unit_variance = rhs.m_unit_variance; - m_unit_scale = rhs.m_unit_scale; - m_z_score = rhs.m_z_score; - m_trans = rhs.m_trans; - - return (*this); -} - - -cv_normalizer *cv_normalizer::clone() const { - return new cv_normalizer(*this); -} - - -cv_normalizer::normalization_type& cv_normalizer::set_normalization_type( - normalization_type& ntype, const normalization_type flag) const { - return (ntype = set_normalization_bits(ntype, flag)); -} - - -bool cv_normalizer::check_to_enable() const { - return (m_mean_subtraction || m_unit_variance || m_unit_scale || m_z_score); -} - - -void cv_normalizer::set(const bool meansub, const bool unitvar, const bool unitscale, const bool zscore) { - reset(); - m_mean_subtraction = meansub; - m_unit_variance = unitvar; - m_unit_scale = unitscale; - m_z_score = zscore; -} - - -void cv_normalizer::reset() { - m_enabled = false; - m_trans.clear(); -} - - -bool cv_normalizer::determine_transform(const cv::Mat& image) { - reset(); - - _LBANN_SILENT_EXCEPTION(image.empty(), "", false) - - if (!check_to_enable()) { - return false; - } - - normalization_type ntype = _none; - if (m_unit_scale) { - set_normalization_type(ntype, _u_scale); - } - if (m_mean_subtraction) { - set_normalization_type(ntype, _mean_sub); - } - if (m_unit_variance) { - set_normalization_type(ntype, _unit_var); - } - if (m_z_score) { - set_normalization_type(ntype, _z_score); - } - - ComputeType u_scale = 1.0; - ComputeType largest = 1.0; - - //if (!m_z_score && m_unit_scale) { - if (ntype < _z_score) { // !(m_z_score || (m_mean_subtraction && m_unit_variance)) - switch(image.depth()) { - case CV_8U: - largest = std::numeric_limits::max(); - break; - case CV_8S: - largest = std::numeric_limits::max(); - break; - case CV_16U: - largest = std::numeric_limits::max(); - break; - case CV_16S: - largest = std::numeric_limits::max(); - break; - case CV_32S: - largest = std::numeric_limits::max(); - break; - default: - return false; - // Currently, do nothing for non-integral types. However, a set of scaling - // paramters can be added to the argument list of this function. - } - u_scale = static_cast(1.0)/largest; - } - - std::vector mean; - std::vector stddev; - const normalization_type code_wo_uscale = mask_normalization_bits(ntype, _z_score); - const auto NCh = static_cast(image.channels()); - - if (code_wo_uscale != _none) { - if (!compute_mean_stddev(image, mean, stddev) || (NCh != mean.size())) { - return false; - } - #if 0 - for (int ch = 0; ch < image.channels(); ++ch) { - std::cout << "channel " << ch << "\tmean " << mean[ch] << "\tstddev " << stddev[ch] << std::endl; - } - #endif - } - - m_trans.resize(NCh); - - switch (code_wo_uscale) { - case _none: // Note that mean.size() is zero in this case - for (size_t ch=0u; ch < NCh; ++ch) { - m_trans[ch] = channel_trans_t(u_scale, 0.0); - } - break; - case _mean_sub: - for (size_t ch=0u; ch < NCh; ++ch) { - m_trans[ch] = channel_trans_t(u_scale, - - u_scale * mean[ch]); - } - break; - case _unit_var: - for (size_t ch=0u; ch < NCh; ++ch) { - if (stddev[ch] > fabs(mean[ch])*(1e-7)) { - m_trans[ch] = - channel_trans_t(static_cast(1.0)/stddev[ch], - u_scale * mean[ch] - mean[ch]/stddev[ch]); - } else { - m_trans[ch] = channel_trans_t(u_scale, 0.0); - } - } - break; - case _z_score: - for (size_t ch=0u; ch < NCh; ++ch) { - if (stddev[ch] > fabs(mean[ch])*(1e-7)) { - m_trans[ch] = channel_trans_t(static_cast(1.0)/stddev[ch], - - mean[ch]/stddev[ch]); - } else { - m_trans[ch] = channel_trans_t(0.0, 0.0); - } - } - break; - default: - return false; - } - - m_enabled = true; - return true; -} - - -/** - * Manually invoke normalization before copying image from cv::Mat into - * El::Matrix format. Then, the transform must be disabled to - * prevent it from being automatically applied again during copying. - * After the copying is complete, either of the following two is required - * depending on whether the inverse transform is needed afterwards or not. - * If no inverse transform is needed , disabling or resetting is ok. - * As the normalization could have been implicitly applied during copying - * via scaling, the transform must be disabled after copying. - * On the other hand, resetting the structure is ok if no inverse transform - * is needed. Alternatively, the inverse transform can be set. - */ -bool cv_normalizer::apply(cv::Mat& image) { - m_enabled = false; // turn off as it is applied - return scale(image, m_trans); -} - - -/** - * The actual transform can either be manually invoked, or automatically during - * copying from a cv::Mat image to El::Matrix data to avoid reading - * the image twice. - * @param _trans The channel-wise parameters for linear transform - */ -void cv_normalizer::set_transform(const std::vector& _trans) { - m_trans = _trans; - m_enabled = true; -} - - -/** - * In case that undoing normalization is required, this call arranges it to - * occur during copying from El::Matrix data to a cv::Mat image - * while avoiding reading the image twice. - */ -bool cv_normalizer::determine_inverse_transform() { - m_enabled = false; // unless this method is successful, stays disabled - const size_t NCh = m_trans.size(); - if (NCh == 0u) { - m_trans.clear(); - return false; - } - - std::vector trans_reverse(NCh, channel_trans_t(1.0, 0.0)); - - for (size_t ch=0u; ch < NCh; ++ch) { - if (m_trans[ch].first == 0.0) { - m_trans.clear(); - return false; - } - trans_reverse[ch] = - channel_trans_t(static_cast(1.0)/m_trans[ch].first, - - m_trans[ch].second/m_trans[ch].first); - } - trans_reverse.swap(m_trans); - - return (m_enabled = true); -} - - - -bool cv_normalizer::scale(cv::Mat& image, const std::vector& trans) { - _LBANN_SILENT_EXCEPTION(image.empty(), "", false) - - switch(image.depth()) { - case CV_8U: - return scale_with_known_type<_depth_type(CV_8U), DataType>(image, trans); - case CV_8S: - return scale_with_known_type<_depth_type(CV_8S), DataType>(image, trans); - case CV_16U: - return scale_with_known_type<_depth_type(CV_16U), DataType>(image, trans); - case CV_16S: - return scale_with_known_type<_depth_type(CV_16S), DataType>(image, trans); - case CV_32S: - return scale_with_known_type<_depth_type(CV_32S), DataType>(image, trans); - case CV_32F: - return scale_with_known_type<_depth_type(CV_32F), DataType>(image, trans); - case CV_64F: - return scale_with_known_type<_depth_type(CV_64F), DataType>(image, trans); - } - return false; -} - - -bool cv_normalizer::compute_mean_stddev(const cv::Mat& image, - std::vector& mean, std::vector& stddev, - cv::InputArray mask) { - if (image.empty()) { - return false; - } - if (image.channels() > 4) { - _SWITCH_CV_FUNC_4PARAMS(image.depth(), \ - compute_mean_stddev_with_known_type, image, mean, stddev, mask) - } else { - // cv::meanStdDev() currently only works with double type for mean and stddev and images of 1-4 channels - using Ch_T = double; - //using Ch_T = ComputeType; - using Output_T = cv_image_type; - cv::Mat _mean(1, 4, Output_T::T()); - cv::Mat _stddev(1, 4, Output_T::T()); - cv::meanStdDev(image, _mean, _stddev, mask); - mean.resize(image.channels()); - stddev.resize(image.channels()); - for (int c=0; c < image.channels(); ++c) { - mean[c] = static_cast(_mean.at(0,c)); - stddev[c] = static_cast(_stddev.at(0,c)); - } - return true; - } - return false; -} - -std::string cv_normalizer::get_description() const { - std::stringstream os; - os << get_type() + ":" << std::endl - << " - mean subtraction: " << (m_mean_subtraction? "true" : "false") << std::endl - << " - unit variance: " << (m_unit_variance? "true" : "false") << std::endl - << " - unit scale: " << (m_unit_scale? "true" : "false") << std::endl - << " - z-score: " << (m_z_score? "true" : "false") << std::endl; - return os.str(); -} - -std::ostream& cv_normalizer::print(std::ostream& os) const { - os << get_description() - << " - transform:"; - for (const channel_trans_t& tr: m_trans) { - os << " [" << tr.first << ' ' << tr.second << "]\n "; - } - os << std::endl; - - return os; -} - - -} // end of namespace lbann -#endif // LBANN_HAS_OPENCV diff --git a/src/data_readers/cv_process.cpp b/src/data_readers/cv_process.cpp deleted file mode 100644 index 2864aeeca61..00000000000 --- a/src/data_readers/cv_process.cpp +++ /dev/null @@ -1,312 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. -// Produced at the Lawrence Livermore National Laboratory. -// Written by the LBANN Research Team (B. Van Essen, et al.) listed in -// the CONTRIBUTORS file. -// -// LLNL-CODE-697807. -// All rights reserved. -// -// This file is part of LBANN: Livermore Big Artificial Neural Network -// Toolkit. For details, see http://software.llnl.gov/LBANN or -// https://github.com/LLNL/LBANN. -// -// Licensed under the Apache License, Version 2.0 (the "Licensee"); you -// may not use this file except in compliance with the License. You may -// obtain a copy of the License at: -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the license. -// -// cv_process .cpp .hpp - structure that defines the operations -// on image data in opencv format -//////////////////////////////////////////////////////////////////////////////// - - -#include "lbann/data_readers/cv_process.hpp" -#include "lbann/utils/exception.hpp" -#include // std::min - -#ifdef LBANN_HAS_OPENCV -namespace lbann { - -/** - * Copy constructor. - * Rather than transferring the ownership of the managed cv_transform objects - * pointed by the pointers, or sharing them by simply copying the pointers, - * copy-constructs the objects and owns the pointers to those newly created - * objects. - */ -cv_process::cv_process(const cv_process& rhs) - : m_flip(rhs.m_flip), m_split(rhs.m_split), - m_is_normalizer_set(rhs.m_is_normalizer_set), - m_normalizer_idx(rhs.m_normalizer_idx) -{ - for (size_t i = 0u; i < rhs.m_transforms.size(); ++i) { - std::unique_ptr p(rhs.m_transforms[i]->clone()); - if (!p) { - std::stringstream err; - err << __FILE__ << " " << __LINE__ << " :: cv_process: undefined transform " << i; - throw lbann_exception(err.str()); - } - m_transforms.push_back(std::move(p)); // avoid using emplace - } -} - -/** - * Assignment operator. - * Rather than transferring the ownership of the managed cv_transform objects - * pointed by the pointers, or sharing them by simply copying the pointers, - * copy-constructs the objects and owns the pointers to those newly created - * objects. - */ -cv_process& cv_process::operator=(const cv_process& rhs) { - if (this == &rhs) { - return (*this); - } - - m_flip = rhs.m_flip; - m_split = rhs.m_split; - m_is_normalizer_set = rhs.m_is_normalizer_set; - m_normalizer_idx = rhs.m_normalizer_idx; - - m_transforms.clear(); - - for (size_t i = 0u; i < rhs.m_transforms.size(); ++i) { - std::unique_ptr p(rhs.m_transforms[i]->clone()); - if (!p) { - std::stringstream err; - err << __FILE__ << " " << __LINE__ << " :: cv_process: undefined transform " << i; - throw lbann_exception(err.str()); - } - m_transforms.push_back(std::move(p)); - } - - return (*this); -} - - -void cv_process::reset() { - for (auto & m_transform : m_transforms) - m_transform->reset(); -} - -void cv_process::disable_lazy_normalizer() { - if (to_fuse_normalizer_with_copy()) { - m_transforms[m_normalizer_idx]->disable(); - } -} - -void cv_process::disable_transforms() { - for (auto & m_transform : m_transforms) { - m_transform->disable(); - } -} - -bool cv_process::add_transform(std::unique_ptr tr) { - if (!tr) return false; - m_transforms.push_back(std::move(tr)); - return true; -} - -bool cv_process::to_fuse_normalizer_with_copy() const { - return (m_is_normalizer_set && - ((m_normalizer_idx+1) == m_transforms.size()) && - (dynamic_cast(m_transforms[m_normalizer_idx].get()) != nullptr)); -} - -void cv_process::set_normalizer_info() { - m_is_normalizer_set = true; - m_normalizer_idx = m_transforms.size(); -} - -bool cv_process::add_normalizer(std::unique_ptr tr) { - if (!tr || m_is_normalizer_set) return false; - set_normalizer_info(); - m_transforms.push_back(std::move(tr)); - return true; -} - -bool cv_process::add_normalizer(std::unique_ptr tr) { - if (!tr || m_is_normalizer_set) return false; - set_normalizer_info(); - m_transforms.push_back(std::move(tr)); - return true; -} - -/// Allow read-only access to a particular transform indexed by idx -const cv_transform* cv_process::get_transform(const unsigned int idx) const { - if (idx >= m_transforms.size()) { - std::stringstream err; - err << __FILE__ << " " << __LINE__ << " :: cv_process: invalid index " << idx << " >= " << m_transforms.size(); - throw lbann_exception(err.str()); - } - return m_transforms[idx].get(); -} - -/// Allow read-write access to a particular transform indexed by idx -cv_transform* cv_process::get_transform(const unsigned int idx) { - if (idx >= m_transforms.size()) { - std::stringstream err; - err << __FILE__ << " " << __LINE__ << " :: cv_process: invalid index " << idx << " >= " << m_transforms.size(); - throw lbann_exception(err.str()); - } - return m_transforms[idx].get(); -} - -std::vector cv_process::get_data_dims() const { - for(const std::unique_ptr& tr: m_transforms) { - const auto* const c = dynamic_cast(&(*tr)); - if (c != nullptr) { - return {c->get_crop_width(), c->get_crop_height()}; - } - } - return {0u, 0u}; -} - -/** - * Call this before image saving/exporting in postprocessing if inverse normalization - * is needed to save image. Unless normalization is followed by a transform, inverse - * normalization is done while copying data from El::Matrix to cv::Mat format. - * Otherwise, it will be done during postprocessing as the rest of transforms in order. - */ -void cv_process::determine_inverse_lazy_normalization() { - if (!m_is_normalizer_set || !to_fuse_normalizer_with_copy()) { - return; - } - - m_transforms[m_normalizer_idx]->determine_inverse_transform(); -} - -/** - * Preprocess an image. - * It executes a range of transforms specified as [tr_strart, tr_end). If tr_end - * is unspecified, it is considered as the total number of transforms. If it is 0, - * no transform will perform. - * By default, it executes all of them. Selective execution is useful whe - * generating multiple patches (small images) out of an image. - * We first run transforms until generating patches, and stop. Then, generate - * patches, and run the rest of the transforms on each patches generated. - * @return true if successful - */ -bool cv_process::preprocess(cv::Mat& image, unsigned int tr_start, unsigned int tr_end) { - _LBANN_SILENT_EXCEPTION(image.empty(), "", false) - - bool ok = true; - - if (tr_end == 0u) return true; - if (tr_start == 0u) { - if (to_flip()) - cv::flip(image, image, how_to_flip()); - } else if ((tr_start >= m_transforms.size()) || (tr_start >= tr_end)) { - return true; - } - - // While many transforms can update pixel values in place, some require new - // memory locations to write new values. In addition, at the end of a pre- - // processing pipeline, the values in an OpenCV matrix is copied into an - // Elemental matrix. Normalization typically is the last transform in a - // preprocessing pipeline. It is also simple enough (e.g., applying a linear - // function to existing values) that we can merge it with copying from one memory - // to another. Therefore, unless there is another preprocessing operation to be - // done after normalization, in which case we prefer in-place updating, - // we implicitly apply it during copying between memory locations to avoid - // redundant memory access overheads. For this reason, we treat normalization - // differently from other transforms. However, if a subtractor is used as a - // normalizer, it is treated as an ordinary transform. - - const unsigned int num_trs = static_cast(m_transforms.size()); - const bool lazy_normalization = (tr_end == num_trs) && to_fuse_normalizer_with_copy(); - const unsigned int n_immediate_transforms - = std::min((lazy_normalization? m_normalizer_idx : num_trs), tr_end); - - for (size_t i = tr_start; i < n_immediate_transforms; ++i) { - if (m_transforms[i]->determine_transform(image)) { - ok = m_transforms[i]->apply(image); - } - } - - if (lazy_normalization) { - m_transforms[m_normalizer_idx]->determine_transform(image); - } - - return ok; -} - -/** - * Postprocess an image. - * @return true if successful - */ -bool cv_process::postprocess(cv::Mat& image) { - _LBANN_SILENT_EXCEPTION(image.empty(), "", false) - - bool ok = true; - - const bool lazy_normalization = to_fuse_normalizer_with_copy(); - const unsigned int n_immediate_transforms - = (lazy_normalization? m_normalizer_idx : m_transforms.size()); - - // If normalizer is the last transform in the preprocessing pipeline, it will - // be the first in the postprocessing. In addition, it has implicitly been - // inversed during copying from El::Mat to cv::Mat before calling postprocess(image) - - for (size_t i = n_immediate_transforms; i > 0; --i) { - if (m_transforms[i-1]->determine_inverse_transform()) { - ok = m_transforms[i-1]->apply(image); - _LBANN_MILD_EXCEPTION(!ok, "inverse transform " << i-1 << " has failed!", false); - } - } - - if (to_flip()) { - cv::flip(image, image, how_to_flip()); - } - - return ok; -} - -std::vector cv_process::get_transform_normalize() const { - return (to_fuse_normalizer_with_copy()? - dynamic_cast(m_transforms[m_normalizer_idx].get())->transform() : - std::vector()); -} - -std::vector cv_process::get_transform_normalize(const unsigned int ch) const { - std::vector trans; - if (to_fuse_normalizer_with_copy()) { - trans = dynamic_cast(m_transforms[m_normalizer_idx].get())->transform(); - } - - return ((trans.size() > ch) ? - std::vector(1, trans[ch]) : - std::vector(1, cv_normalizer::channel_trans_t(1.0, 0.0))); -} - -std::string cv_process::get_description() const { - std::stringstream os; - os << get_type() + ":" << std::endl - << " - flip: " << cv_transform::flip_desc(m_flip) << std::endl - << " - split channels: " << m_split << std::endl - << " - is normalizer set: " << m_is_normalizer_set << std::endl; - - if (m_is_normalizer_set) - os << " - normalizer index: " << m_normalizer_idx << std::endl; - - os << " - number of transforms: " << m_transforms.size() << std::endl; - for(size_t i = 0u; i< m_transforms.size(); ++i) { - if(!m_transforms[i]) - os << " transform [" << i << "]: not set" << std::endl; - else - os << " transform [" << i << "]: " << m_transforms[i]->get_name() - << " of " << m_transforms[i]->get_type() << " type" << std::endl; - } - - return os.str(); -} - -} // end of namespace lbann -#endif // LBANN_HAS_OPENCV diff --git a/src/data_readers/cv_process_patches.cpp b/src/data_readers/cv_process_patches.cpp deleted file mode 100644 index 08a227741a9..00000000000 --- a/src/data_readers/cv_process_patches.cpp +++ /dev/null @@ -1,109 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. -// Produced at the Lawrence Livermore National Laboratory. -// Written by the LBANN Research Team (B. Van Essen, et al.) listed in -// the CONTRIBUTORS file. -// -// LLNL-CODE-697807. -// All rights reserved. -// -// This file is part of LBANN: Livermore Big Artificial Neural Network -// Toolkit. For details, see http://software.llnl.gov/LBANN or -// https://github.com/LLNL/LBANN. -// -// Licensed under the Apache License, Version 2.0 (the "Licensee"); you -// may not use this file except in compliance with the License. You may -// obtain a copy of the License at: -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the license. -// -// cv_process_patches .cpp .hpp - structure that defines the operations -// on patches extracted from an image in the opencv format -//////////////////////////////////////////////////////////////////////////////// - - -#include "lbann/data_readers/cv_process_patches.hpp" -#include // std::numeric_limits - -#ifdef LBANN_HAS_OPENCV -namespace lbann { - -cv_process_patches::cv_process_patches() - : cv_process(), m_self_label(false), - m_when_to_extract(std::numeric_limits::max()) { -} - -cv_process_patches::cv_process_patches(const bool self_label) - : cv_process(), m_self_label(self_label), - m_when_to_extract(std::numeric_limits::max()) { -} - -cv_process_patches::cv_process_patches(const cv_process_patches& rhs) - : cv_process(rhs), m_pd(rhs.m_pd), m_self_label(rhs.m_self_label), - m_when_to_extract(rhs.m_when_to_extract) { -} - -cv_process_patches::cv_process_patches(const cv_transform::cv_flipping flip_code, const bool tosplit) - : cv_process(flip_code, tosplit), m_self_label(false), - m_when_to_extract(std::numeric_limits::max()) { -} - -cv_process_patches& cv_process_patches::operator=(const cv_process_patches& rhs) { - if (this == &rhs) { - return (*this); - } - cv_process::operator=(rhs); - m_pd = rhs.m_pd; - m_self_label = rhs.m_self_label; - m_when_to_extract = rhs.m_when_to_extract; - - return (*this); -} - -void cv_process_patches::set_patch_descriptor(const patchworks::patch_descriptor& pd, - const unsigned int when_to_extract) { - m_pd = pd; - m_self_label = m_pd.is_self_labeling(); - m_when_to_extract = when_to_extract; -} - -/** - * Preprocess patches extracted from an image. - * @return true if successful - */ -bool cv_process_patches::preprocess(cv::Mat& image, std::vector& patches) { - bool ok = true; - patches.clear(); - - ok = cv_process::preprocess(image, 0u, m_when_to_extract); - ok = ok && m_pd.extract_patches(image, patches); - - for (size_t i=0u; ok && (i < patches.size()); ++i) { - ok = cv_process::preprocess(patches[i], m_when_to_extract); - } - - return ok; -} - -std::string cv_process_patches::get_description() const { - std::stringstream os; - const unsigned int when_to_extract = ((m_when_to_extract > m_transforms.size())? - m_transforms.size() : m_when_to_extract); - const std::string when_exactly = ((when_to_extract == 0u)? - "at the beginning" : ("after " + m_transforms[when_to_extract-1]->get_name())); - os << cv_process::get_description(); - os << " - self-labeling: " << m_self_label << std::endl - << " - extract patches " << when_exactly << std::endl - << m_pd << std::endl; - - return os.str(); -} - -} // end of namespace lbann -#endif // LBANN_HAS_OPENCV diff --git a/src/data_readers/cv_resizer.cpp b/src/data_readers/cv_resizer.cpp deleted file mode 100644 index e3c12c0844a..00000000000 --- a/src/data_readers/cv_resizer.cpp +++ /dev/null @@ -1,117 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. -// Produced at the Lawrence Livermore National Laboratory. -// Written by the LBANN Research Team (B. Van Essen, et al.) listed in -// the CONTRIBUTORS file. -// -// LLNL-CODE-697807. -// All rights reserved. -// -// This file is part of LBANN: Livermore Big Artificial Neural Network -// Toolkit. For details, see http://software.llnl.gov/LBANN or -// https://github.com/LLNL/LBANN. -// -// Licensed under the Apache License, Version 2.0 (the "Licensee"); you -// may not use this file except in compliance with the License. You may -// obtain a copy of the License at: -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the license. -// -// cv_resizer .cpp .hpp - Functions to resize images -//////////////////////////////////////////////////////////////////////////////// - -#include "lbann/data_readers/cv_resizer.hpp" -#include "lbann/utils/exception.hpp" -#include -#include - -#ifdef LBANN_HAS_OPENCV -namespace lbann { - -const int cv_resizer::m_interpolation_choices[3] = {cv::INTER_LINEAR, cv::INTER_AREA, cv::INTER_LINEAR}; - -cv_resizer::cv_resizer() - : cv_transform(), m_width(0u), m_height(0u), - m_interpolation(m_interpolation_choices[0]), - m_adaptive_interpolation(false) {} - - -cv_resizer *cv_resizer::clone() const { - return new cv_resizer(*this); -} - -void cv_resizer::set(const unsigned int width, const unsigned int height, - const bool adaptive_interpolation) { - reset(); - m_width = width; - m_height = height; - m_adaptive_interpolation = adaptive_interpolation; - - if ((m_width == 0u) || (m_height == 0u)) { - std::stringstream err; - err << __FILE__ << " " << __LINE__ << " :: cv_resizer: invalid size of the resized image"; - throw lbann_exception(err.str()); - } -} - -void cv_resizer::reset() { - m_enabled = false; - m_interpolation = m_interpolation_choices[0]; -} - -bool cv_resizer::determine_transform(const cv::Mat& image) { - m_enabled = false; //sufficient for now in place of reset(); - - if (image.empty()) { - throw lbann_exception("cv_resizer::determine_transform : empty image."); - } - - const double zoom = image.cols * image.rows / static_cast(m_width * m_height); - - if (zoom <= 1.0) { // shirinking - m_interpolation = m_interpolation_choices[static_cast(m_adaptive_interpolation)]; - } else { // enlarging - m_interpolation = m_interpolation_choices[static_cast(m_adaptive_interpolation) << 1]; - } - - return (m_enabled = true); -} - -bool cv_resizer::apply(cv::Mat& image) { - m_enabled = false; // turn off as it is applied - - cv::Mat image_new; - cv::resize(image, image_new, cv::Size(m_width, m_height), 0, 0, m_interpolation); - image = image_new; - - return true; -} - -std::string cv_resizer::get_description() const { - std::stringstream os; - os << get_type() + ":" << std::endl - << " - desired size: " << m_width << "x" << m_height << std::endl - << " - adaptive interpolation: " << m_adaptive_interpolation << std::endl; - return os.str(); -} - -std::ostream& cv_resizer::print(std::ostream& os) const { - os << get_description() - << " - interpolation: "; - switch(m_interpolation) { - case cv::INTER_LINEAR: os << "INTER_LINEAR" << std::endl; break; - case cv::INTER_CUBIC: os << "INTER_CUBIC" << std::endl; break; - case cv::INTER_AREA: os << "INTER_AREA" << std::endl; break; - default: os << "unrecognized" << std::endl; break; - } - return os; -} - -} // end of namespace lbann -#endif // LBANN_HAS_OPENCV diff --git a/src/data_readers/cv_subtractor.cpp b/src/data_readers/cv_subtractor.cpp deleted file mode 100644 index b52e1c1391c..00000000000 --- a/src/data_readers/cv_subtractor.cpp +++ /dev/null @@ -1,393 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. -// Produced at the Lawrence Livermore National Laboratory. -// Written by the LBANN Research Team (B. Van Essen, et al.) listed in -// the CONTRIBUTORS file. -// -// LLNL-CODE-697807. -// All rights reserved. -// -// This file is part of LBANN: Livermore Big Artificial Neural Network -// Toolkit. For details, see http://software.llnl.gov/LBANN or -// https://github.com/LLNL/LBANN. -// -// Licensed under the Apache License, Version 2.0 (the "Licensee"); you -// may not use this file except in compliance with the License. You may -// obtain a copy of the License at: -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the license. -// -// cv_subtractor .cpp .hpp - subtract channel values of an image (possibly the -// pixel-wise mean of dataset) from the corresponding values of another (input) -//////////////////////////////////////////////////////////////////////////////// - -#include "lbann/data_readers/cv_subtractor.hpp" -#include "lbann/utils/exception.hpp" -#include "lbann/utils/mild_exception.hpp" -#include "lbann/utils/file_utils.hpp" -#include -#include -#include - -#ifdef LBANN_HAS_OPENCV -namespace lbann { - -cv_subtractor::cv_subtractor(const cv_subtractor& rhs) - : cv_transform(rhs), - m_img_to_sub(rhs.m_img_to_sub), - m_img_to_div(rhs.m_img_to_div), - m_channel_mean(rhs.m_channel_mean), - m_channel_stddev(rhs.m_channel_stddev), - m_applied(rhs.m_applied) -{} - -cv_subtractor& cv_subtractor::operator=(const cv_subtractor& rhs) { - cv_transform::operator=(rhs); - m_img_to_sub = rhs.m_img_to_sub; - m_img_to_div = rhs.m_img_to_div; - m_channel_mean = rhs.m_channel_mean; - m_channel_stddev = rhs.m_channel_stddev; - m_applied = rhs.m_applied; - return *this; -} - -cv_subtractor *cv_subtractor::clone() const { - return (new cv_subtractor(*this)); -} - -/** - * Load an image in the file of the proprietary format. - * The file name describes the image configuration as: - * *-(width)x(height)x(num_channels)-(opencv_depth_code).bin - * There is no header in the file. The file is a binary dump of an OpenCV cv::Mat data. - * For the better portability, an existing format can be used to carry image data. - */ -cv::Mat cv_subtractor::read_binary_image_file(const std::string filename) { - std::vector tokens; - { // Extract the information on the image from the file name - const std::vector delims = {'-', 'x','x','-','.'}; - std::string dir; - std::string basename; - - parse_path(filename, dir, basename); - tokens = get_tokens(basename, delims); - if (tokens.size() != delims.size()) { - return cv::Mat(); - } - } - - std::ifstream file(filename, std::ios::binary); - if (!file.good()) { - return cv::Mat(); - } - file.unsetf(std::ios::skipws); - - { // Check file size - const size_t image_byte_size - = tokens[1] * tokens[2] * tokens[3] * CV_ELEM_SIZE(tokens[4]); - - file.seekg(0, std::ios::end); - const size_t file_size = static_cast(file.tellg()); - if (image_byte_size != file_size) { - return cv::Mat(); - } - } - - // Construct an image data structure - cv::Mat image(tokens[1], tokens[2], CV_MAKETYPE(tokens[4], tokens[3])); - - // Reset the file pointer - file.seekg(0, std::ios::beg); - - // Load the image from the file - std::copy(std::istream_iterator(file), - std::istream_iterator(), - reinterpret_cast(image.data)); - - return image; -} - -void cv_subtractor::set_mean(const std::string name_of_img_to_sub, const int depth_code) { - cv::Mat img_to_sub; - std::string ext = get_ext_name(name_of_img_to_sub); - std::transform(ext.begin(), ext.end(), ext.begin(), ::tolower); - if (ext == "bin") { - img_to_sub = read_binary_image_file(name_of_img_to_sub); - } else { // let OpenCV handle - img_to_sub = cv::imread(name_of_img_to_sub); - } - if (img_to_sub.empty()) { - std::stringstream err; - err << __FILE__ << " " << __LINE__ << " :: cv_subtractor: cannot load the image " - << name_of_img_to_sub << " to subtract."; - throw lbann_exception(err.str()); - } - set_mean(img_to_sub, depth_code); - m_channel_mean.clear(); - - if (m_channel_stddev.empty() && !m_img_to_div.empty() && - !check_if_cv_Mat_has_same_shape(m_img_to_div, m_img_to_sub)) { - throw lbann_exception("cv_subtractor::set_mean() : mean and variance images have different shapes"); - } -} - -void cv_subtractor::set_mean(const std::vector ch_mean) { - if (ch_mean.size() > cv::Scalar::channels) { - throw lbann_exception(std::string("cv_subtractor::set_mean() : ") + - "provide the mean image if the number of channels are larger than " + - std::to_string(cv::Scalar::channels) + '.'); - } - m_channel_mean = ch_mean; -} - -bool cv_subtractor::create_img_to_sub(int width, int height, int n_channels) { - if ((n_channels == 0) || (static_cast(n_channels) != m_channel_mean.size()) || - (width == 0) || (height == 0)) { - return false; - } - const std::vector& ch_mean = m_channel_mean; - cv::Scalar px = cv::Scalar::all(0.0); - for (size_t i = 0u; i < ch_mean.size(); ++i) { - px[static_cast(i)] = ch_mean[i]; - } - cv::Mat img_to_sub(height, width, cv_image_type::T(n_channels), px); - set_mean(img_to_sub); - return true; -} - -void cv_subtractor::set_mean(const cv::Mat& image, const int depth_code) { - reset(); - - const double f = get_depth_normalizing_factor(image.depth()); - - // Make sure that the image is set as a floating point type image - // Note that this is the only way to set m_img_to_sub. This means that - // m_img_to_sub will be of a floating point type unless it is empty. - - if ((depth_code != CV_32F) && (depth_code != CV_64F)) { - // If the depth_code does not indicate a floating point type, see if the - // image is already of a floating point type. If so, use the same type. - // Otherwise, use the type of LBANN's DataType. - if (check_if_cv_Mat_is_float_type(image)) { - image.convertTo(m_img_to_sub, image.depth(), f, 0.0); - } else { - image.convertTo(m_img_to_sub, cv_image_type::T(), f, 0.0); - } - } else { - image.convertTo(m_img_to_sub, depth_code, f, 0.0); - } -} - -void cv_subtractor::set_stddev(const std::string name_of_img_to_div, const int depth_code) { - cv::Mat img_to_div; - std::string ext = get_ext_name(name_of_img_to_div); - std::transform(ext.begin(), ext.end(), ext.begin(), ::tolower); - if (ext == "bin") { - img_to_div = read_binary_image_file(name_of_img_to_div); - } else { // let OpenCV handle - img_to_div = cv::imread(name_of_img_to_div); - } - if (img_to_div.empty()) { - std::stringstream err; - err << __FILE__ << " " << __LINE__ << " :: cv_subtractor: cannot load the image " - << name_of_img_to_div << " to normalize."; - throw lbann_exception(err.str()); - } - set_stddev(img_to_div, depth_code); - m_channel_stddev.clear(); - - if (m_channel_mean.empty() && !m_img_to_sub.empty() && - !check_if_cv_Mat_has_same_shape(m_img_to_sub, m_img_to_div)) { - throw lbann_exception("cv_subtractor::set_stddev() : mean and variance images have different shapes."); - } -} - -void cv_subtractor::set_stddev(const std::vector ch_stddev) { - if (ch_stddev.size() > cv::Scalar::channels) { - throw lbann_exception(std::string("cv_subtractor::set_stddev() : ") + - "provide the stddev image if the number of channels are larger than " + - std::to_string(cv::Scalar::channels) + '.'); - } - m_channel_stddev = ch_stddev; -} - -bool cv_subtractor::create_img_to_div(int width, int height, int n_channels) { - if ((n_channels == 0) || (static_cast(n_channels) != m_channel_stddev.size()) || - (width == 0) || (height == 0)) { - return false; - } - const std::vector& ch_stddev = m_channel_stddev; - cv::Scalar px = cv::Scalar::all(0.0); - for (size_t i = 0u; i < ch_stddev.size(); ++i) { - px[static_cast(i)] = ch_stddev[i]; - } - cv::Mat img_to_div(height, width, cv_image_type::T(n_channels), px); - set_stddev(img_to_div); - return true; -} - -void cv_subtractor::set_stddev(const cv::Mat& image, const int depth_code) { - reset(); - - const double f = get_depth_normalizing_factor(image.depth()); - - if ((depth_code != CV_32F) && (depth_code != CV_64F)) { - if (check_if_cv_Mat_is_float_type(image)) { - image.convertTo(m_img_to_div, image.depth(), f, 0.0); - } else { - image.convertTo(m_img_to_div, cv_image_type::T(), f, 0.0); - } - } else { - image.convertTo(m_img_to_div, depth_code, f, 0.0); - } -} - -bool cv_subtractor::determine_transform(const cv::Mat& image) { - reset(); - if (m_channel_mean.empty()) { - if (!m_img_to_sub.empty()) { // pixel-wise - if (!check_if_cv_Mat_has_same_shape(image, m_img_to_sub)) { - throw lbann_exception(std::string("cv_subtactor::determine_transform(): ") + - "input and mean images have different sizes."); - } - m_enabled = true; - } - } else { // channel-wise - if (!check_if_cv_Mat_has_same_shape(image, m_img_to_sub) && - !create_img_to_sub(image.cols, image.rows, image.channels())) { - throw lbann_exception(std::string("cv_subtactor::determine_transform(): ") + - "failed to create mean image."); - } - m_enabled = true; - } - if (m_channel_stddev.empty()) { - if (!m_img_to_div.empty()) { // pixel-wise - if (!check_if_cv_Mat_has_same_shape(image, m_img_to_div)) { - throw lbann_exception(std::string("cv_subtactor::determine_transform(): ") + - "input and stddev images have different sizes."); - } - m_enabled = true; - } - } else { // channel-wise - if (!check_if_cv_Mat_has_same_shape(image, m_img_to_div) && - !create_img_to_div(image.cols, image.rows, image.channels())) { - throw lbann_exception(std::string("cv_subtactor::determine_transform(): ") + - "failed to create stddev image."); - } - m_enabled = true; - } - return m_enabled; -} - -bool cv_subtractor::determine_inverse_transform() { - return (m_enabled = m_applied); -} - -/** - * Currently only supports mean-subtraction and z-score. - * TODO: Unit variance is not supported. It can be implemented by adding - * 'm_img_to_sub' to the result of z-score. Both z-score and unit variance - * requires both mean and stddev. Thus, we would need an additional flag to - * distinguish which method is being set up. - */ -bool cv_subtractor::apply(cv::Mat& image) { - m_enabled = false; // turn off as the transform is applied once - if (m_applied) { // inverse if applied already - double f = get_depth_denormalizing_factor(CV_8U); - - cv::Mat image_new; - - if (!m_img_to_div.empty()) { - double ff = 1.0; - if (m_img_to_sub.empty()) { - ff = f; - f = 1.0; - } - cv::multiply(image, m_img_to_div, image_new, ff, m_img_to_div.depth()); - image = image_new; - } - - if (!m_img_to_sub.empty()) { - cv::addWeighted(m_img_to_sub, f, image, f, 0.0, image_new, CV_8U); - image = image_new; - } - - m_applied = false; - } else { - double f = get_depth_normalizing_factor(image.depth()); - - cv::Mat image_new; - if (!m_img_to_sub.empty()) { - cv::addWeighted(m_img_to_sub, -1.0, image, f, 0.0, image_new, m_img_to_sub.depth()); - f = 1.0; // to avoid redundant depth normalization - image = image_new; - } - - if (!m_img_to_div.empty()) { - cv::divide(image, m_img_to_div, image_new, f, m_img_to_div.depth()); - image = image_new; - } - - m_applied = true; - } - - return true; -} - -bool cv_subtractor::check_if_channel_wise() const { - return !(m_channel_mean.empty() || m_channel_stddev.empty()); -} - -std::string cv_subtractor::get_description() const { - std::stringstream os; - os << get_type() + ":" << std::endl; - return os.str(); -} - -std::ostream& cv_subtractor::print(std::ostream& os) const { - os << get_description() - << " - image shape to subtract: " - << m_img_to_sub.cols << 'x' << m_img_to_sub.rows - << 'x' << m_img_to_sub.channels() - << '-' << m_img_to_sub.depth() << std::endl - << " - image shape to divide: " - << m_img_to_div.cols << 'x' << m_img_to_div.rows - << 'x' << m_img_to_div.channels() - << '-' << m_img_to_div.depth() << std::endl; - - os << " - mean per channel to subtract:"; - for (const auto v: m_channel_mean) { - os << ' ' << v; - } - os << std::endl; - - os << " - stddev per channel to divide:"; - for (const auto v: m_channel_stddev) { - os << ' ' << v; - } - os << std::endl; - -#if 0 - double f = get_depth_denormalizing_factor(CV_8U); - if (!m_img_to_sub.empty()) { - cv::Mat img_sub; - m_img_to_sub.convertTo(img_sub, CV_8U, f, 0.0); - cv::imwrite("img_sub.png", img_sub); - } - if (!m_img_to_div.empty()) { - cv::Mat img_div; - m_img_to_div.convertTo(img_div, CV_8U, f, 0.0); - cv::imwrite("img_div.png", img_div); - } -#endif - return os; -} - -} // end of namespace lbann -#endif // LBANN_HAS_OPENCV diff --git a/src/data_readers/cv_utils.cpp b/src/data_readers/cv_utils.cpp deleted file mode 100644 index 9a730775344..00000000000 --- a/src/data_readers/cv_utils.cpp +++ /dev/null @@ -1,112 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. -// Produced at the Lawrence Livermore National Laboratory. -// Written by the LBANN Research Team (B. Van Essen, et al.) listed in -// the CONTRIBUTORS file. -// -// LLNL-CODE-697807. -// All rights reserved. -// -// This file is part of LBANN: Livermore Big Artificial Neural Network -// Toolkit. For details, see http://software.llnl.gov/LBANN or -// https://github.com/LLNL/LBANN. -// -// Licensed under the Apache License, Version 2.0 (the "Licensee"); you -// may not use this file except in compliance with the License. You may -// obtain a copy of the License at: -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the license. -// -// cv_utils .cpp .hpp - operations related to opencv images -//////////////////////////////////////////////////////////////////////////////// - -#include "lbann/data_readers/cv_utils.hpp" -#include "lbann/utils/exception.hpp" -#include "lbann/utils/timer.hpp" -#include "lbann/utils/file_utils.hpp" -//#include - -#ifdef LBANN_HAS_OPENCV -namespace lbann { - -bool cv_utils::copy_cvMat_to_buf(const cv::Mat& image, std::vector& buf, const cv_process& pp) { - _LBANN_SILENT_EXCEPTION(image.empty(), "", false) - - _SWITCH_CV_FUNC_3PARAMS(image.depth(), \ - copy_cvMat_to_buf_with_known_type, \ - image, buf, pp) - return false; -} - - -cv::Mat cv_utils::copy_buf_to_cvMat(const std::vector& buf, - const int Width, const int Height, const int Type, const cv_process& pp) { - _LBANN_MILD_EXCEPTION(buf.size() != \ - static_cast(Width * Height * CV_MAT_CN(Type) * CV_ELEM_SIZE(CV_MAT_DEPTH(Type))), \ - "Size mismatch: Buffer has " << buf.size() << " items when " \ - << static_cast(Width * Height * CV_MAT_CN(Type) * CV_ELEM_SIZE(CV_MAT_DEPTH(Type))) \ - << " are expected.", \ - cv::Mat()) - - _SWITCH_CV_FUNC_4PARAMS(CV_MAT_DEPTH(Type), \ - copy_buf_to_cvMat_with_known_type, \ - buf, Width, Height, pp) - - _LBANN_DEBUG_MSG("Unknown image depth: " << CV_MAT_DEPTH(Type)); - return cv::Mat(); -} - - -bool cv_utils::copy_cvMat_to_buf(const cv::Mat& image, CPUMat& buf, const cv_process& pp) { - _LBANN_SILENT_EXCEPTION(image.empty(), "", false) - - _SWITCH_CV_FUNC_3PARAMS(image.depth(), \ - copy_cvMat_to_buf_with_known_type, \ - image, buf, pp) - return false; -} - - -cv::Mat cv_utils::copy_buf_to_cvMat(const CPUMat& buf, - const int Width, const int Height, const int Type, const cv_process& pp) { - _SWITCH_CV_FUNC_4PARAMS(CV_MAT_DEPTH(Type), \ - copy_buf_to_cvMat_with_known_type, \ - buf, Width, Height, pp) - - _LBANN_DEBUG_MSG("Unknown image depth: " << CV_MAT_DEPTH(Type)); - return cv::Mat(); -} - - -std::ostream& operator<<(std::ostream& os, const cv_transform& tr) { - tr.print(os); - return os; -} - - -cv::Mat cv_utils::lbann_imread(const std::string& img_file_path, int flags, std::vector& buf, cv::Mat* cv_buf) { - // Load an image bytestream into memory - bool ok = lbann::load_file(img_file_path, buf); - if (!ok) { - throw lbann_exception("lbann_imread() : failed to load " + img_file_path); - } - - // create a zero-copying view on a block of bytes - using InputBuf_T = lbann::cv_image_type; - const cv::Mat inbuf(1, buf.size(), InputBuf_T::T(1), buf.data()); - - // decode the image data in the memory buffer - // Note that if cv_buf is not NULL, then the return value is *cv_buf - cv::Mat image = cv::imdecode(inbuf, cv::IMREAD_ANYCOLOR | cv::IMREAD_ANYDEPTH, cv_buf); - return image; -} - - -} // end of namespace lbann -#endif // LBANN_HAS_OPENCV diff --git a/src/data_readers/data_reader_cifar10.cpp b/src/data_readers/data_reader_cifar10.cpp index 5b69c79d7c3..a9f8127b5d6 100644 --- a/src/data_readers/data_reader_cifar10.cpp +++ b/src/data_readers/data_reader_cifar10.cpp @@ -97,9 +97,11 @@ bool cifar10_reader::fetch_datum(CPUMat& X, int data_id, int mb_idx) { } auto pixel_col = X(El::IR(0, X.Height()), El::IR(mb_idx, mb_idx + 1)); - augment(pixel_col, m_image_height, m_image_width, m_image_num_channels); - normalize(pixel_col, m_image_num_channels); - pixel_noise(pixel_col); //add noise to image, disable by default + std::vector dims = { + static_cast(m_image_num_channels), + static_cast(m_image_height), + static_cast(m_image_width)}; + m_transform_pipeline.apply(pixel_col, dims); return true; } diff --git a/src/data_readers/data_reader_image.cpp b/src/data_readers/data_reader_image.cpp index ea2a904625b..337ffe896b0 100644 --- a/src/data_readers/data_reader_image.cpp +++ b/src/data_readers/data_reader_image.cpp @@ -27,6 +27,7 @@ //////////////////////////////////////////////////////////////////////////////// #include "lbann/data_readers/data_reader_image.hpp" +#include "lbann/utils/image.hpp" #include "lbann/utils/timer.hpp" #include "lbann/data_store/data_store_conduit.hpp" #include "lbann/utils/file_utils.hpp" @@ -230,13 +231,10 @@ void image_data_reader::preload_data_store() { void image_data_reader::setup(int num_io_threads, std::shared_ptr io_thread_pool) { generic_data_reader::setup(num_io_threads, io_thread_pool); - - using InputBuf_T = lbann::cv_image_type; - auto cvMat = cv::Mat(1, get_linearized_data_size(), InputBuf_T::T(1)); - m_thread_cv_buffer.resize(num_io_threads); - for(int tid = 0; tid < num_io_threads; ++tid) { - m_thread_cv_buffer[tid] = cvMat.clone(); - } + m_transform_pipeline.set_expected_out_dims( + {static_cast(m_image_num_channels), + static_cast(m_image_height), + static_cast(m_image_width)}); } std::vector image_data_reader::get_image_list_of_current_mb() const { diff --git a/src/data_readers/data_reader_imagenet.cpp b/src/data_readers/data_reader_imagenet.cpp index ee3a437d82d..426da295d1d 100644 --- a/src/data_readers/data_reader_imagenet.cpp +++ b/src/data_readers/data_reader_imagenet.cpp @@ -27,66 +27,23 @@ //////////////////////////////////////////////////////////////////////////////// #include "lbann/data_readers/data_reader_imagenet.hpp" -#include "lbann/data_readers/image_utils.hpp" +#include "lbann/utils/image.hpp" #include "lbann/utils/file_utils.hpp" -#include namespace lbann { -imagenet_reader::imagenet_reader(const std::shared_ptr& pp, bool shuffle) +imagenet_reader::imagenet_reader(bool shuffle) : image_data_reader(shuffle) { set_defaults(); - - if (!pp) { - LBANN_ERROR("construction error: no image processor"); - } - - m_master_pps = lbann::make_unique(*pp); } -imagenet_reader::imagenet_reader(const imagenet_reader& rhs) - : image_data_reader(rhs) { - if (!rhs.m_master_pps) { - LBANN_ERROR("construction error: no image processor"); - } - m_master_pps = lbann::make_unique(*rhs.m_master_pps); -} - - imagenet_reader::imagenet_reader(const imagenet_reader& rhs, const std::vector& ds_sample_move_list, std::string role) - : image_data_reader(rhs, ds_sample_move_list) { - if (!rhs.m_master_pps) { - LBANN_ERROR("construction error: no image processor"); - } - m_master_pps = lbann::make_unique(*rhs.m_master_pps); - set_role(role); -} + : image_data_reader(rhs, ds_sample_move_list, role) {} imagenet_reader::imagenet_reader(const imagenet_reader& rhs, const std::vector& ds_sample_move_list) - : image_data_reader(rhs, ds_sample_move_list) { - if (!rhs.m_master_pps) { - LBANN_ERROR("construction error: no image processor"); - } - m_master_pps = lbann::make_unique(*rhs.m_master_pps); -} - -imagenet_reader& imagenet_reader::operator=(const imagenet_reader& rhs) { - // check for self-assignment - if (this == &rhs) { - return (*this); - } - - image_data_reader::operator=(rhs); + : image_data_reader(rhs, ds_sample_move_list) {} - if (!rhs.m_master_pps) { - LBANN_ERROR("construction error: no image processor"); - } - m_master_pps = lbann::make_unique(*rhs.m_master_pps); - return (*this); -} - -imagenet_reader::~imagenet_reader() { -} +imagenet_reader::~imagenet_reader() {} void imagenet_reader::set_defaults() { m_image_width = 256; @@ -96,52 +53,17 @@ void imagenet_reader::set_defaults() { m_num_labels = 1000; } -void imagenet_reader::setup(int num_io_threads, std::shared_ptr io_thread_pool) { - image_data_reader::setup(num_io_threads, io_thread_pool); - replicate_processor(*m_master_pps, num_io_threads); -} - -/// Replicate image processor for each I/O thread -bool imagenet_reader::replicate_processor(const cv_process& pp, const int nthreads) { - m_pps.resize(nthreads); - - // Construct thread private preprocessing objects out of a shared pointer - for (int i = 0; i < nthreads; ++i) { - m_pps[i] = lbann::make_unique(pp); - } - - bool ok = true; - for (int i = 0; ok && (i < nthreads); ++i) { - if (!m_pps[i]) ok = false; - } - - if (!ok || (nthreads <= 0)) { - LBANN_ERROR("cannot replicate image processor"); - } - - const std::vector dims = pp.get_data_dims(); - if ((dims.size() == 2u) && (dims[0] != 0u) && (dims[1] != 0u)) { - m_image_width = static_cast(dims[0]); - m_image_height = static_cast(dims[1]); - set_linearized_image_size(); - } - - return true; -} - CPUMat imagenet_reader::create_datum_view(CPUMat& X, const int mb_idx) const { return El::View(X, El::IR(0, X.Height()), El::IR(mb_idx, mb_idx + 1)); } bool imagenet_reader::fetch_datum(CPUMat& X, int data_id, int mb_idx) { - int width=0, height=0, img_type=0; - int tid = m_io_thread_pool->get_local_thread_id(); - CPUMat X_v = create_datum_view(X, mb_idx); - bool ret; - const std::string imagepath = get_file_dir() + m_image_list[data_id].first; + El::Matrix image; + std::vector dims; + const std::string image_path = get_file_dir() + m_image_list[data_id].first; - bool have_node = true; if (m_data_store != nullptr) { + bool have_node = true; conduit::Node node; if (m_data_store->is_local_cache()) { if (m_data_store->has_conduit_node(data_id)) { @@ -164,36 +86,26 @@ bool imagenet_reader::fetch_datum(CPUMat& X, int data_id, int mb_idx) { if (m_issue_warning) { if (is_master()) { LBANN_WARNING("m_data_store != nullptr, but we are not retrivieving a node from the store; role: " + get_role() + "; this is probably OK for test mode, but may be an error for train or validate modes"); - } + } m_issue_warning = false; } - ret = lbann::image_utils::load_image(imagepath, width, height, img_type, *(m_pps[tid]), X_v, m_thread_buffer[tid], &m_thread_cv_buffer[tid]); + load_image(image_path, image, dims); have_node = false; } if (have_node) { char *buf = node[LBANN_DATA_ID_STR(data_id) + "/buffer"].value(); size_t size = node[LBANN_DATA_ID_STR(data_id) + "/buffer_size"].value(); - std::vector v2(size); - for (size_t j=0; j encoded_image(size, 1, reinterpret_cast(buf), size); + decode_image(encoded_image, image, dims); } - } - - // not using data store - else { - ret = lbann::image_utils::load_image(imagepath, width, height, img_type, *(m_pps[tid]), X_v, m_thread_buffer[tid], &m_thread_cv_buffer[tid]); + } else { + // Data store is not being used. + load_image(image_path, image, dims); } - if(!ret) { - LBANN_ERROR(get_type() + ": image_utils::load_image failed to load - " + imagepath); - } - if((width * height * CV_MAT_CN(img_type)) != m_image_linearized_size) { - LBANN_ERROR( get_type() + ": mismatch data size -- either width, height or channel - " + imagepath + "[w,h,c]=[" + std::to_string(width) + "x" + std::to_string(height) + "x" + std::to_string(CV_MAT_CN(img_type)) + "]"); - } + auto X_v = create_datum_view(X, mb_idx); + m_transform_pipeline.apply(image, X_v, dims); return true; } diff --git a/src/data_readers/data_reader_imagenet_patches.cpp b/src/data_readers/data_reader_imagenet_patches.cpp deleted file mode 100644 index f577474ba04..00000000000 --- a/src/data_readers/data_reader_imagenet_patches.cpp +++ /dev/null @@ -1,175 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. -// Produced at the Lawrence Livermore National Laboratory. -// Written by the LBANN Research Team (B. Van Essen, et al.) listed in -// the CONTRIBUTORS file. -// -// LLNL-CODE-697807. -// All rights reserved. -// -// This file is part of LBANN: Livermore Big Artificial Neural Network -// Toolkit. For details, see http://software.llnl.gov/LBANN or -// https://github.com/LLNL/LBANN. -// -// Licensed under the Apache License, Version 2.0 (the "Licensee"); you -// may not use this file except in compliance with the License. You may -// obtain a copy of the License at: -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the license. -// -// data_reader_imagenet_patches .hpp .cpp - extract patches from ImageNet dataset -//////////////////////////////////////////////////////////////////////////////// - -#include "lbann/data_readers/data_reader_imagenet_patches.hpp" -#include "lbann/data_readers/image_utils.hpp" - -#include - -namespace lbann { - -imagenet_reader_patches::imagenet_reader_patches(const std::shared_ptr& pp, bool shuffle) - : image_data_reader(shuffle) { - set_defaults(); - - if (!pp) { - std::stringstream err; - err << __FILE__<<" "<<__LINE__<< " :: " << get_type() << " construction error: no image processor"; - throw lbann_exception(err.str()); - } - - m_master_pps = lbann::make_unique(*pp); -} - -imagenet_reader_patches::imagenet_reader_patches(const imagenet_reader_patches& rhs) - : image_data_reader(rhs) -{ - if (!rhs.m_master_pps) { - std::stringstream err; - err << __FILE__<<" "<<__LINE__<< " :: " << get_type() << " construction error: no image processor"; - throw lbann_exception(err.str()); - } - m_num_patches = rhs.m_num_patches; - m_master_pps = lbann::make_unique(*rhs.m_master_pps); -} - -imagenet_reader_patches& imagenet_reader_patches::operator=(const imagenet_reader_patches& rhs) { - // check for self-assignment - if (this == &rhs) { - return (*this); - } - - image_data_reader::operator=(rhs); - - if (!rhs.m_master_pps) { - std::stringstream err; - err << __FILE__<<" "<<__LINE__<< " :: " << get_type() << " construction error: no image processor"; - throw lbann_exception(err.str()); - } - m_num_patches = rhs.m_num_patches; - m_master_pps = lbann::make_unique(*rhs.m_master_pps); - return (*this); -} - -imagenet_reader_patches::~imagenet_reader_patches() { -} - -void imagenet_reader_patches::set_defaults() { - m_image_width = 256; - m_image_height = 256; - m_image_num_channels = 3; - set_linearized_image_size(); - m_num_labels = 1000; - m_num_patches = 1; -} - -void imagenet_reader_patches::setup(int num_io_threads, std::shared_ptr io_thread_pool) { - image_data_reader::setup(num_io_threads, io_thread_pool); - replicate_processor(*m_master_pps, num_io_threads); -} - - -/// Replicate image processor for each OpenMP thread -bool imagenet_reader_patches::replicate_processor(const cv_process_patches& pp, const int nthreads) { - m_pps.resize(nthreads); - - // Construct thread private preprocessing objects out of a shared pointer - for (int i = 0; i < nthreads; ++i) { - m_pps[i] = lbann::make_unique(pp); - } - - bool ok = true; - for (int i = 0; ok && (i < nthreads); ++i) { - if (!m_pps[i]) ok = false; - } - - if (!ok || (nthreads <= 0)) { - std::stringstream err; - err << __FILE__<<" "<<__LINE__<< " :: " << get_type() << " construction error: cannot replicate image processor"; - throw lbann_exception(err.str()); - return false; - } - const std::vector dims = pp.get_data_dims(); - if ((dims.size() == 3u) && (dims[0] != 0u) && (dims[1] != 0u) && (dims[2] != 0u)) { - m_num_patches = static_cast(dims[0]); - m_image_width = static_cast(dims[1]); - m_image_height = static_cast(dims[2]); - set_linearized_image_size(); - } - if (pp.is_self_labeling()) { - m_num_labels = pp.get_num_labels(); - } - - return true; -} - -std::vector imagenet_reader_patches::create_datum_views(CPUMat& X, const int mb_idx) const { -/* - if (X.Height() != get_linearized_data_size()) { - throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__) + " " - + get_type() + ": inconsistent number of patches"); - } -*/ - std::vector X_v(m_num_patches); - El::Int h = 0; - for(int i=0; i < m_num_patches; ++i) { - El::View(X_v[i], X, El::IR(h, h + m_image_linearized_size), El::IR(mb_idx, mb_idx + 1)); - h = h + m_image_linearized_size; - } - return X_v; -} - -bool imagenet_reader_patches::fetch_datum(CPUMat& X, int data_id, int mb_idx) { - int tid = m_io_thread_pool->get_local_thread_id(); - const std::string imagepath = get_file_dir() + m_image_list[data_id].first; - - int width=0, height=0, img_type=0; - std::vector X_v = create_datum_views(X, mb_idx); - bool ret; - ret = lbann::image_utils::load_image(imagepath, width, height, img_type, *(m_pps[tid]), X_v, m_thread_buffer[tid], &m_thread_cv_buffer[tid]); - //ret = lbann::image_utils::load_image(imagepath, width, height, img_type, *(m_pps[tid]), X_v); - - if (m_pps[tid]->is_self_labeling()) { - m_image_list[data_id].second = m_pps[tid]->get_patch_label(); - } - - if(!ret) { - throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__) + " " - + get_type() + ": image_utils::load_image failed to load - " - + imagepath); - } - if((width * height * CV_MAT_CN(img_type)) != m_image_linearized_size) { - throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__) + " " - + get_type() + ": mismatch data size -- either width, height or channel - " - + imagepath + " [w,h,c]=[" + std::to_string(width) + "x" + std::to_string(height) - + "x" + std::to_string(CV_MAT_CN(img_type)) + "] != " + std::to_string(m_image_linearized_size)); - } - return true; -} - -} // namespace lbann diff --git a/src/data_readers/data_reader_jag.cpp b/src/data_readers/data_reader_jag.cpp index 1003d4eb90e..d1d24e48578 100644 --- a/src/data_readers/data_reader_jag.cpp +++ b/src/data_readers/data_reader_jag.cpp @@ -27,13 +27,14 @@ #include "lbann/utils/file_utils.hpp" #include "lbann/utils/cnpy_utils.hpp" -#include "lbann/data_readers/opencv_extensions.hpp" +#include "lbann/utils/image.hpp" #include "lbann/data_readers/data_reader_jag.hpp" #include // numeric_limits #include // max_element #include // accumulate #include // multiplies #include // is_same +#include namespace lbann { @@ -492,8 +493,7 @@ void data_reader_jag::normalize_image() { if (!m_image_loaded) { return; } - using depth_t = cv_image_type; - const int type_code = depth_t::T(1u); + const int type_code = CV_MAKETYPE(cv::DataType::depth, 1u); if (m_image_normalization == 1) { data_t* const ptr = get_image_ptr(0); @@ -520,26 +520,6 @@ data_reader_jag::data_t* data_reader_jag::get_image_ptr(const size_t i) const { return (m_image_loaded? cnpy_utils::data_ptr(m_images, {i}) : nullptr); } -cv::Mat data_reader_jag::get_image(const size_t i) const { - using InputBuf_T = cv_image_type; - - data_t* const ptr = get_image_ptr(i); - if (ptr == nullptr) { - return cv::Mat(); - } - // Construct a zero copying view to data - const cv::Mat img_org(m_linearized_image_size, 1, InputBuf_T::T(1u), - reinterpret_cast(ptr)); - - cv::Mat img; - if (std::is_same::value) { - img = img_org.clone(); - } else { - img_org.convertTo(img, cv_image_type::T(1u)); - } - return img.reshape(0, m_image_height); -} - data_reader_jag::data_t data_reader_jag::get_image_max() const { if (!m_image_loaded) { return std::numeric_limits::min(); @@ -660,8 +640,4 @@ bool data_reader_jag::fetch_label(CPUMat& Y, int data_id, int mb_idx) { return true; } -void data_reader_jag::save_image(Mat& pixels, const std::string filename, bool do_scale) { - internal_save_image(pixels, filename, m_image_height, m_image_width, 1, do_scale); -} - } // end of namespace lbann diff --git a/src/data_readers/data_reader_jag_conduit.cpp b/src/data_readers/data_reader_jag_conduit.cpp index a58256ee250..a1942a138b5 100644 --- a/src/data_readers/data_reader_jag_conduit.cpp +++ b/src/data_readers/data_reader_jag_conduit.cpp @@ -30,9 +30,12 @@ #include "lbann/data_store/data_store_conduit.hpp" #include "lbann/models/model.hpp" #include "lbann/utils/lbann_library.hpp" +#include "lbann/utils/image.hpp" +#include "lbann/utils/opencv.hpp" +#include "lbann/transforms/repack_HWC_to_CHW_layout.hpp" +#include "lbann/transforms/scale_and_translate.hpp" #include "lbann/utils/file_utils.hpp" // for add_delimiter() in load() -#include "lbann/data_readers/opencv_extensions.hpp" #include // numeric_limits #include // max_element #include // accumulate @@ -40,7 +43,6 @@ #include // is_same #include #include -#include "lbann/data_readers/image_utils.hpp" #include #include "lbann/utils/timer.hpp" #include "lbann/utils/glob.hpp" @@ -145,15 +147,9 @@ bool data_reader_jag_conduit::check_num_parallel_readers(long data_set_size) { return true; } -data_reader_jag_conduit::data_reader_jag_conduit(const std::shared_ptr& pp, bool shuffle) +data_reader_jag_conduit::data_reader_jag_conduit(bool shuffle) : generic_data_reader(shuffle) { set_defaults(); - - if (!pp) { - _THROW_LBANN_EXCEPTION_(get_type(), " construction error: no image processor"); - } - - m_master_pps = lbann::make_unique(*pp); } void data_reader_jag_conduit::copy_members(const data_reader_jag_conduit& rhs, const std::vector& ds_sample_move_list) { @@ -172,12 +168,6 @@ void data_reader_jag_conduit::copy_members(const data_reader_jag_conduit& rhs, c m_scalar_keys = rhs.m_scalar_keys; m_input_keys = rhs.m_input_keys; - if (!rhs.m_master_pps) { - _THROW_LBANN_EXCEPTION_(get_type(), " construction error: no image processor"); - } - - m_master_pps = lbann::make_unique(*rhs.m_master_pps); - m_uniform_input_type = rhs.m_uniform_input_type; m_output_scalar_prefix = rhs.m_output_scalar_prefix; @@ -290,35 +280,6 @@ void data_reader_jag_conduit::set_defaults() { void data_reader_jag_conduit::setup(int num_io_threads, std::shared_ptr io_thread_pool) { generic_data_reader::setup(num_io_threads, io_thread_pool); - replicate_processor(*m_master_pps, num_io_threads); -} - -/// Replicate image processor for each I/O thread -bool data_reader_jag_conduit::replicate_processor(const cv_process& pp, const int nthreads) { - m_pps.resize(nthreads); - - // Construct thread private preprocessing objects out of a shared pointer - for (int i = 0; i < nthreads; ++i) { - m_pps[i] = lbann::make_unique(pp); - } - - bool ok = true; - for (int i = 0; ok && (i < nthreads); ++i) { - if (!m_pps[i]) ok = false; - } - - if (!ok || (nthreads <= 0)) { - _THROW_LBANN_EXCEPTION_(get_type(), " cannot replicate image processor"); - return false; - } - - const std::vector dims = pp.get_data_dims(); - if ((dims.size() == 2u) && (dims[0] != 0u) && (dims[1] != 0u)) { - m_image_width = static_cast(dims[0]); - m_image_height = static_cast(dims[1]); - } - - return true; } const conduit::Node& data_reader_jag_conduit::get_conduit_node(const conduit::Node& n_base, const std::string key) { @@ -1253,102 +1214,6 @@ data_reader_jag_conduit::get_image_data(const size_t sample_id, conduit::Node& s return image_ptrs; } -cv::Mat data_reader_jag_conduit::cast_to_cvMat( - const std::pair img, const int height, const int num_ch) { - const int num_pixels = static_cast(img.first); - const ch_t* ptr = img.second; - - // add a zero copying view to data - using InputBuf_T = cv_image_type; - const cv::Mat image(num_pixels, 1, InputBuf_T::T(1u), - reinterpret_cast(const_cast(ptr))); - // reshape the image. Furter need to clone (deep-copy) the image - // to preserve the constness of the original data - return (image.reshape(num_ch, height)); -} - -/// Assumes the same parameters for the same channel from different views -void data_reader_jag_conduit::image_normalization(cv::Mat& img, size_t i, size_t ch) const { - const auto& tr = m_image_normalization_params.at(ch); - img.convertTo(img, -1, tr.first, tr.second); -} - -std::vector data_reader_jag_conduit::get_cv_images(const size_t sample_id, conduit::Node& sample) const { - const std::vector< std::vector > img_data(get_image_data(sample_id, sample)); - std::vector images; - - if (m_split_channels) { - images.reserve(img_data.size()*m_image_num_channels); - for (size_t i = 0u; i < img_data.size(); ++i) { - const auto& img = img_data[i]; - cv::Mat ch[m_image_num_channels]; - cv::split(cast_to_cvMat(std::make_pair(img.size(), img.data()), m_image_height, m_image_num_channels), ch); - for(int c = 0; c < m_image_num_channels; ++c) { - #if 1 // with normalization - image_normalization(ch[c], i, static_cast(c)); - #endif - images.emplace_back(ch[c].clone()); - } - } - } else { - images.reserve(img_data.size()); - for (size_t i = 0u; i < img_data.size(); ++i) { - const auto& img = img_data[i]; - #if 1 // with normalization - cv::Mat ch[m_image_num_channels]; - cv::split(cast_to_cvMat(std::make_pair(img.size(), img.data()), m_image_height, m_image_num_channels), ch); - for(int c = 0; c < m_image_num_channels; ++c) { - image_normalization(ch[c], i, static_cast(c)); - } - cv::Mat img_normalized; - cv::merge(ch, m_image_num_channels, img_normalized); - images.emplace_back(img_normalized); - #else - images.emplace_back(cast_to_cvMat(std::make_pair(img.size(), img.data()), m_image_height, m_image_num_channels).clone()); - #endif - } - } - return images; -} - -std::vector data_reader_jag_conduit::get_images(const size_t sample_id, conduit::Node& sample) const { - std::vector< std::vector > img_data(get_image_data(sample_id, sample)); - std::vector images; - - if (m_split_channels) { - images.resize(get_linearized_size(JAG_Image)); - size_t i = 0u; - size_t j = 0u; - for (const auto& img: img_data) { - const ch_t * const ptr_end = img.data() + img.size(); - for (int c=0; c < m_image_num_channels; ++c) { - const auto& tr = m_image_normalization_params.at(c); - for (const ch_t* ptr = img.data() + c; ptr < ptr_end; ptr += m_image_num_channels) { - #if 1 // with normalization - images[i++] = cv::saturate_cast(*ptr * tr.first + tr.second); - #else - images[i++] = *ptr; - #endif - } - } - j ++; - } - } else { - images.reserve(get_linearized_size(JAG_Image)); - for (const auto& img: img_data) { - #if 1 // with normalization - // TODO: normalization needed - _THROW_LBANN_EXCEPTION_(_CN_, "get_images() : normalization not implemented yet"); - (void) img; - #else - images.insert(images.end(), img.cbegin(), ptr + img.cend()); - #endif - } - } - - return images; -} - std::vector data_reader_jag_conduit::get_scalars(const size_t sample_id, conduit::Node& sample) const { std::vector scalars; scalars.reserve(m_scalar_keys.size()); @@ -1438,7 +1303,6 @@ std::vector data_reader_jag_conduit::get_input return inputs; } - std::vector data_reader_jag_conduit::create_datum_views(CPUMat& X, const std::vector& sizes, const int mb_idx) const { std::vector X_v(sizes.size()); @@ -1449,28 +1313,46 @@ data_reader_jag_conduit::create_datum_views(CPUMat& X, const std::vector El::View(X_v[i], X, El::IR(h, h_end), El::IR(mb_idx, mb_idx + 1)); h = h_end; } - return X_v; + return std::move(X_v); } bool data_reader_jag_conduit::fetch(CPUMat& X, int data_id, conduit::Node& sample, int mb_idx, int tid, const data_reader_jag_conduit::variable_t vt, const std::string tag) { switch (vt) { case JAG_Image: { - const size_t num_images = get_num_img_srcs() - * static_cast(m_split_channels? m_image_num_channels : 1u); - const size_t image_size = m_split_channels? get_linearized_1ch_image_size() : get_linearized_image_size(); + const size_t num_images = get_num_img_srcs(); + const size_t num_channels = m_image_num_channels; + const size_t image_size = get_linearized_image_size(); const std::vector sizes(num_images, image_size); std::vector X_v = create_datum_views(X, sizes, mb_idx); - std::vector images = get_cv_images(data_id, sample); + std::vector< std::vector > img_data(get_image_data(data_id, sample)); - if (images.size() != num_images) { - _THROW_LBANN_EXCEPTION2_(_CN_, "fetch() : the number of images is not as expected", \ - std::to_string(images.size()) + "!=" + std::to_string(num_images)); + if (img_data.size() != num_images) { + _THROW_LBANN_EXCEPTION2_(_CN_, "fetch() : the number of images is not as expected ", \ + std::to_string(img_data.size()) + "!=" + std::to_string(num_images)); } + if (!m_split_channels && m_image_num_channels != 1) { + _THROW_LBANN_EXCEPTION2_(_CN_, "fetch() : transform pipeline now requires single channel images: num_channels=", \ + std::to_string(m_image_num_channels) + " split_channel=" + std::to_string(m_split_channels)); + } + + std::vector dims = {num_channels, static_cast(m_image_height), static_cast(m_image_width)}; + std::vector ch_dims = {static_cast(m_image_height), static_cast(m_image_width)}; + auto tll = lbann::transform::repack_HWC_to_CHW_layout(); for(size_t i=0u; i < num_images; ++i) { - int width, height, img_type; - image_utils::process_image(images[i], width, height, img_type, *(m_pps[tid]), X_v[i]); + CPUMat img_mat = CPUMat(utils::get_linearized_size(dims), 1, img_data[i].data(), utils::get_linearized_size(dims)); + utils::type_erased_matrix te_img(std::move(img_mat)); + CPUMat tgt_mat = CPUMat(utils::get_linearized_size(dims), 1); + tll.apply(te_img, X_v[i], dims); + const std::vector ch_sizes(num_channels, m_image_height * m_image_width); + std::vector X_ch_v = create_datum_views(X_v[i], ch_sizes, mb_idx); + for(size_t ch = 0; ch < num_channels; ch++) { + const auto& tr = m_image_normalization_params.at(ch); + auto s = lbann::transform::scale_and_translate(tr.first, tr.second); + utils::type_erased_matrix te_img_plane(std::move(X_ch_v[ch])); + s.apply(te_img_plane, ch_dims); + } } break; } @@ -1605,10 +1487,6 @@ void data_reader_jag_conduit::setup_data_store(int mini_batch_size) { } } -void data_reader_jag_conduit::save_image(Mat& pixels, const std::string filename, bool do_scale) { - internal_save_image(pixels, filename, m_image_height, m_image_width, 1, do_scale); -} - void data_reader_jag_conduit::print_schema(const size_t sample_id) const { //@TODO revisit later -- don't know how to handle this yet if (m_data_store != nullptr) { diff --git a/src/data_readers/data_reader_mnist.cpp b/src/data_readers/data_reader_mnist.cpp index b7b0612efac..81decd0ba88 100644 --- a/src/data_readers/data_reader_mnist.cpp +++ b/src/data_readers/data_reader_mnist.cpp @@ -57,9 +57,11 @@ bool mnist_reader::fetch_datum(CPUMat& X, int data_id, int mb_idx) { } auto pixel_col = X(El::IR(0, X.Height()), El::IR(mb_idx, mb_idx + 1)); - augment(pixel_col, m_image_height, m_image_width, 1); - normalize(pixel_col, 1); - pixel_noise(pixel_col); //add noise to image, disable by default + std::vector dims = { + 1ull, + static_cast(m_image_height), + static_cast(m_image_width)}; + m_transform_pipeline.apply(pixel_col, dims); return true; } diff --git a/src/data_readers/data_reader_mnist_siamese.cpp b/src/data_readers/data_reader_mnist_siamese.cpp deleted file mode 100644 index 9f7f2fce6a4..00000000000 --- a/src/data_readers/data_reader_mnist_siamese.cpp +++ /dev/null @@ -1,296 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. -// Produced at the Lawrence Livermore National Laboratory. -// Written by the LBANN Research Team (B. Van Essen, et al.) listed in -// the CONTRIBUTORS file. -// -// LLNL-CODE-697807. -// All rights reserved. -// -// This file is part of LBANN: Livermore Big Artificial Neural Network -// Toolkit. For details, see http://software.llnl.gov/LBANN or -// https://github.com/LLNL/LBANN. -// -// Licensed under the Apache License, Version 2.0 (the "Licensee"); you -// may not use this file except in compliance with the License. You may -// obtain a copy of the License at: -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the license. -// -// data_reader_mnist_siamese .hpp .cpp - data reader class for mnist dataset -// employing two images per sample to feed siamese model -//////////////////////////////////////////////////////////////////////////////// - -#include "lbann/data_readers/data_reader_mnist_siamese.hpp" -#include "lbann/data_readers/image_utils.hpp" -#include "lbann/utils/file_utils.hpp" -#include -#include -#include -#include // shuffle() -#include -#include - -namespace lbann { - -data_reader_mnist_siamese::data_reader_mnist_siamese(const std::shared_ptr& pp, bool shuffle) - : data_reader_multi_images(pp, shuffle) { - set_defaults(); -} - -data_reader_mnist_siamese::data_reader_mnist_siamese(const data_reader_mnist_siamese& rhs) - : data_reader_multi_images(rhs), - m_shuffled_indices2(rhs.m_shuffled_indices2), - m_image_data(rhs.m_image_data) -{} - -data_reader_mnist_siamese& data_reader_mnist_siamese::operator=(const data_reader_mnist_siamese& rhs) { - // check for self-assignment - if (this == &rhs) { - return (*this); - } - - data_reader_multi_images::operator=(rhs); - m_shuffled_indices2 = rhs.m_shuffled_indices2; - m_image_data = rhs.m_image_data; - - return (*this); -} - -data_reader_mnist_siamese::~data_reader_mnist_siamese() { -} - -void data_reader_mnist_siamese::set_defaults() { - m_image_width = 28; - m_image_height = 28; - m_image_num_channels = 1; - set_linearized_image_size(); - m_num_labels = 2; - m_num_img_srcs = 2; -} - - -void data_reader_mnist_siamese::set_input_params( - const int, const int, const int, const int) { - set_defaults(); -} - - -/** - * Fill the input minibatch matrix with the samples of image pairs by using - * the overloaded fetch_datum() - */ -int data_reader_mnist_siamese::fetch_data(CPUMat& X, El::Matrix& indices_fetched) { - int nthreads = m_io_thread_pool->get_num_threads(); - if(!position_valid()) { - throw lbann_exception( - std::string{} + __FILE__ + " " + std::to_string(__LINE__) - + " :: " + get_type() + " load error: !position_valid" - + " -- current pos = " + std::to_string(m_current_pos) - + " and there are " + std::to_string(m_shuffled_indices.size()) + " indices"); - } - - /// Allow each thread to perform any preprocessing necessary on the - /// data source prior to fetching data - for (int t = 0; t < nthreads; t++) { - preprocess_data_source(t); - } - - int loaded_batch_size = get_loaded_mini_batch_size(); - const int end_pos = std::min(static_cast(m_current_pos+loaded_batch_size), - m_shuffled_indices.size()); - const int mb_size = std::min( - El::Int{((end_pos - m_current_pos) + m_sample_stride - 1) / m_sample_stride}, - X.Width()); - - El::Zeros_seq(X, X.Height(), X.Width()); - El::Zeros_seq(indices_fetched, mb_size, 1); - - std::string error_message; - for (int s = 0; s < mb_size; s++) { - int n = m_current_pos + (s * m_sample_stride); - sample_t index = std::make_pair(m_shuffled_indices[n], m_shuffled_indices2[n]); - bool valid = fetch_datum(X, index, s); - if (valid) { - El::Int index_coded = m_shuffled_indices[n] + m_shuffled_indices2[n]*(std::numeric_limits::max()+1); - indices_fetched.Set(s, 0, index_coded); - } else{ - error_message = "invalid datum"; - } - } - if (!error_message.empty()) { LBANN_ERROR(error_message); } - - /// Allow each thread to perform any postprocessing necessary on the - /// data source prior to fetching data - for (int t = 0; t < nthreads; t++) { - postprocess_data_source(t); - } - - return mb_size; -} - - -/** - * Fill the ground truth table by using the overloaded fetch_label() - */ -int data_reader_mnist_siamese::fetch_labels(CPUMat& Y) { - if(!position_valid()) { - throw lbann_exception( - std::string{} + __FILE__ + " " + std::to_string(__LINE__) + - " :: generic data reader load error: !position_valid"); - } - - int loaded_batch_size = get_loaded_mini_batch_size(); - const int end_pos = std::min(static_cast(m_current_pos+loaded_batch_size), - m_shuffled_indices.size()); - const int mb_size = std::min( - El::Int{((end_pos - m_current_pos) + m_sample_stride - 1) / m_sample_stride}, - Y.Width()); - - El::Zeros(Y, Y.Height(), Y.Width()); - - std::string error_message; - for (int s = 0; s < mb_size; s++) { - int n = m_current_pos + (s * m_sample_stride); - sample_t index = std::make_pair(m_shuffled_indices[n], m_shuffled_indices2[n]); - bool valid = fetch_label(Y, index, s); - if (!valid) { - error_message = "invalid label"; - } - } - if (!error_message.empty()) { LBANN_ERROR(error_message); } - - return mb_size; -} - - -bool data_reader_mnist_siamese::fetch_datum(CPUMat& X, std::pair data_id, int mb_idx) { - int tid = m_io_thread_pool->get_local_thread_id(); - std::vector X_v = create_datum_views(X, mb_idx); - - using raw_data_t = std::vector; - using local_sample_t = std::array; - local_sample_t sample; - sample[0] = &m_image_data[data_id.first]; - sample[1] = &m_image_data[data_id.second]; - - for(size_t i=0u; i < sample.size(); ++i) { - int width=0, height=0, img_type=0; - bool ret = true; - -#if 1 - // Construct a zero copying view to a portion of a preloaded data buffer - // This has nothing to do with the image type but only to create view on a block of bytes - using InputBuf_T = lbann::cv_image_type; - const cv::Mat image_buf(1, sample[i]->size()-1, InputBuf_T::T(1), &((*sample[i])[1])); -#else - raw_data_t image_buf(sample[i]->begin()+1, sample[i]->end()); // make copy of the raw data -#endif - ret = lbann::image_utils::import_image(image_buf, width, height, img_type, *(m_pps[tid]), X_v[i]); - - if(!ret) { - throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__) + " " - + get_type() + ": image_utils::import_image failed to load"); - } - if((width * height * CV_MAT_CN(img_type)) != m_image_linearized_size) { - throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__) + " " - + get_type() + ": mismatch data size -- either width, height or channel - " - + " [w,h,c]=[" + std::to_string(width) + "x" + std::to_string(height) - + "x" + std::to_string(CV_MAT_CN(img_type)) + "] != " + std::to_string(m_image_linearized_size)); - } - } - return true; -} - - -bool data_reader_mnist_siamese::fetch_label(CPUMat& Y, std::pair data_id, int mb_idx) { - const label_t label_1 = m_image_data[data_id.first][0]; - const label_t label_2 = m_image_data[data_id.second][0]; - const label_t label = static_cast(label_1 == label_2); - Y.Set(label, mb_idx, 1); - return true; -} - - -bool data_reader_mnist_siamese::fetch_datum(CPUMat& X, int data_id, int mb_idx) { - throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__) + " " - + get_type() + ": unused interface is called"); - return false; -} - - -bool data_reader_mnist_siamese::fetch_label(CPUMat& Y, int data_id, int mb_idx) { - throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__) + " " - + get_type() + ": unused interface is called"); - return false; -} - - -// The function is defined in data_readers/data_reader_mnist.cpp -extern void load_mnist_data(const std::string imagepath, const std::string labelpath, - const int first_n, std::vector >& m_image_data); - - -void data_reader_mnist_siamese::load() { - if (is_master()) { - std::cerr << "starting lbann::" << get_type() << "::load\n"; - } - m_image_data.clear(); - - const std::string FileDir = get_file_dir(); - const std::string ImageFile = get_data_filename(); - const std::string LabelFile = get_label_filename(); - - // set filepath - const std::string imagepath = FileDir + "/" + ImageFile; - const std::string labelpath = FileDir + "/" + LabelFile; - - if (is_master()) { - std::cerr << "read labels!\n"; - } - - load_mnist_data(imagepath, labelpath, m_first_n, m_image_data); - - if (m_first_n > 0) { - set_use_percent(1.0); - set_absolute_sample_count(0u); - } - - // reset indices - m_shuffled_indices.clear(); - m_shuffled_indices.resize(m_image_data.size()); - for (size_t n = 0; n < m_shuffled_indices.size(); n++) { - m_shuffled_indices[n] = n; - } - if (is_master()) { - std::cerr << "calling select_subset_of_data; m_shuffled_indices.size: " << - m_shuffled_indices.size() << std::endl; - } - select_subset_of_data(); -} - - -void data_reader_mnist_siamese::shuffle_indices() { - if (m_shuffled_indices2.size() != m_shuffled_indices.size()) { - m_shuffled_indices2 = m_shuffled_indices; - if (!m_shuffle) { - std::shuffle(m_shuffled_indices2.begin(), m_shuffled_indices2.end(), - get_data_seq_generator()); - } - } - if (m_shuffle) { - std::shuffle(m_shuffled_indices.begin(), m_shuffled_indices.end(), - get_data_seq_generator()); - std::shuffle(m_shuffled_indices2.begin(), m_shuffled_indices2.end(), - get_data_seq_generator()); - } -} - - -} // namespace lbann diff --git a/src/data_readers/data_reader_multi_images.cpp b/src/data_readers/data_reader_multi_images.cpp index f1f5208758f..b8027ecc0dc 100644 --- a/src/data_readers/data_reader_multi_images.cpp +++ b/src/data_readers/data_reader_multi_images.cpp @@ -28,7 +28,6 @@ //////////////////////////////////////////////////////////////////////////////// #include "lbann/data_readers/data_reader_multi_images.hpp" -#include "lbann/data_readers/image_utils.hpp" #include "lbann/utils/file_utils.hpp" #include #include @@ -36,8 +35,8 @@ namespace lbann { -data_reader_multi_images::data_reader_multi_images(const std::shared_ptr& pp, bool shuffle) - : imagenet_reader(pp, shuffle) { +data_reader_multi_images::data_reader_multi_images(bool shuffle) + : imagenet_reader(shuffle) { set_defaults(); } @@ -97,32 +96,6 @@ std::vector data_reader_multi_images::create_datum_views(CPUMat& X, cons return X_v; } -bool data_reader_multi_images::fetch_datum(CPUMat& X, int data_id, int mb_idx) { - int tid = m_io_thread_pool->get_local_thread_id(); - std::vector X_v = create_datum_views(X, mb_idx); - - const img_src_t& img_src = m_image_list[data_id].first; - for(size_t i=0u; i < m_num_img_srcs; ++i) { - int width=0, height=0, img_type=0; - const std::string imagepath = get_file_dir() + img_src[i]; - bool ret = true; - ret = lbann::image_utils::load_image(imagepath, width, height, img_type, *(m_pps[tid]), X_v[i], m_thread_buffer[tid], &m_thread_cv_buffer[tid]); - - if(!ret) { - throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__) + " " - + get_type() + ": image_utils::load_image failed to load - " - + imagepath); - } - if((width * height * CV_MAT_CN(img_type)) != m_image_linearized_size) { - throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__) + " " - + get_type() + ": mismatch data size -- either width, height or channel - " - + imagepath + " [w,h,c]=[" + std::to_string(width) + "x" + std::to_string(height) - + "x" + std::to_string(CV_MAT_CN(img_type)) + "] != " + std::to_string(m_image_linearized_size)); - } - } - return true; -} - bool data_reader_multi_images::fetch_label(CPUMat& Y, int data_id, int mb_idx) { const label_t label = m_image_list[data_id].second; Y.Set(label, mb_idx, 1); diff --git a/src/data_readers/data_reader_multihead_siamese.cpp b/src/data_readers/data_reader_multihead_siamese.cpp index cdb4c582042..28005407b8c 100644 --- a/src/data_readers/data_reader_multihead_siamese.cpp +++ b/src/data_readers/data_reader_multihead_siamese.cpp @@ -28,8 +28,8 @@ //////////////////////////////////////////////////////////////////////////////// #include "lbann/data_readers/data_reader_multihead_siamese.hpp" -#include "lbann/data_readers/image_utils.hpp" #include "lbann/utils/file_utils.hpp" +#include "lbann/utils/image.hpp" #include #include #include @@ -38,14 +38,14 @@ namespace lbann { -data_reader_multihead_siamese::data_reader_multihead_siamese(const std::shared_ptr& pp, unsigned int nimages, bool shuffle) : data_reader_multi_images(pp, shuffle) { +data_reader_multihead_siamese::data_reader_multihead_siamese(unsigned int nimages, bool shuffle) : data_reader_multi_images(shuffle) { set_defaults(); m_num_img_srcs = nimages; m_samples = offline_patches_npz (m_num_img_srcs); } -data_reader_multihead_siamese::data_reader_multihead_siamese(const std::shared_ptr& pp, bool shuffle) - : data_reader_multi_images(pp, shuffle) { +data_reader_multihead_siamese::data_reader_multihead_siamese(bool shuffle) + : data_reader_multi_images(shuffle) { set_defaults(); } @@ -88,30 +88,14 @@ void data_reader_multihead_siamese::set_input_params(const int width, const int bool data_reader_multihead_siamese::fetch_datum(Mat& X, int data_id, int mb_idx) { - - int tid = m_io_thread_pool->get_local_thread_id(); - std::vector X_v = create_datum_views(X, mb_idx); - + std::vector X_v = create_datum_views(X, mb_idx); sample_t sample = m_samples.get_sample(data_id); - for(size_t i=0u; i < m_num_img_srcs; ++i) { - int width=0, height=0, img_type=0; - const std::string imagepath = get_file_dir() + sample.first[i]; - bool ret = true; - ret = lbann::image_utils::load_image(imagepath, width, height, img_type, *(m_pps[tid]), X_v[i], m_thread_buffer[tid], &m_thread_cv_buffer[tid]); - - if(!ret) { - throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__) + " " - + get_type() + ": image_utils::load_image failed to load - " - + imagepath); - } - if((width * height * CV_MAT_CN(img_type)) != m_image_linearized_size) { - throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__) + " " - + get_type() + ": mismatch data size -- either width, height or channel - " - + imagepath + " [w,h,c]=[" + std::to_string(width) + "x" + std::to_string(height) - + "x" + std::to_string(CV_MAT_CN(img_type)) + "] != " + std::to_string(m_image_linearized_size)); - } + for (size_t i = 0; i < m_num_img_srcs; ++i) { + El::Matrix image; + std::vector dims; + load_image(get_file_dir() + sample.first[i], image, dims); + m_transform_pipeline.apply(image, X_v[i], dims); } - return true; } diff --git a/src/data_readers/data_reader_triplet.cpp b/src/data_readers/data_reader_triplet.cpp index f2ee426a594..a6037cff2eb 100644 --- a/src/data_readers/data_reader_triplet.cpp +++ b/src/data_readers/data_reader_triplet.cpp @@ -30,16 +30,16 @@ //////////////////////////////////////////////////////////////////////////////// #include "lbann/data_readers/data_reader_triplet.hpp" -#include "lbann/data_readers/image_utils.hpp" #include "lbann/utils/file_utils.hpp" +#include "lbann/utils/image.hpp" #include #include #include namespace lbann { -data_reader_triplet::data_reader_triplet(const std::shared_ptr& pp, bool shuffle) - : data_reader_multi_images(pp, shuffle) { +data_reader_triplet::data_reader_triplet(bool shuffle) + : data_reader_multi_images(shuffle) { set_defaults(); } @@ -82,27 +82,13 @@ void data_reader_triplet::set_input_params(const int width, const int height, co bool data_reader_triplet::fetch_datum(Mat& X, int data_id, int mb_idx) { - int tid = m_io_thread_pool->get_local_thread_id(); std::vector X_v = create_datum_views(X, mb_idx); - sample_t sample = m_samples.get_sample(data_id); - for(size_t i=0u; i < m_num_img_srcs; ++i) { - int width=0, height=0, img_type=0; - const std::string imagepath = get_file_dir() + sample.first[i]; - bool ret = true; - ret = lbann::image_utils::load_image(imagepath, width, height, img_type, *(m_pps[tid]), X_v[i], m_thread_buffer[tid], &m_thread_cv_buffer[tid]); - - if(!ret) { - throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__) + " " - + get_type() + ": image_utils::load_image failed to load - " - + imagepath); - } - if((width * height * CV_MAT_CN(img_type)) != m_image_linearized_size) { - throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__) + " " - + get_type() + ": mismatch data size -- either width, height or channel - " - + imagepath + " [w,h,c]=[" + std::to_string(width) + "x" + std::to_string(height) - + "x" + std::to_string(CV_MAT_CN(img_type)) + "] != " + std::to_string(m_image_linearized_size)); - } + for (size_t i = 0; i < m_num_img_srcs; ++i) { + El::Matrix image; + std::vector dims; + load_image(get_file_dir() + sample.first[i], image, dims); + m_transform_pipeline.apply(image, X_v[i], dims); } return true; } diff --git a/src/data_readers/image_preprocessor.cpp b/src/data_readers/image_preprocessor.cpp deleted file mode 100644 index 5166ab6ed8e..00000000000 --- a/src/data_readers/image_preprocessor.cpp +++ /dev/null @@ -1,335 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. -// Produced at the Lawrence Livermore National Laboratory. -// Written by the LBANN Research Team (B. Van Essen, et al.) listed in -// the CONTRIBUTORS file. -// -// LLNL-CODE-697807. -// All rights reserved. -// -// This file is part of LBANN: Livermore Big Artificial Neural Network -// Toolkit. For details, see http://software.llnl.gov/LBANN or -// https://github.com/LLNL/LBANN. -// -// Licensed under the Apache License, Version 2.0 (the "Licensee"); you -// may not use this file except in compliance with the License. You may -// obtain a copy of the License at: -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the license. -// -// image_preprocessor.cpp - Preprocessing utilities for image inputs -//////////////////////////////////////////////////////////////////////////////// - -#include "lbann/data_readers/image_preprocessor.hpp" -#include "lbann/data_readers/image_utils.hpp" -#include "lbann/utils/random.hpp" -#include "lbann/utils/statistics.hpp" -#include "lbann/utils/exception.hpp" - -namespace { -const float pi = std::acos(-1); -}; - -namespace lbann { - -lbann_image_preprocessor::lbann_image_preprocessor() : - m_horizontal_flip(false), - m_vertical_flip(false), - m_rotation_range(0.0f), - m_horizontal_shift(0.0f), - m_vertical_shift(0.0f), - m_shear_range(0.0f), - m_mean_subtraction(false), - m_unit_variance(false), - m_scale(true), // We always did scaling by default. - m_z_score(false), - m_noise_factor(0.0f) { -} - -void lbann_image_preprocessor::augment(Mat& pixels, unsigned imheight, - unsigned imwidth, - unsigned num_channels) { - bool do_transform = m_horizontal_flip || m_vertical_flip || - m_rotation_range || m_horizontal_shift || m_vertical_shift || - m_shear_range; - if (do_transform) { - cv::Mat sqpixels = cv_pixels(pixels, imheight, imwidth, num_channels); - rng_gen& gen = get_io_generator(); - std::uniform_int_distribution bool_dist(0, 1); - // Flips. - bool horiz_flip = bool_dist(gen) && m_horizontal_flip; - bool vert_flip = bool_dist(gen) && m_vertical_flip; - if (horiz_flip || vert_flip) { - if (horiz_flip && !vert_flip) { - flip(sqpixels, 1); - } else if (!horiz_flip && vert_flip) { - flip(sqpixels, 0); - } else { - flip(sqpixels, -1); - } - } - // Translations. - float x_trans = 0.0f; - float y_trans = 0.0f; - if (m_horizontal_shift) { - std::uniform_real_distribution dist(-m_horizontal_shift, - m_horizontal_shift); - x_trans = dist(gen) * imwidth; - } - if (m_vertical_shift) { - std::uniform_real_distribution dist(-m_vertical_shift, - m_vertical_shift); - y_trans = dist(gen) * imheight; - } - Mat trans_mat; - El::Diagonal(trans_mat, std::vector({1.0f, 1.0f, 1.0f})); - trans_mat(0, 2) = x_trans; - trans_mat(1, 2) = y_trans; - // Shearing. - float shear = 0.0f; - if (m_shear_range) { - std::uniform_real_distribution dist(-m_shear_range, - m_shear_range); - shear = dist(gen); - } - Mat shear_mat; - El::Zeros(shear_mat, 3, 3); - shear_mat(0, 0) = 1.0f; - shear_mat(2, 2) = 1.0f; - shear_mat(0, 1) = -std::sin(shear); - shear_mat(1, 1) = std::cos(shear); - // Rotation. - float rotate = 0.0f; - if (m_rotation_range) { - std::uniform_real_distribution dist(-m_rotation_range, - m_rotation_range); - rotate = ::pi / 180.0f * dist(gen); - } - Mat rot_mat; - El::Zeros(rot_mat, 3, 3); - rot_mat(2, 2) = 1.0f; - rot_mat(0, 0) = std::cos(rotate); - rot_mat(0, 1) = -std::sin(rotate); - rot_mat(1, 0) = std::sin(rotate); - rot_mat(1, 1) = std::cos(rotate); - // Compute the final transformation. - Mat affine_mat_tmp(3, 3); - Mat affine_mat(3, 3); - El::Gemm(El::NORMAL, El::NORMAL, (DataType) 1.0, trans_mat, shear_mat, - (DataType) 0.0, affine_mat_tmp); - El::Gemm(El::NORMAL, El::NORMAL, (DataType) 1.0, affine_mat_tmp, rot_mat, - (DataType) 0.0, affine_mat); - affine_trans(sqpixels, affine_mat); - col_pixels(sqpixels, pixels, num_channels); - } -} - -void lbann_image_preprocessor::normalize(Mat& pixels, unsigned num_channels) { - if (m_z_score || (m_mean_subtraction && m_unit_variance)) { - z_score(pixels, num_channels); - } else { - if (m_scale) { - unit_scale(pixels, num_channels); - } - if (m_mean_subtraction) { - mean_subtraction(pixels, num_channels); - } - if (m_unit_variance) { - unit_variance(pixels, num_channels); - } - } -} - -void lbann_image_preprocessor::mean_subtraction(Mat& pixels, - unsigned num_channels) { - const unsigned height = pixels.Height(); - const unsigned height_per_channel = height / num_channels; - for (unsigned channel = 0; channel < num_channels; ++channel) { - const unsigned channel_start = channel*height_per_channel; - const unsigned channel_end = (channel+1)*height_per_channel; - Mat pixels_channel = El::View(pixels, El::IR(channel_start, channel_end), El::ALL); - DataType mean, stdev; - entrywise_mean_and_stdev(pixels_channel, mean, stdev); - for (unsigned i = 0; i < height_per_channel; ++i) { - DataType& pixels_entry = pixels_channel(i, 0); - pixels_entry -= mean; - } - } -} - -void lbann_image_preprocessor::unit_variance( - Mat& pixels, unsigned num_channels) { - - // Get image parameters - const unsigned height = pixels.Height(); - const unsigned height_per_channel = height / num_channels; - - // Scale each channel separately - for (unsigned channel = 0; channel < num_channels; ++channel) { - const unsigned channel_start = channel*height_per_channel; - const unsigned channel_end = (channel+1)*height_per_channel; - Mat pixels_channel = El::View(pixels, El::IR(channel_start, channel_end), El::ALL); - DataType mean, stdev; - entrywise_mean_and_stdev(pixels_channel, mean, stdev); - if(stdev > DataType(1e-7)*std::abs(mean)) { - const DataType inv_stdev = 1 / stdev; - for (unsigned i = 0; i < height_per_channel; ++i) { - DataType& pixels_entry = pixels_channel(i, 0); - pixels_entry = (pixels_entry - mean) * inv_stdev + mean; - } - } - } - -} - -void lbann_image_preprocessor::unit_scale(Mat& pixels, - unsigned num_channels) { - // Pixels are in range [0, 255], normalize using that. - // Channels are not relevant here. - El::Scale(DataType(1) / 255, pixels); -} - - -void lbann_image_preprocessor::pixel_noise(Mat& pixels) -{ - if(m_noise_factor){ - Mat X_noise; - El::Gaussian(X_noise, pixels.Height(), pixels.Width(), DataType(0), DataType(1)); - El::Axpy(m_noise_factor,X_noise,pixels); - //@todo - clip to min and max of input entry - auto clip = [](const DataType& z) { - return std::max(DataType(0), std::min(z,DataType(1))); - }; - EntrywiseMap(pixels, El::MakeFunction(clip)); - } -} - -void lbann_image_preprocessor::z_score(Mat& pixels, - unsigned num_channels) { - - // Get image parameters - const unsigned height = pixels.Height(); - const unsigned height_per_channel = height / num_channels; - - // Shift and scale each channel separately - for (unsigned channel = 0; channel < num_channels; ++channel) { - const unsigned channel_start = channel*height_per_channel; - const unsigned channel_end = (channel+1)*height_per_channel; - Mat pixels_channel = El::View(pixels, El::IR(channel_start, channel_end), El::ALL); - DataType mean, stdev; - entrywise_mean_and_stdev(pixels_channel, mean, stdev); - if(stdev > DataType(1e-7)*std::abs(mean)) { - const DataType inv_stdev = 1 / stdev; - for (unsigned i = 0; i < height_per_channel; ++i) { - DataType& pixels_entry = pixels_channel(i, 0); - pixels_entry = (pixels_entry - mean) * inv_stdev; - } - } else { - Zero(pixels_channel); - } - } - -} - -cv::Mat lbann_image_preprocessor::cv_pixels(const Mat& pixels, - unsigned imheight, - unsigned imwidth, - unsigned num_channels) { - if (num_channels == 1) { - cv::Mat m(imheight, imwidth, CV_32FC1); - for (unsigned y = 0; y < imheight; ++y) { - for (unsigned x = 0; x < imwidth; ++x) { - m.at(y, x) = pixels(y * imwidth + x, 0); - } - } - return m; - } else if (num_channels == 3) { - cv::Mat m(imheight, imwidth, CV_32FC3); - for (unsigned y = 0; y < imheight; ++y) { - for (unsigned x = 0; x < imwidth; ++x) { - cv::Vec3f pixel; - unsigned offset = y * imwidth + x; - pixel[0] = pixels(offset, 0); - pixel[1] = pixels(offset + imheight*imwidth, 0); - pixel[2] = pixels(offset + 2*imheight*imwidth, 0); - m.at(y, x) = pixel; - } - } - return m; - } else { - throw lbann_exception(std::string{} + __FILE__ + " " + - std::to_string(__LINE__) + - "Only support 1 and 3 channels"); - } -} - -void lbann_image_preprocessor::col_pixels(const cv::Mat& sqpixels, Mat& pixels, - unsigned num_channels) { - unsigned imheight = sqpixels.rows; - unsigned imwidth = sqpixels.cols; - if (num_channels == 1) { - for (unsigned y = 0; y < imheight; ++y) { - for (unsigned x = 0; x < imwidth; ++x) { - pixels(y * imwidth + x, 0) = sqpixels.at(y, x); - } - } - } else if (num_channels == 3) { - for (unsigned y = 0; y < imheight; ++y) { - for (unsigned x = 0; x < imwidth; ++x) { - cv::Vec3f pixel = sqpixels.at(y, x); - unsigned offset = y * imwidth + x; - pixels(offset, 0) = pixel[0]; - pixels(offset + imheight*imwidth, 0) = pixel[1]; - pixels(offset + 2*imheight*imwidth, 0) = pixel[2]; - } - } - } else { - throw lbann_exception(std::string{} + __FILE__ + " " + - std::to_string(__LINE__) + - "Only support 1 and 3 channels"); - } -} - -void lbann_image_preprocessor::flip(cv::Mat& sqpixels, int flip_flag) { - // In/out must be different. - cv::Mat sqpixels_copy = sqpixels.clone(); - cv::flip(sqpixels_copy, sqpixels, flip_flag); -} - -void lbann_image_preprocessor::affine_trans(cv::Mat& sqpixels, - const Mat& trans) { - cv::Mat sqpixels_copy = sqpixels.clone(); - // Construct the OpenCV transformation matrix. - cv::Mat cv_trans(2, 3, CV_32FC1); - cv_trans.at(0, 0) = trans(0, 0); - cv_trans.at(0, 1) = trans(0, 1); - cv_trans.at(0, 2) = trans(0, 2); - cv_trans.at(1, 0) = trans(1, 0); - cv_trans.at(1, 1) = trans(1, 1); - cv_trans.at(1, 2) = trans(1, 2); - cv::warpAffine(sqpixels_copy, sqpixels, cv_trans, sqpixels.size(), - cv::INTER_LINEAR, cv::BORDER_REPLICATE); -} - -void lbann_image_preprocessor::internal_save_image( - Mat& pixels, const std::string filename, unsigned imheight, unsigned imwidth, - unsigned num_channels, bool do_scale) { - cv::Mat sqpixels = cv_pixels(pixels, imheight, imwidth, num_channels); - cv::Mat converted_pixels; - int dst_type = 0; - if (num_channels == 1) { - dst_type = CV_8UC1; - } else if (num_channels == 3) { - dst_type = CV_8UC3; - } // cv_pixels ensures no other case happens. - sqpixels.convertTo(converted_pixels, dst_type, do_scale ? 255.0f : 1.0f); - cv::imwrite(filename, converted_pixels); -} - -} // namespace lbann diff --git a/src/data_readers/image_utils.cpp b/src/data_readers/image_utils.cpp deleted file mode 100644 index 64102e334c5..00000000000 --- a/src/data_readers/image_utils.cpp +++ /dev/null @@ -1,370 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. -// Produced at the Lawrence Livermore National Laboratory. -// Written by the LBANN Research Team (B. Van Essen, et al.) listed in -// the CONTRIBUTORS file. -// -// LLNL-CODE-697807. -// All rights reserved. -// -// This file is part of LBANN: Livermore Big Artificial Neural Network -// Toolkit. For details, see http://software.llnl.gov/LBANN or -// https://github.com/LLNL/LBANN. -// -// Licensed under the Apache License, Version 2.0 (the "Licensee"); you -// may not use this file except in compliance with the License. You may -// obtain a copy of the License at: -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the license. -// -// image_utils .cpp .hpp - Image I/O utility functions -//////////////////////////////////////////////////////////////////////////////// - -#include "lbann/data_readers/image_utils.hpp" -#include "lbann/utils/exception.hpp" - -#define _THROW_EXCEPTION_NO_OPENCV_() { \ - std::stringstream err; \ - err << __FILE__ << " " << __LINE__ \ - << " :: not compiled with LBANN_ENABLE_OPENCV!"; \ - throw lbann_exception(err.str()); \ -} - - -namespace lbann { - -bool image_utils::loadIMG(const std::string& Imagefile, int& Width, int& Height, bool Flip, unsigned char *&Pixels, std::vector& buf) { -#ifdef LBANN_HAS_OPENCV - cv::Mat image = cv_utils::lbann_imread(Imagefile, _LBANN_CV_COLOR_, buf); - if (image.empty()) { - return false; - } - - Width = image.cols; - Height = image.rows; - - for (int y = 0; y < Height; y++) { - for (int x = 0; x < Width; x++) { - cv::Vec3b pixel = image.at(y, x); - int offset = (Flip) ? ((Height - 1 - y) * Width + x) : (y * Width + x); - Pixels[offset] = pixel[_LBANN_CV_BLUE_]; - Pixels[offset + Height*Width] = pixel[_LBANN_CV_GREEN_]; - Pixels[offset + 2*Height*Width] = pixel[_LBANN_CV_RED_]; - } - } - - return true; -#else - _THROW_EXCEPTION_NO_OPENCV_(); - return false; -#endif -} - -bool image_utils::loadIMG(std::vector& image_buf, int& Width, int& Height, bool Flip, unsigned char *&Pixels) { -#ifdef LBANN_HAS_OPENCV - cv::Mat image = cv::imdecode(image_buf, _LBANN_CV_COLOR_); - //cv::Mat image = cv::imdecode(image_buf, cv::IMREAD_ANYCOLOR | cv::IMREAD_ANYDEPTH); - if (image.empty()) { - return false; - } - - Width = image.cols; - Height = image.rows; - - for (int y = 0; y < Height; y++) { - for (int x = 0; x < Width; x++) { - cv::Vec3b pixel = image.at(y, x); - int offset = (Flip) ? ((Height - 1 - y) * Width + x) : (y * Width + x); - Pixels[offset] = pixel[_LBANN_CV_BLUE_]; - Pixels[offset + Height*Width] = pixel[_LBANN_CV_GREEN_]; - Pixels[offset + 2*Height*Width] = pixel[_LBANN_CV_RED_]; - } - } - - return true; -#else - _THROW_EXCEPTION_NO_OPENCV_(); - return false; -#endif -} - -bool image_utils::saveIMG(const std::string& Imagefile, int Width, int Height, bool Flip, unsigned char *Pixels) { -#ifdef LBANN_HAS_OPENCV - cv::Mat image = cv::Mat(Height, Width, CV_8UC3); - - for (int y = 0; y < Height; y++) { - for (int x = 0; x < Width; x++) { - cv::Vec3b pixel; - int offset = (Flip) ? ((Height - 1 - y) * Width + x) : (y * Width + x); - pixel[_LBANN_CV_BLUE_] = Pixels[offset]; - pixel[_LBANN_CV_GREEN_] = Pixels[offset + Height*Width]; - pixel[_LBANN_CV_RED_] = Pixels[offset + 2*Height*Width]; - image.at(y, x) = pixel; - } - } - cv::imwrite(Imagefile, image); - - return true; -#else - _THROW_EXCEPTION_NO_OPENCV_(); - return false; -#endif -} - - -#ifdef LBANN_HAS_OPENCV -bool image_utils::process_image(cv::Mat& image, int& Width, int& Height, int& Type, cv_process& pp, CPUMat& out) { - bool ok1 = !image.empty() && pp.preprocess(image); - bool ok2 = ok1 && cv_utils::copy_cvMat_to_buf(image, out, pp); - // Disabling normalizer is needed because normalizer is not necessarily - // called during preprocessing but implicitly applied during data copying to - // reduce overhead. - pp.disable_lazy_normalizer(); - - if (!ok2) { - throw lbann_exception(std::string("image_utils::process_image(): image ") + - (image.empty()? "is empty." : - (ok1? "copying failed." : - "preprocessing failed."))); - } - - Width = image.cols; - Height = image.rows; - Type = image.type(); - - return ok2; -} - -bool image_utils::process_image(cv::Mat& image, int& Width, int& Height, int& Type, cv_process& pp, std::vector& out) { - bool ok1 = !image.empty() && pp.preprocess(image); - bool ok2 = ok1 && cv_utils::copy_cvMat_to_buf(image, out, pp); - pp.disable_lazy_normalizer(); - - if (!ok2) { - throw lbann_exception(std::string("image_utils::process_image(): image ") + - (image.empty()? "is empty." : - (ok1? "copying failed." : - "preprocessing failed."))); - } - - Width = image.cols; - Height = image.rows; - Type = image.type(); - - return ok2; -} - -bool image_utils::process_image(cv::Mat& image, int& Width, int& Height, int& Type, cv_process_patches& pp, std::vector& out) { - std::vector patches; - bool ok1 = !image.empty() && pp.preprocess(image, patches); - bool ok2 = ok1 && (patches.size() != 0u) && (patches.size() == out.size()); - bool ok3 = ok2; - - for(size_t i=0u; ok3 && (i < patches.size()); ++i) { - ok3 = cv_utils::copy_cvMat_to_buf(patches[i], out[i], pp); - } - pp.disable_lazy_normalizer(); - - if (!ok3) { - throw lbann_exception(std::string("image_utils::process_image(): image ") + - (image.empty()? "is empty." : - (ok1? (ok2? "copying failed." : - "extracted to invalid number of patches: " + - std::to_string(patches.size()) + " != " + - std::to_string(out.size())) : - "preprocessing failed."))); - } - - Width = patches[0].cols; - Height = patches[0].rows; - Type = patches[0].type(); - - return ok3; -} -#endif // LBANN_HAS_OPENCV - -/** - * @param filename The name of the image file to read in - * @param Width The width of the image read - * @param Height The height of the image read - * @param Type The type of the image read (OpenCV code used for cv::Mat) - * @param pp The pre-processing parameters - * @param data The pre-processed image data to be stored in El::Matrix format - * @param buf A thread safe buffer for local, temporary, image decoding - */ -bool image_utils::load_image(const std::string& filename, - int& Width, int& Height, int& Type, cv_process& pp, CPUMat& data, std::vector& buf, cv::Mat* cv_buf) { -#ifdef LBANN_HAS_OPENCV - cv::Mat image = cv_utils::lbann_imread(filename, cv::IMREAD_ANYCOLOR | cv::IMREAD_ANYDEPTH, buf, cv_buf); - - return process_image(image, Width, Height, Type, pp, data); -#else - _THROW_EXCEPTION_NO_OPENCV_(); - return false; -#endif // LBANN_HAS_OPENCV -} - -/** - * @param filename The name of the image file to read in - * @param Width The width of a patch from the image read - * @param Height The height of a patch from the image read - * @param Type The type of the image patches (OpenCV code used for cv::Mat) - * @param pp The pre-processing parameters - * @param data The pre-processed image data to be stored in El::Matrix format - * @param buf A thread safe buffer for local, temporary, image decoding - */ -bool image_utils::load_image(const std::string& filename, - int& Width, int& Height, int& Type, cv_process_patches& pp, std::vector& data, std::vector& buf, cv::Mat* cv_buf) { -#ifdef LBANN_HAS_OPENCV - cv::Mat image = cv_utils::lbann_imread(filename, cv::IMREAD_ANYCOLOR | cv::IMREAD_ANYDEPTH, buf, cv_buf); - - return process_image(image, Width, Height, Type, pp, data); -#else - _THROW_EXCEPTION_NO_OPENCV_(); - return false; -#endif // LBANN_HAS_OPENCV -} - -//XX -/** - * @param filename The name of the image file to read in - * @param Width The width of a patch from the image read - * @param Height The height of a patch from the image read - * @param Type The type of the image patches (OpenCV code used for cv::Mat) - * @param pp The pre-processing parameters - * @param data The pre-processed image data to be stored in El::Matrix format - */ -bool image_utils::load_image(std::vector& image_buf, - int& Width, int& Height, int& Type, cv_process_patches& pp, std::vector& data, cv::Mat* cv_buf) { - - return import_image(image_buf, Width, Height, Type, pp, data, cv_buf); -} - -/** - * @param filename The name of the image file to write - * @param Width The width of the image to be written - * @param Height The height of the image to be written - * @param Type The type of the image to be written (OpenCV code used for cv::Mat) - * @param pp The post-processing parameters - * @param data The image data in El::Matrix format to post-process and write - */ -bool image_utils::save_image(const std::string& filename, - const int Width, const int Height, const int Type, cv_process& pp, const CPUMat& data) { -#ifdef LBANN_HAS_OPENCV - pp.determine_inverse_lazy_normalization(); - cv::Mat image = cv_utils::copy_buf_to_cvMat(data, Width, Height, Type, pp); - bool ok = !image.empty() && pp.postprocess(image); - - _LBANN_MILD_EXCEPTION(!ok, "Image postprocessing has failed.", false) - - return (ok && cv::imwrite(filename, image)); -#else - _THROW_EXCEPTION_NO_OPENCV_(); - return false; -#endif // LBANN_HAS_OPENCV -} - -/** - * @param inbuf The buffer that contains the raw bytes read from an image file - * This can be for example, const std:vector& or const cv::Mat&. - * http://docs.opencv.org/trunk/d4/d32/classcv_1_1__InputArray.html - * @param Width The width of the image consturcted out of inbuf - * @param Height The height of the image consructed - * @param Type The type of the image constructed (OpenCV code used for cv::Mat) - * @param pp The pre-processing parameters - * @param data The pre-processed image data. A set of sub-matrix Views can be used to store the data. - */ -bool image_utils::import_image(cv::InputArray inbuf, - int& Width, int& Height, int& Type, cv_process& pp, CPUMat& data, cv::Mat* cv_buf) { -#ifdef LBANN_HAS_OPENCV - cv::Mat image; - if(cv_buf != nullptr) { - image = cv::imdecode(inbuf, cv::IMREAD_ANYCOLOR | cv::IMREAD_ANYDEPTH, cv_buf); - }else { - image = cv::imdecode(inbuf, cv::IMREAD_ANYCOLOR | cv::IMREAD_ANYDEPTH); - } - - return process_image(image, Width, Height, Type, pp, data); -#else - _THROW_EXCEPTION_NO_OPENCV_(); - return false; -#endif // LBANN_HAS_OPENCV -} - -/** - * @param inbuf The buffer that contains the raw bytes read from an image file - * This can be for example, const std:vector& or const cv::Mat&. - * http://docs.opencv.org/trunk/d4/d32/classcv_1_1__InputArray.html - * @param Width The width of a patch from the image consturcted out of inbuf - * @param Height The height of a patch from the image consructed - * @param Type The type of the image patches (OpenCV code used for cv::Mat) - * @param pp The pre-processing parameters - * @param data The pre-processed image data. A set of sub-matrix Views can be used to store the data. - */ -bool image_utils::import_image(cv::InputArray inbuf, - int& Width, int& Height, int& Type, cv_process_patches& pp, std::vector& data, cv::Mat* cv_buf) { -#ifdef LBANN_HAS_OPENCV - cv::Mat image; - if(cv_buf != nullptr) { - image = cv::imdecode(inbuf, cv::IMREAD_ANYCOLOR | cv::IMREAD_ANYDEPTH, cv_buf); - }else { - image = cv::imdecode(inbuf, cv::IMREAD_ANYCOLOR | cv::IMREAD_ANYDEPTH); - } - - return process_image(image, Width, Height, Type, pp, data); -#else - _THROW_EXCEPTION_NO_OPENCV_(); - return false; -#endif // LBANN_HAS_OPENCV -} - -/** - * @param fileExt The format extension name of image file: e.g., ".jpeg", ".png" - * @param outbuf The preallocated buffer to contain the bytes to be written into an image file - * @param Width The width of the image to be consturcted based on the given data of ::Mat - * @param Height The height of the image - * @param Type The type of the image (OpenCV code used for cv::Mat) - * @param pp The post-processing parameters - * @param data The image data. A sub-matrix View can be passed instead of the entire matrix. - */ -bool image_utils::export_image(const std::string& fileExt, std::vector& outbuf, - const int Width, const int Height, const int Type, cv_process& pp, const CPUMat& data) { -#ifdef LBANN_HAS_OPENCV - pp.determine_inverse_lazy_normalization(); - cv::Mat image = cv_utils::copy_buf_to_cvMat(data, Width, Height, Type, pp); - bool ok = !image.empty() && pp.postprocess(image); - - _LBANN_MILD_EXCEPTION(!ok, "Either the image is empty or postprocessing has failed.", false) - _LBANN_MILD_EXCEPTION(fileExt.empty(), "Empty file format extension!", false) - - const std::string ext = ((fileExt[0] != '.')? ("." + fileExt) : fileExt); - - static const size_t max_img_header_size = 1024; - const size_t capacity = image_data_amount(image) + max_img_header_size; - - if (outbuf.size() < capacity) { - //std::cout << "bytes reserved for the image: " << image_data_amount(image) << std::endl; - outbuf.resize(capacity); - } - - return (ok && cv::imencode(ext, image, outbuf)); -#else - _THROW_EXCEPTION_NO_OPENCV_(); - return false; -#endif // LBANN_HAS_OPENCV -} - - - -bool image_utils::load_image(std::vector& image_buf, - int& Width, int& Height, int& Type, cv_process& pp, CPUMat& data, cv::Mat* cv_buf) { - return import_image(image_buf, Width, Height, Type, pp, data, cv_buf); -} - -} // namespace lbann diff --git a/src/data_readers/lbann_data_generator.cpp b/src/data_readers/lbann_data_generator.cpp deleted file mode 100644 index f215248d906..00000000000 --- a/src/data_readers/lbann_data_generator.cpp +++ /dev/null @@ -1,104 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. -// Produced at the Lawrence Livermore National Laboratory. -// Written by the LBANN Research Team (B. Van Essen, et al.) listed in -// the CONTRIBUTORS file. -// -// LLNL-CODE-697807. -// All rights reserved. -// -// This file is part of LBANN: Livermore Big Artificial Neural Network -// Toolkit. For details, see http://software.llnl.gov/LBANN or -// https://github.com/LLNL/LBANN. -// -// Licensed under the Apache License, Version 2.0 (the "Licensee"); you -// may not use this file except in compliance with the License. You may -// obtain a copy of the License at: -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the license. -// -// lbann_data_generator .hpp .cpp - Synthetic Data Generator -//////////////////////////////////////////////////////////////////////////////// - -#include "lbann/data_readers/lbann_data_generator.hpp" -#include "lbann/utils/random.hpp" -#include - -lbann::DataGenerator::DataGenerator(Int num_samples, Int width, Int height, Int batchSize) - : DataReader(batchSize, true) -{ - m_num_samples = num_samples; - m_data_width = width; - m_data_height = height; -} - -lbann::DataGenerator::DataGenerator(const DataGenerator& source) - : DataReader((const DataReader&) source), - m_data_width(source.m_data_width), m_data_height(source.m_data_height) -{ - // No need to deallocate data on a copy constuctor - - // clone_image_data(source); -} - -lbann::DataGenerator::~DataGenerator() -{ - // this->free(); -} - -void lbann::DataGenerator::load() { - ShuffledIndices.clear(); - ShuffledIndices.resize(m_num_samples); - for (size_t n = 0; n < ShuffledIndices.size(); n++) { - ShuffledIndices[n] = n; - } - uniform_fill_procdet(m_data, get_linearized_data_size(), m_num_samples, 128, 128); -} - -int lbann::DataGenerator::fetch_data(Mat& X) -{ - if(!DataReader::position_valid()) { - stringstream err; - err << __FILE__<<" "<<__LINE__<< " :: Data Generator load error: !position_valid"; - throw lbann_exception(err.str()); - } - - int current_batch_size = getBatchSize(); - - int n = 0; - for (n = CurrentPos; n < CurrentPos + current_batch_size; n++) { - if (n >= (int)ShuffledIndices.size()) - break; - - int k = n - CurrentPos; - int index = ShuffledIndices[n]; - - for (int p = 0; p < get_linearized_data_size(); p++) { - X.Set(p, k, m_data.GetLocal(p, index)); - } - } - - return (n - CurrentPos); -} - -// Assignment operator -lbann::DataGenerator& lbann::DataGenerator::operator=(const DataGenerator& source) -{ - // check for self-assignment - if (this == &source) - return *this; - - // Call the parent operator= function - DataReader::operator=(source); - - this->m_data_width = source.m_data_width; - this->m_data_height = source.m_data_height; - - return *this; -} diff --git a/src/data_readers/patchworks/patchworks.cpp b/src/data_readers/patchworks/patchworks.cpp deleted file mode 100644 index 40fa91cea9f..00000000000 --- a/src/data_readers/patchworks/patchworks.cpp +++ /dev/null @@ -1,182 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. -// Produced at the Lawrence Livermore National Laboratory. -// Written by the LBANN Research Team (B. Van Essen, et al.) listed in -// the CONTRIBUTORS file. -// -// LLNL-CODE-697807. -// All rights reserved. -// -// This file is part of LBANN: Livermore Big Artificial Neural Network -// Toolkit. For details, see http://software.llnl.gov/LBANN or -// https://github.com/LLNL/LBANN. -// -// Licensed under the Apache License, Version 2.0 (the "Licensee"); you -// may not use this file except in compliance with the License. You may -// obtain a copy of the License at: -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the license. -// -// patchworks.cpp - LBANN PATCHWORKS main interface implementation -//////////////////////////////////////////////////////////////////////////////// - -/** - * LBANN PATCHWORKS main interface implementation - * - includes the main interface function definitions - */ - -#include "lbann/data_readers/patchworks/patchworks.hpp" - -#ifdef LBANN_HAS_OPENCV -#include "lbann/utils/random.hpp" -#include "lbann/data_readers/patchworks/patchworks_stats.hpp" - -namespace lbann { -namespace patchworks { - -#if _PATCHWORKS_STAT_FLOAT_ == 32 -#define _f f -#elif _PATCHWORKS_STAT_FLOAT_ == 64 -#define _f -#else -#error need to set _PATCHWORKS_STAT_FLOAT_ -#endif - -std::pair check_min_max(const cv::Mat& _img) { - cv::Mat img = _img.clone(); - - double maxVal = 0.0; - double minVal = 0.0; - const int nCh = img.channels(); - - img.reshape(1); - cv::minMaxLoc(img, &minVal, &maxVal, nullptr, nullptr); - img.reshape(nCh); - - //std::cout << "min max : " << minVal << ' ' << maxVal << std::endl; - return std::make_pair(minVal, maxVal); -} - -cv::Mat correct_chromatic_aberration(const cv::Mat& _img) { - if (_img.channels() != 3) { - return _img.clone(); - } - - const int img_depth = _img.depth(); - - std::pair range_org = check_min_max(_img); - - cv::Mat img; // float matrix - - _img.convertTo(img, _PW_CV_FP_); - - static const pw_fp_t a[3] = {-1.0 _f, 2.0 _f, -1.0 _f}; // BGR order - static const pw_fp_t aa = a[0]*a[0] + a[1]*a[1]+ a[2]*a[2]; - // A = a'*a/(a*a') - //static const pw_fp_t A[3][3] = {{a[0]*a[0]/aa, a[0]*a[1]/aa, a[0]*a[2]/aa}, - // {a[1]*a[0]/aa, a[1]*a[1]/aa, a[1]*a[2]/aa}, - // {a[2]*a[0]/aa, a[2]*a[1]/aa, a[2]*a[2]/aa}}; - // B = (I - A)' - static const pw_fp_t B[3][3] = {{1.0 _f-a[0] *a[0]/aa, a[0] *a[1]/aa, a[0] *a[2]/aa}, - {a[1] *a[0]/aa, 1.0 _f-a[1] *a[1]/aa, a[1] *a[2]/aa}, - {a[2] *a[0]/aa, a[2] *a[1]/aa, 1.0 _f-a[2] *a[2]/aa} - }; - - cv::MatIterator_ it = img.begin(); - cv::MatIterator_ itend = img.end(); - - for ( ; it != itend; ++it) { - const auto b0 = static_cast((*it)[0]); - const auto g0 = static_cast((*it)[1]); - const auto r0 = static_cast((*it)[2]); - - pw_fp_t b = b0 * B[0][0] + g0 * B[1][0] + r0 * B[2][0]; - pw_fp_t g = b0 * B[0][1] + g0 * B[1][1] + r0 * B[2][1]; - pw_fp_t r = b0 * B[0][2] + g0 * B[1][2] + r0 * B[2][2]; - - //std::cout << r0 << ' ' << g0 << ' ' << b0 << " " << r << ' ' << g << ' ' << b << std::endl; - (*it) = pw_cv_vec3(b,g,r); - } - - std::pair range_new = check_min_max(img); - cv::Mat img_final; - //(x-range_new.first)*(range_org.second-range_org.first)/(range_new.second-range_new.first) + range_org.first; - const double alpha = (range_org.second-range_org.first)/(range_new.second-range_new.first); - const double beta = range_org.first - range_new.first * alpha; - img.convertTo(img_final, img_depth, alpha, beta); - - //std::pair range_final = check_min_max(img_final); - - return img_final; -} - -cv::Mat drop_2channels(const cv::Mat& _img) { - if (_img.channels() != 3) { - return _img.clone(); - } - - const int img_depth = _img.depth(); - - cv::Mat img; // pw_fp_t matrix - _img.convertTo(img, _PW_CV_FP_); - - // compute channel to remain - pw_fp_t m[3] = {0.0 _f, 0.0 _f, 0.0 _f}; - - ::lbann::rng_gen& gen = ::lbann::get_io_generator(); - - std::uniform_int_distribution rg_ch(0, 2); - const int chosenCh = rg_ch(gen); - - m[chosenCh] = 1.0 _f; - - // compute white noise - std::vector stats; - get_channel_stats(_img, stats); - - const auto avg = static_cast(stats[chosenCh].avg); - const auto dev = static_cast(stats[chosenCh].stdev/100.0); - pw_fp_t avgs[3] = {avg, avg, avg}; - pw_fp_t devs[3] = {dev, dev, dev}; - - std::normal_distribution rg_ch0(avgs[0], devs[0]); - std::normal_distribution rg_ch1(avgs[1], devs[1]); - std::normal_distribution rg_ch2(avgs[2], devs[2]); - - cv::MatIterator_ it = img.begin(); - cv::MatIterator_ itend = img.end(); - - for ( ; it != itend; ++it) { - const auto b0 = static_cast((*it)[0]); - const auto g0 = static_cast((*it)[1]); - const auto r0 = static_cast((*it)[2]); - -#if 1 - pw_fp_t b = b0*m[0] + (1.0-m[0])*rg_ch0(gen); - pw_fp_t g = g0*m[1] + (1.0-m[1])*rg_ch1(gen); - pw_fp_t r = r0*m[2] + (1.0-m[2])*rg_ch2(gen); -#else - pw_fp_t b = b0*m[0]; - pw_fp_t g = g0*m[1]; - pw_fp_t r = r0*m[2]; -#endif - - //std::cout << r0 << ' ' << g0 << ' ' << b0 << " " << r << ' ' << g << ' ' << b << std::endl; - (*it) = pw_cv_vec3(b,g,r); - } - - cv::Mat img_final; - img.convertTo(img_final, img_depth); - - return img_final; -} - -} // end of namespace patchworks -} // end of namespace lbann -#endif // LBANN_HAS_OPENCV diff --git a/src/data_readers/patchworks/patchworks_ROI.cpp b/src/data_readers/patchworks/patchworks_ROI.cpp deleted file mode 100644 index e9084640af3..00000000000 --- a/src/data_readers/patchworks/patchworks_ROI.cpp +++ /dev/null @@ -1,153 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. -// Produced at the Lawrence Livermore National Laboratory. -// Written by the LBANN Research Team (B. Van Essen, et al.) listed in -// the CONTRIBUTORS file. -// -// LLNL-CODE-697807. -// All rights reserved. -// -// This file is part of LBANN: Livermore Big Artificial Neural Network -// Toolkit. For details, see http://software.llnl.gov/LBANN or -// https://github.com/LLNL/LBANN. -// -// Licensed under the Apache License, Version 2.0 (the "Licensee"); you -// may not use this file except in compliance with the License. You may -// obtain a copy of the License at: -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the license. -// -// patchworks_ROI.cpp - LBANN PATCHWORKS ROI (region-of-interest) implementation -//////////////////////////////////////////////////////////////////////////////// - -/** - * LBANN PATCHWORKS ROI implementation - * - Region of interest descriptor - */ - -#include "lbann/data_readers/patchworks/patchworks_ROI.hpp" - -#ifdef LBANN_HAS_OPENCV -#include - -namespace lbann { -namespace patchworks { - -const int ROI::undefined_coordinate = -1; - -/// Reset to the initial condition indicating to cover the whole image -void ROI::init() { - m_left = undefined_coordinate; - m_top = undefined_coordinate; - m_right = undefined_coordinate; - m_bottom = undefined_coordinate; -} - -bool ROI::is_undefined() const { - return ((m_left == undefined_coordinate) || - (m_top == undefined_coordinate) || - (m_right == undefined_coordinate) || - (m_bottom == undefined_coordinate)); // default - -} - -/// Sanity check on a set of two coordinates that defines a region of interest -bool ROI::is_valid() const { - return (!is_undefined() && (m_left < m_right) && (m_top < m_bottom)); -} - -/** - * Check how the region of interest overlaps with the image, and shrink it to - * preceisely match the image boundary in case that it is out of boundary. - */ -bool ROI::set_overlapping_region(const cv::Mat& img) { - if (!is_valid() || (img.data == nullptr)) { - return false; - } - if (m_left < 0) { - m_left = 0; - } - if (m_top < 0) { - m_top = 0; - } - if (m_right > img.cols) { - m_right = img.cols; - } - if (m_bottom > img.rows) { - m_bottom = img.rows; - } - if (m_right == undefined_coordinate) { - m_right = img.cols; - } - if (m_bottom == undefined_coordinate) { - m_bottom = img.rows; - } - - return true; -} - -bool ROI::is_whole_image(const cv::Mat& img) { - const bool ok = set_overlapping_region(img); - return ok && - ((m_left == 0) && - (m_top == 0) && - (m_right == img.cols) && - (m_bottom == img.rows)); -} - -bool ROI::set_by_corners(const int p0_x, const int p0_y, const int p1_x, const int p1_y) { - m_left = p0_x; - m_top = p0_y; - m_right = p1_x; - m_bottom = p1_y; - - return is_valid(); -} - -bool ROI::set_by_center(const int px, const int py, const unsigned int _width, const unsigned int _height) { - m_left = px - (_width + _width%2)/2; - m_right = px + (_width + _width%2)/2; - m_top = py - (_height + _height%2)/2; - m_bottom = py + (_height + _height%2)/2; - - return is_valid(); -} - -void ROI::move(const std::pair displ) { - m_left += displ.first; - m_right += displ.first; - m_top += displ.second; - m_bottom += displ.second; -} - -bool ROI::operator==(const ROI& rarea) const { - return ((rarea.m_left == m_left) && (rarea.m_top == m_top) && - (m_right == rarea.m_right) && (m_bottom == rarea.m_bottom)); -} - -bool ROI::operator!=(const ROI& rarea) const { - return !(*this == rarea); -} - -bool ROI::operator<(const ROI& rarea) const { - return ((*this <= rarea) && !(*this == rarea)); -} - -bool ROI::operator>(const ROI& rarea) const { - return ((*this >= rarea) && !(*this == rarea)); -} - -/// Stream out the content of the region of interest -std::ostream& operator<<(std::ostream& os, const ROI& roi) { - return roi.Print(os); -} - -} // end of namespace patchworks -} // end of namespace lbann -#endif // LBANN_HAS_OPENCV diff --git a/src/data_readers/patchworks/patchworks_patch_descriptor.cpp b/src/data_readers/patchworks/patchworks_patch_descriptor.cpp deleted file mode 100644 index 42751ebe550..00000000000 --- a/src/data_readers/patchworks/patchworks_patch_descriptor.cpp +++ /dev/null @@ -1,270 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. -// Produced at the Lawrence Livermore National Laboratory. -// Written by the LBANN Research Team (B. Van Essen, et al.) listed in -// the CONTRIBUTORS file. -// -// LLNL-CODE-697807. -// All rights reserved. -// -// This file is part of LBANN: Livermore Big Artificial Neural Network -// Toolkit. For details, see http://software.llnl.gov/LBANN or -// https://github.com/LLNL/LBANN. -// -// Licensed under the Apache License, Version 2.0 (the "Licensee"); you -// may not use this file except in compliance with the License. You may -// obtain a copy of the License at: -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the license. -// -// patchworks_patch_descriptor.cpp - LBANN PATCHWORKS implementation for patch descriptor -//////////////////////////////////////////////////////////////////////////////// - -/** - * LBANN PATCHWORKS implementation for patch descriptor - */ - -#include "lbann/data_readers/patchworks/patchworks_patch_descriptor.hpp" - -#ifdef LBANN_HAS_OPENCV -#include -#include "lbann/utils/random.hpp" - -namespace lbann { -namespace patchworks { - -void patch_descriptor::init() { - m_width = 0u; - m_height = 0u; - m_gap = 0u; - m_jitter = 0u; - m_mode_center = 1u; - m_mode_chrom = 0u; - m_self_label = false; - m_ext = ""; - m_sample_area = ROI(); - m_displacements.clear(); - reset(); -} - -void patch_descriptor::reset() { - m_patch_center = ROI(); - m_positions.clear(); - m_cur_patch_idx = 0u; -} - -void patch_descriptor::set_size(const int width, const int height) { - m_width = width; - m_height = height; -} - -bool patch_descriptor::set_sample_area(const ROI& area) { - if (!area.is_valid()) { - return false; - } - m_sample_area = area; - return true; -} - -bool patch_descriptor::set_sample_image(const unsigned int img_width, const unsigned int img_height) { - ROI whole_image; - whole_image.set_by_corners(0, 0, img_width, img_height); - - return set_sample_area(whole_image); -} - -void patch_descriptor::define_patch_set() { - const int wdisp = m_width + m_gap; - const int hdisp = m_height + m_gap; - m_displacements.clear(); - m_displacements.emplace_back(-wdisp, -hdisp); - m_displacements.emplace_back( 0, -hdisp); - m_displacements.emplace_back( wdisp, -hdisp); - m_displacements.emplace_back(-wdisp, 0); - m_displacements.emplace_back( wdisp, 0); - m_displacements.emplace_back(-wdisp, hdisp); - m_displacements.emplace_back( 0, hdisp); - m_displacements.emplace_back( wdisp, hdisp); -} - -bool patch_descriptor::get_first_patch(ROI& patch) { - int x_center = (m_sample_area.width()+1)/2 + m_sample_area.left(); - int y_center = (m_sample_area.height()+1)/2 + m_sample_area.top(); - int x_margin = 0; - int y_margin = 0; - - if (m_mode_center == 0u) { - // Consider the jitter for a patch at the boundary of an image - x_margin = (m_width+1)/2 + m_jitter; - y_margin = (m_height+1)/2 + m_jitter; - } else if (m_mode_center == 1u) { - // The jitter for the center patch is a part of gap. - //if (m_jitter > m_gap) return false; - x_margin = m_width + (m_width+1)/2 + 2*m_jitter + m_gap; - y_margin = m_height + (m_height+1)/2 + 2*m_jitter + m_gap; - } - - ::lbann::rng_gen& gen = ::lbann::get_io_generator(); - - if ((m_mode_center == 0u || m_mode_center == 1u)) { - // area where the center of a center patch can be in - ROI center_patch_area; - bool ok = center_patch_area.set_by_corners(x_margin + m_sample_area.left(), - y_margin + m_sample_area.top(), - m_sample_area.width() - x_margin, - m_sample_area.height() - y_margin); - if (!ok) { - std::cout << "invalid center patch area: " << center_patch_area << std::endl; - return false; - } - if (!center_patch_area.is_valid()) { - return false; - } - - // randomly generate the center coordinate within the center patch area - std::uniform_int_distribution rg_center_x(0, center_patch_area.width()-1); - std::uniform_int_distribution rg_center_y(0, center_patch_area.height()-1); - x_center = rg_center_x(gen) + center_patch_area.left(); - y_center = rg_center_y(gen) + center_patch_area.top(); - } - - if (m_jitter > 0u) { // apply position jitter if enabled - std::uniform_int_distribution rg_jitter_x(0, 2*m_jitter); - std::uniform_int_distribution rg_jitter_y(0, 2*m_jitter); - x_center += rg_jitter_x(gen) - m_jitter; - y_center += rg_jitter_y(gen) - m_jitter; - } - - // set the center patch - ROI p; - if (!p.set_by_center(x_center, y_center, m_width, m_height) || - !(m_sample_area >= p)) { - return false; - } - - m_patch_center = p; - patch = p; - m_positions.clear(); - m_cur_patch_idx = 0u; - m_positions.push_back(patch); - - return true; -} - -bool patch_descriptor::get_next_patch(ROI& patch) { - bool got_one = false; - - ::lbann::rng_gen& gen = ::lbann::get_io_generator(); - - do { - ROI p = m_patch_center; - - if (m_cur_patch_idx >= m_displacements.size()) { - return false; - } - p.move(m_displacements[m_cur_patch_idx++]); - - if (m_jitter > 0u) { - std::uniform_int_distribution rg_jitter_x(0, 2*m_jitter); - std::uniform_int_distribution rg_jitter_y(0, 2*m_jitter); - const int x_jitter = rg_jitter_x(gen) - m_jitter; - const int y_jitter = rg_jitter_y(gen) - m_jitter; - p.move(displacement_type(x_jitter, y_jitter)); - } - - if (p.is_valid() && (m_sample_area >= p)) { - patch = p; - got_one = true; - } - } while (!got_one); - - m_positions.push_back(patch); - return true; -} - -bool patch_descriptor::extract_patches(const cv::Mat& img, std::vector& patches) { - patches.clear(); - if (img.data == nullptr) { - return false; - } - - ROI roi; - bool ok = get_first_patch(roi); - if (!ok) { - return false; - } - - patches.push_back(img(roi.rect()).clone()); - -#if 0 // to generate all the patches defined in the set - unsigned int i = 1u; - - while (get_next_patch(roi)) { - patches.push_back(img(roi.rect()).clone()); - i++; - } - if (i == 1u) { - return false; - } -#else // to randomly generate another patch. The label will be recorded to m_cur_patch_idx. - if (m_displacements.size() == 0) { - return false; - } - - std::uniform_int_distribution rg_patch_idx(0, m_displacements.size()-1); - ::lbann::rng_gen& gen = ::lbann::get_io_generator(); - m_cur_patch_idx = rg_patch_idx(gen); - - if (!get_next_patch(roi)) { - return false; - } - patches.push_back(img(roi.rect()).clone()); -#endif - - return true; -} - -std::string patch_descriptor::get_description() const { - std::stringstream os; - os << "patch descriptor:" << std::endl - << '\t' << "m_width: " << m_width << std::endl - << '\t' << "m_height: " << m_height << std::endl - << '\t' << "m_gap: " << m_gap << std::endl - << '\t' << "m_jitter: " << m_jitter << std::endl - << '\t' << "m_mode_center: " << m_mode_center << std::endl - << '\t' << "m_mode_chrom: " << m_mode_chrom << std::endl - << '\t' << "m_self_label: " << m_self_label << std::endl - << '\t' << "m_ext: " << m_ext << std::endl - << '\t' << "m_sample_area: " << m_sample_area << std::endl - << '\t' << "patch displacements from the center: " << std::endl; - for (unsigned int i=0u; i < m_displacements.size() ; ++i) { - os << "\t\t" << i+1 << ' ' << m_displacements[i].first << ' ' << m_displacements[i].second << std::endl; - } - - return os.str(); -} - -std::ostream& patch_descriptor::print(std::ostream& os) const { - os << get_description() - << '\t' << "m_cur_patch_idx: " << m_cur_patch_idx << std::endl - << '\t' << "patch regions: " << std::endl; - for (unsigned int i=0u; i < m_positions.size() ; ++i) { - os << "\t\t" << i << '\t' << m_positions[i] << std::endl; - } - - return os; -} - -std::ostream& operator<<(std::ostream& os, const patch_descriptor& pd) { - return pd.print(os); -} - -} // end of namespace patchworks -} // end of namespace lbann -#endif // LBANN_HAS_OPENCV diff --git a/src/data_readers/patchworks/patchworks_stats.cpp b/src/data_readers/patchworks/patchworks_stats.cpp deleted file mode 100644 index 9216fbceab0..00000000000 --- a/src/data_readers/patchworks/patchworks_stats.cpp +++ /dev/null @@ -1,147 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. -// Produced at the Lawrence Livermore National Laboratory. -// Written by the LBANN Research Team (B. Van Essen, et al.) listed in -// the CONTRIBUTORS file. -// -// LLNL-CODE-697807. -// All rights reserved. -// -// This file is part of LBANN: Livermore Big Artificial Neural Network -// Toolkit. For details, see http://software.llnl.gov/LBANN or -// https://github.com/LLNL/LBANN. -// -// Licensed under the Apache License, Version 2.0 (the "Licensee"); you -// may not use this file except in compliance with the License. You may -// obtain a copy of the License at: -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the license. -// -// patchworks_stats.cpp - LBANN PATCHWORKS implementation for pixel statistics -//////////////////////////////////////////////////////////////////////////////// - -/** - * LBANN PATCHWORKS implementation for pixel statistics - */ - -#include "lbann/data_readers/patchworks/patchworks_stats.hpp" -#ifdef LBANN_HAS_OPENCV - -namespace lbann { -namespace patchworks { - -bool get_single_channel_stats(const cv::Mat& _img, image_stats& stats) { - if (_img.channels() != 1) { - return false; - } - - cv::Mat img; // pw_fp_t matrix - _img.convertTo(img, _PW_CV_FP_); - - cv::MatIterator_ itBegin = img.begin(); - cv::MatIterator_ itEnd = img.end(); - - const auto typeZero = static_cast(0); - - double sum = 0.0; - - std::vector data(itBegin, itEnd); - stats.cnt = data.size(); - if (stats.cnt == 0u) { - return false; - } - - std::sort(data.begin(), data.end()); - if (data[0] < typeZero) { - return false; - } - - stats.max = data.back(); - stats.min = data[0]; - - std::vector::const_iterator itbeg = data.begin(); - std::vector::const_iterator itend = data.end(); - std::vector::const_iterator itbegNZ = std::upper_bound(data.begin(), data.end(), static_cast(0)); - - stats.cntZeros = std::distance(itbeg, itbegNZ); - stats.minNZ = *itbegNZ; - - const size_t nnz = stats.cnt - stats.cntZeros; - const size_t halfPointNZ = nnz/2; - const size_t halfPoint = stats.cnt/2; - auto itMedNZ = itbegNZ; - auto itMed = itbeg; - std::advance(itMedNZ, halfPointNZ); - std::advance(itMed, halfPoint); - - stats.medianNZ = *itMedNZ; - stats.median = *itMed; - - auto it = itbegNZ; - for( ; it != itend; ++it) { - sum += *it; - } - - stats.avg = sum/stats.cnt; - if (nnz == 0u) { - stats.avgNZ = stats.avg; - } else { - stats.avgNZ = sum/nnz; - } - - double var = 0.0; - double varNZ = 0.0; - it = itbegNZ; - - for(it = itbeg; it != itbegNZ; ++it) { - const double dev = (*it-stats.avg); - var += dev*dev; - } - - for( ; it != itend; ++it) { - const double dev = (*it-stats.avg); - var += dev*dev; - const double devNZ = (*it-stats.avgNZ); - varNZ += devNZ*devNZ; - } - - stats.stdev = sqrt(var/stats.cnt); - stats.stdevNZ = sqrt(varNZ/nnz); - - return true; -} - -bool get_channel_stats(const cv::Mat& img, std::vector& stats) { - if (img.data == nullptr) { - std::cout << "get_channel_stats(): img not set" << std::endl; - return false; - } - - const int nCh = img.channels(); - std::vector imgCh; // image data per channel - cv::split(img, imgCh); // split the image into individual channels - - stats.clear(); - stats.resize(nCh); - - bool ok = true; - - for (int ch=0; ok && (ch < img.channels()); ++ch) { // compute statistics per channel - ok = get_single_channel_stats(imgCh[ch], stats[ch]); - } - - if (!ok) { - std::cout << "Failed to get stats" << std::endl; - } - return ok; -} - -} // end of namespace patchworks -} // end of namespace lbann -#endif // LBANN_HAS_OPENCV diff --git a/src/proto/factories/CMakeLists.txt b/src/proto/factories/CMakeLists.txt index c89d2d12e81..45d987ab318 100644 --- a/src/proto/factories/CMakeLists.txt +++ b/src/proto/factories/CMakeLists.txt @@ -7,6 +7,7 @@ set_full_path(THIS_DIR_SOURCES model_factory.cpp objective_function_factory.cpp optimizer_factory.cpp + transform_factory.cpp weights_factory.cpp ) diff --git a/src/proto/factories/transform_factory.cpp b/src/proto/factories/transform_factory.cpp new file mode 100644 index 00000000000..7940e743545 --- /dev/null +++ b/src/proto/factories/transform_factory.cpp @@ -0,0 +1,133 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#include "lbann/proto/factories.hpp" +#include "lbann/transforms/normalize.hpp" +#include "lbann/transforms/sample_normalize.hpp" +#include "lbann/transforms/scale.hpp" +#include "lbann/transforms/vision/center_crop.hpp" +#include "lbann/transforms/vision/grayscale.hpp" +#include "lbann/transforms/vision/colorize.hpp" +#include "lbann/transforms/vision/horizontal_flip.hpp" +#include "lbann/transforms/vision/normalize_to_lbann_layout.hpp" +#include "lbann/transforms/vision/random_affine.hpp" +#include "lbann/transforms/vision/random_crop.hpp" +#include "lbann/transforms/vision/random_resized_crop.hpp" +#include "lbann/transforms/vision/random_resized_crop_with_fixed_aspect_ratio.hpp" +#include "lbann/transforms/vision/resize.hpp" +#include "lbann/transforms/vision/resized_center_crop.hpp" +#include "lbann/transforms/vision/to_lbann_layout.hpp" +#include "lbann/transforms/vision/vertical_flip.hpp" +#include "lbann/utils/memory.hpp" + +namespace lbann { +namespace proto { + +std::unique_ptr construct_transform( + const lbann_data::Transform& trans) { + if (trans.has_normalize()) { + auto& pb_trans = trans.normalize(); + return make_unique( + parse_list(pb_trans.means()), + parse_list(pb_trans.stddevs())); + } else if (trans.has_sample_normalize()) { + return make_unique(); + } else if (trans.has_scale()) { + return make_unique(trans.scale().scale()); + } else if (trans.has_center_crop()) { + auto& pb_trans = trans.center_crop(); + return make_unique( + pb_trans.height(), pb_trans.width()); + } else if (trans.has_colorize()) { + return make_unique(); + } else if (trans.has_grayscale()) { + return make_unique(); + } else if (trans.has_horizontal_flip()) { + return make_unique( + trans.horizontal_flip().p()); + } else if (trans.has_normalize_to_lbann_layout()) { + auto& pb_trans = trans.normalize_to_lbann_layout(); + return make_unique( + parse_list(pb_trans.means()), + parse_list(pb_trans.stddevs())); + } else if (trans.has_random_affine()) { + auto& pb_trans = trans.random_affine(); + return make_unique( + pb_trans.rotate_min(), pb_trans.rotate_max(), + pb_trans.translate_h(), pb_trans.translate_w(), + pb_trans.scale_min(), pb_trans.scale_max(), + pb_trans.shear_min(), pb_trans.shear_max()); + } else if (trans.has_random_crop()) { + auto& pb_trans = trans.random_crop(); + return make_unique( + pb_trans.height(), pb_trans.width()); + } else if (trans.has_random_resized_crop()) { + auto& pb_trans = trans.random_resized_crop(); + // Handle defaults: If one specified, all must be. + if (pb_trans.scale_min() != 0.0f) { + return make_unique( + pb_trans.height(), pb_trans.width(), + pb_trans.scale_min(), pb_trans.scale_max(), + pb_trans.ar_min(), pb_trans.ar_max()); + } else { + return make_unique( + pb_trans.height(), pb_trans.width()); + } + } else if (trans.has_random_resized_crop_with_fixed_aspect_ratio()) { + auto& pb_trans = trans.random_resized_crop_with_fixed_aspect_ratio(); + return make_unique( + pb_trans.height(), pb_trans.width(), + pb_trans.crop_height(), pb_trans.crop_width()); + } else if (trans.has_resize()) { + auto& pb_trans = trans.resize(); + return make_unique(pb_trans.height(), pb_trans.width()); + } else if (trans.has_resized_center_crop()) { + auto& pb_trans = trans.resized_center_crop(); + return make_unique( + pb_trans.height(), pb_trans.width(), + pb_trans.crop_height(), pb_trans.crop_width()); + } else if (trans.has_to_lbann_layout()) { + return make_unique(); + } else if (trans.has_vertical_flip()) { + return make_unique( + trans.vertical_flip().p()); + } + + LBANN_ERROR("Unknown transform"); + return nullptr; +} + +transform::transform_pipeline construct_transform_pipeline( + const lbann_data::Reader& data_reader) { + transform::transform_pipeline tp; + for (int i = 0; i < data_reader.transforms_size(); ++i) { + tp.add_transform(construct_transform(data_reader.transforms(i))); + } + return tp; +} + +} // namespace proto +} // namespace lbann diff --git a/src/proto/init_image_data_readers.cpp b/src/proto/init_image_data_readers.cpp index 511ddaab32e..58be634aeb0 100644 --- a/src/proto/init_image_data_readers.cpp +++ b/src/proto/init_image_data_readers.cpp @@ -27,6 +27,7 @@ //////////////////////////////////////////////////////////////////////////////// #include "lbann/proto/init_image_data_readers.hpp" +#include "lbann/proto/factories.hpp" #include #include #include @@ -34,280 +35,6 @@ namespace lbann { -/// set up a cropper -static void set_cropper(const lbann_data::ImagePreprocessor& pb_preprocessor, - const bool master, std::shared_ptr& pp, - int& width, int& height) { - if (pb_preprocessor.has_cropper()) { - const lbann_data::ImagePreprocessor::Cropper& pb_cropper = pb_preprocessor.cropper(); - if (!pb_cropper.disable()) { - const std::string cropper_name = ((pb_cropper.name() == "")? "default_cropper" : pb_cropper.name()); - std::unique_ptr cropper(new(lbann::cv_cropper)); - cropper->set_name(cropper_name); - cropper->set(pb_cropper.crop_width(), - pb_cropper.crop_height(), - pb_cropper.crop_randomly(), - std::make_pair(pb_cropper.resized_width(), - pb_cropper.resized_height()), - pb_cropper.adaptive_interpolation()); - pp->add_transform(std::move(cropper)); - width = pb_cropper.crop_width(); - height = pb_cropper.crop_height(); - if (master) std::cout << "image processor: " << cropper_name << " cropper is set" << std::endl; - } - } -} - -/// set up a resizer -static void set_resizer(const lbann_data::ImagePreprocessor& pb_preprocessor, - const bool master, std::shared_ptr& pp, - int& width, int& height) { - if (pb_preprocessor.has_resizer()) { - const lbann_data::ImagePreprocessor::Resizer& pb_resizer = pb_preprocessor.resizer(); - if (!pb_resizer.disable()) { - const std::string resizer_name = ((pb_resizer.name() == "")? "default_resizer" : pb_resizer.name()); - std::unique_ptr resizer(new(lbann::cv_resizer)); - resizer->set_name(resizer_name); - resizer->set(pb_resizer.resized_width(), - pb_resizer.resized_height(), - pb_resizer.adaptive_interpolation()); - pp->add_transform(std::move(resizer)); - width = pb_resizer.resized_width(); - height = pb_resizer.resized_height(); - if (master) std::cout << "image processor: " << resizer_name << " resizer is set" << std::endl; - } - } -} - -/// set up an augmenter -static void set_augmenter(const lbann_data::ImagePreprocessor& pb_preprocessor, - const bool master, std::shared_ptr& pp) { - if (pb_preprocessor.has_augmenter()) { - const lbann_data::ImagePreprocessor::Augmenter& pb_augmenter = pb_preprocessor.augmenter(); - if (!pb_augmenter.disable() && - (pb_augmenter.horizontal_flip() || - pb_augmenter.vertical_flip() || - pb_augmenter.rotation() != 0.0 || - pb_augmenter.horizontal_shift() != 0.0 || - pb_augmenter.vertical_shift() != 0.0 || - pb_augmenter.shear_range() != 0.0)) - { - const std::string augmenter_name = ((pb_augmenter.name() == "")? "default_augmenter" : pb_augmenter.name()); - std::unique_ptr augmenter(new(lbann::cv_augmenter)); - augmenter->set_name(augmenter_name); - augmenter->set(pb_augmenter.horizontal_flip(), - pb_augmenter.vertical_flip(), - pb_augmenter.rotation(), - pb_augmenter.horizontal_shift(), - pb_augmenter.vertical_shift(), - pb_augmenter.shear_range()); - pp->add_transform(std::move(augmenter)); - if (master) std::cout << "image processor: " << augmenter_name << " augmenter is set" << std::endl; - } - } -} - -/// set up a decolorizer -static void set_decolorizer(const lbann_data::ImagePreprocessor& pb_preprocessor, - const bool master, std::shared_ptr& pp, int& channels) { - if (pb_preprocessor.has_decolorizer()) { - const lbann_data::ImagePreprocessor::Decolorizer& pb_decolorizer = pb_preprocessor.decolorizer(); - if (!pb_decolorizer.disable()) { - const std::string decolorizer_name = ((pb_decolorizer.name() == "")? "default_decolorizer" : pb_decolorizer.name()); - std::unique_ptr decolorizer(new(lbann::cv_decolorizer)); - decolorizer->set_name(decolorizer_name); - decolorizer->set(pb_decolorizer.pick_1ch()); - pp->add_transform(std::move(decolorizer)); - channels = 1; - if (master) std::cout << "image processor: " << decolorizer_name << " decolorizer is set" << std::endl; - } - } -} - -/// set up a colorizer -static void set_colorizer(const lbann_data::ImagePreprocessor& pb_preprocessor, - const bool master, std::shared_ptr& pp, int& channels) { - if (pb_preprocessor.has_colorizer()) { - const lbann_data::ImagePreprocessor::Colorizer& pb_colorizer = pb_preprocessor.colorizer(); - if (!pb_colorizer.disable()) { - const std::string colorizer_name = ((pb_colorizer.name() == "")? "default_colorizer" : pb_colorizer.name()); - std::unique_ptr colorizer(new(lbann::cv_colorizer)); - colorizer->set_name(colorizer_name); - pp->add_transform(std::move(colorizer)); - channels = 3; - if (master) std::cout << "image processor: " << colorizer_name << " colorizer is set" << std::endl; - } - } -} - -static bool has_channel_wise_subtractor(const lbann_data::ImagePreprocessor& pb_preprocessor) { - if (!pb_preprocessor.has_subtractor()) { - return false; - } - const lbann_data::ImagePreprocessor::Subtractor& pb_subtractor = pb_preprocessor.subtractor(); - return ((pb_subtractor.channel_mean_size() > 0) || (pb_subtractor.channel_stddev_size() > 0)) - && pb_subtractor.image_to_sub().empty() && pb_subtractor.image_to_div().empty(); -} - -/// set up a subtractor -static void set_subtractor(const lbann_data::ImagePreprocessor& pb_preprocessor, - const bool master, std::shared_ptr& pp, - const int channels) { - if (pb_preprocessor.has_subtractor()) { - const lbann_data::ImagePreprocessor::Subtractor& pb_subtractor = pb_preprocessor.subtractor(); - if (!pb_subtractor.disable()) { - const std::string subtractor_name = ((pb_subtractor.name() == "")? "default_subtractor" : pb_subtractor.name()); - std::unique_ptr subtractor(new(lbann::cv_subtractor)); - subtractor->set_name(subtractor_name); - - bool is_mean_set = false; - - if (!pb_subtractor.image_to_sub().empty()) { - subtractor->set_mean(pb_subtractor.image_to_sub()); - is_mean_set = true; - } - else if (pb_subtractor.channel_mean_size() > 0) { - const size_t n = pb_subtractor.channel_mean_size(); - if (n != static_cast(channels)) { - throw lbann_exception("Failed to setup subtractor due to inconsistent number of channels."); - } - std::vector ch_mean(n); - for(size_t i = 0u; i < n; ++i) { - ch_mean[i] = static_cast(pb_subtractor.channel_mean(i)); - } - - subtractor->set_mean(ch_mean); - is_mean_set = true; - } - - if (!is_mean_set && master) { - std::cout << "image processor: " << subtractor_name << " assumes zero mean." << std::endl - << " If this is not the case, provide mean." << std::endl; - } - - bool is_stddev_set = false; - if (!pb_subtractor.image_to_div().empty()) { - subtractor->set_stddev(pb_subtractor.image_to_div()); - is_stddev_set = true; - } - else if (pb_subtractor.channel_stddev_size() > 0) { - const size_t n = pb_subtractor.channel_stddev_size(); - if (n != static_cast(channels)) { - throw lbann_exception("Failed to setup subtractor due to inconsistent number of channels."); - } - std::vector ch_stddev(n); - for(size_t i = 0u; i < n; ++i) { - ch_stddev[i] = static_cast(pb_subtractor.channel_stddev(i)); - } - - subtractor->set_stddev(ch_stddev); - is_stddev_set = true; - } - - pp->add_normalizer(std::move(subtractor)); - if (master) { - std::cout << "image processor: " << subtractor_name << " subtractor is set for " - << (has_channel_wise_subtractor(pb_preprocessor)? "channel-wise" : "pixel-wise") - << ' ' << (is_stddev_set? "z-score" : "mean-subtraction") << std::endl; - } - } - } -} - -/// set up a sample-wide normalizer -static void set_normalizer(const lbann_data::ImagePreprocessor& pb_preprocessor, - const bool master, std::shared_ptr& pp) { - if (pb_preprocessor.has_normalizer()) { - const lbann_data::ImagePreprocessor::Normalizer& pb_normalizer = pb_preprocessor.normalizer(); - if (!pb_normalizer.disable()) { - const std::string normalizer_name = ((pb_normalizer.name() == "")? "default_normalizer" : pb_normalizer.name()); - std::unique_ptr normalizer(new(lbann::cv_normalizer)); - normalizer->set_name(normalizer_name); - normalizer->unit_scale(pb_normalizer.scale()); - normalizer->subtract_mean(pb_normalizer.subtract_mean()); - normalizer->unit_variance(pb_normalizer.unit_variance()); - normalizer->z_score(pb_normalizer.z_score()); - bool ok = pp->add_normalizer(std::move(normalizer)); - if (master && ok) std::cout << "image processor: " << normalizer_name << " normalizer is set" << std::endl; - } - } -} - - -void init_image_preprocessor(const lbann_data::Reader& pb_readme, const bool master, - std::shared_ptr& pp, int& width, int& height, int& channels) { -// Currently we set width and height for image_data_reader here considering the transform -// pipeline. image_data_reader reports the final dimension of data to the child layer based -// on these information. -// TODO: However, for composible pipeline, this needs to be automatically determined by each -// cv_process at the setup finalization stage. - if (!pb_readme.has_image_preprocessor()) return; - - const lbann_data::ImagePreprocessor& pb_preprocessor = pb_readme.image_preprocessor(); - if (pb_preprocessor.disable()) return; - - // data reader name - const std::string& name = pb_readme.name(); - // final size of image - width = pb_preprocessor.raw_width(); - height = pb_preprocessor.raw_height(); - if (pb_preprocessor.raw_num_channels() > 0) { - channels = pb_preprocessor.raw_num_channels(); - } - - if (pb_preprocessor.has_subtractor() && !has_channel_wise_subtractor(pb_preprocessor)) { - // decolorizer and colorizer are exclusive - set_decolorizer(pb_preprocessor, master, pp, channels); - set_colorizer(pb_preprocessor, master, pp, channels); - // set up a pixel-wise subtractor - set_subtractor(pb_preprocessor, master, pp, channels); - } - - set_cropper(pb_preprocessor, master, pp, width, height); - set_resizer(pb_preprocessor, master, pp, width, height); - set_augmenter(pb_preprocessor, master, pp); - if (has_channel_wise_subtractor(pb_preprocessor)) { - // decolorizer and colorizer are exclusive - set_decolorizer(pb_preprocessor, master, pp, channels); - set_colorizer(pb_preprocessor, master, pp, channels); - // set up a channel-wise subtractor - set_subtractor(pb_preprocessor, master, pp, channels); - } else if (!pb_preprocessor.has_subtractor()) { - // decolorizer/colorizer would have already been applied in the pixel-wise subtractor - // decolorizer and colorizer are exclusive - set_decolorizer(pb_preprocessor, master, pp, channels); - set_colorizer(pb_preprocessor, master, pp, channels); - } - set_normalizer(pb_preprocessor, master, pp); - - // create a data reader - if (name == "imagenet_patches") { - std::shared_ptr ppp = std::dynamic_pointer_cast(pp); - if (pb_preprocessor.has_patch_extractor()) { - const lbann_data::ImagePreprocessor::PatchExtractor& pb_patch_extractor = pb_preprocessor.patch_extractor(); - if (!pb_patch_extractor.disable()) { - const std::string patch_extractor_name = ((pb_patch_extractor.name() == "")? "default_patch_extractor" : pb_patch_extractor.name()); - lbann::patchworks::patch_descriptor pi; - pi.set_sample_image(static_cast(width), - static_cast(height)); - pi.set_size(pb_patch_extractor.patch_width(), pb_patch_extractor.patch_height()); - pi.set_gap(pb_patch_extractor.patch_gap()); - pi.set_jitter(pb_patch_extractor.patch_jitter()); - pi.set_mode_centering(pb_patch_extractor.centering_mode()); - pi.set_mode_chromatic_aberration(pb_patch_extractor.ca_correction_mode()); - pi.set_self_label(); - pi.define_patch_set(); - width = pb_patch_extractor.patch_width(); - height = pb_patch_extractor.patch_height(); - ppp->set_name(patch_extractor_name); - ppp->set_patch_descriptor(pi); - if (master) std::cout << "image processor: " << patch_extractor_name << " patch_extractor is set" << std::endl; - } - } - } -} - - void init_image_data_reader(const lbann_data::Reader& pb_readme, const lbann_data::DataSetMetaData& pb_metadata, const bool master, generic_data_reader* &reader) { // data reader name const std::string& name = pb_readme.name(); @@ -316,47 +43,55 @@ void init_image_data_reader(const lbann_data::Reader& pb_readme, const lbann_dat // number of labels const int n_labels = pb_readme.num_labels(); - std::shared_ptr pp; - // set up the image preprocessor - if ((name == "imagenet") || (name == "jag_conduit") || - (name == "multihead_siamese") || (name == "mnist_siamese") || - (name == "multi_images") || (name == "moving_mnist")) { - pp = std::make_shared(); - } else if (name == "imagenet_patches") { - pp = std::make_shared(); - } else { - if (master) { - std::stringstream err; - err << __FILE__ << " " << __LINE__ << " :: unknown name for image data reader: " - << name; - throw lbann_exception(err.str()); - } - } - // final size of image int width = 0, height = 0; int channels = 0; - // setup preprocessor - init_image_preprocessor(pb_readme, master, pp, width, height, channels); + // Ugly hack for now to extract dimensions. + for (int i = 0; i < pb_readme.transforms_size(); ++i) { + auto& trans = pb_readme.transforms(i); + if (trans.has_center_crop()) { + height = trans.center_crop().height(); + width = trans.center_crop().width(); + } else if (trans.has_grayscale()) { channels = 1; } + else if (trans.has_random_crop()) { + height = trans.random_crop().height(); + width = trans.random_crop().width(); + } else if (trans.has_random_resized_crop()) { + height = trans.random_resized_crop().height(); + width = trans.random_resized_crop().width(); + } else if (trans.has_random_resized_crop_with_fixed_aspect_ratio()) { + height = trans.random_resized_crop_with_fixed_aspect_ratio().crop_height(); + width = trans.random_resized_crop_with_fixed_aspect_ratio().crop_width(); + } else if (trans.has_resize()) { + height = trans.resize().height(); + width = trans.resize().width(); + } else if (trans.has_resized_center_crop()) { + height = trans.resized_center_crop().crop_height(); + width = trans.resized_center_crop().crop_width(); + } + } - if (name == "imagenet_patches") { - std::shared_ptr ppp = std::dynamic_pointer_cast(pp); - reader = new imagenet_reader_patches(ppp, shuffle); - } else if (name == "imagenet") { - reader = new imagenet_reader(pp, shuffle); + if (name == "imagenet") { + reader = new imagenet_reader(shuffle); } else if (name == "multihead_siamese") { - reader = new data_reader_multihead_siamese(pp, pb_readme.num_image_srcs(), shuffle); - } else if (name == "mnist_siamese") { - reader = new data_reader_mnist_siamese(pp, shuffle); - } else if (name == "multi_images") { - reader = new data_reader_multi_images(pp, shuffle); + reader = new data_reader_multihead_siamese(pb_readme.num_image_srcs(), shuffle); } else if (name == "moving_mnist") { reader = new moving_mnist_reader(7, 40, 40, 2); } else if (name =="jag_conduit") { - data_reader_jag_conduit* reader_jag = new data_reader_jag_conduit(pp, shuffle); + data_reader_jag_conduit* reader_jag = new data_reader_jag_conduit(shuffle); const lbann_data::DataSetMetaData::Schema& pb_schema = pb_metadata.schema(); + if(height == 0 && pb_schema.image_height() != 0) { + height = pb_schema.image_height(); + } + if(width == 0 && pb_schema.image_width() != 0) { + width = pb_schema.image_width(); + } + if(channels == 0 && pb_schema.image_num_channels() != 0) { + channels = pb_schema.image_num_channels(); + } + if (channels == 0) { channels = 1; } @@ -514,6 +249,9 @@ void init_image_data_reader(const lbann_data::Reader& pb_readme, const lbann_dat return; } + reader->set_transform_pipeline( + std::move(proto::construct_transform_pipeline(pb_readme))); + if (channels == 0) { channels = 3; } @@ -526,82 +264,9 @@ void init_image_data_reader(const lbann_data::Reader& pb_readme, const lbann_dat } if (master) std::cout << reader->get_type() << " is set" << std::endl; - // configure the data reader - if (name == "multi_images") { - const int n_img_srcs = pb_readme.num_image_srcs(); - data_reader_multi_images* multi_image_dr_ptr - = dynamic_cast(image_data_reader_ptr); - if (multi_image_dr_ptr == nullptr) { - std::stringstream err; - err << __FILE__ << " " << __LINE__ << " no data_reader_multi_images"; - throw lbann_exception(err.str()); - } - multi_image_dr_ptr->set_input_params(width, height, channels, n_labels, n_img_srcs); - } else if(name == "multihead_siamese") { - const int n_img_srcs = pb_readme.num_image_srcs(); - data_reader_multi_images* multi_image_dr_ptr - = dynamic_cast(image_data_reader_ptr); - multi_image_dr_ptr->set_input_params(width, height, channels, n_labels, n_img_srcs); - } else { - image_data_reader_ptr->set_input_params(width, height, channels, n_labels); - } + image_data_reader_ptr->set_input_params(width, height, channels, n_labels); } - -void init_generic_preprocessor(const lbann_data::Reader& pb_readme, const bool master, generic_data_reader* reader) { - if (!pb_readme.has_image_preprocessor()) return; - - const lbann_data::ImagePreprocessor& pb_preprocessor = pb_readme.image_preprocessor(); - if (pb_preprocessor.disable()) return; - - // set up augmenter if necessary - if (pb_preprocessor.has_augmenter()) { - const lbann_data::ImagePreprocessor::Augmenter& pb_augmenter = pb_preprocessor.augmenter(); - if (!pb_augmenter.disable() && - (pb_augmenter.name() == "") && - (pb_augmenter.horizontal_flip() || - pb_augmenter.vertical_flip() || - pb_augmenter.rotation() != 0.0 || - pb_augmenter.horizontal_shift() != 0.0 || - pb_augmenter.vertical_shift() != 0.0 || - pb_augmenter.shear_range() != 0.0)) - { - reader->horizontal_flip( pb_augmenter.horizontal_flip() ); - reader->vertical_flip( pb_augmenter.vertical_flip() ); - reader->rotation( pb_augmenter.rotation() ); - reader->horizontal_shift( pb_augmenter.horizontal_shift() ); - reader->vertical_shift( pb_augmenter.vertical_shift() ); - reader->shear_range( pb_augmenter.shear_range() ); - if (master) std::cout << "image processor: augmenter is set" << std::endl; - } else { - reader->disable_augmentation(); - } - } - - // set up the normalizer - if (pb_preprocessor.has_normalizer()) { - const lbann_data::ImagePreprocessor::Normalizer& pb_normalizer = pb_preprocessor.normalizer(); - if (!pb_normalizer.disable() && - (pb_normalizer.name() == "")) { - reader->subtract_mean( pb_normalizer.subtract_mean() ); - reader->unit_variance( pb_normalizer.unit_variance() ); - reader->scale( pb_normalizer.scale() ); - reader->z_score( pb_normalizer.z_score() ); - if (master) std::cout << "image processor: normalizer is set" << std::endl; - } - } - - if (pb_preprocessor.has_noiser()) { - const lbann_data::ImagePreprocessor::Noiser& pb_noiser = pb_preprocessor.noiser(); - if (!pb_noiser.disable() && - (pb_noiser.name() == "")) { - reader->add_noise( pb_noiser.factor() ); - if (master) std::cout << "image processor: noiser is set" << std::endl; - } - } -} - - void init_org_image_data_reader(const lbann_data::Reader& pb_readme, const bool master, generic_data_reader* &reader) { // data reader name const std::string& name = pb_readme.name(); @@ -625,8 +290,8 @@ void init_org_image_data_reader(const lbann_data::Reader& pb_readme, const bool } } - // setup preprocessor - init_generic_preprocessor(pb_readme, master, reader); + reader->set_transform_pipeline( + std::move(proto::construct_transform_pipeline(pb_readme))); } } diff --git a/src/proto/lbann.proto b/src/proto/lbann.proto index 859261344e3..ecdff306ef8 100644 --- a/src/proto/lbann.proto +++ b/src/proto/lbann.proto @@ -36,7 +36,6 @@ message Reader { //for GAN model bool gan_labelling = 201; int32 gan_label_value = 202; - ImagePreprocessor image_preprocessor = 13; int32 num_labels = 99; //for imagenet and synthetic int64 num_samples = 100; //only for synthetic @@ -74,6 +73,8 @@ message Reader { //------------- end of only for index lists ------------------ PythonDataReader python = 501; + + repeated Transform transforms = 600; // Ordered list of transforms to apply. } message PythonDataReader { @@ -84,117 +85,112 @@ message PythonDataReader { string sample_dims_function = 5; // Function that gets dimensions of data sample } -message ImagePreprocessor { - string name = 1; - bool disable = 2; - int32 raw_width = 3; - int32 raw_height = 4; - int32 raw_num_channels = 5; - - message Cropper { - string name = 1; - bool disable = 2; - bool crop_randomly = 3; - uint32 crop_width = 4; - uint32 crop_height = 5; - int32 resized_width = 6; - int32 resized_height = 7; - bool adaptive_interpolation = 8; +// Preprocessing transforms. +message Transform { + // Transforms that apply to LBANN data. + // Normalize channel-wise with mean and standard deviation. + message Normalize { + string means = 1; + string stddevs = 2; } - - message Resizer { - string name = 1; - bool disable = 2; - int32 resized_width = 3; - int32 resized_height = 4; - bool adaptive_interpolation = 5; + // Normalize each sample to have mean 0, standard deviation 1. + message SampleNormalize {} + // Scale by a constant. + message Scale { + float scale = 1; } - message Augmenter { - string name = 1; - bool disable = 2; - bool horizontal_flip = 3; - bool vertical_flip = 4; - double rotation = 5; - double horizontal_shift = 6; - double vertical_shift = 7; - double shear_range = 8; + // Transforms that apply to images. + // Crop of size height x width from the center. + message CenterCrop { + uint64 height = 1; + uint64 width = 2; } - - message Decolorizer { - string name = 1; - bool disable = 2; - bool pick_1ch = 3; + // Convert to color. + message Colorize {} + // Convert to grayscale. + message Grayscale {} + // Horizontal flip with probability p. + message HorizontalFlip { + float p = 1; } - - message Colorizer { - string name = 1; - bool disable = 2; + // Fused Normalize + ToLBANNLayout. + message NormalizeToLBANNLayout { + string means = 1; + string stddevs = 2; } - - message Normalizer { - string name = 1; - bool disable = 2; - bool scale = 3; - bool subtract_mean = 4; - bool unit_variance = 5; - bool z_score = 6; + // Apply a random affine transform. + message RandomAffine { + float rotate_min = 1; + float rotate_max = 2; + float translate_h = 3; + float translate_w = 4; + float scale_min = 5; + float scale_max = 6; + float shear_min = 7; + float shear_max = 8; } - - message Subtractor { - string name = 1; - bool disable = 2; - string image_to_sub = 3; - string image_to_div = 4; - repeated float channel_mean = 5 [packed = true]; - repeated float channel_stddev = 6 [packed = true]; + // Crop of size height x width from a random location. + message RandomCrop { + uint64 height = 1; + uint64 width = 2; } - - message PatchExtractor { - string name = 1; - bool disable = 2; - uint32 patch_width = 3; - uint32 patch_height = 4; - uint32 patch_gap = 5; // gap between patches - uint32 patch_jitter = 6; // max jittering amount for patch positions - uint32 centering_mode = 7; // center patch positioning mode - uint32 ca_correction_mode = 8; // chromatic abberation correction mode + // Random crop with scale and aspect ratio augmentation. + message RandomResizedCrop { + uint64 height = 1; + uint64 width = 2; + float scale_min = 3; + float scale_max = 4; + float ar_min = 5; + float ar_max = 6; } - - message Noiser { - string name = 1; - bool disable = 2; - float factor = 3; + // Resize to height x width, then randomly crop to crop_height x crop_width. + message RandomResizedCropWithFixedAspectRatio { + uint64 height = 1; + uint64 width = 2; + uint64 crop_height = 3; + uint64 crop_width = 4; } - - Cropper cropper = 6; - Resizer resizer = 7; - Augmenter augmenter = 8; - Decolorizer decolorizer = 9; - Colorizer colorizer = 10; - Subtractor subtractor = 11; - Normalizer normalizer = 12; - Noiser noiser = 13; - PatchExtractor patch_extractor = 14; - - int32 early_normalization = 33; // for data_reader_jag only -} - -// TODO: wrap El::Mat based normalization into a generic preprocessor -message GenericPreprocessor { - string name = 1; - bool disable = 2; - - message Normalizer { - string name = 1; - bool disable = 2; - bool scale = 3; - bool subtract_mean = 4; - bool unit_variance = 5; - bool z_score = 6; + // Resize to height x width. + message Resize { + uint64 height = 1; + uint64 width = 2; + } + // Resize to height x width then crop to crop_height x crop_width at the center. + message ResizedCenterCrop { + uint64 height = 1; + uint64 width = 2; + uint64 crop_height = 3; + uint64 crop_width = 4; + } + // Convert from an image to LBANN data. + message ToLBANNLayout { } + // Vertical flip with probability p. + message VerticalFlip { + float p = 1; } - Normalizer normalizer = 3; + oneof a_transform { + // On LBANN data: + Normalize normalize = 1; + SampleNormalize sample_normalize = 2; + Scale scale = 3; + + // On images: + CenterCrop center_crop = 100; + Colorize colorize = 101; + Grayscale grayscale = 102; + HorizontalFlip horizontal_flip = 103; + NormalizeToLBANNLayout normalize_to_lbann_layout = 104; + RandomAffine random_affine = 105; + RandomCrop random_crop = 106; + RandomResizedCrop random_resized_crop = 107; + RandomResizedCropWithFixedAspectRatio random_resized_crop_with_fixed_aspect_ratio = 108; + Resize resize = 109; + ResizedCenterCrop resized_center_crop = 110; + ToLBANNLayout to_lbann_layout = 111; + VerticalFlip vertical_flip = 112; + } } //======================================================================== @@ -206,6 +202,10 @@ message DataSetMetaData { string image_prefix = 2; string input_prefix = 3; + uint64 image_height = 11; + uint64 image_width = 12; + uint64 image_num_channels = 13; + //------------------ start of only for jag_conduit ----------------------- bool split_jag_image_channels = 89; repeated string jag_image_keys = 90; diff --git a/src/proto/proto_common.cpp b/src/proto/proto_common.cpp index e88f2f5b648..165732638a2 100644 --- a/src/proto/proto_common.cpp +++ b/src/proto/proto_common.cpp @@ -4,6 +4,7 @@ #include "lbann/base.hpp" #include "lbann/comm.hpp" #include "lbann/proto/init_image_data_readers.hpp" +#include "lbann/proto/factories.hpp" #include "lbann/utils/file_utils.hpp" #include @@ -82,8 +83,6 @@ void init_data_readers( for (int j=0; jset_dependent_variable_type(dependent_type); - - const lbann_data::ImagePreprocessor& pb_preproc = readme.image_preprocessor(); - reader_jag->set_image_dims(pb_preproc.raw_width(), pb_preproc.raw_height()); - reader_jag->set_normalization_mode(pb_preproc.early_normalization()); reader = reader_jag; - set_up_generic_preprocessor = false; } else if (name == "jag_conduit") { init_image_data_reader(readme, pb_metadata, master, reader); + set_transform_pipeline = false; auto reader_jag_conduit = dynamic_cast(reader); const lbann_data::Model& pb_model = p.model(); reader->set_mini_batch_size(static_cast(pb_model.mini_batch_size())); @@ -174,10 +172,9 @@ void init_data_readers( break; } } - set_up_generic_preprocessor = false; } else if (name == "jag_conduit_hdf5") { init_image_data_reader(readme, pb_metadata, master, reader); - set_up_generic_preprocessor = false; + set_transform_pipeline = false; } else if (name == "nci") { reader = new data_reader_nci(shuffle); } else if (name == "csv") { @@ -240,7 +237,6 @@ void init_data_readers( npy_readers.push_back(reader_numpy_npz); } else if (readme.format() == "jag_conduit") { init_image_data_reader(readme, pb_metadata, master, reader); - set_up_generic_preprocessor = false; npy_readers.push_back(reader); } else if (readme.format() == "pilot2_molecular_reader") { pilot2_molecular_reader* reader_pilot2_molecular = new pilot2_molecular_reader(readme.num_neighbors(), readme.max_neighborhood(), shuffle); @@ -347,6 +343,11 @@ void init_data_readers( } reader->set_comm(comm); + if (set_transform_pipeline) { + reader->set_transform_pipeline( + std::move(proto::construct_transform_pipeline(readme))); + } + if (readme.data_filename() != "") { reader->set_data_filename( readme.data_filename() ); } @@ -380,10 +381,6 @@ void init_data_readers( reader->set_gan_label_value(readme.gan_label_value()); reader->set_partitioned(readme.is_partitioned(), readme.partition_overlap(), readme.partition_mode()); - - if (set_up_generic_preprocessor) { - init_generic_preprocessor(readme, master, reader); - } } if (readme.role() == "train") { @@ -426,14 +423,8 @@ void init_data_readers( reader_validation = new numpy_npz_conduit_reader(*dynamic_cast(reader)); } else if (name == "imagenet") { reader_validation = new imagenet_reader(*dynamic_cast(reader), reader->get_unused_indices()); - } else if (name == "imagenet_patches") { - reader_validation = new imagenet_reader_patches(*dynamic_cast(reader)); } else if (name == "multihead_siamese") { reader_validation = new data_reader_multihead_siamese(*dynamic_cast(reader)); - } else if (name == "mnist_siamese") { - reader_validation = new data_reader_mnist_siamese(*dynamic_cast(reader)); - } else if (name == "multi_images") { - reader_validation = new data_reader_multi_images(*dynamic_cast(reader)); } else if (name == "jag") { reader_validation = new data_reader_jag(shuffle); *dynamic_cast(reader_validation) = *dynamic_cast(reader); diff --git a/src/data_readers/patchworks/CMakeLists.txt b/src/transforms/CMakeLists.txt similarity index 52% rename from src/data_readers/patchworks/CMakeLists.txt rename to src/transforms/CMakeLists.txt index 860d5008980..d934e6b5dbd 100644 --- a/src/data_readers/patchworks/CMakeLists.txt +++ b/src/transforms/CMakeLists.txt @@ -1,10 +1,14 @@ # Add the source files for this directory set_full_path(THIS_DIR_SOURCES - patchworks.cpp - patchworks_ROI.cpp - patchworks_patch_descriptor.cpp - patchworks_stats.cpp + normalize.cpp + repack_HWC_to_CHW_layout.cpp + sample_normalize.cpp + scale.cpp + scale_and_translate.cpp + transform_pipeline.cpp ) +add_subdirectory(vision) + # Propagate the files up the tree set(SOURCES "${SOURCES}" "${THIS_DIR_SOURCES}" PARENT_SCOPE) diff --git a/src/transforms/normalize.cpp b/src/transforms/normalize.cpp new file mode 100644 index 00000000000..d4803185d30 --- /dev/null +++ b/src/transforms/normalize.cpp @@ -0,0 +1,105 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#include "lbann/transforms/normalize.hpp" + +namespace lbann { +namespace transform { + +void normalize::apply(utils::type_erased_matrix& data, std::vector& dims) { + // Ensure we have the right number of channels. + if (dims.size() == 3 && m_means.size() != dims[0]) { + LBANN_ERROR("Normalize channels does not match data"); + } else if (dims.size() != 3 && m_means.size() != 1) { + LBANN_ERROR("Transform data has no channels, cannot normalize with multiple channels"); + } + // Only work with DataTypes to avoid rounding/floating point issues. + auto& mat = data.template get(); + if (mat.Height() != mat.LDim()) { + LBANN_ERROR("Normalizing non-contiguous matrix not supported"); + } + DataType* __restrict__ buf = mat.Buffer(); + if (m_means.size() == 1) { + const DataType mean = m_means[0]; + const DataType std = m_stds[0]; + const El::Int size = mat.Height() * mat.Width(); + for (El::Int i = 0; i < size; ++i) { + buf[i] = (buf[i] - mean) / std; + } + } else { + for (size_t channel = 0; channel < dims[0]; ++channel) { + const DataType mean = m_means[channel]; + const DataType std = m_stds[channel]; + const size_t size = dims[1] * dims[2]; + const size_t channel_start = channel*size; + const size_t channel_end = channel_start + size; + for (size_t i = channel_start; i < channel_end; ++i) { + buf[i] = (buf[i] - mean) / std; + } + } + } +} + +void normalize::apply(utils::type_erased_matrix& data, CPUMat& out, + std::vector& dims) { + // Ensure we have the right number of channels. + if (dims.size() == 3 && m_means.size() != dims[0]) { + LBANN_ERROR("Normalize channels does not match data"); + } else if (dims.size() != 3 && m_means.size() != 1) { + LBANN_ERROR("Transform data has no channels, cannot normalize with multiple channels"); + } + if (out.Height() != out.LDim()) { + LBANN_ERROR("Normalizing to non-contiguous matrix not supported."); + } + const auto& src = data.template get(); + if (src.Height() != src.LDim()) { + LBANN_ERROR("Normalizing from non-contiguous matrix not supported."); + } + const DataType* __restrict__ src_buf = src.LockedBuffer(); + DataType* __restrict__ dst_buf = out.Buffer(); + if (m_means.size() == 1) { + const DataType mean = m_means[0]; + const DataType std = m_stds[0]; + const El::Int size = src.Height() * src.Width(); + for (El::Int i = 0; i < size; ++i) { + dst_buf[i] = (src_buf[i] - mean) / std; + } + } else { + for (size_t channel = 0; channel < dims[0]; ++channel) { + const DataType mean = m_means[channel]; + const DataType std = m_stds[channel]; + const size_t size = dims[1] * dims[2]; + const size_t channel_start = channel*size; + const size_t channel_end = channel_start + size; + for (size_t i = channel_start; i < channel_end; ++i) { + dst_buf[i] = (src_buf[i] - mean) / std; + } + } + } +} + +} // namespace transform +} // namespace lbann diff --git a/src/transforms/repack_HWC_to_CHW_layout.cpp b/src/transforms/repack_HWC_to_CHW_layout.cpp new file mode 100644 index 00000000000..113f20076c6 --- /dev/null +++ b/src/transforms/repack_HWC_to_CHW_layout.cpp @@ -0,0 +1,84 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#include "lbann/transforms/repack_HWC_to_CHW_layout.hpp" +#include "lbann/utils/opencv.hpp" + +namespace lbann { +namespace transform { + +void repack_HWC_to_CHW_layout::apply(utils::type_erased_matrix& data, std::vector& dims) { + auto dst = CPUMat(utils::get_linearized_size(dims), 1); + apply(data, dst, dims); + data.emplace(std::move(dst)); +} + +void repack_HWC_to_CHW_layout::apply(utils::type_erased_matrix& data, CPUMat& out, + std::vector& dims) { + CPUMat &src = data.template get(); + if (!src.Contiguous()) { + LBANN_ERROR("RepackHWCtoCHWLayout does not support non-contiguous src."); + } + if (!out.Contiguous()) { + LBANN_ERROR("RepackHWCtoCHWLayout does not support non-contiguous destination."); + } + const DataType* __restrict__ src_buf = src.LockedBuffer(); + const size_t out_size = utils::get_linearized_size(dims); + if (static_cast(out.Height() * out.Width()) != out_size) { + LBANN_ERROR("Transform output does not have sufficient space."); + } + DataType* __restrict__ dst_buf = out.Buffer(); + // Pack an interleave multi-channel data structure into a + // channel-strided data structure + const size_t size = dims[1] * dims[2]; + for (size_t row = 0; row < dims[1]; ++row) { + for (size_t col = 0; col < dims[2]; ++col) { + int N = dims[0]; + // Multiply by N because there are N channels. + const size_t src_base = N*(row + col*dims[1]); + const size_t dst_base = row + col*dims[1]; + switch(N) { + case 4: + dst_buf[dst_base + 3*size] = src_buf[src_base + 3]; + [[fallthrough]]; + case 3: + dst_buf[dst_base + 2*size] = src_buf[src_base + 2]; + [[fallthrough]]; + case 2: + dst_buf[dst_base + size] = src_buf[src_base + 1]; + [[fallthrough]]; + case 1: + dst_buf[dst_base] = src_buf[src_base]; + break; + default: + LBANN_ERROR("Unsupported number of channels"); + } + } + } +} + +} // namespace transform +} // namespace lbann diff --git a/src/transforms/sample_normalize.cpp b/src/transforms/sample_normalize.cpp new file mode 100644 index 00000000000..c52d78c3ecf --- /dev/null +++ b/src/transforms/sample_normalize.cpp @@ -0,0 +1,49 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#include "lbann/transforms/sample_normalize.hpp" +#include "lbann/utils/statistics.hpp" + +namespace lbann { +namespace transform { + +void sample_normalize::apply(utils::type_erased_matrix& data, std::vector&) { + // Only work with DataTypes to avoid rounding/floating point issues. + auto& mat = data.template get(); + if (mat.Height() != mat.LDim()) { + LBANN_ERROR("Normalizing non-contiguous matrix not supported."); + } + DataType mean, stdev; + entrywise_mean_and_stdev(mat, mean, stdev); + DataType* __restrict__ buf = mat.Buffer(); + const El::Int size = mat.Height() * mat.Width(); + for (El::Int i = 0; i < size; ++i) { + buf[i] = (buf[i] - mean) / stdev; + } +} + +} // namespace transform +} // namespace lbann diff --git a/src/transforms/scale.cpp b/src/transforms/scale.cpp new file mode 100644 index 00000000000..9c2ceb45a38 --- /dev/null +++ b/src/transforms/scale.cpp @@ -0,0 +1,48 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#include "lbann/transforms/scale.hpp" + +namespace lbann { +namespace transform { + +void scale::apply(utils::type_erased_matrix& data, std::vector&) { + // Currently only works on DataTypes. + // Need to decide how to handle uint8_t matrices. + auto& mat = data.template get(); + if (mat.Height() != mat.LDim()) { + LBANN_ERROR("Scaling non-contiguous matrix not supported."); + } + // Don't use El::Scale because it spawns OpenMP threads. + DataType* __restrict__ buf = mat.Buffer(); + const El::Int size = mat.Height() * mat.Width(); + for (El::Int i = 0; i < size; ++i) { + buf[i] *= m_scale; + } +} + +} // namespace transform +} // namespace lbann diff --git a/src/transforms/scale_and_translate.cpp b/src/transforms/scale_and_translate.cpp new file mode 100644 index 00000000000..8fe5d6ec3f4 --- /dev/null +++ b/src/transforms/scale_and_translate.cpp @@ -0,0 +1,48 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#include "lbann/transforms/scale_and_translate.hpp" + +namespace lbann { +namespace transform { + +void scale_and_translate::apply(utils::type_erased_matrix& data, std::vector&) { + // Currently only works on DataTypes. + // Need to decide how to handle uint8_t matrices. + auto& mat = data.template get(); + if (!mat.Contiguous()) { + LBANN_ERROR("Scaling and translating non-contiguous matrix not supported."); + } + // Don't use El::Scale because it spawns OpenMP threads. + DataType* __restrict__ buf = mat.Buffer(); + const El::Int size = mat.Height() * mat.Width(); + for (El::Int i = 0; i < size; ++i) { + buf[i] = m_scale * buf[i] + m_translate; + } +} + +} // namespace transform +} // namespace lbann diff --git a/src/transforms/transform_pipeline.cpp b/src/transforms/transform_pipeline.cpp new file mode 100644 index 00000000000..3c1c172a0a4 --- /dev/null +++ b/src/transforms/transform_pipeline.cpp @@ -0,0 +1,110 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#include "lbann/transforms/transform_pipeline.hpp" +#include "lbann/utils/exception.hpp" + +namespace lbann { +namespace transform { + +transform_pipeline::transform_pipeline(const transform_pipeline& other) : + m_expected_out_dims(other.m_expected_out_dims) { + for (const auto& trans : other.m_transforms) { + m_transforms.emplace_back(trans->copy()); + } +} + +transform_pipeline& transform_pipeline::operator=( + const transform_pipeline& other) { + m_expected_out_dims = other.m_expected_out_dims; + m_transforms.clear(); + for (const auto& trans : other.m_transforms) { + m_transforms.emplace_back(trans->copy()); + } + return *this; +} + +void transform_pipeline::apply(utils::type_erased_matrix& data, + std::vector& dims) { + for (auto& trans : m_transforms) { + trans->apply(data, dims); + } + assert_expected_out_dims(dims); +} + +void transform_pipeline::apply(CPUMat& data, std::vector& dims) { + utils::type_erased_matrix m = utils::type_erased_matrix(std::move(data)); + apply(m, dims); + data = std::move(m.template get()); +} + +void transform_pipeline::apply(El::Matrix& data, CPUMat& out_data, + std::vector& dims) { + utils::type_erased_matrix m = utils::type_erased_matrix(std::move(data)); + if (!m_transforms.empty()) { + bool applied_non_inplace = false; + size_t i = 0; + for (; !applied_non_inplace && i < m_transforms.size(); ++i) { + if (m_transforms[i]->supports_non_inplace()) { + applied_non_inplace = true; + m_transforms[i]->apply(m, out_data, dims); + } else { + m_transforms[i]->apply(m, dims); + } + } + if (!applied_non_inplace) { + LBANN_ERROR("No transform to go from uint8 -> DataType"); + } + if (i < m_transforms.size()) { + // Apply the remaining transforms. + // TODO(pp): Prevent out_data from being resized/reallocated. + m = utils::type_erased_matrix(std::move(out_data)); + for (; i < m_transforms.size(); ++i) { + m_transforms[i]->apply(m, dims); + } + out_data = std::move(m.template get()); + } + } else { + LBANN_ERROR("No transform to go from uint8 -> DataType"); + } + assert_expected_out_dims(dims); +} + +void transform_pipeline::assert_expected_out_dims( + const std::vector& dims) { + if (!m_expected_out_dims.empty() && dims != m_expected_out_dims) { + std::stringstream ss; + ss << "Transformed dims do not match expected dims, got {"; + for (const auto& d : dims) { ss << d << " "; } + ss << "} expected {"; + for (const auto& d : m_expected_out_dims) { ss << d << " "; } + ss << "}"; + LBANN_ERROR(ss.str()); + } +} + +} // namespace transform +} // namespace lbann diff --git a/src/transforms/unit_test/CMakeLists.txt b/src/transforms/unit_test/CMakeLists.txt new file mode 100644 index 00000000000..c93aeb4207c --- /dev/null +++ b/src/transforms/unit_test/CMakeLists.txt @@ -0,0 +1,9 @@ +set_full_path(_DIR_LBANN_CATCH2_TEST_FILES + normalize_test.cpp + sample_normalize_test.cpp + scale_test.cpp + transform_pipeline_test.cpp + ) + +set(LBANN_CATCH2_TEST_FILES + "${LBANN_CATCH2_TEST_FILES}" "${_DIR_LBANN_CATCH2_TEST_FILES}" PARENT_SCOPE) diff --git a/src/transforms/unit_test/normalize_test.cpp b/src/transforms/unit_test/normalize_test.cpp new file mode 100644 index 00000000000..5211b2e271f --- /dev/null +++ b/src/transforms/unit_test/normalize_test.cpp @@ -0,0 +1,96 @@ +// MUST include this +#include + +// File being tested +#include + +TEST_CASE("Testing normalize preprocessing", "[preproc]") { + SECTION("matrix with no channels") { + lbann::utils::type_erased_matrix mat = lbann::utils::type_erased_matrix(lbann::CPUMat()); + El::Ones(mat.template get(), 3, 3); + El::Scale(2.0f, mat.template get()); + std::vector dims = {3, 3}; + auto normalizer = lbann::transform::normalize({0.5}, {2.0}); + SECTION("applying the normalizer") { + REQUIRE_NOTHROW(normalizer.apply(mat, dims)); + + SECTION("normalizing does not change dims") { + REQUIRE(dims[0] == 3); + REQUIRE(dims[1] == 3); + } + SECTION("normalizing does not change matrix type") { + REQUIRE_NOTHROW(mat.template get()); + } + SECTION("normalizing produces correct values") { + auto& real_mat = mat.template get(); + for (El::Int col = 0; col < 3; ++col) { + for (El::Int row = 0; row < 3; ++row) { + REQUIRE(real_mat(row, col) == Approx(0.75)); + } + } + } + } + } + + SECTION("matrix with one channel") { + lbann::utils::type_erased_matrix mat = lbann::utils::type_erased_matrix(lbann::CPUMat()); + El::Ones(mat.template get(), 3, 3); + El::Scale(2.0f, mat.template get()); + std::vector dims = {1, 3, 3}; + auto normalizer = lbann::transform::normalize({0.5}, {2.0}); + SECTION("applying the normalizer") { + REQUIRE_NOTHROW(normalizer.apply(mat, dims)); + + SECTION("normalizing does not change dims") { + REQUIRE(dims[0] == 1); + REQUIRE(dims[1] == 3); + REQUIRE(dims[2] == 3); + } + SECTION("normalizing does not change matrix type") { + REQUIRE_NOTHROW(mat.template get()); + } + SECTION("normalizing produces correct values") { + auto& real_mat = mat.template get(); + for (El::Int col = 0; col < 3; ++col) { + for (El::Int row = 0; row < 3; ++row) { + REQUIRE(real_mat(row, col) == Approx(0.75)); + } + } + } + } + } + + SECTION("matrix with three channels") { + lbann::utils::type_erased_matrix mat = lbann::utils::type_erased_matrix(lbann::CPUMat()); + El::Ones(mat.template get(), 27, 1); + El::Scale(2.0f, mat.template get()); + std::vector dims = {3, 3, 3}; + auto normalizer = lbann::transform::normalize({0.75, 0.5, 0.25}, + {1.0, 2.0, 4.0}); + SECTION("applying the normalizer") { + REQUIRE_NOTHROW(normalizer.apply(mat, dims)); + + SECTION("normalizing does not change dims") { + REQUIRE(dims[0] == 3); + REQUIRE(dims[1] == 3); + REQUIRE(dims[2] == 3); + } + SECTION("normalizing does not change matrix type") { + REQUIRE_NOTHROW(mat.template get()); + } + SECTION("normalizing produces correct values") { + auto& real_mat = mat.template get(); + const lbann::DataType* buf = real_mat.Buffer(); + for (size_t i = 0; i < 9; ++i) { + REQUIRE(buf[i] == Approx(1.25)); + } + for (size_t i = 9; i < 18; ++i) { + REQUIRE(buf[i] == Approx(0.75)); + } + for (size_t i = 18; i < 27; ++i) { + REQUIRE(buf[i] == Approx(0.4375)); + } + } + } + } +} diff --git a/src/transforms/unit_test/sample_normalize_test.cpp b/src/transforms/unit_test/sample_normalize_test.cpp new file mode 100644 index 00000000000..a8bfb434f1d --- /dev/null +++ b/src/transforms/unit_test/sample_normalize_test.cpp @@ -0,0 +1,36 @@ +// MUST include this +#include + +// File being tested +#include + +TEST_CASE("Testing sample normalize preprocessing", "[preproc]") { + lbann::utils::type_erased_matrix mat = lbann::utils::type_erased_matrix(lbann::CPUMat()); + El::Identity(mat.template get(), 3, 3); + El::Scale(2.0, mat.template get()); + std::vector dims = {3, 3}; + auto normalizer = lbann::transform::sample_normalize(); + SECTION("applying the normalizer") { + REQUIRE_NOTHROW(normalizer.apply(mat, dims)); + + SECTION("normalizing does not change dims") { + REQUIRE(dims[0] == 3); + REQUIRE(dims[1] == 3); + } + SECTION("normalizing does not change matrix type") { + REQUIRE_NOTHROW(mat.template get()); + } + SECTION("normalizing produces correct values") { + auto& real_mat = mat.template get(); + for (El::Int col = 0; col < 3; ++col) { + for (El::Int row = 0; row < 3; ++row) { + if (row == col) { + REQUIRE(real_mat(row, col) == Approx(1.41421356)); + } else { + REQUIRE(real_mat(row, col) == Approx(-0.70710678)); + } + } + } + } + } +} diff --git a/src/transforms/unit_test/scale_test.cpp b/src/transforms/unit_test/scale_test.cpp new file mode 100644 index 00000000000..b21c4c408c9 --- /dev/null +++ b/src/transforms/unit_test/scale_test.cpp @@ -0,0 +1,32 @@ +// MUST include this +#include + +// File being tested +#include + +TEST_CASE("Testing scale preprocessing", "[preproc]") { + lbann::utils::type_erased_matrix mat = lbann::utils::type_erased_matrix(lbann::CPUMat()); + El::Ones(mat.template get(), 3, 3); + std::vector dims = {3, 3}; + auto scaler = lbann::transform::scale(2.0); + + SECTION("applying the scaler") { + REQUIRE_NOTHROW(scaler.apply(mat, dims)); + + SECTION("scaling does not change dims") { + REQUIRE(dims[0] == 3); + REQUIRE(dims[1] == 3); + } + SECTION("scaling does not change matrix type") { + REQUIRE_NOTHROW(mat.template get()); + } + SECTION("scaling changes matrix values") { + auto& real_mat = mat.template get(); + for (El::Int col = 0; col < 3; ++col) { + for (El::Int row = 0; row < 3; ++row) { + REQUIRE(real_mat(row, col) == 2.0); + } + } + } + } +} diff --git a/src/transforms/unit_test/transform_pipeline_test.cpp b/src/transforms/unit_test/transform_pipeline_test.cpp new file mode 100644 index 00000000000..dff68c4b883 --- /dev/null +++ b/src/transforms/unit_test/transform_pipeline_test.cpp @@ -0,0 +1,38 @@ +// MUST include this +#include + +// File being tested +#include +#include +#include +#include + +TEST_CASE("Testing transform pipeline", "[preproc]") { + lbann::transform::transform_pipeline p; + p.add_transform(lbann::make_unique(2.0f)); + p.add_transform(lbann::make_unique()); + lbann::CPUMat mat; + El::Identity(mat, 3, 3); + std::vector dims = {3, 3}; + + SECTION("applying the pipeline") { + REQUIRE_NOTHROW(p.apply(mat, dims)); + + SECTION("pipeline does not change dims") { + REQUIRE(dims[0] == 3); + REQUIRE(dims[1] == 3); + } + + SECTION("pipeline produces correct values") { + for (El::Int col = 0; col < 3; ++col) { + for (El::Int row = 0; row < 3; ++row) { + if (row == col) { + REQUIRE(mat(row, col) == Approx(1.41421356)); + } else { + REQUIRE(mat(row, col) == Approx(-0.70710678)); + } + } + } + } + } +} diff --git a/src/transforms/vision/CMakeLists.txt b/src/transforms/vision/CMakeLists.txt new file mode 100644 index 00000000000..b2f0781eaa1 --- /dev/null +++ b/src/transforms/vision/CMakeLists.txt @@ -0,0 +1,19 @@ +# Add the source files for this directory +set_full_path(THIS_DIR_SOURCES + center_crop.cpp + colorize.cpp + grayscale.cpp + horizontal_flip.cpp + normalize_to_lbann_layout.cpp + random_affine.cpp + random_crop.cpp + random_resized_crop.cpp + random_resized_crop_with_fixed_aspect_ratio.cpp + resize.cpp + resized_center_crop.cpp + to_lbann_layout.cpp + vertical_flip.cpp + ) + +# Propagate the files up the tree +set(SOURCES "${SOURCES}" "${THIS_DIR_SOURCES}" PARENT_SCOPE) diff --git a/src/transforms/vision/center_crop.cpp b/src/transforms/vision/center_crop.cpp new file mode 100644 index 00000000000..9f16ccb4e78 --- /dev/null +++ b/src/transforms/vision/center_crop.cpp @@ -0,0 +1,65 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#include +#include "lbann/transforms/vision/center_crop.hpp" +#include "lbann/utils/opencv.hpp" + +namespace lbann { +namespace transform { + +void center_crop::apply(utils::type_erased_matrix& data, std::vector& dims) { + cv::Mat src = utils::get_opencv_mat(data, dims); + if (dims[1] <= m_h || dims[2] <= m_w) { + std::stringstream ss; + ss << "Center crop to " << m_h << "x" << m_w + << " applied to input " << dims[1] << "x" << dims[2]; + LBANN_ERROR(ss.str()); + } + std::vector new_dims = {dims[0], m_h, m_w}; + auto dst_real = El::Matrix(utils::get_linearized_size(new_dims), 1); + cv::Mat dst = utils::get_opencv_mat(dst_real, new_dims); + // Compute upper-left corner of crop. + const size_t x = std::round(float(src.cols - m_w) / 2.0); + const size_t y = std::round(float(src.rows - m_h) / 2.0); + // Sanity check. + if (x >= static_cast(src.cols) || + y >= static_cast(src.rows) || + (x + m_w) > static_cast(src.cols) || + (y + m_h) > static_cast(src.rows)) { + std::stringstream ss; + ss << "Bad crop dimensions for " << src.rows << "x" << src.cols << ": " + << m_h << "x" << m_w << " at (" << x << "," << y << ")"; + LBANN_ERROR(ss.str()); + } + // Copy is needed to ensure this is continuous. + src(cv::Rect(x, y, m_h, m_w)).copyTo(dst); + data.emplace(std::move(dst_real)); + dims = new_dims; +} + +} // namespace transform +} // namespace lbann diff --git a/src/transforms/vision/colorize.cpp b/src/transforms/vision/colorize.cpp new file mode 100644 index 00000000000..4ece618b727 --- /dev/null +++ b/src/transforms/vision/colorize.cpp @@ -0,0 +1,48 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#include +#include "lbann/transforms/vision/colorize.hpp" +#include "lbann/utils/opencv.hpp" + +namespace lbann { +namespace transform { + +void colorize::apply(utils::type_erased_matrix& data, std::vector& dims) { + cv::Mat src = utils::get_opencv_mat(data, dims); + if (dims[0] != 1) { + return; // Already color. + } + std::vector new_dims = {3, dims[1], dims[2]}; + auto dst_real = El::Matrix(utils::get_linearized_size(new_dims), 1); + cv::Mat dst = utils::get_opencv_mat(dst_real, new_dims); + cv::cvtColor(src, dst, cv::COLOR_GRAY2BGR); + data.emplace(std::move(dst_real)); + dims = new_dims; +} + +} // namespace transform +} // namespace lbann diff --git a/src/transforms/vision/grayscale.cpp b/src/transforms/vision/grayscale.cpp new file mode 100644 index 00000000000..e5b4a54fd6d --- /dev/null +++ b/src/transforms/vision/grayscale.cpp @@ -0,0 +1,48 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#include +#include "lbann/transforms/vision/grayscale.hpp" +#include "lbann/utils/opencv.hpp" + +namespace lbann { +namespace transform { + +void grayscale::apply(utils::type_erased_matrix& data, std::vector& dims) { + cv::Mat src = utils::get_opencv_mat(data, dims); + if (dims[0] == 1) { + return; // Only one channel: Already grayscale. + } + std::vector new_dims = {1, dims[1], dims[2]}; + auto dst_real = El::Matrix(utils::get_linearized_size(new_dims), 1); + cv::Mat dst = utils::get_opencv_mat(dst_real, new_dims); + cv::cvtColor(src, dst, cv::COLOR_BGR2GRAY); + data.emplace(std::move(dst_real)); + dims = new_dims; +} + +} // namespace transform +} // namespace lbann diff --git a/src/transforms/vision/horizontal_flip.cpp b/src/transforms/vision/horizontal_flip.cpp new file mode 100644 index 00000000000..3bdc190178f --- /dev/null +++ b/src/transforms/vision/horizontal_flip.cpp @@ -0,0 +1,44 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#include "lbann/transforms/vision/horizontal_flip.hpp" +#include "lbann/utils/opencv.hpp" + +namespace lbann { +namespace transform { + +void horizontal_flip::apply(utils::type_erased_matrix& data, std::vector& dims) { + if (transform::get_bool_random(m_p)) { + cv::Mat src = utils::get_opencv_mat(data, dims); + auto dst_real = El::Matrix(utils::get_linearized_size(dims), 1); + cv::Mat dst = utils::get_opencv_mat(dst_real, dims); + cv::flip(src, dst, 1); + data.emplace(std::move(dst_real)); + } +} + +} // namespace transform +} // namespace lbann diff --git a/src/transforms/vision/normalize_to_lbann_layout.cpp b/src/transforms/vision/normalize_to_lbann_layout.cpp new file mode 100644 index 00000000000..c65eef9052b --- /dev/null +++ b/src/transforms/vision/normalize_to_lbann_layout.cpp @@ -0,0 +1,94 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#include "lbann/transforms/vision/normalize_to_lbann_layout.hpp" +#include "lbann/utils/opencv.hpp" + +namespace lbann { +namespace transform { + +void normalize_to_lbann_layout::apply(utils::type_erased_matrix& data, + std::vector& dims) { + auto dst = CPUMat(utils::get_linearized_size(dims), 1); + apply(data, dst, dims); + data.emplace(std::move(dst)); +} + +void normalize_to_lbann_layout::apply(utils::type_erased_matrix& data, + CPUMat& out, + std::vector& dims) { + cv::Mat src = utils::get_opencv_mat(data, dims); + if (!src.isContinuous()) { + // This should not occur, but just in case. + LBANN_ERROR("Do not support non-contiguous OpenCV matrices."); + } + // Ensure we have the right number of channels. + if (dims.size() == 3 && m_means.size() != dims[0]) { + LBANN_ERROR("Normalize channels does not match data"); + } else if (dims.size() != 3 && m_means.size() != 1) { + LBANN_ERROR("Transform data has no channels, cannot normalize with multiple channels"); + } + if (!out.Contiguous()) { + LBANN_ERROR("NormalizeToLBANNLayout does not support non-contiguous destination."); + } + const uint8_t* __restrict__ src_buf = src.ptr(); + const size_t out_size = utils::get_linearized_size(dims); + if (static_cast(out.Height() * out.Width()) != out_size) { + LBANN_ERROR("Transform output does not have sufficient space."); + } + DataType* __restrict__ dst_buf = out.Buffer(); + const float scale = 1.0f / 255.0f; + if (dims[0] == 1) { + // Greyscale. + const DataType mean = m_means[0]; + const DataType std = m_stds[0]; + for (size_t row = 0; row < dims[1]; ++row) { + for (size_t col = 0; col < dims[2]; ++col) { + dst_buf[row + col*dims[1]] = + (src_buf[row*dims[2] + col] * scale - mean) / std; + } + } + } else { + // RGB/three-channel. + const size_t size = dims[1] * dims[2]; + for (size_t row = 0; row < dims[1]; ++row) { + for (size_t col = 0; col < dims[2]; ++col) { + // Multiply by 3 because there are three channels. + const size_t src_base = 3*(row*dims[2] + col); + const size_t dst_base = row + col*dims[1]; + dst_buf[dst_base] = + (src_buf[src_base] * scale - m_means[0]) / m_stds[0]; + dst_buf[dst_base + size] = + (src_buf[src_base + 1] * scale - m_means[1]) / m_stds[1]; + dst_buf[dst_base + 2*size] = + (src_buf[src_base + 2] * scale - m_means[2]) / m_stds[2]; + } + } + } +} + +} // namespace transform +} // namespace lbann diff --git a/src/transforms/vision/random_affine.cpp b/src/transforms/vision/random_affine.cpp new file mode 100644 index 00000000000..90d6d7f55b8 --- /dev/null +++ b/src/transforms/vision/random_affine.cpp @@ -0,0 +1,103 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#include +#include "lbann/transforms/vision/random_affine.hpp" +#include "lbann/utils/opencv.hpp" + +namespace lbann { +namespace transform { + +void random_affine::apply(utils::type_erased_matrix& data, std::vector& dims) { + cv::Mat src = utils::get_opencv_mat(data, dims); + auto dst_real = El::Matrix(utils::get_linearized_size(dims), 1); + cv::Mat dst = utils::get_opencv_mat(dst_real, dims); + // Compute the random quantities for the transform. + // For converting to radians: + constexpr float pi_rad = 3.14159265358979323846f / 180.0f; + float angle = 0.0f; + if (m_rotate_min != 0.0f || m_rotate_max != 0.0f) { + angle = transform::get_uniform_random(m_rotate_min, m_rotate_max) * pi_rad; + } + float translate_x = 0.0f; + if (m_translate_h != 0.0f) { + const float dx = dims[2]*m_translate_w; + translate_x = std::round(transform::get_uniform_random(-dx, dx)); + } + float translate_y = 0.0f; + if (m_translate_w != 0.0f) { + const float dy = dims[1]*m_translate_h; + translate_y = std::round(transform::get_uniform_random(-dy, dy)); + } + float scale = 1.0f; + if (m_scale_min != 0.0f || m_scale_max != 0.0f) { + scale = transform::get_uniform_random(m_scale_min, m_scale_max); + } + float shear = 0.0f; + if (m_shear_min != 0.0f || m_shear_max != 0.0f) { + shear = transform::get_uniform_random(m_shear_min, m_shear_max) * pi_rad; + } + // Centering matrix: + const float center_x = dims[2]*0.5f + 0.5f; + const float center_y = dims[1]*0.5f + 0.5f; + // Compute the affine transformation matrix: M = T * C * R * S * Sc * C^-1 + // where + // T = [1 0 translate_x | 0 1 translate_y | 0 0 1] + // is the translation matrix, + // C = [1 0 center_x | 0 1 center_y | 0 0 1] + // is the centering matrix, + // R = [cos(angle) -sin(angle) 0 | sin(angle) cos(angle) 0 | 0 0 1] + // is the rotation matrix, + // S = [1 -sin(shear) 0 | 0 cos(shear) 0 | 0 0 1] + // is the shear matrix, and + // Sc = [scale 0 0 | 0 scale 0 | 0 0 1] + // is the scale matrix. + // The centering matrix is used to ensure we rotate/shear about the center + // of the image. + // What we actually need is the inverse affine map (destination -> source): + // M^-1 = C * Sc^-1 S^-1 R^-1 C^-1 T^-1. + // This is a bit ugly to write out fully, but the below is the result, care of + // Mathematica. + const float sec_shear_scale = 1.0f / std::cos(shear) / scale; + float affine_mat[2][3] = { + {std::cos(angle+shear)*sec_shear_scale, std::sin(angle+shear)*sec_shear_scale, 0.0f}, + {-std::sin(angle)*sec_shear_scale, std::cos(angle)*sec_shear_scale, 0.0f} + }; + affine_mat[0][2] = affine_mat[0][0]*(-center_x - translate_x) + + affine_mat[0][1]*(-center_y - translate_y) + + center_x; + affine_mat[1][2] = affine_mat[1][0]*(-center_x - translate_x) + + affine_mat[1][1]*(-center_y - translate_y) + + center_y; + cv::Mat cv_affine(2, 3, CV_32F, affine_mat); + cv::warpAffine(src, dst, cv_affine, dst.size(), + cv::INTER_LINEAR | cv::WARP_INVERSE_MAP, + cv::BORDER_REPLICATE); + data.emplace(std::move(dst_real)); +} + +} // namespace transform +} // namespace lbann diff --git a/src/transforms/vision/random_crop.cpp b/src/transforms/vision/random_crop.cpp new file mode 100644 index 00000000000..416bae6abfe --- /dev/null +++ b/src/transforms/vision/random_crop.cpp @@ -0,0 +1,64 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#include "lbann/transforms/vision/random_crop.hpp" +#include "lbann/utils/opencv.hpp" + +namespace lbann { +namespace transform { + +void random_crop::apply(utils::type_erased_matrix& data, std::vector& dims) { + cv::Mat src = utils::get_opencv_mat(data, dims); + if (dims[1] <= m_h || dims[2] <= m_w) { + std::stringstream ss; + ss << "Random crop to " << m_h << "x" << m_w + << " applied to input " << dims[1] << "x" << dims[2]; + LBANN_ERROR(ss.str()); + } + std::vector new_dims = {dims[0], m_h, m_w}; + auto dst_real = El::Matrix(utils::get_linearized_size(new_dims), 1); + cv::Mat dst = utils::get_opencv_mat(dst_real, new_dims); + // Select the upper-left corner of the crop. + const size_t x = transform::get_uniform_random_int(0, dims[2] - m_w + 1); + const size_t y = transform::get_uniform_random_int(0, dims[1] - m_h + 1); + // Sanity check. + if (x >= static_cast(src.cols) || + y >= static_cast(src.rows) || + (x + m_w) > static_cast(src.cols) || + (y + m_h) > static_cast(src.rows)) { + std::stringstream ss; + ss << "Bad crop dimensions for " << src.rows << "x" << src.cols << ": " + << m_h << "x" << m_w << " at (" << x << "," << y << ")"; + LBANN_ERROR(ss.str()); + } + // Copy is needed to ensure this is continuous. + src(cv::Rect(x, y, m_h, m_w)).copyTo(dst); + data.emplace(std::move(dst_real)); + dims = new_dims; +} + +} // namespace transform +} // namespace lbann diff --git a/src/transforms/vision/random_resized_crop.cpp b/src/transforms/vision/random_resized_crop.cpp new file mode 100644 index 00000000000..0fa9420e538 --- /dev/null +++ b/src/transforms/vision/random_resized_crop.cpp @@ -0,0 +1,93 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#include +#include "lbann/transforms/vision/random_resized_crop.hpp" +#include "lbann/utils/opencv.hpp" + +namespace lbann { +namespace transform { + +void random_resized_crop::apply(utils::type_erased_matrix& data, + std::vector& dims) { + cv::Mat src = utils::get_opencv_mat(data, dims); + std::vector new_dims = {dims[0], m_h, m_w}; + auto dst_real = El::Matrix(utils::get_linearized_size(new_dims), 1); + cv::Mat dst = utils::get_opencv_mat(dst_real, new_dims); + size_t x = 0, y = 0, h = 0, w = 0; + const size_t area = dims[1]*dims[2]; + // There's a chance this can fail, so we only make ten attempts. + for (int attempt = 0; attempt < 10; ++attempt) { + const float target_area = area*transform::get_uniform_random(m_scale_min, + m_scale_max); + const float target_ar = transform::get_uniform_random(m_ar_min, m_ar_max); + w = std::sqrt(target_area * target_ar); + h = std::sqrt(target_area / target_ar); + // Swap these with 50% probability. + if (transform::get_bool_random(0.5)) { + std::swap(w, h); + } + if (w <= dims[2] && h <= dims[1]) { + x = transform::get_uniform_random_int(0, dims[2] - w + 1); + y = transform::get_uniform_random_int(0, dims[1] - h + 1); + break; + } + // Reset. + h = 0; + w = 0; + } + bool fallback = false; + // Fallback. + if (h == 0) { + fallback = true; + w = std::min(dims[1], dims[2]); + h = w; + x = (dims[2] - w) / 2; + y = (dims[1] - h) / 2; + } + // Sanity check. + if (x >= static_cast(src.cols) || + y >= static_cast(src.rows) || + (x + w) > static_cast(src.cols) || + (y + h) > static_cast(src.rows)) { + std::stringstream ss; + ss << "Bad crop dimensions for " << src.rows << "x" << src.cols << ": " + << h << "x" << w << " at (" << x << "," << y << ") fallback=" << fallback; + LBANN_ERROR(ss.str()); + } + // This is just a view. + cv::Mat tmp = src(cv::Rect(x, y, w, h)); + cv::resize(tmp, dst, dst.size(), 0, 0, cv::INTER_LINEAR); + // Sanity check. + if (dst.ptr() != dst_real.Buffer()) { + LBANN_ERROR("Did not resize into dst_real."); + } + data.emplace(std::move(dst_real)); + dims = new_dims; +} + +} // namespace transform +} // namespace lbann diff --git a/src/transforms/vision/random_resized_crop_with_fixed_aspect_ratio.cpp b/src/transforms/vision/random_resized_crop_with_fixed_aspect_ratio.cpp new file mode 100644 index 00000000000..e66afd3acfa --- /dev/null +++ b/src/transforms/vision/random_resized_crop_with_fixed_aspect_ratio.cpp @@ -0,0 +1,71 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#include +#include "lbann/transforms/vision/random_resized_crop_with_fixed_aspect_ratio.hpp" +#include "lbann/utils/opencv.hpp" + +namespace lbann { +namespace transform { + +void random_resized_crop_with_fixed_aspect_ratio::apply( + utils::type_erased_matrix& data, std::vector& dims) { + cv::Mat src = utils::get_opencv_mat(data, dims); + std::vector new_dims = {dims[0], m_crop_h, m_crop_w}; + auto dst_real = El::Matrix(utils::get_linearized_size(new_dims), 1); + cv::Mat dst = utils::get_opencv_mat(dst_real, new_dims); + // Compute the projected crop area in the original image, crop it, and resize. + const float zoom = std::min(float(src.rows) / float(m_h), + float(src.cols) / float(m_w)); + const size_t zoom_h = m_h*zoom; + const size_t zoom_w = m_w*zoom; + const size_t zoom_crop_h = m_crop_h*zoom; + const size_t zoom_crop_w = m_crop_w*zoom; + const size_t dx = transform::get_uniform_random_int( + 0, 2*(zoom*m_w - zoom_crop_w) + 1); + const size_t dy = transform::get_uniform_random_int( + 0, 2*(zoom*m_h - zoom_crop_h) + 1); + const size_t x = (dims[2] - zoom_w + dx + 1) / 2; + const size_t y = (dims[1] - zoom_h + dy + 1) / 2; + // Sanity check. + if (x >= static_cast(src.cols) || + y >= static_cast(src.rows) || + (x + zoom_crop_w) > static_cast(src.cols) || + (y + zoom_crop_h) > static_cast(src.rows)) { + std::stringstream ss; + ss << "Bad crop dimensions for " << src.rows << "x" << src.cols << ": " + << zoom_crop_h << "x" << zoom_crop_w << " at (" << x << "," << y << ")"; + LBANN_ERROR(ss.str()); + } + // The crop is just a view. + cv::Mat tmp = src(cv::Rect(x, y, zoom_crop_h, zoom_crop_w)); + cv::resize(tmp, dst, dst.size(), 0, 0, cv::INTER_LINEAR); + data.emplace(std::move(dst_real)); + dims = new_dims; +} + +} // namespace transform +} // namespace lbann diff --git a/src/transforms/vision/resize.cpp b/src/transforms/vision/resize.cpp new file mode 100644 index 00000000000..69bf3facdac --- /dev/null +++ b/src/transforms/vision/resize.cpp @@ -0,0 +1,45 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#include +#include "lbann/transforms/vision/resize.hpp" +#include "lbann/utils/opencv.hpp" + +namespace lbann { +namespace transform { + +void resize::apply(utils::type_erased_matrix& data, std::vector& dims) { + cv::Mat src = utils::get_opencv_mat(data, dims); + std::vector new_dims = {dims[0], m_h, m_w}; + auto dst_real = El::Matrix(utils::get_linearized_size(new_dims), 1); + cv::Mat dst = utils::get_opencv_mat(dst_real, new_dims); + cv::resize(src, dst, dst.size(), 0, 0, cv::INTER_LINEAR); + data.emplace(std::move(dst_real)); + dims = new_dims; +} + +} // namespace transform +} // namespace lbann diff --git a/src/transforms/vision/resized_center_crop.cpp b/src/transforms/vision/resized_center_crop.cpp new file mode 100644 index 00000000000..3aa370c9361 --- /dev/null +++ b/src/transforms/vision/resized_center_crop.cpp @@ -0,0 +1,67 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#include +#include "lbann/transforms/vision/resized_center_crop.hpp" +#include "lbann/utils/opencv.hpp" + +namespace lbann { +namespace transform { + +void resized_center_crop::apply(utils::type_erased_matrix& data, std::vector& dims) { + cv::Mat src = utils::get_opencv_mat(data, dims); + std::vector new_dims = {dims[0], m_crop_h, m_crop_w}; + auto dst_real = El::Matrix(utils::get_linearized_size(new_dims), 1); + cv::Mat dst = utils::get_opencv_mat(dst_real, new_dims); + // This computes the projected crop area in the original image, crops it, + // then resizes it. + // Thus, we resize a smaller image, which is faster. + // Method due to @JaeseungYeom. + const float zoom = std::min(float(src.rows) / float(m_h), + float(src.cols) / float(m_w)); + const size_t zoom_h = m_crop_h*zoom; + const size_t zoom_w = m_crop_w*zoom; + const size_t x = std::round(float(src.cols - zoom_w) / 2.0f); + const size_t y = std::round(float(src.rows - zoom_h) / 2.0f); + // Sanity check. + if (x >= static_cast(src.cols) || + y >= static_cast(src.rows) || + (x + zoom_w) > static_cast(src.cols) || + (y + zoom_h) > static_cast(src.rows)) { + std::stringstream ss; + ss << "Bad crop dimensions for " << src.rows << "x" << src.cols << ": " + << zoom_h << "x" << zoom_w << " at (" << x << "," << y << ")"; + LBANN_ERROR(ss.str()); + } + // The crop is just a view. + cv::Mat tmp = src(cv::Rect(x, y, zoom_h, zoom_w)); + cv::resize(tmp, dst, dst.size(), 0, 0, cv::INTER_LINEAR); + data.emplace(std::move(dst_real)); + dims = new_dims; +} + +} // namespace transform +} // namespace lbann diff --git a/src/transforms/vision/to_lbann_layout.cpp b/src/transforms/vision/to_lbann_layout.cpp new file mode 100644 index 00000000000..49f17a7e7d5 --- /dev/null +++ b/src/transforms/vision/to_lbann_layout.cpp @@ -0,0 +1,80 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#include "lbann/transforms/vision/to_lbann_layout.hpp" +#include "lbann/utils/opencv.hpp" + +namespace lbann { +namespace transform { + +void to_lbann_layout::apply(utils::type_erased_matrix& data, std::vector& dims) { + auto dst = CPUMat(utils::get_linearized_size(dims), 1); + apply(data, dst, dims); + data.emplace(std::move(dst)); +} + +void to_lbann_layout::apply(utils::type_erased_matrix& data, CPUMat& out, + std::vector& dims) { + cv::Mat src = utils::get_opencv_mat(data, dims); + if (!src.isContinuous()) { + // This should not occur, but just in case. + LBANN_ERROR("Do not support non-contiguous OpenCV matrices."); + } + if (!out.Contiguous()) { + LBANN_ERROR("ToLBANNLayout does not support non-contiguous destination."); + } + const uint8_t* __restrict__ src_buf = src.ptr(); + const size_t out_size = utils::get_linearized_size(dims); + if (static_cast(out.Height() * out.Width()) != out_size) { + LBANN_ERROR("Transform output does not have sufficient space."); + } + DataType* __restrict__ dst_buf = out.Buffer(); + const float scale = 1.0f / 255.0f; + if (dims[0] == 1) { + // Greyscale. + for (size_t row = 0; row < dims[1]; ++row) { + for (size_t col = 0; col < dims[2]; ++col) { + dst_buf[row + col*dims[1]] = src_buf[row*dims[2] + col] * scale; + } + } + } else { + // RGB/three-channel. + const size_t size = dims[1] * dims[2]; + for (size_t row = 0; row < dims[1]; ++row) { + for (size_t col = 0; col < dims[2]; ++col) { + // Multiply by 3 because there are three channels. + const size_t src_base = 3*(row*dims[2] + col); + const size_t dst_base = row + col*dims[1]; + dst_buf[dst_base] = src_buf[src_base] * scale; + dst_buf[dst_base + size] = src_buf[src_base + 1] * scale; + dst_buf[dst_base + 2*size] = src_buf[src_base + 2] * scale; + } + } + } +} + +} // namespace transform +} // namespace lbann diff --git a/src/transforms/vision/unit_test/CMakeLists.txt b/src/transforms/vision/unit_test/CMakeLists.txt new file mode 100644 index 00000000000..b2e4c84cd4a --- /dev/null +++ b/src/transforms/vision/unit_test/CMakeLists.txt @@ -0,0 +1,18 @@ +set_full_path(_DIR_LBANN_CATCH2_TEST_FILES + center_crop_test.cpp + colorize_test.cpp + grayscale_test.cpp + horizontal_flip_test.cpp + random_affine_test.cpp + random_crop_test.cpp + random_resized_crop_test.cpp + random_resized_crop_with_fixed_aspect_ratio_test.cpp + resize_test.cpp + resized_center_crop_test.cpp + to_lbann_layout_test.cpp + transform_pipeline_test.cpp + vertical_flip_test.cpp + ) + +set(LBANN_CATCH2_TEST_FILES + "${LBANN_CATCH2_TEST_FILES}" "${_DIR_LBANN_CATCH2_TEST_FILES}" PARENT_SCOPE) diff --git a/src/transforms/vision/unit_test/center_crop_test.cpp b/src/transforms/vision/unit_test/center_crop_test.cpp new file mode 100644 index 00000000000..95dcdad1840 --- /dev/null +++ b/src/transforms/vision/unit_test/center_crop_test.cpp @@ -0,0 +1,76 @@ +// MUST include this +#include + +// File being tested +#include +#include "helper.hpp" + +TEST_CASE("Testing center crop preprocessing", "[preproc]") { + lbann::utils::type_erased_matrix mat = lbann::utils::type_erased_matrix(El::Matrix()); + + SECTION("matrix with one channel") { + zeros(mat.template get(), 5, 5, 1); + apply_elementwise(mat.template get(), 5, 5, 1, + [](uint8_t& x, El::Int row, El::Int col, El::Int) { + if (row >= 1 && row <= 3 && col >= 1 && col <= 3) { + x = 1; + } + }); + std::vector dims = {1, 5, 5}; + auto cropper = lbann::transform::center_crop(3, 3); + + SECTION("applying the crop") { + REQUIRE_NOTHROW(cropper.apply(mat, dims)); + + SECTION("cropping changes dims correctly") { + REQUIRE(dims[0] == 1); + REQUIRE(dims[1] == 3); + REQUIRE(dims[2] == 3); + } + SECTION("cropping does not change matrix type") { + REQUIRE_NOTHROW(mat.template get()); + } + SECTION("cropping produces correct values") { + auto& real_mat = mat.template get(); + apply_elementwise( + real_mat, 3, 3, 1, + [](uint8_t& x, El::Int row, El::Int col, El::Int) { + REQUIRE(x == 1); + }); + } + } + } + + SECTION("matrix with three channels") { + zeros(mat.template get(), 5, 5, 3); + apply_elementwise(mat.template get(), 5, 5, 3, + [](uint8_t& x, El::Int row, El::Int col, El::Int) { + if (row >= 1 && row <= 3 && col >= 1 && col <= 3) { + x = 1; + } + }); + std::vector dims = {3, 5, 5}; + auto cropper = lbann::transform::center_crop(3, 3); + + SECTION("applying the crop") { + REQUIRE_NOTHROW(cropper.apply(mat, dims)); + + SECTION("cropping changes dims correctly") { + REQUIRE(dims[0] == 3); + REQUIRE(dims[1] == 3); + REQUIRE(dims[2] == 3); + } + SECTION("cropping does not change matrix type") { + REQUIRE_NOTHROW(mat.template get()); + } + SECTION("cropping produces correct values") { + auto& real_mat = mat.template get(); + apply_elementwise( + real_mat, 3, 3, 3, + [](uint8_t& x, El::Int row, El::Int col, El::Int) { + REQUIRE(x == 1); + }); + } + } + } +} diff --git a/src/transforms/vision/unit_test/colorize_test.cpp b/src/transforms/vision/unit_test/colorize_test.cpp new file mode 100644 index 00000000000..570ba6a630f --- /dev/null +++ b/src/transforms/vision/unit_test/colorize_test.cpp @@ -0,0 +1,66 @@ +// MUST include this +#include + +// File being tested +#include +#include "helper.hpp" + +TEST_CASE("Testing colorize preprocessing", "[preproc]") { + lbann::utils::type_erased_matrix mat = lbann::utils::type_erased_matrix(El::Matrix()); + + SECTION("matrix with one channel") { + identity(mat.template get(), 3, 3, 1); + std::vector dims = {1, 3, 3}; + auto gs = lbann::transform::colorize(); + + SECTION("applying grayscape") { + REQUIRE_NOTHROW(gs.apply(mat, dims)); + + SECTION("colorize changes dims correctly") { + REQUIRE(dims[0] == 3); + REQUIRE(dims[1] == 3); + REQUIRE(dims[2] == 3); + } + SECTION("colorize does not change matrix type") { + REQUIRE_NOTHROW(mat.template get()); + } + SECTION("colorize does not change values") { + auto& real_mat = mat.template get(); + apply_elementwise( + real_mat, 3, 3, 3, + [](uint8_t& x, El::Int row, El::Int col, El::Int) { + if (row == col) { REQUIRE(x == 1); } + else { REQUIRE(x == 0); } + }); + } + } + } + + SECTION("matrix with three channels") { + identity(mat.template get(), 3, 3, 3); + std::vector dims = {3, 3, 3}; + auto gs = lbann::transform::colorize(); + + SECTION("applying colorize") { + REQUIRE_NOTHROW(gs.apply(mat, dims)); + + SECTION("colorize does not change dims") { + REQUIRE(dims[0] == 3); + REQUIRE(dims[1] == 3); + REQUIRE(dims[2] == 3); + } + SECTION("colorize does not change matrix type") { + REQUIRE_NOTHROW(mat.template get()); + } + SECTION("colorize produces correct values") { + auto& real_mat = mat.template get(); + apply_elementwise( + real_mat, 3, 3, 3, + [](uint8_t& x, El::Int row, El::Int col, El::Int) { + if (row == col) { REQUIRE(x == 1); } + else { REQUIRE(x == 0); } + }); + } + } + } +} diff --git a/src/transforms/vision/unit_test/grayscale_test.cpp b/src/transforms/vision/unit_test/grayscale_test.cpp new file mode 100644 index 00000000000..0c1248c5abb --- /dev/null +++ b/src/transforms/vision/unit_test/grayscale_test.cpp @@ -0,0 +1,66 @@ +// MUST include this +#include + +// File being tested +#include +#include "helper.hpp" + +TEST_CASE("Testing grayscale preprocessing", "[preproc]") { + lbann::utils::type_erased_matrix mat = lbann::utils::type_erased_matrix(El::Matrix()); + + SECTION("matrix with one channel") { + identity(mat.template get(), 3, 3, 1); + std::vector dims = {1, 3, 3}; + auto gs = lbann::transform::grayscale(); + + SECTION("applying grayscape") { + REQUIRE_NOTHROW(gs.apply(mat, dims)); + + SECTION("grayscale does not change dims") { + REQUIRE(dims[0] == 1); + REQUIRE(dims[1] == 3); + REQUIRE(dims[2] == 3); + } + SECTION("grayscale does not change matrix type") { + REQUIRE_NOTHROW(mat.template get()); + } + SECTION("grayscale does not change values") { + auto& real_mat = mat.template get(); + apply_elementwise( + real_mat, 3, 3, 1, + [](uint8_t& x, El::Int row, El::Int col, El::Int) { + if (row == col) { REQUIRE(x == 1); } + else { REQUIRE(x == 0); } + }); + } + } + } + + SECTION("matrix with three channels") { + identity(mat.template get(), 3, 3, 3); + std::vector dims = {3, 3, 3}; + auto gs = lbann::transform::grayscale(); + + SECTION("applying grayscale") { + REQUIRE_NOTHROW(gs.apply(mat, dims)); + + SECTION("grayscale changes dims correctly") { + REQUIRE(dims[0] == 1); + REQUIRE(dims[1] == 3); + REQUIRE(dims[2] == 3); + } + SECTION("grayscale does not change matrix type") { + REQUIRE_NOTHROW(mat.template get()); + } + SECTION("grayscale produces correct values") { + auto& real_mat = mat.template get(); + apply_elementwise( + real_mat, 3, 3, 1, + [](uint8_t& x, El::Int row, El::Int col, El::Int) { + if (row == col) { REQUIRE(x == 1); } + else { REQUIRE(x == 0); } + }); + } + } + } +} diff --git a/src/transforms/vision/unit_test/helper.hpp b/src/transforms/vision/unit_test/helper.hpp new file mode 100644 index 00000000000..4512489e198 --- /dev/null +++ b/src/transforms/vision/unit_test/helper.hpp @@ -0,0 +1,58 @@ +#ifndef LBANN_TRANSFORMS_VISION_UNIT_TEST_HELPER +#define LBANN_TRANSFORMS_VISION_UNIT_TEST_HELPER + +inline void apply_elementwise( + El::Matrix& mat, El::Int height, El::Int width, El::Int channels, + std::function f) { + uint8_t* buf = mat.Buffer(); + for (El::Int channel = 0; channel < channels; ++channel) { + for (El::Int col = 0; col < width; ++col) { + for (El::Int row = 0; row < height; ++row) { + f(buf[channels*(col+row*width) + channel], row, col, channel); + } + } + } +} + +inline void identity(El::Matrix& mat, El::Int height, El::Int width, + El::Int channels = 1) { + mat.Resize(height*width*channels, 1); + apply_elementwise(mat, height, width, channels, + [](uint8_t& x, El::Int row, El::Int col, El::Int) { + x = (row == col) ? 1 : 0; + }); +} + +inline void zeros(El::Matrix& mat, El::Int height, El::Int width, + El::Int channels = 1) { + mat.Resize(height*width*channels, 1); + uint8_t* buf = mat.Buffer(); + for (El::Int i = 0; i < height*width*channels; ++i) { + buf[i] = 0; + } +} + +inline void ones(El::Matrix& mat, El::Int height, El::Int width, + El::Int channels = 1) { + mat.Resize(height*width*channels, 1); + uint8_t* buf = mat.Buffer(); + for (El::Int i = 0; i < height*width*channels; ++i) { + buf[i] = 1; + } +} + +inline void print(const El::Matrix& mat, El::Int height, El::Int width, + El::Int channels = 1) { + const uint8_t* buf = mat.LockedBuffer(); + for (El::Int channel = 0; channel < channels; ++channel) { + for (El::Int col = 0; col < width; ++col) { + for (El::Int row = 0; row < height; ++row) { + std::cout << ((int) buf[channels*(col+row*width) + channel]) << " "; + } + std::cout << std::endl; + } + std::cout << "--" << std::endl; + } +} + +#endif // LBANN_TRANSFORMS_VISION_UNIT_TEST_HELPER diff --git a/src/transforms/vision/unit_test/horizontal_flip_test.cpp b/src/transforms/vision/unit_test/horizontal_flip_test.cpp new file mode 100644 index 00000000000..72f4e057ab9 --- /dev/null +++ b/src/transforms/vision/unit_test/horizontal_flip_test.cpp @@ -0,0 +1,80 @@ +// MUST include this +#include + +// File being tested +#include +#include "helper.hpp" + +TEST_CASE("Testing horizontal flip preprocessing", "[preproc]") { + lbann::utils::type_erased_matrix mat = lbann::utils::type_erased_matrix(El::Matrix()); + + SECTION("matrix with one channel") { + zeros(mat.template get(), 3, 3, 1); + apply_elementwise(mat.template get(), 3, 3, 1, + [](uint8_t& x, El::Int row, El::Int col, El::Int) { + if (col == 0) { x = 1; } + }); + std::vector dims = {1, 3, 3}; + auto flipper = lbann::transform::horizontal_flip(1.0); + + SECTION("applying the flip") { + REQUIRE_NOTHROW(flipper.apply(mat, dims)); + + SECTION("flipping does not change dims") { + REQUIRE(dims[0] == 1); + REQUIRE(dims[1] == 3); + REQUIRE(dims[2] == 3); + } + SECTION("flipping does not change matrix type") { + REQUIRE_NOTHROW(mat.template get()); + } + SECTION("flipping produces correct values") { + auto& real_mat = mat.template get(); + apply_elementwise( + real_mat, 3, 3, 1, + [](uint8_t& x, El::Int row, El::Int col, El::Int) { + if (col == 2) { + REQUIRE(x == 1); + } else { + REQUIRE(x == 0); + } + }); + } + } + } + + SECTION("matrix with three channels") { + zeros(mat.template get(), 3, 3, 3); + apply_elementwise(mat.template get(), 3, 3, 3, + [](uint8_t& x, El::Int row, El::Int col, El::Int) { + if (col == 0) { x = 1; } + }); + std::vector dims = {3, 3, 3}; + auto flipper = lbann::transform::horizontal_flip(1.0); + + SECTION("applying the flip") { + REQUIRE_NOTHROW(flipper.apply(mat, dims)); + + SECTION("flipping does not change dims") { + REQUIRE(dims[0] == 3); + REQUIRE(dims[1] == 3); + REQUIRE(dims[2] == 3); + } + SECTION("flipping does not change matrix type") { + REQUIRE_NOTHROW(mat.template get()); + } + SECTION("flipping produces correct values") { + auto& real_mat = mat.template get(); + apply_elementwise( + real_mat, 3, 3, 3, + [](uint8_t& x, El::Int row, El::Int col, El::Int) { + if (col == 2) { + REQUIRE(x == 1); + } else { + REQUIRE(x == 0); + } + }); + } + } + } +} diff --git a/src/transforms/vision/unit_test/random_affine_test.cpp b/src/transforms/vision/unit_test/random_affine_test.cpp new file mode 100644 index 00000000000..bb27df86b6a --- /dev/null +++ b/src/transforms/vision/unit_test/random_affine_test.cpp @@ -0,0 +1,100 @@ +// MUST include this +#include + +// File being tested +#include +#include "helper.hpp" + +// Note: This is *random* so we only do basic checks. +TEST_CASE("Testing random affine preprocessing", "[preproc]") { + lbann::utils::type_erased_matrix mat = lbann::utils::type_erased_matrix(El::Matrix()); + // For simplicity, we'll only use a 3-channel matrix here. + identity(mat.template get(), 10, 10, 3); + std::vector dims = {3, 10, 10}; + + SECTION("rotation") { + auto affiner = lbann::transform::random_affine(0.0, 90.0, 0, 0, 0, 0, 0, 0); + + SECTION("applying the transform") { + REQUIRE_NOTHROW(affiner.apply(mat, dims)); + + SECTION("transform does not change dims") { + REQUIRE(dims[0] == 3); + REQUIRE(dims[1] == 10); + REQUIRE(dims[2] == 10); + } + SECTION("transform does not change matrix type") { + REQUIRE_NOTHROW(mat.template get()); + } + } + } + + SECTION("translate") { + auto affiner = lbann::transform::random_affine(0, 0, 0.1, 0.1, 0, 0, 0, 0); + + SECTION("applying the transform") { + REQUIRE_NOTHROW(affiner.apply(mat, dims)); + + SECTION("transform does not change dims") { + REQUIRE(dims[0] == 3); + REQUIRE(dims[1] == 10); + REQUIRE(dims[2] == 10); + } + SECTION("transform does not change matrix type") { + REQUIRE_NOTHROW(mat.template get()); + } + } + } + + SECTION("scale") { + auto affiner = lbann::transform::random_affine(0, 0, 0, 0, 0.0, 2.0, 0, 0); + + SECTION("applying the transform") { + REQUIRE_NOTHROW(affiner.apply(mat, dims)); + + SECTION("transform does not change dims") { + REQUIRE(dims[0] == 3); + REQUIRE(dims[1] == 10); + REQUIRE(dims[2] == 10); + } + SECTION("transform does not change matrix type") { + REQUIRE_NOTHROW(mat.template get()); + } + } + } + + SECTION("shear") { + auto affiner = lbann::transform::random_affine(0, 0, 0, 0, 0, 0, 0.0, 45.0); + + SECTION("applying the transform") { + REQUIRE_NOTHROW(affiner.apply(mat, dims)); + + SECTION("transform does not change dims") { + REQUIRE(dims[0] == 3); + REQUIRE(dims[1] == 10); + REQUIRE(dims[2] == 10); + } + SECTION("transform does not change matrix type") { + REQUIRE_NOTHROW(mat.template get()); + } + } + } + + SECTION("all") { + auto affiner = lbann::transform::random_affine( + 0.0, 90.0, 0.1, 0.1, 0.0, 2.0, 0.0, 45.0); + + SECTION("applying the transform") { + REQUIRE_NOTHROW(affiner.apply(mat, dims)); + + SECTION("transform does not change dims") { + REQUIRE(dims[0] == 3); + REQUIRE(dims[1] == 10); + REQUIRE(dims[2] == 10); + } + SECTION("transform does not change matrix type") { + REQUIRE_NOTHROW(mat.template get()); + } + } + } +} diff --git a/src/transforms/vision/unit_test/random_crop_test.cpp b/src/transforms/vision/unit_test/random_crop_test.cpp new file mode 100644 index 00000000000..8cd043a8de3 --- /dev/null +++ b/src/transforms/vision/unit_test/random_crop_test.cpp @@ -0,0 +1,64 @@ +// MUST include this +#include + +// File being tested +#include +#include "helper.hpp" + +TEST_CASE("Testing random crop preprocessing", "[preproc]") { + lbann::utils::type_erased_matrix mat = lbann::utils::type_erased_matrix(El::Matrix()); + + SECTION("matrix with one channel") { + ones(mat.template get(), 5, 5, 1); + std::vector dims = {1, 5, 5}; + auto cropper = lbann::transform::random_crop(3, 3); + + SECTION("applying the crop") { + REQUIRE_NOTHROW(cropper.apply(mat, dims)); + + SECTION("cropping changes dims correctly") { + REQUIRE(dims[0] == 1); + REQUIRE(dims[1] == 3); + REQUIRE(dims[2] == 3); + } + SECTION("cropping does not change matrix type") { + REQUIRE_NOTHROW(mat.template get()); + } + SECTION("cropping produces correct values") { + auto& real_mat = mat.template get(); + apply_elementwise( + real_mat, 3, 3, 1, + [](uint8_t& x, El::Int row, El::Int col, El::Int) { + REQUIRE(x == 1); + }); + } + } + } + + SECTION("matrix with three channels") { + ones(mat.template get(), 5, 5, 3); + std::vector dims = {3, 5, 5}; + auto cropper = lbann::transform::random_crop(3, 3); + + SECTION("applying the crop") { + REQUIRE_NOTHROW(cropper.apply(mat, dims)); + + SECTION("cropping changes dims correctly") { + REQUIRE(dims[0] == 3); + REQUIRE(dims[1] == 3); + REQUIRE(dims[2] == 3); + } + SECTION("cropping does not change matrix type") { + REQUIRE_NOTHROW(mat.template get()); + } + SECTION("cropping produces correct values") { + auto& real_mat = mat.template get(); + apply_elementwise( + real_mat, 3, 3, 3, + [](uint8_t& x, El::Int row, El::Int col, El::Int) { + REQUIRE(x == 1); + }); + } + } + } +} diff --git a/src/transforms/vision/unit_test/random_resized_crop_test.cpp b/src/transforms/vision/unit_test/random_resized_crop_test.cpp new file mode 100644 index 00000000000..0e1e07796b6 --- /dev/null +++ b/src/transforms/vision/unit_test/random_resized_crop_test.cpp @@ -0,0 +1,118 @@ +// MUST include this +#include + +// File being tested +#include +#include "helper.hpp" + +TEST_CASE("Testing random resized crop preprocessing", "[preproc]") { + lbann::utils::type_erased_matrix mat = lbann::utils::type_erased_matrix(El::Matrix()); + + SECTION("matrix with one channel") { + ones(mat.template get(), 5, 5, 1); + std::vector dims = {1, 5, 5}; + + SECTION("resizing larger and cropping") { + auto resize_cropper = lbann::transform::random_resized_crop(3, 3); + + SECTION("applying the resize/crop") { + REQUIRE_NOTHROW(resize_cropper.apply(mat, dims)); + + SECTION("resizing/cropping changes dims correctly") { + REQUIRE(dims[0] == 1); + REQUIRE(dims[1] == 3); + REQUIRE(dims[2] == 3); + } + SECTION("resizing/cropping does not change matrix type") { + REQUIRE_NOTHROW(mat.template get()); + } + SECTION("resizing/cropping produces correct values") { + auto& real_mat = mat.template get(); + apply_elementwise( + real_mat, 3, 3, 1, + [](uint8_t& x, El::Int row, El::Int col, El::Int) { + REQUIRE(x == 1); + }); + } + } + } + SECTION("resizing smaller and cropping") { + auto resize_cropper = lbann::transform::random_resized_crop(1, 1); + + SECTION("applying the resize/crop") { + REQUIRE_NOTHROW(resize_cropper.apply(mat, dims)); + + SECTION("resizing/cropping changes dims correctly") { + REQUIRE(dims[0] == 1); + REQUIRE(dims[1] == 1); + REQUIRE(dims[2] == 1); + } + SECTION("resizing/cropping does not change matrix type") { + REQUIRE_NOTHROW(mat.template get()); + } + SECTION("resizing/cropping produces correct values") { + auto& real_mat = mat.template get(); + apply_elementwise( + real_mat, 1, 1, 1, + [](uint8_t& x, El::Int row, El::Int col, El::Int) { + REQUIRE(x == 1); + }); + } + } + } + } + + SECTION("matrix with three channels") { + ones(mat.template get(), 5, 5, 3); + std::vector dims = {3, 5, 5}; + + SECTION("resizing larger and cropping") { + auto resize_cropper = lbann::transform::random_resized_crop(3, 3); + + SECTION("applying the resize/crop") { + REQUIRE_NOTHROW(resize_cropper.apply(mat, dims)); + + SECTION("resizing/cropping changes dims correctly") { + REQUIRE(dims[0] == 3); + REQUIRE(dims[1] == 3); + REQUIRE(dims[2] == 3); + } + SECTION("resizing/cropping does not change matrix type") { + REQUIRE_NOTHROW(mat.template get()); + } + SECTION("resizing/cropping produces correct values") { + auto& real_mat = mat.template get(); + apply_elementwise( + real_mat, 3, 3, 3, + [](uint8_t& x, El::Int row, El::Int col, El::Int) { + REQUIRE(x == 1); + }); + } + } + } + SECTION("resizing smaller and cropping") { + auto resize_cropper = lbann::transform::random_resized_crop(1, 1); + + SECTION("applying the resize/crop") { + REQUIRE_NOTHROW(resize_cropper.apply(mat, dims)); + + SECTION("resizing/cropping changes dims correctly") { + REQUIRE(dims[0] == 3); + REQUIRE(dims[1] == 1); + REQUIRE(dims[2] == 1); + } + SECTION("resizing/cropping does not change matrix type") { + REQUIRE_NOTHROW(mat.template get()); + } + SECTION("resizing/cropping produces correct values") { + auto& real_mat = mat.template get(); + apply_elementwise( + real_mat, 1, 1, 3, + [](uint8_t& x, El::Int row, El::Int col, El::Int) { + REQUIRE(x == 1); + }); + } + } + } + } +} diff --git a/src/transforms/vision/unit_test/random_resized_crop_with_fixed_aspect_ratio_test.cpp b/src/transforms/vision/unit_test/random_resized_crop_with_fixed_aspect_ratio_test.cpp new file mode 100644 index 00000000000..16d50b4aa70 --- /dev/null +++ b/src/transforms/vision/unit_test/random_resized_crop_with_fixed_aspect_ratio_test.cpp @@ -0,0 +1,188 @@ +// MUST include this +#include + +// File being tested +#include +#include +#include +#include "helper.hpp" + +TEST_CASE("Testing random resized crop with fixed aspect ratio preprocessing", "[preproc]") { + lbann::utils::type_erased_matrix mat = lbann::utils::type_erased_matrix(El::Matrix()); + + SECTION("matrix with one channel") { + ones(mat.template get(), 5, 5, 1); + std::vector dims = {1, 5, 5}; + + SECTION("resizing larger and cropping") { + auto resize_cropper = lbann::transform::random_resized_crop_with_fixed_aspect_ratio(7, 7, 3, 3); + + SECTION("applying the resize/crop") { + REQUIRE_NOTHROW(resize_cropper.apply(mat, dims)); + + SECTION("resizing/cropping changes dims correctly") { + REQUIRE(dims[0] == 1); + REQUIRE(dims[1] == 3); + REQUIRE(dims[2] == 3); + } + SECTION("resizing/cropping does not change matrix type") { + REQUIRE_NOTHROW(mat.template get()); + } + SECTION("resizing/cropping produces correct values") { + auto& real_mat = mat.template get(); + apply_elementwise( + real_mat, 3, 3, 1, + [](uint8_t& x, El::Int row, El::Int col, El::Int) { + REQUIRE(x == 1); + }); + } + + SECTION("compare with resize then crop") { + lbann::utils::type_erased_matrix mat2 = + lbann::utils::type_erased_matrix(El::Matrix()); + ones(mat2.template get(), 5, 5, 1); + std::vector dims2 = {1, 5, 5}; + auto resizer = lbann::transform::resize(7, 7); + auto cropper = lbann::transform::random_crop(3, 3); + REQUIRE_NOTHROW(resizer.apply(mat2, dims2)); + REQUIRE_NOTHROW(cropper.apply(mat2, dims2)); + REQUIRE(dims == dims2); + const uint8_t* buf = mat.template get().LockedBuffer(); + const uint8_t* buf2 = mat2.template get().LockedBuffer(); + for (size_t i = 0; i < dims2[1]*dims2[2]; ++i) { + REQUIRE(buf[i] == buf2[i]); + } + } + } + } + SECTION("resizing smaller and cropping") { + auto resize_cropper = lbann::transform::random_resized_crop_with_fixed_aspect_ratio(3, 3, 1, 1); + + SECTION("applying the resize/crop") { + REQUIRE_NOTHROW(resize_cropper.apply(mat, dims)); + + SECTION("resizing/cropping changes dims correctly") { + REQUIRE(dims[0] == 1); + REQUIRE(dims[1] == 1); + REQUIRE(dims[2] == 1); + } + SECTION("resizing/cropping does not change matrix type") { + REQUIRE_NOTHROW(mat.template get()); + } + SECTION("resizing/cropping produces correct values") { + auto& real_mat = mat.template get(); + apply_elementwise( + real_mat, 1, 1, 1, + [](uint8_t& x, El::Int row, El::Int col, El::Int) { + REQUIRE(x == 1); + }); + } + + SECTION("compare with resize then crop") { + lbann::utils::type_erased_matrix mat2 = + lbann::utils::type_erased_matrix(El::Matrix()); + ones(mat2.template get(), 5, 5, 1); + std::vector dims2 = {1, 5, 5}; + auto resizer = lbann::transform::resize(3, 3); + auto cropper = lbann::transform::random_crop(1, 1); + REQUIRE_NOTHROW(resizer.apply(mat2, dims2)); + REQUIRE_NOTHROW(cropper.apply(mat2, dims2)); + REQUIRE(dims == dims2); + const uint8_t* buf = mat.template get().LockedBuffer(); + const uint8_t* buf2 = mat2.template get().LockedBuffer(); + for (size_t i = 0; i < dims2[1]*dims2[2]; ++i) { + REQUIRE(buf[i] == buf2[i]); + } + } + } + } + } + + SECTION("matrix with three channels") { + ones(mat.template get(), 5, 5, 3); + std::vector dims = {3, 5, 5}; + + SECTION("resizing larger and cropping") { + auto resize_cropper = lbann::transform::random_resized_crop_with_fixed_aspect_ratio(7, 7, 3, 3); + + SECTION("applying the resize/crop") { + REQUIRE_NOTHROW(resize_cropper.apply(mat, dims)); + + SECTION("resizing/cropping changes dims correctly") { + REQUIRE(dims[0] == 3); + REQUIRE(dims[1] == 3); + REQUIRE(dims[2] == 3); + } + SECTION("resizing/cropping does not change matrix type") { + REQUIRE_NOTHROW(mat.template get()); + } + SECTION("resizing/cropping produces correct values") { + auto& real_mat = mat.template get(); + apply_elementwise( + real_mat, 3, 3, 3, + [](uint8_t& x, El::Int row, El::Int col, El::Int) { + REQUIRE(x == 1); + }); + } + + SECTION("compare with resize then crop") { + lbann::utils::type_erased_matrix mat2 = + lbann::utils::type_erased_matrix(El::Matrix()); + ones(mat2.template get(), 5, 5, 3); + std::vector dims2 = {3, 5, 5}; + auto resizer = lbann::transform::resize(7, 7); + auto cropper = lbann::transform::random_crop(3, 3); + REQUIRE_NOTHROW(resizer.apply(mat2, dims2)); + REQUIRE_NOTHROW(cropper.apply(mat2, dims2)); + REQUIRE(dims == dims2); + const uint8_t* buf = mat.template get().LockedBuffer(); + const uint8_t* buf2 = mat2.template get().LockedBuffer(); + for (size_t i = 0; i < dims2[1]*dims2[2]; ++i) { + REQUIRE(buf[i] == buf2[i]); + } + } + } + } + SECTION("resizing smaller and cropping") { + auto resize_cropper = lbann::transform::random_resized_crop_with_fixed_aspect_ratio(3, 3, 1, 1); + + SECTION("applying the resize/crop") { + REQUIRE_NOTHROW(resize_cropper.apply(mat, dims)); + + SECTION("resizing/cropping changes dims correctly") { + REQUIRE(dims[0] == 3); + REQUIRE(dims[1] == 1); + REQUIRE(dims[2] == 1); + } + SECTION("resizing/cropping does not change matrix type") { + REQUIRE_NOTHROW(mat.template get()); + } + SECTION("resizing/cropping produces correct values") { + auto& real_mat = mat.template get(); + apply_elementwise( + real_mat, 1, 1, 3, + [](uint8_t& x, El::Int row, El::Int col, El::Int) { + REQUIRE(x == 1); + }); + } + + SECTION("compare with resize then crop") { + lbann::utils::type_erased_matrix mat2 = + lbann::utils::type_erased_matrix(El::Matrix()); + ones(mat2.template get(), 5, 5, 3); + std::vector dims2 = {3, 5, 5}; + auto resizer = lbann::transform::resize(3, 3); + auto cropper = lbann::transform::random_crop(1, 1); + REQUIRE_NOTHROW(resizer.apply(mat2, dims2)); + REQUIRE_NOTHROW(cropper.apply(mat2, dims2)); + REQUIRE(dims == dims2); + const uint8_t* buf = mat.template get().LockedBuffer(); + const uint8_t* buf2 = mat2.template get().LockedBuffer(); + for (size_t i = 0; i < dims2[1]*dims2[2]; ++i) { + REQUIRE(buf[i] == buf2[i]); + } + } + } + } + } +} diff --git a/src/transforms/vision/unit_test/resize_test.cpp b/src/transforms/vision/unit_test/resize_test.cpp new file mode 100644 index 00000000000..16c4bd48fc0 --- /dev/null +++ b/src/transforms/vision/unit_test/resize_test.cpp @@ -0,0 +1,118 @@ +// MUST include this +#include + +// File being tested +#include +#include "helper.hpp" + +TEST_CASE("Testing resize preprocessing", "[preproc]") { + lbann::utils::type_erased_matrix mat = lbann::utils::type_erased_matrix(El::Matrix()); + + SECTION("matrix with one channel") { + ones(mat.template get(), 3, 3, 1); + std::vector dims = {1, 3, 3}; + + SECTION("resizing larger") { + auto resizer = lbann::transform::resize(5, 5); + + SECTION("applying the resize") { + REQUIRE_NOTHROW(resizer.apply(mat, dims)); + + SECTION("resizing changes dims correctly") { + REQUIRE(dims[0] == 1); + REQUIRE(dims[1] == 5); + REQUIRE(dims[2] == 5); + } + SECTION("resizing does not change matrix type") { + REQUIRE_NOTHROW(mat.template get()); + } + SECTION("resizing produces correct values") { + auto& real_mat = mat.template get(); + apply_elementwise( + real_mat, 5, 5, 1, + [](uint8_t& x, El::Int row, El::Int col, El::Int) { + REQUIRE(x == 1); + }); + } + } + } + SECTION("resizing smaller") { + auto resizer = lbann::transform::resize(2, 2); + + SECTION("applying the resize") { + REQUIRE_NOTHROW(resizer.apply(mat, dims)); + + SECTION("resizing changes dims correctly") { + REQUIRE(dims[0] == 1); + REQUIRE(dims[1] == 2); + REQUIRE(dims[2] == 2); + } + SECTION("resizing does not change matrix type") { + REQUIRE_NOTHROW(mat.template get()); + } + SECTION("resizing produces correct values") { + auto& real_mat = mat.template get(); + apply_elementwise( + real_mat, 2, 2, 1, + [](uint8_t& x, El::Int row, El::Int col, El::Int) { + REQUIRE(x == 1); + }); + } + } + } + } + + SECTION("matrix with three channels") { + ones(mat.template get(), 3, 3, 3); + std::vector dims = {3, 3, 3}; + + SECTION("resizing larger") { + auto resizer = lbann::transform::resize(5, 5); + + SECTION("applying the resize") { + REQUIRE_NOTHROW(resizer.apply(mat, dims)); + + SECTION("resizing changes dims correctly") { + REQUIRE(dims[0] == 3); + REQUIRE(dims[1] == 5); + REQUIRE(dims[2] == 5); + } + SECTION("resizing does not change matrix type") { + REQUIRE_NOTHROW(mat.template get()); + } + SECTION("resizing produces correct values") { + auto& real_mat = mat.template get(); + apply_elementwise( + real_mat, 5, 5, 3, + [](uint8_t& x, El::Int row, El::Int col, El::Int) { + REQUIRE(x == 1); + }); + } + } + } + SECTION("resizing smaller") { + auto resizer = lbann::transform::resize(2, 2); + + SECTION("applying the resize") { + REQUIRE_NOTHROW(resizer.apply(mat, dims)); + + SECTION("resizing changes dims correctly") { + REQUIRE(dims[0] == 3); + REQUIRE(dims[1] == 2); + REQUIRE(dims[2] == 2); + } + SECTION("resizing does not change matrix type") { + REQUIRE_NOTHROW(mat.template get()); + } + SECTION("resizing produces correct values") { + auto& real_mat = mat.template get(); + apply_elementwise( + real_mat, 2, 2, 3, + [](uint8_t& x, El::Int row, El::Int col, El::Int) { + REQUIRE(x == 1); + }); + } + } + } + } +} diff --git a/src/transforms/vision/unit_test/resized_center_crop_test.cpp b/src/transforms/vision/unit_test/resized_center_crop_test.cpp new file mode 100644 index 00000000000..8ed8a3b7cfe --- /dev/null +++ b/src/transforms/vision/unit_test/resized_center_crop_test.cpp @@ -0,0 +1,188 @@ +// MUST include this +#include + +// File being tested +#include +#include +#include +#include "helper.hpp" + +TEST_CASE("Testing resized center crop preprocessing", "[preproc]") { + lbann::utils::type_erased_matrix mat = lbann::utils::type_erased_matrix(El::Matrix()); + + SECTION("matrix with one channel") { + ones(mat.template get(), 5, 5, 1); + std::vector dims = {1, 5, 5}; + + SECTION("resizing larger and cropping") { + auto resize_cropper = lbann::transform::resized_center_crop(7, 7, 3, 3); + + SECTION("applying the resize/crop") { + REQUIRE_NOTHROW(resize_cropper.apply(mat, dims)); + + SECTION("resizing/cropping changes dims correctly") { + REQUIRE(dims[0] == 1); + REQUIRE(dims[1] == 3); + REQUIRE(dims[2] == 3); + } + SECTION("resizing/cropping does not change matrix type") { + REQUIRE_NOTHROW(mat.template get()); + } + SECTION("resizing/cropping produces correct values") { + auto& real_mat = mat.template get(); + apply_elementwise( + real_mat, 3, 3, 1, + [](uint8_t& x, El::Int row, El::Int col, El::Int) { + REQUIRE(x == 1); + }); + } + + SECTION("compare with resize then crop") { + lbann::utils::type_erased_matrix mat2 = + lbann::utils::type_erased_matrix(El::Matrix()); + ones(mat2.template get(), 5, 5, 1); + std::vector dims2 = {1, 5, 5}; + auto resizer = lbann::transform::resize(7, 7); + auto cropper = lbann::transform::center_crop(3, 3); + REQUIRE_NOTHROW(resizer.apply(mat2, dims2)); + REQUIRE_NOTHROW(cropper.apply(mat2, dims2)); + REQUIRE(dims == dims2); + const uint8_t* buf = mat.template get().LockedBuffer(); + const uint8_t* buf2 = mat2.template get().LockedBuffer(); + for (size_t i = 0; i < dims2[1]*dims2[2]; ++i) { + REQUIRE(buf[i] == buf2[i]); + } + } + } + } + SECTION("resizing smaller and cropping") { + auto resize_cropper = lbann::transform::resized_center_crop(3, 3, 1, 1); + + SECTION("applying the resize/crop") { + REQUIRE_NOTHROW(resize_cropper.apply(mat, dims)); + + SECTION("resizing/cropping changes dims correctly") { + REQUIRE(dims[0] == 1); + REQUIRE(dims[1] == 1); + REQUIRE(dims[2] == 1); + } + SECTION("resizing/cropping does not change matrix type") { + REQUIRE_NOTHROW(mat.template get()); + } + SECTION("resizing/cropping produces correct values") { + auto& real_mat = mat.template get(); + apply_elementwise( + real_mat, 1, 1, 1, + [](uint8_t& x, El::Int row, El::Int col, El::Int) { + REQUIRE(x == 1); + }); + } + + SECTION("compare with resize then crop") { + lbann::utils::type_erased_matrix mat2 = + lbann::utils::type_erased_matrix(El::Matrix()); + ones(mat2.template get(), 5, 5, 1); + std::vector dims2 = {1, 5, 5}; + auto resizer = lbann::transform::resize(3, 3); + auto cropper = lbann::transform::center_crop(1, 1); + REQUIRE_NOTHROW(resizer.apply(mat2, dims2)); + REQUIRE_NOTHROW(cropper.apply(mat2, dims2)); + REQUIRE(dims == dims2); + const uint8_t* buf = mat.template get().LockedBuffer(); + const uint8_t* buf2 = mat2.template get().LockedBuffer(); + for (size_t i = 0; i < dims2[1]*dims2[2]; ++i) { + REQUIRE(buf[i] == buf2[i]); + } + } + } + } + } + + SECTION("matrix with three channels") { + ones(mat.template get(), 5, 5, 3); + std::vector dims = {3, 5, 5}; + + SECTION("resizing larger and cropping") { + auto resize_cropper = lbann::transform::resized_center_crop(7, 7, 3, 3); + + SECTION("applying the resize/crop") { + REQUIRE_NOTHROW(resize_cropper.apply(mat, dims)); + + SECTION("resizing/cropping changes dims correctly") { + REQUIRE(dims[0] == 3); + REQUIRE(dims[1] == 3); + REQUIRE(dims[2] == 3); + } + SECTION("resizing/cropping does not change matrix type") { + REQUIRE_NOTHROW(mat.template get()); + } + SECTION("resizing/cropping produces correct values") { + auto& real_mat = mat.template get(); + apply_elementwise( + real_mat, 3, 3, 3, + [](uint8_t& x, El::Int row, El::Int col, El::Int) { + REQUIRE(x == 1); + }); + } + + SECTION("compare with resize then crop") { + lbann::utils::type_erased_matrix mat2 = + lbann::utils::type_erased_matrix(El::Matrix()); + ones(mat2.template get(), 5, 5, 3); + std::vector dims2 = {3, 5, 5}; + auto resizer = lbann::transform::resize(7, 7); + auto cropper = lbann::transform::center_crop(3, 3); + REQUIRE_NOTHROW(resizer.apply(mat2, dims2)); + REQUIRE_NOTHROW(cropper.apply(mat2, dims2)); + REQUIRE(dims == dims2); + const uint8_t* buf = mat.template get().LockedBuffer(); + const uint8_t* buf2 = mat2.template get().LockedBuffer(); + for (size_t i = 0; i < dims2[1]*dims2[2]; ++i) { + REQUIRE(buf[i] == buf2[i]); + } + } + } + } + SECTION("resizing smaller and cropping") { + auto resize_cropper = lbann::transform::resized_center_crop(3, 3, 1, 1); + + SECTION("applying the resize/crop") { + REQUIRE_NOTHROW(resize_cropper.apply(mat, dims)); + + SECTION("resizing/cropping changes dims correctly") { + REQUIRE(dims[0] == 3); + REQUIRE(dims[1] == 1); + REQUIRE(dims[2] == 1); + } + SECTION("resizing/cropping does not change matrix type") { + REQUIRE_NOTHROW(mat.template get()); + } + SECTION("resizing/cropping produces correct values") { + auto& real_mat = mat.template get(); + apply_elementwise( + real_mat, 1, 1, 3, + [](uint8_t& x, El::Int row, El::Int col, El::Int) { + REQUIRE(x == 1); + }); + } + + SECTION("compare with resize then crop") { + lbann::utils::type_erased_matrix mat2 = + lbann::utils::type_erased_matrix(El::Matrix()); + ones(mat2.template get(), 5, 5, 3); + std::vector dims2 = {3, 5, 5}; + auto resizer = lbann::transform::resize(3, 3); + auto cropper = lbann::transform::center_crop(1, 1); + REQUIRE_NOTHROW(resizer.apply(mat2, dims2)); + REQUIRE_NOTHROW(cropper.apply(mat2, dims2)); + REQUIRE(dims == dims2); + const uint8_t* buf = mat.template get().LockedBuffer(); + const uint8_t* buf2 = mat2.template get().LockedBuffer(); + for (size_t i = 0; i < dims2[1]*dims2[2]; ++i) { + REQUIRE(buf[i] == buf2[i]); + } + } + } + } + } +} diff --git a/src/transforms/vision/unit_test/to_lbann_layout_test.cpp b/src/transforms/vision/unit_test/to_lbann_layout_test.cpp new file mode 100644 index 00000000000..a737a0f69ac --- /dev/null +++ b/src/transforms/vision/unit_test/to_lbann_layout_test.cpp @@ -0,0 +1,82 @@ +// MUST include this +#include + +// File being tested +#include +#include "helper.hpp" + +TEST_CASE("Testing to LBANN layout", "[preproc]") { + lbann::utils::type_erased_matrix mat = lbann::utils::type_erased_matrix(El::Matrix()); + + SECTION("matrix with one channel") { + zeros(mat.template get(), 3, 3, 1); + apply_elementwise(mat.template get(), 3, 3, 1, + [](uint8_t& x, El::Int row, El::Int col, El::Int) { + if (row == 0) { x = 1; } + }); + std::vector dims = {1, 3, 3}; + auto tll = lbann::transform::to_lbann_layout(); + + SECTION("converting the matrix") { + REQUIRE_NOTHROW(tll.apply(mat, dims)); + + SECTION("converting does not change dims") { + REQUIRE(dims[0] == 1); + REQUIRE(dims[1] == 3); + REQUIRE(dims[2] == 3); + } + SECTION("converting changes matrix type") { + REQUIRE_THROWS(mat.template get()); + REQUIRE_NOTHROW(mat.template get()); + } + SECTION("converting produces correct values") { + auto& real_mat = mat.template get(); + const lbann::DataType* buf = real_mat.LockedBuffer(); + for (size_t col = 0; col < 3; ++col) { + for (size_t row = 0; row < 3; ++row) { + const lbann::DataType val = buf[row + col*3]; + if (row == 0) { REQUIRE(val == 1.0f / 255.0f); } + else { REQUIRE(val == 0.0f); } + } + } + } + } + } + + SECTION("matrix with three channels") { + zeros(mat.template get(), 3, 3, 3); + apply_elementwise(mat.template get(), 3, 3, 3, + [](uint8_t& x, El::Int row, El::Int col, El::Int) { + if (row == 0) { x = 1; } + }); + std::vector dims = {3, 3, 3}; + auto tll = lbann::transform::to_lbann_layout(); + + SECTION("converting the matrix") { + REQUIRE_NOTHROW(tll.apply(mat, dims)); + + SECTION("converting does not change dims") { + REQUIRE(dims[0] == 3); + REQUIRE(dims[1] == 3); + REQUIRE(dims[2] == 3); + } + SECTION("converting changes matrix type") { + REQUIRE_THROWS(mat.template get()); + REQUIRE_NOTHROW(mat.template get()); + } + SECTION("converting produces correct values") { + auto& real_mat = mat.template get(); + const lbann::DataType* buf = real_mat.LockedBuffer(); + for (size_t channel = 0; channel < 3; ++channel) { + for (size_t col = 0; col < 3; ++col) { + for (size_t row = 0; row < 3; ++row) { + const lbann::DataType val = buf[3*3*channel + row + col*3]; + if (row == 0) { REQUIRE(val == 1.0f / 255.0f); } + else { REQUIRE(val == 0.0f); } + } + } + } + } + } + } +} diff --git a/src/transforms/vision/unit_test/transform_pipeline_test.cpp b/src/transforms/vision/unit_test/transform_pipeline_test.cpp new file mode 100644 index 00000000000..f314904b9c4 --- /dev/null +++ b/src/transforms/vision/unit_test/transform_pipeline_test.cpp @@ -0,0 +1,45 @@ +// MUST include this +#include + +// File being tested +#include +#include +#include +#include +#include +#include +#include "helper.hpp" + +TEST_CASE("Testing vision transform pipeline", "[preproc]") { + lbann::transform::transform_pipeline p; + p.add_transform( + lbann::make_unique(7, 7, 3, 3)); + p.add_transform(lbann::make_unique()); + p.add_transform(lbann::make_unique(2.0f)); + p.add_transform(lbann::make_unique( + std::vector({0.5f, 0.5f, 0.5f}), + std::vector({2.0f, 2.0f, 2.0f}))); + lbann::utils::type_erased_matrix mat = lbann::utils::type_erased_matrix(El::Matrix()); + ones(mat.template get(), 5, 5, 3); + std::vector dims = {3, 5, 5}; + + SECTION("applying the pipeline") { + REQUIRE_NOTHROW(p.apply(mat, dims)); + + SECTION("pipeline produces correct dims") { + REQUIRE(dims[0] == 3); + REQUIRE(dims[1] == 3); + REQUIRE(dims[2] == 3); + } + SECTION("pipeline produces correct type") { + REQUIRE_NOTHROW(mat.template get()); + } + SECTION("pipeline produces correct values") { + auto& real_mat = mat.template get(); + const lbann::DataType* buf = real_mat.LockedBuffer(); + for (size_t i = 0; i < 3*3*3; ++i) { + REQUIRE(buf[i] == Approx(-0.24607843)); + } + } + } +} diff --git a/src/transforms/vision/unit_test/vertical_flip_test.cpp b/src/transforms/vision/unit_test/vertical_flip_test.cpp new file mode 100644 index 00000000000..f77284ccd51 --- /dev/null +++ b/src/transforms/vision/unit_test/vertical_flip_test.cpp @@ -0,0 +1,80 @@ +// MUST include this +#include + +// File being tested +#include +#include "helper.hpp" + +TEST_CASE("Testing vertical flip preprocessing", "[preproc]") { + lbann::utils::type_erased_matrix mat = lbann::utils::type_erased_matrix(El::Matrix()); + + SECTION("matrix with one channel") { + zeros(mat.template get(), 3, 3, 1); + apply_elementwise(mat.template get(), 3, 3, 1, + [](uint8_t& x, El::Int row, El::Int col, El::Int) { + if (row == 0) { x = 1; } + }); + std::vector dims = {1, 3, 3}; + auto flipper = lbann::transform::vertical_flip(1.0); + + SECTION("applying the flip") { + REQUIRE_NOTHROW(flipper.apply(mat, dims)); + + SECTION("flipping does not change dims") { + REQUIRE(dims[0] == 1); + REQUIRE(dims[1] == 3); + REQUIRE(dims[2] == 3); + } + SECTION("flipping does not change matrix type") { + REQUIRE_NOTHROW(mat.template get()); + } + SECTION("flipping produces correct values") { + auto& real_mat = mat.template get(); + apply_elementwise( + real_mat, 3, 3, 1, + [](uint8_t& x, El::Int row, El::Int col, El::Int) { + if (row == 2) { + REQUIRE(x == 1); + } else { + REQUIRE(x == 0); + } + }); + } + } + } + + SECTION("matrix with three channels") { + zeros(mat.template get(), 3, 3, 3); + apply_elementwise(mat.template get(), 3, 3, 3, + [](uint8_t& x, El::Int row, El::Int col, El::Int) { + if (row == 0) { x = 1; } + }); + std::vector dims = {3, 3, 3}; + auto flipper = lbann::transform::vertical_flip(1.0); + + SECTION("applying the flip") { + REQUIRE_NOTHROW(flipper.apply(mat, dims)); + + SECTION("flipping does not change dims") { + REQUIRE(dims[0] == 3); + REQUIRE(dims[1] == 3); + REQUIRE(dims[2] == 3); + } + SECTION("flipping does not change matrix type") { + REQUIRE_NOTHROW(mat.template get()); + } + SECTION("flipping produces correct values") { + auto& real_mat = mat.template get(); + apply_elementwise( + real_mat, 3, 3, 3, + [](uint8_t& x, El::Int row, El::Int col, El::Int) { + if (row == 2) { + REQUIRE(x == 1); + } else { + REQUIRE(x == 0); + } + }); + } + } + } +} diff --git a/src/transforms/vision/vertical_flip.cpp b/src/transforms/vision/vertical_flip.cpp new file mode 100644 index 00000000000..f84a789a25c --- /dev/null +++ b/src/transforms/vision/vertical_flip.cpp @@ -0,0 +1,44 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#include "lbann/transforms/vision/vertical_flip.hpp" +#include "lbann/utils/opencv.hpp" + +namespace lbann { +namespace transform { + +void vertical_flip::apply(utils::type_erased_matrix& data, std::vector& dims) { + if (transform::get_bool_random(m_p)) { + cv::Mat src = utils::get_opencv_mat(data, dims); + auto dst_real = El::Matrix(utils::get_linearized_size(dims), 1); + cv::Mat dst = utils::get_opencv_mat(dst_real, dims); + cv::flip(src, dst, 0); + data.emplace(std::move(dst_real)); + } +} + +} // namespace transform +} // namespace lbann diff --git a/src/utils/CMakeLists.txt b/src/utils/CMakeLists.txt index aafbc5741cc..9b19b996dd6 100644 --- a/src/utils/CMakeLists.txt +++ b/src/utils/CMakeLists.txt @@ -8,6 +8,7 @@ set_full_path(THIS_DIR_SOURCES file_utils.cpp graph.cpp im2col.cpp + image.cpp number_theory.cpp omp_diagnostics.cpp options.cpp diff --git a/src/utils/image.cpp b/src/utils/image.cpp new file mode 100644 index 00000000000..f90baae8f3c --- /dev/null +++ b/src/utils/image.cpp @@ -0,0 +1,241 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#include +#include +#include +#include "lbann/utils/image.hpp" +#include "lbann/utils/exception.hpp" +#include "lbann/utils/opencv.hpp" + +namespace { + +// Read filename into buf. +void read_file_to_buf(const std::string& filename, El::Matrix& buf, + size_t& size) { + FILE* f = fopen(filename.c_str(), "r"); + if (f == nullptr) { + LBANN_ERROR("Could not open file " + filename); + } + // Determine the length. + if (fseeko(f, 0, SEEK_END) != 0) { + LBANN_ERROR("Could not seek to end of file " + filename); + } + off_t size_ = ftello(f); + if (size_ == -1) { + LBANN_ERROR("Could not get offset in file " + filename); + } + size = static_cast(size_); + rewind(f); + // Allocate sufficient space and read. + buf.Resize(size, 1); + if (fread(buf.Buffer(), 1, size, f) != size) { + LBANN_ERROR("Could not real file " + filename); + } + fclose(f); +} + +// There are other SOFs, but these are the common ones. +const bool is_jpg_sof[16] = { + true, true, true, true, false, true, true, true, + false, true, true, true, false, true, true, true}; + +// Attempt to guess the decoded size of an image. +// May not return the actual size (and may just return 0), so treat this as a +// hint. +void guess_image_size(const El::Matrix& buf_, size_t size, + size_t& height, size_t& width, size_t& channels) { + height = 0; + width = 0; + channels = 0; + const uint8_t* buf = buf_.LockedBuffer(); + if (size >= 2 && // Size + buf[0] == 0xFF && buf[1] == 0xD8) { // Signature + // JPEG image. + // See: https://en.wikipedia.org/wiki/JPEG#Syntax_and_structure + // and https://stackoverflow.com/questions/15800704/get-image-size-without-loading-image-into-memory + // and https://github.com/python-pillow/Pillow/blob/master/src/PIL/JpegImagePlugin.py + // JPEG is complicated, this will probably not work for every image. + // Try to find a start-of-frame marker, and then get the size. + for (size_t cur_pos = 2; cur_pos < size;) { + uint8_t b = buf[cur_pos]; + if (b == 0xFF) { + if (cur_pos + 1 >= size) { return; } // Shouldn't happen. + uint8_t marker = buf[cur_pos + 1]; + if (marker >= 0xC0 && marker <= 0xCF && is_jpg_sof[marker - 0xC0]) { + // Found the SOF. + // 2 for the marker, 2 for the frame header length, 1 for the precision. + cur_pos += 5; + if (cur_pos + 4 >= size) { return; } // Shouldn't happen. + uint16_t h_w[2]; + memcpy(h_w, &buf[cur_pos], 4); + height = ntohs(h_w[0]); + width = ntohs(h_w[1]); + channels = 3; // Assume color. + return; + } else { + cur_pos += 2; + if (cur_pos + 2 >= size) { return; } // Shouldn't happen. + // Skip ahead by the length of this segment. + uint16_t l; + memcpy(&l, &buf[cur_pos], 2); + cur_pos += ntohs(l); + } + } else { + // Skip non-0xFFs. + cur_pos += 1; + } + } + } else if (size >= 24 && // Size + // Check signature + buf[0] == 0x89 && buf[1] == 0x50 && + buf[2] == 0x4E && buf[3] == 0x47 && + buf[4] == 0x0D && buf[5] == 0x0A && + buf[6] == 0x1A && buf[7] == 0x0A && + // Need IHDR chunk. + buf[12] == 'I' && buf[13] == 'H' && + buf[14] == 'D' && buf[15] == 'R') { + // PNG image + // See: https://en.wikipedia.org/wiki/Portable_Network_Graphics#File_header + uint32_t h_w[2]; + memcpy(h_w, buf + 16, 8); + // Convert from network byte order and get size. + width = ntohl(h_w[0]); + height = ntohl(h_w[1]); + channels = 3; // Assume color. + } + // Give up. +} + +// Decode an image from a buffer using OpenCV. +void opencv_decode(El::Matrix& buf, El::Matrix& dst, + std::vector& dims, const std::string filename) { + const size_t encoded_size = buf.Height() * buf.Width(); + std::vector buf_dims = {1, encoded_size, 1}; + cv::Mat cv_encoded = lbann::utils::get_opencv_mat(buf, buf_dims); + // Attempt to guess the decoded size. + // Warning: These may be wrong. + size_t height, width, channels; + guess_image_size(buf, encoded_size, height, width, channels); + if (height != 0) { + // We have a guess. + dst.Resize(height*width*channels, 1); + std::vector guessed_dims = {channels, height, width}; + // Decode the image. + cv::Mat cv_dst = lbann::utils::get_opencv_mat(dst, guessed_dims); + cv::Mat real_decoded = cv::imdecode(cv_encoded, + cv::IMREAD_ANYCOLOR | cv::IMREAD_ANYDEPTH, + &cv_dst); + // For now we only support 8-bit 1- or 3-channel images. + if (real_decoded.type() != CV_8UC1 && real_decoded.type() != CV_8UC3) { + LBANN_ERROR("Only support 8-bit 1- or 3-channel images, cannot load " + filename); + } + dims = {real_decoded.type() == CV_8UC1 ? 1ull : 3ull, + static_cast(real_decoded.rows), + static_cast(real_decoded.cols)}; + // If we did not guess the size right, need to copy. + if (real_decoded.ptr() != dst.Buffer()) { + dst.Resize(lbann::utils::get_linearized_size(dims), 1); + cv_dst = lbann::utils::get_opencv_mat(dst, dims); + real_decoded.copyTo(cv_dst); + } + } else { + cv::Mat decoded = cv::imdecode(cv_encoded, + cv::IMREAD_ANYCOLOR | cv::IMREAD_ANYDEPTH); + if (decoded.type() != CV_8UC1 && decoded.type() != CV_8UC3) { + LBANN_ERROR("Only support 8-bit 1- or 3-channel images, cannot load " + filename); + } + dims = {decoded.type() == CV_8UC1 ? 1ull : 3ull, + static_cast(decoded.rows), + static_cast(decoded.cols)}; + // Copy to dst. + dst.Resize(lbann::utils::get_linearized_size(dims), 1); + cv::Mat cv_dst = lbann::utils::get_opencv_mat(dst, dims); + decoded.copyTo(cv_dst); + } +} + +} // anonymous namespace + +namespace lbann { + +void load_image(const std::string& filename, El::Matrix& dst, + std::vector& dims) { + // Load the encoded image. + El::Matrix buf; + size_t encoded_size; + read_file_to_buf(filename, buf, encoded_size); + opencv_decode(buf, dst, dims, filename); +} + +void decode_image(El::Matrix& src, El::Matrix& dst, + std::vector& dims) { + opencv_decode(src, dst, dims, "encoded image"); +} + +void save_image(const std::string& filename, El::Matrix& src, + const std::vector& dims) { + cv::Mat cv_src = utils::get_opencv_mat(src, dims); + if (!cv::imwrite(filename, cv_src)) { + LBANN_ERROR("Could not save image to " + filename); + } +} + +void save_image(const std::string& filename, const CPUMat& src, + const std::vector& dims) { + if (dims.size() != 3 || (dims[0] != 1 && dims[0] != 3)) { + LBANN_ERROR("Unsupported dimensions for saving an image."); + } + // Need to convert to uint8_t matrix in OpenCV format. + // We will normalize to [0, 1], then map to [0, 255]. + const size_t size = utils::get_linearized_size(dims); + El::Matrix cv_mat = El::Matrix(size, 1); + // Find the minimum and maximum to normalize with. + const DataType* __restrict__ src_buf = src.LockedBuffer(); + DataType min = std::numeric_limits::max(); + DataType max = std::numeric_limits::lowest(); + for (size_t i = 0; i < size; ++i) { + min = std::min(min, src_buf[i]); + max = std::max(max, src_buf[i]); + } + const DataType norm_denom = max - min; + // Construct the OpenCV buffer. + uint8_t* __restrict__ cv_buf = cv_mat.Buffer(); + for (size_t channel = 0; channel < dims[0]; ++channel) { + const size_t src_offset = channel*dims[1]*dims[2]; + for (size_t col = 0; col < dims[2]; ++col) { + for (size_t row = 0; row < dims[1]; ++row) { + const DataType norm_src_val = + (src_buf[src_offset + row + col*dims[1]] - min) / norm_denom; + cv_buf[dims[0]*(col + row*dims[2]) + channel] = + static_cast(norm_src_val * 255); + } + } + } + save_image(filename, cv_mat, dims); +} + +} // namespace lbann diff --git a/src/utils/unit_test/CMakeLists.txt b/src/utils/unit_test/CMakeLists.txt index 3578391b3a4..69c29e8632e 100644 --- a/src/utils/unit_test/CMakeLists.txt +++ b/src/utils/unit_test/CMakeLists.txt @@ -1,6 +1,7 @@ set_full_path(_DIR_LBANN_CATCH2_TEST_FILES any_test.cpp factory_test.cpp + image_test.cpp type_erased_matrix_test.cpp ) diff --git a/src/utils/unit_test/image_test.cpp b/src/utils/unit_test/image_test.cpp new file mode 100644 index 00000000000..8f2cdaa564c --- /dev/null +++ b/src/utils/unit_test/image_test.cpp @@ -0,0 +1,51 @@ +// MUST include this +#include + +// File being tested +#include + +// Hide by default because this will create a file. +TEST_CASE("Testing image utils", "[.image-utils][utilities]") { + SECTION("JPEG") { + std::string filename = "test.jpg"; + lbann::CPUMat image; + // Make this a 3-channel image. + image.Resize(3*32*32, 1); + { + lbann::DataType* buf = image.Buffer(); + for (size_t channel = 0; channel < 3; ++channel) { + for (size_t col = 0; col < 32; ++col) { + for (size_t row = 0; row < 32; ++row) { + const size_t i = channel*32*32 + row+col*32; + if (row == col) { buf[i] = 1.0f; } + else { buf[i] = 0.0f; } + } + } + } + } + SECTION("save image") { + std::vector dims = {3, 32, 32}; + REQUIRE_NOTHROW(lbann::save_image(filename, image, dims)); + } + SECTION("load image") { + El::Matrix loaded_image; + std::vector dims; + REQUIRE_NOTHROW(lbann::load_image(filename, loaded_image, dims)); + REQUIRE(dims.size() == 3); + REQUIRE(dims[0] == 3); + REQUIRE(dims[1] == 32); + REQUIRE(dims[2] == 32); + const uint8_t* buf = loaded_image.LockedBuffer(); + for (size_t channel = 0; channel < 3; ++channel) { + for (size_t col = 0; col < 32; ++col) { + for (size_t row = 0; row < 32; ++row) { + const size_t i = 3*(col+row*32) + channel; + if (row == col) { REQUIRE(buf[i] == 255); } + // Turns out JPEG doesn't encode every pixel to exactly 0. + else { REQUIRE(buf[i] <= 1); } + } + } + } + } + } +} diff --git a/superbuild/opencv/AddOpenCVOptions.cmake b/superbuild/opencv/AddOpenCVOptions.cmake index f1e8439b09f..c45c26e390e 100644 --- a/superbuild/opencv/AddOpenCVOptions.cmake +++ b/superbuild/opencv/AddOpenCVOptions.cmake @@ -2,7 +2,7 @@ option(OPENCV_BUILD_opencv_core "OpenCV: Enable core module" ON) option(OPENCV_BUILD_opencv_flann "OpenCV: Enable flann module" OFF) option(OPENCV_BUILD_opencv_imgproc "OpenCV: Enable imgproc module" ON) -option(OPENCV_BUILD_opencv_highgui "OpenCV: Enable highgui module" ON) +option(OPENCV_BUILD_opencv_highgui "OpenCV: Enable highgui module" OFF) option(OPENCV_BUILD_opencv_features2d "OpenCV: Enable features2d module" OFF) option(OPENCV_BUILD_opencv_calib3d "OpenCV: Enable calib3d module" OFF) option(OPENCV_BUILD_opencv_ml "OpenCV: Enable ml module" OFF) diff --git a/superbuild/opencv/CMakeLists.txt b/superbuild/opencv/CMakeLists.txt index 520b67816bf..493318c3333 100644 --- a/superbuild/opencv/CMakeLists.txt +++ b/superbuild/opencv/CMakeLists.txt @@ -41,7 +41,7 @@ else () CACHE STRING "The URL from which to clone OpenCV") endif () -set(OPENCV_TAG "3.4.5" +set(OPENCV_TAG "4.1.0" CACHE STRING "The git tag or hash to checkout for OpenCV") include(ExternalProject) diff --git a/tests/test_img_pipeline/CMakeLists.txt b/tests/test_img_pipeline/CMakeLists.txt deleted file mode 100644 index c5f40f967a4..00000000000 --- a/tests/test_img_pipeline/CMakeLists.txt +++ /dev/null @@ -1,81 +0,0 @@ -project(imgpipe) -cmake_minimum_required(VERSION 3.8) -cmake_policy(SET CMP0015 NEW) - -set(COMPILER "gnu") -#set(CLUSTER "surface") # only usable with non-custom built MPI that is detected by SetupMPI.cmake below -set(CLUSTER "catalyst") -set(LBANN_DIR ../..) -set(LBANN_BUILD_DIR ${LBANN_DIR}/build/${COMPILER}.Release.${CLUSTER}.llnl.gov/install) -include(${LBANN_DIR}/cmake/modules/SetupMPI.cmake) -include(${LBANN_DIR}/cmake/modules/SetupOpenMP.cmake) -include_directories(${CMAKE_CURRENT_SOURCE_DIR}) -set(LBANN_INCLUDE_DIR ${LBANN_BUILD_DIR}/include) -include_directories(${LBANN_INCLUDE_DIR}) - -set(IMGPIPE_EXE imgpipe) -set(IMGPIPE_SRCS main.cpp) - - -set(WITH_OPENCL OFF) - -add_definitions(-Wall) -add_definitions(-O2) -add_definitions(-g) -add_definitions(-std=c++11) -add_definitions(-DLBANN_HAS_OPENCV) - - -#list(APPEND OpenCV_DIR /usr/local/tools/opencv-3.0.0) -#list(APPEND OpenCV_DIR /usr) -#find_package(OpenCV QUIET HINTS ${OpenCV_DIR}) -#message(STATUS "OpenCV_DIR: ${OpenCV_DIR}") - -if(NOT OpenCV_FOUND) - set(OpenCV_DIR ${LBANN_BUILD_DIR}) - set(OpenCV_LIBS "libopencv_highgui.so;libopencv_imgproc.so;libopencv_imgcodecs.so;libopencv_core.so") - set(OpenCV_INCLUDE_DIRS "${OpenCV_DIR}/include") - set(OpenCV_LIB_DIR "${OpenCV_DIR}/lib") - message(STATUS "OpenCV_DIR: ${OpenCV_DIR}") -endif() - -include_directories(SYSTEM ${OpenCV_INCLUDE_DIRS}) -link_directories(${OpenCV_LIB_DIR}) - - -find_package(MPI REQUIRED) -message(STATUS "Found MPI: ${MPI_CXX_COMPILER} ${MPI_C_COMPILER} ${MPI_Fortran_COMPILER}") -include_directories(${MPI_CXX_INCLUDE_PATH}) - -list (APPEND Hydrogen_DIR ${LBANN_BUILD_DIR}) -message(STATUS "Hydrogen_DIR: ${Hydrogen_DIR}") - -include_directories(SYSTEM ${Hydrogen_INCLUDE_DIRS}) -link_directories(${Hydrogen_DIR}/lib) -set(Hydrogen_LIBS "${Hydrogen_LIBRARIES};-lHydrogen;-lpmrrr;-lopenblas;-lpthread") - -file(GLOB IMGPIPE_DEPEND_SRCS - lbann/utils/random.cpp - ${LBANN_DIR}/src//utils/file_utils.cpp - ${LBANN_DIR}/src/data_readers/image_utils.cpp - ${LBANN_DIR}/src/data_readers/cv_augmenter.cpp - ${LBANN_DIR}/src/data_readers/cv_colorizer.cpp - ${LBANN_DIR}/src/data_readers/cv_cropper.cpp - ${LBANN_DIR}/src/data_readers/cv_decolorizer.cpp - ${LBANN_DIR}/src/data_readers/cv_mean_extractor.cpp - ${LBANN_DIR}/src/data_readers/cv_normalizer.cpp - ${LBANN_DIR}/src/data_readers/cv_process.cpp - ${LBANN_DIR}/src/data_readers/cv_process_patches.cpp - ${LBANN_DIR}/src/data_readers/cv_resizer.cpp - ${LBANN_DIR}/src/data_readers/cv_subtractor.cpp - ${LBANN_DIR}/src/data_readers/cv_transform.cpp - ${LBANN_DIR}/src/data_readers/cv_utils.cpp - ${LBANN_DIR}/src/data_readers/patchworks/patchworks.cpp - ${LBANN_DIR}/src/data_readers/patchworks/patchworks_patch_descriptor.cpp - ${LBANN_DIR}/src/data_readers/patchworks/patchworks_ROI.cpp - ${LBANN_DIR}/src/data_readers/patchworks/patchworks_stats.cpp) - -link_directories(${LBANN_DIR}/lib64) -link_directories(${LBANN_DIR}/lib) -add_executable(${IMGPIPE_EXE} ${IMGPIPE_DEPEND_SRCS} ${IMGPIPE_SRCS}) -target_link_libraries(${IMGPIPE_EXE} ${OpenCV_LIBS} ${Hydrogen_LIBS} ${MPI_CXX_LIBRARIES} ${OpenMP_CXX_LIBRARIES} ${OpenMP_Fortran_LIBRARIES}) diff --git a/tests/test_img_pipeline/Mat.hpp b/tests/test_img_pipeline/Mat.hpp deleted file mode 120000 index 8a49f86123e..00000000000 --- a/tests/test_img_pipeline/Mat.hpp +++ /dev/null @@ -1 +0,0 @@ -../../tools/compute_mean/Mat.hpp \ No newline at end of file diff --git a/tests/test_img_pipeline/README.txt b/tests/test_img_pipeline/README.txt deleted file mode 100644 index df981490eee..00000000000 --- a/tests/test_img_pipeline/README.txt +++ /dev/null @@ -1,42 +0,0 @@ -check Elemental_DIR in CMakeList.txt -This requires OpenCV, Elemental, and MPI. cmake will attempt to find these -under system or LBANN build directories. -To set the LBANN build directory, set CLUSTER variable in CMakeList.txt - -Make sure if the compiler supports c++11, and the environment viriables, CC and CXX, are set. -e.g., -CC=gcc -CXX=g++ - -Then, use the sequence of following commands: - mkdir build - cd build - cmake .. - make - cd .. - -run it as -build/imgpipe image_filename w h r rw rh bsz a n ni - - The parameters w, h, c, rw and rh are for cropper - w: the final crop width of image - h: the final crop height of image - (w and h are dictated whether by cropping images to the size) - r: whether to randomize the crop position within the center region (0|1) - rw: The width of the center region with respect to w after resizig the raw image - rh: The height of the center region with respect to h after resizing the raw image - Raw image will be resized to an image of size rw x rh around the center, - which covers area of the original image as much as possible while preseving - the aspect ratio of object in the image - - bsz: The batch size for mean extractor - if 0, turns off te mean extractor - - a: whether to use augmenter (0|1) - - n: whether to use normalizer (0=none|1=channel-wise|2=pixel-wise) - - ni: The number of iterations. - must be greater than 0 - -e.g., build/imgpipe img.jpg 240 240 1 256 256 4 0 0 8 diff --git a/tests/test_img_pipeline/include b/tests/test_img_pipeline/include deleted file mode 120000 index 38e8790d9e4..00000000000 --- a/tests/test_img_pipeline/include +++ /dev/null @@ -1 +0,0 @@ -../../include/lbann/data_readers \ No newline at end of file diff --git a/tests/test_img_pipeline/lbann b/tests/test_img_pipeline/lbann deleted file mode 120000 index acaf439b382..00000000000 --- a/tests/test_img_pipeline/lbann +++ /dev/null @@ -1 +0,0 @@ -../../tools/compute_mean/lbann \ No newline at end of file diff --git a/tests/test_img_pipeline/lbann_config.hpp b/tests/test_img_pipeline/lbann_config.hpp deleted file mode 120000 index b596fc812e3..00000000000 --- a/tests/test_img_pipeline/lbann_config.hpp +++ /dev/null @@ -1 +0,0 @@ -../../tools/compute_mean/lbann_config.hpp \ No newline at end of file diff --git a/tests/test_img_pipeline/main.cpp b/tests/test_img_pipeline/main.cpp deleted file mode 100644 index de104321b98..00000000000 --- a/tests/test_img_pipeline/main.cpp +++ /dev/null @@ -1,350 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include "lbann/data_readers/image_utils.hpp" -#include "lbann/data_readers/cv_process.hpp" -#include "lbann/utils/file_utils.hpp" - - -struct cropper_params { - bool m_is_set; - bool m_rand_center; - bool m_adaptive_interpolation; - std::pair m_crop_sz; - std::pair m_roi_sz; - - cropper_params(void) - : m_is_set(false), - m_rand_center(false), - m_adaptive_interpolation(false), - m_crop_sz(std::make_pair(0, 0)), - m_roi_sz(std::make_pair(0,0)) {} -}; - -struct resizer_params { - bool m_is_set; - unsigned int m_width; - unsigned int m_height; - bool m_adaptive_interpolation; - resizer_params(void) - : m_is_set(false), - m_width(0u), - m_height(0u), - m_adaptive_interpolation(false) {} -}; - -struct augmenter_params { - bool m_is_set; - bool m_hflip; - bool m_vflip; - float m_rot; - float m_hshift; - float m_vshift; - float m_shear; - - augmenter_params(void) - : m_is_set(false), - m_hflip(false), - m_vflip(false), - m_rot(0.0f), - m_hshift(0.0f), - m_vshift(0.0f), - m_shear(0.0f) {} -}; - -struct main_params { - enum normalizer_type {_NONE_,_CHANNEL_WISE_,_PIXEL_WISE_}; - unsigned int m_num_bytes; - bool m_enable_cropper; - bool m_enable_resizer; - bool m_enable_augmenter; - bool m_enable_colorizer; - bool m_enable_decolorizer; - bool m_enable_mean_extractor; - normalizer_type m_enable_normalizer; - unsigned int m_mean_batch_size; - unsigned int m_num_iter; - std::string m_mean_image_name; - - bool is_normalizer_off() const { return (m_enable_normalizer == _NONE_); } - bool is_channel_wise_normalizer() const { return (m_enable_normalizer == _CHANNEL_WISE_); } - bool is_pixel_wise_normalizer() const { return (m_enable_normalizer == _PIXEL_WISE_); } - - main_params(void) - : m_num_bytes(0u), - m_enable_cropper(true), - m_enable_resizer(false), - m_enable_augmenter(false), - m_enable_colorizer(false), - m_enable_decolorizer(false), - m_enable_mean_extractor(true), - m_enable_normalizer(_NONE_), - m_mean_batch_size(1024u), - m_num_iter(1u) {} -}; - -bool test_image_io(const std::string filename, const main_params& op, const cropper_params& rp, const resizer_params& sp, const augmenter_params& ap); - -void show_help(std::string name); - -//----------------------------------------------------------------------------- -int main(int argc, char *argv[]) { - - if (argc != 11) { - show_help(argv[0]); - return 0; - } - - std::string filename = argv[1]; - - main_params mp; - mp.m_enable_cropper = true; - // to test resizer manually swap m_enalbe_cropper/resizer - mp.m_enable_resizer = false; - mp.m_enable_augmenter = static_cast(atoi(argv[8])); - mp.m_enable_colorizer = true; - mp.m_enable_decolorizer = false; - mp.m_enable_normalizer = static_cast(atoi(argv[9])); - if (mp.is_pixel_wise_normalizer()) mp.m_mean_image_name = "mean.png"; - mp.m_mean_batch_size = atoi(argv[7]); - mp.m_enable_mean_extractor = (mp.m_mean_batch_size > 0); - mp.m_num_iter = atoi(argv[10]); - - cropper_params rp; - if (mp.m_enable_cropper) { - rp.m_is_set = true; - rp.m_crop_sz.first = atoi(argv[2]); - rp.m_crop_sz.second = atoi(argv[3]); - rp.m_rand_center = static_cast(atoi(argv[4])); - rp.m_roi_sz.first = atoi(argv[5]); - rp.m_roi_sz.second = atoi(argv[6]); - //rp.m_adaptive_interpolation = true; - } - - resizer_params sp; - if (mp.m_enable_resizer) { - sp.m_is_set = true; - sp.m_width = static_cast(atoi(argv[2])); - sp.m_height = static_cast(atoi(argv[3])); - //sp.m_adaptive_interpolation = true; - } - - augmenter_params ap; - if (mp.m_enable_augmenter) { - ap.m_is_set = true; - ap.m_rot = 0.1; - ap.m_shear = 0.2; - ap.m_vflip = true; - } - - // read write test with converting to/from a serialized buffer - bool ok = test_image_io(filename, mp, rp, sp, ap); - if (!ok) { - std::cout << "Test failed" << std::endl; - return 0; - } - std::cout << "Complete!" << std::endl; - - return 0; -} - -//----------------------------------------------------------------------------- -void show_help(std::string name) { - std::cout << "Usage: > " << name << " image_filename w h r rw rh bsz a n ni" << std::endl; - std::cout << std::endl; - std::cout << " The parameters w, h, c, rw and rh are for cropper" << std::endl; - std::cout << " w: the final crop width of image" << std::endl; - std::cout << " h: the final crop height of image" << std::endl; - std::cout << " (w and h are dictated whether by cropping images to the size)" << std::endl; - std::cout << " r: whether to randomize the crop position within the center region (0|1)" << std::endl; - std::cout << " rw: The width of the center region with respect to w after resizig the raw image" << std::endl; - std::cout << " rh: The height of the center region with respect to h after resizing the raw image" << std::endl; - std::cout << " Raw image will be resized to an image of size rw x rh around the center," << std::endl; - std::cout << " which covers area of the original image as much as possible while preseving" << std::endl; - std::cout << " the aspect ratio of object in the image" << std::endl; - std::cout << std::endl; - std::cout << " bsz: The batch size for mean extractor" << std::endl; - std::cout << " if 0, turns off te mean extractor" << std::endl; - std::cout << std::endl; - std::cout << " a: whether to use augmenter (0|1)" << std::endl; - std::cout << std::endl; - std::cout << " n: whether to use normalizer (0=none|1=channel-wise|2=pixel-wise)" << std::endl; - std::cout << std::endl; - std::cout << " ni: The number of iterations." << std::endl; - std::cout << " must be greater than 0" << std::endl; -} - -void show_image_size(const int width, const int height, const int type) { - const int depth = CV_MAT_DEPTH(type); - const int NCh = CV_MAT_CN(type); - const int esz = CV_ELEM_SIZE(depth); - std::cout << "Image size : " << width << " x " << height << std::endl; - std::cout << "Number of channels : " << NCh << std::endl; - std::cout << "Size of the channel value type : " << esz << std::endl; - std::cout << "Total bytes : " << width *height *NCh *esz << std::endl; -} - -void write_file(const std::string filename, const std::vector& buf) { - std::ofstream file(filename, std::ios::out | std::ios::binary); - file.write((const char *) buf.data(), buf.size() * sizeof(unsigned char)); - file.close(); -} - -//----------------------------------------------------------------------------- -bool test_image_io(const std::string filename, - const main_params& mp, - const cropper_params& rp, - const resizer_params& sp, - const augmenter_params& ap) -{ - - int transform_idx = 0; - int mean_extractor_idx = -1; - unsigned int num_bytes = mp.m_num_bytes; // size of image in bytes - - lbann::cv_process pp; - { // Initialize the image processor - if (rp.m_is_set) { // If cropper parameters are given - // Setup a cropper - std::unique_ptr cropper(new(lbann::cv_cropper)); - cropper->set(rp.m_crop_sz.first, rp.m_crop_sz.second, rp.m_rand_center, rp.m_roi_sz, rp.m_adaptive_interpolation); - pp.add_transform(std::move(cropper)); - num_bytes = rp.m_crop_sz.first * rp.m_crop_sz.second * 3; - transform_idx ++; - } - - if (sp.m_is_set) { // If resizer parameters are given - // Setup a cropper - std::unique_ptr resizer(new(lbann::cv_resizer)); - resizer->set(sp.m_width, sp.m_height, rp.m_adaptive_interpolation); - pp.add_transform(std::move(resizer)); - num_bytes = sp.m_width * sp.m_height * 3; - transform_idx ++; - } - - if (ap.m_is_set) { // Set up an augmenter - std::unique_ptr augmenter(new(lbann::cv_augmenter)); - augmenter->set(ap.m_hflip, ap.m_vflip, ap.m_rot, ap.m_hshift, ap.m_vshift, ap.m_shear); - pp.add_transform(std::move(augmenter)); - transform_idx ++; - } - - if (mp.m_enable_colorizer) { // Set up a colorizer - std::unique_ptr colorizer(new(lbann::cv_colorizer)); - pp.add_transform(std::move(colorizer)); - transform_idx ++; - } - - if (mp.m_enable_decolorizer) { // Set up a colorizer - std::unique_ptr decolorizer(new(lbann::cv_decolorizer)); - pp.add_transform(std::move(decolorizer)); - transform_idx ++; - } - - if (mp.m_enable_mean_extractor) { // set up a mean extractor - mean_extractor_idx = transform_idx; - std::unique_ptr mean_extractor(new(lbann::cv_mean_extractor)); - if (rp.m_is_set) - mean_extractor->set(rp.m_crop_sz.first, rp.m_crop_sz.second, 3, mp.m_mean_batch_size); - else - mean_extractor->set(mp.m_mean_batch_size); - pp.add_transform(std::move(mean_extractor)); - transform_idx ++; - } - - if (!mp.is_normalizer_off()) { // Set up a normalizer - if (mp.is_channel_wise_normalizer()) { - std::unique_ptr normalizer(new(lbann::cv_normalizer)); - normalizer->z_score(true); - pp.add_normalizer(std::move(normalizer)); - } else { - std::unique_ptr normalizer(new(lbann::cv_subtractor)); -#if 0 - cv::Mat img_to_sub = cv::imread(mp.m_mean_image_name); - if (img_to_sub.empty()) { - std::cout << mp.m_mean_image_name << " does not exist" << std::endl; - return false; - } - normalizer->set_mean(img_to_sub); -#else - std::vector mean = {0.40625, 0.45703, 0.48047}; - normalizer->set_mean(mean); - std::vector stddev = {0.3, 0.5, 0.3}; - normalizer->set_stddev(stddev); -#endif - pp.add_normalizer(std::move(normalizer)); - } - transform_idx ++; - } - } - - // Load an image bytestream into memory - std::vector buf; - bool ok = lbann::load_file(filename, buf); - if (!ok) { - std::cout << "Failed to load" << std::endl; - return false; - } - - int width = 0; - int height = 0; - int type = 0; - - ::Mat Images; - ::Mat Image_v; // matrix view - Images.Resize(((num_bytes==0)? 1: num_bytes), 2); // minibatch - - size_t img_begin = 0; - size_t img_end = buf.size(); - for (unsigned int i=0; i < mp.m_num_iter; ++i) - { - // This has nothing to do with the image type but only to create view on a block of bytes - using InputBuf_T = lbann::cv_image_type; - // Construct a zero copying view to a portion of a preloaded data buffer - const cv::Mat inbuf(1, (img_end - img_begin), InputBuf_T::T(1), &(buf[img_begin])); - - if (num_bytes == 0) { - ok = lbann::image_utils::import_image(inbuf, width, height, type, pp, Images); - num_bytes = Images.Height(); - El::View(Image_v, Images, El::IR(0, num_bytes), El::IR(0, 1)); - } else { - El::View(Image_v, Images, El::IR(0, num_bytes), El::IR(0, 1)); - //ok = lbann::image_utils::import_image(buf, width, height, type, pp, Image_v); - ok = lbann::image_utils::import_image(inbuf, width, height, type, pp, Image_v); - } - if (!ok) { - std::cout << "Failed to import" << std::endl; - return false; - } - //if ((i%3 == 0u) && (mp.m_enable_mean_extractor)) { - // dynamic_cast(pp.get_transform(mean_extractor_idx))->reset(); - //} - } - - // Print out transforms - const unsigned int num_transforms = pp.get_num_transforms(); - const std::vector >& transforms = pp.get_transforms(); - - for(unsigned int i=0u; i < num_transforms; ++i) { - std::cout << std::endl << "------------ transform " << i << "-------------" << std::endl; - std::cout << *transforms[i] << std::endl; - } - - if (mp.m_enable_mean_extractor) { - // Extract the mean of images - cv::Mat mean_image; - mean_image = dynamic_cast(pp.get_transform(mean_extractor_idx))->extract(); - cv::imwrite("mean.png", mean_image); - } - - // Export the unnormalized image - const std::string ext = lbann::get_ext_name(filename); - std::vector outbuf; - ok = lbann::image_utils::export_image(ext, outbuf, width, height, type, pp, Image_v); - write_file("copy." + ext, outbuf); - return ok; -} diff --git a/tests/test_img_pipeline/src b/tests/test_img_pipeline/src deleted file mode 120000 index 7b97ad049e9..00000000000 --- a/tests/test_img_pipeline/src +++ /dev/null @@ -1 +0,0 @@ -../../src/data_readers \ No newline at end of file diff --git a/tests/test_patchworks/CMakeLists.txt b/tests/test_patchworks/CMakeLists.txt deleted file mode 100644 index 9beaf7aad71..00000000000 --- a/tests/test_patchworks/CMakeLists.txt +++ /dev/null @@ -1,81 +0,0 @@ -project(patchworks) -cmake_minimum_required(VERSION 3.8) -cmake_policy(SET CMP0015 NEW) - -set(COMPILER "gnu") -#set(CLUSTER "surface") -set(CLUSTER "catalyst") -set(LBANN_DIR ../..) -set(LBANN_BUILD_DIR ${LBANN_DIR}/build/${COMPILER}.Release.${CLUSTER}.llnl.gov/install) -include(${LBANN_DIR}/cmake/modules/SetupMPI.cmake) -include(${LBANN_DIR}/cmake/modules/SetupOpenMP.cmake) -include_directories(${CMAKE_CURRENT_SOURCE_DIR}) -set(LBANN_INCLUDE_DIR ${LBANN_BUILD_DIR}/include) -include_directories(${LBANN_INCLUDE_DIR}) - -set(PATCHWORKS_EXE patchworks) -set(PATCHWORKS_SRCS main.cpp patchworks_image.cpp patchworks_utils.cpp) - - -set(WITH_OPENCL OFF) - -add_definitions(-Wall) -add_definitions(-O2) -add_definitions(-g) -add_definitions(-std=c++11) -add_definitions(-DLBANN_HAS_OPENCV) - - -#list(APPEND OpenCV_DIR /usr/local/tools/opencv-3.0.0) -#list(APPEND OpenCV_DIR /usr) -#find_package(OpenCV QUIET HINTS ${OpenCV_DIR}) -#message(STATUS "OpenCV_DIR: ${OpenCV_DIR}") - -if(NOT OpenCV_FOUND) - set(OpenCV_DIR ${LBANN_BUILD_DIR}) - set(OpenCV_LIBS "libopencv_highgui.so;libopencv_imgproc.so;libopencv_imgcodecs.so;libopencv_core.so") - set(OpenCV_INCLUDE_DIRS "${OpenCV_DIR}/include") - set(OpenCV_LIB_DIR "${OpenCV_DIR}/lib") - message(STATUS "OpenCV_DIR: ${OpenCV_DIR}") -endif() - -include_directories(SYSTEM ${OpenCV_INCLUDE_DIRS}) -link_directories(${OpenCV_LIB_DIR}) - - -find_package(MPI REQUIRED) -message(STATUS "Found MPI: ${MPI_CXX_COMPILER} ${MPI_C_COMPILER} ${MPI_Fortran_COMPILER}") -include_directories(${MPI_CXX_INCLUDE_PATH}) - -list (APPEND Hydrogen_DIR ${LBANN_BUILD_DIR}) -message(STATUS "Hydrogen_DIR: ${Hydrogen_DIR}") - -include_directories(SYSTEM ${Hydrogen_INCLUDE_DIRS}) -link_directories(${Hydrogen_DIR}/lib) -set(Hydrogen_LIBS "${Hydrogen_LIBRARIES};-lHydrogen;-lpmrrr;-lopenblas;-lpthread") - -file(GLOB PATCHWORKS_DEPEND_SRCS - lbann/utils/random.cpp - ${LBANN_DIR}/src//utils/file_utils.cpp - ${LBANN_DIR}/src/data_readers/image_utils.cpp - ${LBANN_DIR}/src/data_readers/cv_augmenter.cpp - ${LBANN_DIR}/src/data_readers/cv_colorizer.cpp - ${LBANN_DIR}/src/data_readers/cv_cropper.cpp - ${LBANN_DIR}/src/data_readers/cv_decolorizer.cpp - ${LBANN_DIR}/src/data_readers/cv_mean_extractor.cpp - ${LBANN_DIR}/src/data_readers/cv_normalizer.cpp - ${LBANN_DIR}/src/data_readers/cv_process.cpp - ${LBANN_DIR}/src/data_readers/cv_process_patches.cpp - ${LBANN_DIR}/src/data_readers/cv_resizer.cpp - ${LBANN_DIR}/src/data_readers/cv_subtractor.cpp - ${LBANN_DIR}/src/data_readers/cv_transform.cpp - ${LBANN_DIR}/src/data_readers/cv_utils.cpp - ${LBANN_DIR}/src/data_readers/patchworks/patchworks.cpp - ${LBANN_DIR}/src/data_readers/patchworks/patchworks_patch_descriptor.cpp - ${LBANN_DIR}/src/data_readers/patchworks/patchworks_ROI.cpp - ${LBANN_DIR}/src/data_readers/patchworks/patchworks_stats.cpp) - -link_directories(${LBANN_DIR}/lib64) -link_directories(${LBANN_DIR}/lib) -add_executable(${PATCHWORKS_EXE} ${PATCHWORKS_DEPEND_SRCS} ${PATCHWORKS_SRCS}) -target_link_libraries(${PATCHWORKS_EXE} ${OpenCV_LIBS} ${Hydrogen_LIBS} ${MPI_CXX_LIBRARIES} ${OpenMP_CXX_LIBRARIES} ${OpenMP_Fortran_LIBRARIES}) diff --git a/tests/test_patchworks/Mat.hpp b/tests/test_patchworks/Mat.hpp deleted file mode 120000 index d3dc421f3e4..00000000000 --- a/tests/test_patchworks/Mat.hpp +++ /dev/null @@ -1 +0,0 @@ -../test_img_pipeline/Mat.hpp \ No newline at end of file diff --git a/tests/test_patchworks/README.txt b/tests/test_patchworks/README.txt deleted file mode 100644 index f5942a380d2..00000000000 --- a/tests/test_patchworks/README.txt +++ /dev/null @@ -1,21 +0,0 @@ -check Elemental_DIR in CMakeList.txt -This requires OpenCV, Elemental, and MPI. cmake will attempt to find these -under system or LBANN build directories. -To set the LBANN build directory, set CLUSTER variable in CMakeList.txt - -Make sure if the compiler supports c++11, and the environment viriables, CC and CXX, are set. -e.g., -CC=gcc -CXX=g++ - -Then, use the sequence of following commands: - mkdir build - cd build - cmake .. - make - cd .. - export OPENCV_OPENCL_DEVICE=:CPU - build/patchWorks imgfile 96 48 7 1 - - The options of the executable are for 96x96 patch size, 48 gap size, 7 jitter size, and - the centering mode to generate all 8 neighbors around the center patch. diff --git a/tests/test_patchworks/include b/tests/test_patchworks/include deleted file mode 120000 index 38e8790d9e4..00000000000 --- a/tests/test_patchworks/include +++ /dev/null @@ -1 +0,0 @@ -../../include/lbann/data_readers \ No newline at end of file diff --git a/tests/test_patchworks/lbann b/tests/test_patchworks/lbann deleted file mode 120000 index acaf439b382..00000000000 --- a/tests/test_patchworks/lbann +++ /dev/null @@ -1 +0,0 @@ -../../tools/compute_mean/lbann \ No newline at end of file diff --git a/tests/test_patchworks/lbann_config.hpp b/tests/test_patchworks/lbann_config.hpp deleted file mode 120000 index b596fc812e3..00000000000 --- a/tests/test_patchworks/lbann_config.hpp +++ /dev/null @@ -1 +0,0 @@ -../../tools/compute_mean/lbann_config.hpp \ No newline at end of file diff --git a/tests/test_patchworks/main.cpp b/tests/test_patchworks/main.cpp deleted file mode 100644 index c0c41d426c6..00000000000 --- a/tests/test_patchworks/main.cpp +++ /dev/null @@ -1,124 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include "lbann/base.hpp" -#include "lbann/utils/random.hpp" -#include "lbann/data_readers/patchworks/patchworks.hpp" -#include "patchworks_image.hpp" -#include "lbann/utils/file_utils.hpp" -//#include - - -using namespace lbann::patchworks; - -bool test_patch(const int argc, char *argv[]); - -int main(int argc, char *argv[]) { - if (argc < 2) { - std::cout << "Usage: > " << argv[0] << " filename [patch_size [gap_size [jitter [ceteringMode [ca_mode]]]]]" << std::endl; - return 0; - } - - std::string filename = argv[1]; - - lbann::init_random(); - bool ok = test_patch(argc, argv); - if (!ok) { - std::cout << "failed to copy the image" << std::endl; - return 0; - } - std::cout << "Complete!" << std::endl; - - return 0; -} - -bool test_patch(const int argc, char *argv[]) { - unsigned int patch_size = 96u; - unsigned int gap = 48u; - unsigned int jitter = 7u; - unsigned int mode_centering = 1u; - unsigned int mode_chromaberr = 0u; - std::string filename = argv[1]; - if (argc > 2) { - patch_size = std::atoi(argv[2]); - } - if (argc > 3) { - gap = std::atoi(argv[3]); - } - if (argc > 4) { - jitter = std::atoi(argv[4]); - } - if (argc > 5) { - mode_centering = std::atoi(argv[5]); - } - if (argc > 6) { - mode_chromaberr = std::atoi(argv[6]); - } - - if (patch_size == 0u) { - return false; - } - -#ifdef LBANN_HAS_OPENCV - // load input image - image *img = new image(filename); - if (img->empty()) { - std::cout << "failed to load the image " << filename << std::endl; - return false; - } - img->show_info(); - #if defined(HAVE_GTK) || defined(HAVE_CARBON) - img->display("original " + filename); - #endif - - bool ok = true; - patch_descriptor pi; - pi.set_size(patch_size, patch_size); - ok = pi.set_sample_image(static_cast(img->get_width()), - static_cast(img->get_height())); - if (!ok) { - std::cout << "failed to set patch sampling region" << std::endl; - } - pi.set_gap(gap); - pi.set_jitter(jitter); - pi.set_mode_centering(mode_centering); - pi.set_mode_chromatic_aberration(mode_chromaberr); - pi.set_file_ext("png"); - pi.define_patch_set(); - - std::vector patches; - ok = pi.extract_patches(img->get_image(), patches); - if (!ok) { - std::cout << "failed to extract patch" << std::endl; - } - for (size_t i=0u; i < patches.size(); ++i) { - std::stringstream sstr; - sstr << "patch." << i << ".png"; - image::write(sstr.str(), patches[i]); - } - std::cout << "the id of the last patch generated (label in case of paired patches): " - << pi.get_last_label()+1 << std::endl; - - std::cout << pi; - #if defined(HAVE_GTK) || defined(HAVE_CARBON) - img->draw_patches(pi); - img->display("patches of " + filename); - #endif - - std::string patched_filename = basename_with_no_extention(filename) - + ".patched." + lbann::get_ext_name(filename); - ok = img->write(patched_filename); - if (!ok) { - std::cout << "failed to write patch map" << std::endl; - } - - delete img; - return true; -#else - return false; -#endif // LBANN_HAS_OPENCV -} diff --git a/tests/test_patchworks/patchworks_image.cpp b/tests/test_patchworks/patchworks_image.cpp deleted file mode 100644 index 78f9911e068..00000000000 --- a/tests/test_patchworks/patchworks_image.cpp +++ /dev/null @@ -1,198 +0,0 @@ -#include -#include -#include -#include -#include // sqrt -#include -#include "lbann/data_readers/patchworks/patchworks_stats.hpp" -#include "patchworks_image.hpp" - -#include -#if (!defined(CV_VERSION_EPOCH) && (CV_VERSION_MAJOR >= 3)) -#define DEFAULT_CV_WINDOW_KEEPRATIO cv::WINDOW_KEEPRATIO -#else -#define DEFAULT_CV_WINDOW_KEEPRATIO CV_WINDOW_KEEPRATIO -#endif - -namespace lbann { -namespace patchworks { - -std::string showDepth(const cv::Mat& mat) { - return showDepth(CV_MAT_DEPTH(mat.type())); -} - -std::string showDepth(const int depth) { - switch (depth) { - case CV_8U: - return "CV_8U"; - break; - case CV_8S: - return "CV_8S"; - break; - case CV_16U: - return "CV_16U"; - break; - case CV_16S: - return "CV_16S"; - break; - case CV_32S: - return "CV_32S"; - break; - case CV_32F: - return "CV_32F"; - break; - case CV_64F: - return "CV_64F"; - break; - default: - return "Unknown"; - break; - } - return "Unknown"; -} - -size_t image_data_amount(const cv::Mat& img) { - return static_cast(CV_ELEM_SIZE(img.depth())*CV_MAT_CN(img.type())*img.cols*img.rows); -} - -void show_cvMat_info(const int type) { - const int depth = CV_MAT_DEPTH(type); - std::cout << "showDepth(CV_MAT_DEPTH(img.type())) " << lbann::patchworks::showDepth(depth) << std::endl; - std::cout << "CV_ELEM_SIZE(img.depth()) " << CV_ELEM_SIZE(depth) << std::endl; - std::cout << "CV_ELEM_SIZE(CV_MAT_DEPTH(img.type())) " << CV_ELEM_SIZE(depth) << std::endl; - std::cout << "CV_MAT_CN(img.type()) " << CV_MAT_CN(type) << std::endl; -} - - -image::image(const std::string fname) - : m_screen_width(640), m_screen_height(480) { - detect_screen_resolution(); - if (fname != "") { - load(fname); - } -} - -image::~image(void) { - release(); -} - -void image::release(void) { - m_filename = ""; - if (m_img.data != NULL) { - m_img.release(); - } - m_img.data = NULL; -} - -bool image::load(const std::string fname) { - release(); - - m_img = cv::imread(fname, cv::IMREAD_ANYCOLOR | cv::IMREAD_ANYDEPTH); - - if (m_img.data == NULL) { - return false; - } - - m_filename = fname; - return true; -} - - -std::ostream& image::show_info(const cv::Mat& img, const std::string title, std::ostream& os) { - if (img.data == NULL) { - return os; - } - - os << title << std::endl - << " Type: " << showDepth(img) << std::endl - << " nCh : " << img.channels() << std::endl - << " Size: " << img.size() << std::endl; - - std::vector stats; - get_channel_stats(img, stats); - - for (int ch = 0; ch < img.channels(); ++ch) { - os << "channel " << ch; - os << stats[ch] << std::endl; - } - - return os; -} - -std::ostream& image::show_info(const std::string title, std::ostream& os) const { - show_info(m_img, title, os); - os << " File: " << m_filename << std::endl; - return os; -} - -void image::detect_screen_resolution(void) { - std::vector > res; - unsigned int cnt = get_screen_resolution(res); - - if (cnt == 0u) { // fall back resolution - res.push_back(std::make_pair(640,480)); - } - - m_screen_width = res[0].first; - m_screen_height = res[0].second; -} - -void image::display(const std::string title) const { - if (m_img.data == NULL) { - return; // nothing to show - } - - cv::namedWindow(title, cv::WINDOW_NORMAL | DEFAULT_CV_WINDOW_KEEPRATIO); - cv::imshow(title, m_img); - - float zoomFactor = 1.0; - const double initialZoomOutRateW = static_cast(m_screen_width)/m_img.cols; - const double initialZoomOutRateH = static_cast(m_screen_height)/m_img.rows; - const double initialZoomOutRate = std::min(initialZoomOutRateW, initialZoomOutRateH); - - const int eW = static_cast(m_img.cols * initialZoomOutRate); - const int eH = static_cast(m_img.rows * initialZoomOutRate); - - const int m_screen_widthZ = std::min(static_cast(zoomFactor*eW), m_img.cols); - const int m_screen_heightZ = std::min(static_cast(zoomFactor*eH), m_img.rows); - - cv::resizeWindow(title, m_screen_widthZ, m_screen_heightZ); - m_window_title = title; - cv::waitKey(0); -} - -void image::draw_rectangle(cv::Mat& img, const ROI& r) { - const uint16_t chSet = std::numeric_limits::max(); - const cv::Scalar color(0, chSet, chSet); - const int thickness = 2; - const int lineType = 8; - - cv::rectangle(img, - cv::Point(r.left(), r.top()), - cv::Point(r.right(), r.bottom()), - color, thickness, lineType); -} - -void image::draw_rectangle(const ROI& r) { - draw_rectangle(m_img, r); -} - -void image::draw_patches(const patch_descriptor& pi) { - const std::vector& pos = pi.access_positions(); - for (size_t i=0u; i< pos.size(); ++i) { - draw_rectangle(m_img, pos[i]); - } -} - -bool image::write(const std::string out_filename, const cv::Mat& img_to_write) { - if ((out_filename == "") || (img_to_write.data == NULL)) { - return false; - //std::cout << "Failed to write an image file [" << out_filename << "]" << std::endl; - } - //std::cout << "writing an image file [" << out_filename << "]" << std::endl; - return cv::imwrite(out_filename, img_to_write); -} - - -} // end of namespace patchworks -} // end of namespace lbann diff --git a/tests/test_patchworks/patchworks_image.hpp b/tests/test_patchworks/patchworks_image.hpp deleted file mode 100644 index d2515e75888..00000000000 --- a/tests/test_patchworks/patchworks_image.hpp +++ /dev/null @@ -1,120 +0,0 @@ -/* - * Main image data structure - * Author: Jae-Seung Yeom - */ - -#ifndef _PATCHWORKS_IMAGE_H_INCLUDED_ -#define _PATCHWORKS_IMAGE_H_INCLUDED_ - -#include -#include -#include -#include -#include "lbann/data_readers/patchworks/patchworks_common.hpp" -#include "lbann/data_readers/patchworks/patchworks_ROI.hpp" -#include "lbann/data_readers/patchworks/patchworks_patch_descriptor.hpp" -#include "patchworks_utils.hpp" - -#include -#if (!defined(CV_VERSION_EPOCH) && (CV_VERSION_MAJOR >= 3)) -#include -#define DEFAULT_CV_WINDOW_KEEPRATIO cv::WINDOW_KEEPRATIO -#else -#include -#define DEFAULT_CV_WINDOW_KEEPRATIO CV_WINDOW_KEEPRATIO -#endif - -namespace lbann { -namespace patchworks { - -class image { - public: - protected: - /// The image data (OpenCV Mat type) - cv::Mat m_img; - /// A name to show on an image window title bar - mutable std::string m_window_title; - /// The name of the image file - std::string m_filename; - - // current screen resolution - int m_screen_width; ///< screen width in the number of pixels - int m_screen_height; ///< screen height in the number of pixels - - /// Detect screen resolution to draw window that fits in the screen - virtual void detect_screen_resolution(void); - - public: - image(void) : m_window_title(""), m_filename("") { - detect_screen_resolution(); - } - image(const std::string fname); - virtual ~image(void); - - /// Check if an image data exists - virtual bool empty(void) const { - return (m_img.data == NULL); - } - /// Free the space used by image data - virtual void release(void); - /// Return the filename of this image - virtual std::string get_filename(void) const { - return m_filename; - } - /// Read an image file - virtual bool load(const std::string fname); - /// Display the image - virtual void display(const std::string title="") const; - /// Show information on a given image with a given title - static std::ostream& show_info(const cv::Mat& img, const std::string title = "image info", - std::ostream& os = std::cout); - /// Show information on the image with a given title - virtual std::ostream& show_info(const std::string title = "image info", - std::ostream& os = std::cout) const; - - /// Return the width (number of columns) of the image - virtual int get_width(void) const { - return m_img.cols; - } - /// Return the height (number of rows) of the image - virtual int get_height(void) const { - return m_img.rows; - } - - /// Return the number of channels of the images - virtual int get_num_channels(void) const { - return m_img.channels(); - } - /// Return the pixel depth of the image in OpenCV term - virtual int get_depth(void) const { - return m_img.depth(); - } - - /// Returns the access to the image data (OpenCV Mat type) - virtual cv::Mat& get_image(void) { - return m_img; - } - - /// Mark a retangular region on the image - static void draw_rectangle(cv::Mat& img, const ROI& r); - /// Mark a retangular region on the image - virtual void draw_rectangle(const ROI& r); - /// Mark patch regions on the image - virtual void draw_patches(const patch_descriptor& pi); - - /// Write an image into the file with the given file name - static bool write(const std::string outFileName, const cv::Mat& img_to_write); - /// Write the image into the file with the given name - virtual bool write(const std::string outFileName) const { - return write(outFileName, m_img); - } -}; - -std::string showDepth(const cv::Mat& mat); -std::string showDepth(const int depth); -size_t image_data_amount(const cv::Mat& mat); -void show_cvMat_info(const int type); - -} // end of namespace patchworks -} // end of namespace lbann -#endif // _PATCHWORKS_IMAGE_H_INCLUDED_ diff --git a/tests/test_patchworks/patchworks_utils.cpp b/tests/test_patchworks/patchworks_utils.cpp deleted file mode 100644 index 9b682b6821c..00000000000 --- a/tests/test_patchworks/patchworks_utils.cpp +++ /dev/null @@ -1,78 +0,0 @@ -#include "patchworks_utils.hpp" -#include // popen -#include -#include -#include -#include // std::ostringstream - -unsigned int get_screen_resolution(std::vector >& res) { - std::string command = "xrandr | grep '*'"; - FILE *fpipe = (FILE *) popen(command.c_str(),"r"); - char line[256]; - unsigned int cnt = 0u; - res.clear(); - - while ( fgets( line, sizeof(line), fpipe) ) { - //printf("%s", line); - std::string rstr(line); - unsigned int posS=0u; - unsigned int posE=0u; - unsigned int posX=0u; - posX = rstr.find_first_of("xX", posX); - posS = rstr.find_first_of(" \t\r", posS); - posE = rstr.find_first_of(" \t\r", posS); - std::string widthStr = rstr.substr(posS+1, (posX-posS)); - std::string heightStr = rstr.substr(posX+1, (posE-posX)); - std::stringstream wss(widthStr); - std::stringstream hss(heightStr); - int w=0, h=0; - wss >> w; - hss >> h; - res.push_back(std::make_pair(w,h)); - cnt ++; - } - pclose(fpipe); - - return cnt; -} - -struct path_delimiter { - bool operator()( char ch ) const { - return ch == '/'; - } -}; - -bool split_path(const std::string& path, std::string& dir, std::string& name) { - std::string::const_iterator nb - = std::find_if( path.rbegin(), path.rend(), path_delimiter()).base(); - dir = std::string(path.begin(), nb); - name = std::string(nb, path.end()); - if (name.empty()) { - return false; - } - - return true; -} - -std::string name_with_no_extention(const std::string filename) { - size_t pos = filename.find_last_of('.'); - if (pos == 0u) { - return filename; - } - return filename.substr(0, pos); -} - -std::string get_file_extention(const std::string filename) { - size_t pos = filename.find_last_of('.'); - if (pos == 0u) { - return ""; - } - return filename.substr(pos+1, filename.size()); -} - -std::string basename_with_no_extention(const std::string filename) { - std::string imgdir; - std::string imgfile; - split_path(filename, imgdir, imgfile); - return name_with_no_extention(imgfile); -} diff --git a/tests/test_patchworks/patchworks_utils.hpp b/tests/test_patchworks/patchworks_utils.hpp deleted file mode 100644 index 71c915b42ea..00000000000 --- a/tests/test_patchworks/patchworks_utils.hpp +++ /dev/null @@ -1,26 +0,0 @@ -/* - * Utility routines - * Author: Jae-Seung Yeom - */ -#ifndef _PATCHWORKS_UTILS_H_INCLUDED_ -#define _PATCHWORKS_UTILS_H_INCLUDED_ -#include -#include // std::pair -#include "lbann/data_readers/patchworks/patchworks_common.hpp" - -/// Obtain the screen resolution, which is useful to size an window -unsigned int get_screen_resolution(std::vector >& res); - -/// Split a file path into the directory and the file name under it -bool split_path(const std::string& path, std::string& dir, std::string& name); - -/// return the file name without extention -std::string name_with_no_extention(const std::string filename); - -/// return the file extention -std::string get_file_extention(const std::string filename); - -/// return the base file name (respective to its final directory) without extention -std::string basename_with_no_extention(const std::string filename); - -#endif // _PATCHWORKS_UTILS_H_INCLUDED_ diff --git a/tests/test_patchworks/src b/tests/test_patchworks/src deleted file mode 120000 index 7b97ad049e9..00000000000 --- a/tests/test_patchworks/src +++ /dev/null @@ -1 +0,0 @@ -../../src/data_readers \ No newline at end of file From 74720ac2d6af0b863badbf73c722b8f67d31ae0e Mon Sep 17 00:00:00 2001 From: "David A. Hysom" Date: Thu, 13 Jun 2019 10:15:49 -0700 Subject: [PATCH 071/634] commiting prior to changing branches. development; nothing to see --- .../lbann/data_readers/data_reader_image.hpp | 1 + .../lbann/data_store/data_store_conduit.hpp | 12 ++- src/data_readers/data_reader.cpp | 24 +++-- src/data_readers/data_reader_image.cpp | 15 ++- src/data_store/data_store_conduit.cpp | 98 ++++++++++++++----- src/utils/lbann_library.cpp | 2 +- 6 files changed, 113 insertions(+), 39 deletions(-) diff --git a/include/lbann/data_readers/data_reader_image.hpp b/include/lbann/data_readers/data_reader_image.hpp index 3c9095af07c..3ac80fcadb2 100644 --- a/include/lbann/data_readers/data_reader_image.hpp +++ b/include/lbann/data_readers/data_reader_image.hpp @@ -115,6 +115,7 @@ class image_data_reader : public generic_data_reader { int m_num_labels; ///< number of labels void load_conduit_node_from_file(int data_id, conduit::Node &node); + void load_conduit_node_from_file(int data_id, conduit::Node &node, std::vector &data); }; diff --git a/include/lbann/data_store/data_store_conduit.hpp b/include/lbann/data_store/data_store_conduit.hpp index e3a8284416a..2382a41b13e 100644 --- a/include/lbann/data_store/data_store_conduit.hpp +++ b/include/lbann/data_store/data_store_conduit.hpp @@ -303,10 +303,18 @@ protected : void allocate_shared_segment(int size); std::string m_image_base_dir; - std::vector m_my_files; - std::vector m_my_sizes; + std::vector m_my_files; + std::vector m_my_sizes; void load_files(); + + void *m_mem_seg = 0; + + //m_loaded_images[j] = true if the j'th image has been loaded + bool *m_loaded_images; + + std::vector m_image_filenames; + std::vector m_labels; }; } // namespace lbann diff --git a/src/data_readers/data_reader.cpp b/src/data_readers/data_reader.cpp index 71e7eb0d017..5f55c76ba22 100644 --- a/src/data_readers/data_reader.cpp +++ b/src/data_readers/data_reader.cpp @@ -710,7 +710,7 @@ double generic_data_reader::get_use_percent() const { void generic_data_reader::instantiate_data_store(const std::vector& local_list_sizes) { options *opts = options::get(); - if (! (opts->get_bool("use_data_store") || opts->get_bool("preload_data_store"))) { + if (! (opts->get_bool("use_data_store") || opts->get_bool("preload_data_store") || opts->get_bool("data_store_cache"))) { if (m_data_store != nullptr) { delete m_data_store; m_data_store = nullptr; @@ -738,15 +738,19 @@ void generic_data_reader::instantiate_data_store(const std::vector& local_l // optionally preload the data store if (opts->get_bool("preload_data_store")) { - if(is_master()) { - std::cout << "Starting the preload" << std::endl; - } - if (local_list_sizes.size() != 0) { - m_data_store->build_preloaded_owner_map(local_list_sizes); - } - preload_data_store(); - if(is_master()) { - std::cout << "preload complete" << std::endl; + //TODO: future development: preloading when using store as local cache + if (!opts->get_bool("data_store_cache")) { + + if(is_master()) { + std::cout << "generic_data_reader::instantiate_data_store - Starting the preload" << std::endl; + } + if (local_list_sizes.size() != 0) { + m_data_store->build_preloaded_owner_map(local_list_sizes); + } + preload_data_store(); + if(is_master()) { + std::cout << "preload complete" << std::endl; + } } } diff --git a/src/data_readers/data_reader_image.cpp b/src/data_readers/data_reader_image.cpp index 337ffe896b0..926cd553222 100644 --- a/src/data_readers/data_reader_image.cpp +++ b/src/data_readers/data_reader_image.cpp @@ -168,7 +168,7 @@ void image_data_reader::load() { // is modified std::vector local_list_sizes; - if (opts->get_bool("preload_data_store")) { + if (opts->get_bool("preload_data_store") || opts->get_bool("data_store_cache")) { int np = m_comm->get_procs_per_trainer(); int base_files_per_rank = m_image_list.size() / np; int extra = m_image_list.size() - (base_files_per_rank*np); @@ -243,6 +243,17 @@ std::vector image_data_reader::get_image_list_of_cu return ret; } + +void image_data_reader::load_conduit_node_from_file(int data_id, conduit::Node &node, std::vector &data) { + node.reset(); + const std::string filename = get_file_dir() + m_image_list[data_id].first; + int label = m_image_list[data_id].second; + read_raw_data(filename, data); + node[LBANN_DATA_ID_STR(data_id) + "/label"].set(label); + node[LBANN_DATA_ID_STR(data_id) + "/buffer"].set(data); + //node[LBANN_DATA_ID_STR(data_id) + "/buffer"].set_char_ptr(data.data(), data.size()); + node[LBANN_DATA_ID_STR(data_id) + "/buffer_size"] = data.size(); +} void image_data_reader::load_conduit_node_from_file(int data_id, conduit::Node &node) { node.reset(); const std::string filename = get_file_dir() + m_image_list[data_id].first; @@ -252,7 +263,7 @@ void image_data_reader::load_conduit_node_from_file(int data_id, conduit::Node & read_raw_data(filename, data); node[LBANN_DATA_ID_STR(data_id) + "/label"].set(label); node[LBANN_DATA_ID_STR(data_id) + "/buffer"].set(data); - node[LBANN_DATA_ID_STR(data_id) + "/buffer"].set_char_ptr(data.data(), data.size()); + //node[LBANN_DATA_ID_STR(data_id) + "/buffer"].set_char_ptr(data.data(), data.size()); node[LBANN_DATA_ID_STR(data_id) + "/buffer_size"] = data.size(); } diff --git a/src/data_store/data_store_conduit.cpp b/src/data_store/data_store_conduit.cpp index ee2e23a3aae..6a032304813 100644 --- a/src/data_store/data_store_conduit.cpp +++ b/src/data_store/data_store_conduit.cpp @@ -32,6 +32,8 @@ #include "lbann/utils/options.hpp" #include "lbann/utils/timer.hpp" #include +#include +#include namespace lbann { @@ -65,16 +67,12 @@ data_store_conduit::data_store_conduit( std::stringstream ss; ss << "debug_" << m_reader->get_role() << "." << m_comm->get_rank_in_world(); m_output.open(ss.str().c_str()); + if (m_world_master) { + std::cout << "opened " << ss.str() << " for writing\n"; + } } m_is_local_cache = opts->get_bool("data_store_cache"); - if (m_is_local_cache && opts->get_bool("preload_data_store")) { - LBANN_ERROR("you cannot use both of these options: --data_store_cache --preload_data_store"); - } - - if (m_is_local_cache) { - get_image_sizes(); - } if (m_world_master) { if (m_is_local_cache) { @@ -91,6 +89,9 @@ data_store_conduit::~data_store_conduit() { if (m_output) { m_output.close(); } + if (m_is_local_cache && m_mem_seg) { + shmdt(m_mem_seg); + } } data_store_conduit::data_store_conduit(const data_store_conduit& rhs) { @@ -193,7 +194,6 @@ void data_store_conduit::copy_members(const data_store_conduit& rhs, const std:: } void data_store_conduit::setup(int mini_batch_size) { - if (m_world_master) { if (m_super_node) { std::cout << "data store mode: exchange_data via super nodes\n"; @@ -216,6 +216,10 @@ void data_store_conduit::setup(int mini_batch_size) { m_is_setup = true; + if (m_is_local_cache) { + preload_local_cache(); + } + if (m_world_master && !m_preload) { std::cout << "TIME for data_store_conduit setup: " << get_time() - tm1 << "\n"; } @@ -1023,55 +1027,55 @@ int data_store_conduit::get_image_offsets() { const std::string image_list_file = m_reader->get_data_filename(); m_image_base_dir = m_reader->get_file_dir(); FILE *fplist = fopen(image_list_file.c_str(), "rt"); - std::vector image_file_names; int imagelabel; while (!feof(fplist)) { char imagepath[512]; if (fscanf(fplist, "%s%d", imagepath, &imagelabel) <= 1) { break; } - image_file_names.emplace_back(imagepath); + m_image_filenames.emplace_back(imagepath); + m_labels.emplace_back(imagelabel); } fclose(fplist); // get sizes of files for which I'm responsible - for (size_t h=m_rank_in_trainer; hget_file_dir() + '/' + m_image_filenames[h]; std::ifstream in(fn.c_str()); if (!in) { LBANN_ERROR("failed to open " + fn + " for reading"); } in.seekg(0, std::ios::end); m_my_sizes.push_back(in.tellg()); - m_my_files.push_back(image_file_names[h]); + m_my_files.push_back(h); in.close(); } if (m_output) { m_output << "my image sizes:\n"; - for (size_t k=0; k counts(m_np_in_trainer); m_comm->all_gather(&my_count, 1, counts.data(), 1, m_comm->get_trainer_comm()); size_t g_count = std::accumulate(counts.begin(), counts.end(), 0); - if (g_count != image_file_names.size()) { - LBANN_ERROR("g_count != image_file_names.size()"); + if (g_count != m_image_filenames.size()) { + LBANN_ERROR("g_count != m_image_filenames.size()"); } - std::vector work(image_file_names.size()); + std::vector work(m_image_filenames.size()); std::vector disp(m_np_in_trainer); disp[0] = 0; for (size_t h=0; htrainer_all_gather(my_sizes, work, counts, disp); + m_comm->trainer_all_gather(m_my_sizes, work, counts, disp); // fill in m_image_offsets - m_image_offsets.resize(image_file_names.size()+1); + m_image_offsets.resize(m_image_filenames.size()+1); m_image_offsets[0] = 0; for (int rank = 0; rank < m_np_in_trainer; rank++) { size_t offset = disp[rank]; @@ -1082,12 +1086,18 @@ int data_store_conduit::get_image_offsets() { i += m_np_in_trainer; } } - segment_length = m_image_offsets.back(); + segment_length = m_image_offsets.back() + sizeof(bool)*m_image_offsets.size(); if (m_output) { + if (m_world_master) { + std::cout << "image offsets:\n"; + } m_output << "image offsets:\n"; for (size_t h=0; hget_rank_in_node(); + key_t key = ftok(",", 'x'); + int shm_id; if (node_id == 0) { + shm_id = shmget(key, size, (IPC_CREAT | 0666)); + if (shm_id < 0) { + LBANN_ERROR("shm_id < 0; shmget() failed to create shared memory segment of " + std::to_string(size) + " bytes"); + } + m_mem_seg = shmat(shm_id, NULL, 0); + if (*(int*)m_mem_seg == -1) { + LBANN_ERROR("m_mem_seg == -1; call to shmat() failed"); + } + m_loaded_images = (bool*)((char*)m_mem_seg + m_image_offsets.back()); + for (size_t j=0; jbarrier(m_comm->get_node_comm()); + + if (node_id != 0) { + shm_id = shmget(key, size, 0666); + if (shm_id < 0) { + LBANN_ERROR("shm_id < 0; shmget() failed to create shared memory segment of " + std::to_string(size) + " bytes"); + } + m_mem_seg = shmat(shm_id, NULL, 0); + if (*(int*)m_mem_seg == -1) { + LBANN_ERROR("m_mem_seg == -1; call to shmat() failed"); + } + } } void data_store_conduit::preload_local_cache() { int segment_size = get_image_offsets(); - allocate_shared_segment(segment_length); + allocate_shared_segment(segment_size); load_files(); } void data_store_conduit::load_files() { + for (auto j : m_my_files) { + const std::string fn = m_image_base_dir + '/' + m_image_filenames[j]; + std::ifstream in(fn, std::ios::in | std::ios::binary); + if (!in) { + LBANN_ERROR("failed to open " + fn + " for binary read"); + } + char *c = (char*)m_mem_seg + m_image_offsets[j]; + in.read(c, m_image_offsets[j+1] - m_image_offsets[j]); + in.close(); + } +MPI_Barrier(MPI_COMM_WORLD); +if (m_world_master) std::cout << "all files loaded\n"; +MPI_Barrier(MPI_COMM_WORLD); +exit(0); } } // namespace lbann diff --git a/src/utils/lbann_library.cpp b/src/utils/lbann_library.cpp index 5c82d30c8b2..e347f6f2df9 100644 --- a/src/utils/lbann_library.cpp +++ b/src/utils/lbann_library.cpp @@ -191,7 +191,7 @@ std::unique_ptr build_model_from_prototext( ret_model->allow_background_io_activity(false); } - if (opts->get_bool("use_data_store") || opts->get_bool("preload_data_store")) { + if (opts->get_bool("use_data_store") || opts->get_bool("preload_data_store") || opts->get_bool("data_store_cache")) { if (master) { std::cout << "\nUSING DATA STORE!\n\n"; } From 260d2ce4508c739bb4e8f8bb842e767e1e7e3054 Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Thu, 13 Jun 2019 10:53:53 -0700 Subject: [PATCH 072/634] Updated the JAG unit test and added a copy of the metadata schema for JAG unit tests --- .../prototext/jag_100M_metadata.prototext | 119 ++++++++++++++++++ .../unit_tests/prototext/jag_reader.prototext | 64 +--------- 2 files changed, 120 insertions(+), 63 deletions(-) create mode 100644 bamboo/unit_tests/prototext/jag_100M_metadata.prototext diff --git a/bamboo/unit_tests/prototext/jag_100M_metadata.prototext b/bamboo/unit_tests/prototext/jag_100M_metadata.prototext new file mode 100644 index 00000000000..d76f3155959 --- /dev/null +++ b/bamboo/unit_tests/prototext/jag_100M_metadata.prototext @@ -0,0 +1,119 @@ +######################################################################## +# The JAG normalization values were computed over the 10M + 1MA + 1MB random +# pulls from the 100M data set. The image normalization values were updated +# on 1/30/2019 using the per-channel average of the pixel values +# across all views. +# They are valid for the directories: +# /p/lustre2/brainusr/datasets/10MJAG/ (10M | 1M_A | 1M_B) +# /p/lustre2/brainusr/datasets/10MJAG_balanced_1K/ (1M_A | 1M_B) +# /p/gpfs1/brainusr/datasets/10MJAG/10M | 1M_A | 1M_B +# /p/gpfs1/brainusr/datasets/10MJAG_balanced_1K/ (1M_A | 1M_B) +######################################################################## + +data_set_metadata { + schema { + split_jag_image_channels: true + + # JAG_Image, JAG_Scalar, JAG_Input + independent: [ { pieces: [ JAG_Image, JAG_Scalar ] }, { pieces: [ JAG_Input ] } ] + dependent: [ { pieces: [ JAG_Input ] } ] + + image_prefix: "/outputs/images/" + + image_width: 64 + image_height: 64 + image_num_channels: 4 + + jag_image_keys: ["(0.0, 0.0)/0.0/emi", "(90.0, 0.0)/0.0/emi", "(90.0, 78.0)/0.0/emi"] + + scalar_prefix: "/outputs/scalars/" + + # An empty list indicates to use all + # The commented out variables are not on the Jim's original list but used in the numpy-based format + jag_scalar_keys: + [ "BWx", + "BT", + "tMAXt", # absent in Jim's list + "BWn", + "MAXpressure", + #"BAte", + #"MAXtion", + "tMAXpressure", + "BAt", # absent in Jim's list + "Yn", + "Ye", + "Yx", + #"tMAXte", # absent in Jim's list + #"BAtion", + #"MAXte", + #"tMAXtion", # absent in Jim's list + "BTx", + "MAXt", # absent in Jim's list + #"BTn", + "BApressure", + "tMINradius", + "MINradius" # absent in Jim's list + ] + + # When using all the keys without explicit selection, key filters can be used + # to explicitly exclude the particular variables with keys that matches a filter. + # 'jag_scalar_filters' and 'jag_input_filters' rely on exact key string matching. + # 'jag_scalar_prefix_filters' and 'jag_input_prefix_filters' define a filter as + # the pair of a prefix substring and the minimum key length. + # For example, with the example below, any key that has a length no shorter + # than 26 and starts with the substring "image_(" is excluded. + + jag_scalar_prefix_filters: [ { key_prefix: "image_(" min_len: 26} ] + jag_scalar_filters: [ "iBT" ] + + input_prefix: "/inputs/" + + jag_input_keys: ["shape_model_initial_modes:(4,3)", + "betti_prl15_trans_u", + "betti_prl15_trans_v", + "shape_model_initial_modes:(2,1)", + "shape_model_initial_modes:(1,0)"]; + } + + normalization { + jag_scalar_normalization_params: [ + { scale: 7.610738e+00 bias: -4.075375e-01 }, #BWx + { scale: 1.459875e+00 bias: -3.427656e+00 }, #BT + { scale: 1.490713e+00 bias: -3.495498e+00 }, #tMAXt + { scale: 4.375123e+01 bias: -1.593477e+00 }, #BWn + { scale: 1.685576e-06 bias: -5.330971e-01 }, #MAXpressure + #{ scale: 2.636422e-01 bias: -9.762907e-01 }, #BAte + #{ scale: 2.419509e-01 bias: -9.853402e-01 }, #MAXtion + { scale: 1.430615e+00 bias: -3.351173e+00 }, #tMAXpressure + { scale: 2.636422e-01 bias: -9.762907e-01 }, #BAt + { scale: 7.154074e-18 bias: -1.864709e-02 }, #Yn + { scale: 3.166824e-03 bias: -1.864709e-02 }, #Ye + { scale: 2.102178e-02 bias: -3.071955e-01 }, #Yx + #{ scale: 1.490713e+00 bias: -3.495498e+00 }, #tMAXte + #{ scale: 2.636422e-01 bias: -9.762907e-01 }, #BAtion + #{ scale: 2.419509e-01 bias: -9.853402e-01 }, #MAXte + #{ scale: 1.490713e+00 bias: -3.495498e+00 }, #tMAXtion + { scale: 1.346439e+00 bias: -3.118446e+00 }, #BTx + { scale: 2.419509e-01 bias: -9.853402e-01 }, #MAXt + #{ scale: 1.459875e+00 bias: -3.427656e+00 }, #BTn + { scale: 2.061877e-06 bias: -5.213394e-01 }, #BApressure + { scale: 1.392544e+00 bias: -3.239921e+00 }, #tMINradius + { scale: 6.266253e-02 bias: -1.384504e+00 } #MINradius + ] + + jag_input_normalization_params: [ + { scale: 1.666672e+00 bias: 5.000000e-01 }, #shape_model_initial_modes:(4,3) + { scale: 1.000002e+00 bias: -1.603483e-07 }, #betti_prl15_trans_u + { scale: 1.000001e+00 bias: -1.406672e-06 }, #betti_prl15_trans_v + { scale: 1.666675e+00 bias: 4.999992e-01 }, #shape_model_initial_modes:(2,1) + { scale: 1.666669e+00 bias: 5.000008e-01 } #shape_model_initial_modes:(1,0) + ] + + jag_image_normalization_params: [ + { scale: 2.9258502e+01 bias: 0.0e+00 }, # avg = 0.0341781 + { scale: 8.5826596e+02 bias: 0.0e+00 }, # avg = 0.00116514 + { scale: 1.0004872e+05 bias: 0.0e+00 }, # avg = 9.99513e-06 + { scale: 4.8072070e+06 bias: 0.0e+00 } # avg = 2.08021e-07 + ] + } +} diff --git a/bamboo/unit_tests/prototext/jag_reader.prototext b/bamboo/unit_tests/prototext/jag_reader.prototext index 443809ca8e4..6c5dc722528 100644 --- a/bamboo/unit_tests/prototext/jag_reader.prototext +++ b/bamboo/unit_tests/prototext/jag_reader.prototext @@ -27,37 +27,6 @@ data_reader { disable_labels: true num_labels: 5 - - image_preprocessor { - # assume fixed size of input images if cropper is not used - raw_width: 64 - raw_height: 64 - raw_num_channels: 4 - - normalizer { - disable: true - scale: false - subtract_mean: false - unit_variance: false - z_score: true - } - - subtractor { - disable: true - } - - cropper { - disable: true - } - - colorizer { - disable: true - } - - augmenter { - disable: true - } - } } reader { @@ -71,42 +40,11 @@ data_reader { index_list_per_model: false validation_percent: 0 - absolute_sample_count: 0 + absolute_sample_count: 0 percent_of_data_to_use: 0.005 disable_responses: true disable_labels: true num_labels: 5 - - image_preprocessor { - # assume fixed size of input images if cropper is not used - raw_width: 64 - raw_height: 64 - raw_num_channels: 4 - - normalizer { - disable: true - scale: false - subtract_mean: false - unit_variance: false - z_score: true - } - - subtractor { - disable: true - } - - cropper { - disable: true - } - - colorizer { - disable: true - } - - augmenter { - disable: true - } - } } } From acb478b6c4b8741889eab8c5cfd932aead9fc820 Mon Sep 17 00:00:00 2001 From: Tim Moon Date: Thu, 13 Jun 2019 11:07:11 -0700 Subject: [PATCH 073/634] Fixing error when running model zoo scripts within model_zoo dir. (#1080) --- model_zoo/vision/alexnet.py | 4 ++-- model_zoo/vision/lenet.py | 2 +- model_zoo/vision/resnet.py | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/model_zoo/vision/alexnet.py b/model_zoo/vision/alexnet.py index 54fbdbafa5f..a9319ba9c29 100755 --- a/model_zoo/vision/alexnet.py +++ b/model_zoo/vision/alexnet.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 import argparse -from os.path import dirname, join +from os.path import abspath, dirname, join import google.protobuf.text_format as txtf import lbann import lbann.models @@ -8,7 +8,7 @@ import lbann.contrib.args # Default data reader -model_zoo_dir = dirname(dirname(__file__)) +model_zoo_dir = dirname(dirname(abspath(__file__))) data_reader_prototext = join(model_zoo_dir, 'data_readers', 'data_reader_imagenet.prototext') diff --git a/model_zoo/vision/lenet.py b/model_zoo/vision/lenet.py index 30eb060d798..fd90928819c 100755 --- a/model_zoo/vision/lenet.py +++ b/model_zoo/vision/lenet.py @@ -86,7 +86,7 @@ opt = lbann.SGD(learn_rate=0.01, momentum=0.9) # Load data reader from prototext -model_zoo_dir = os.path.dirname(os.path.dirname(__file__)) +model_zoo_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) data_reader_file = os.path.join(model_zoo_dir, 'data_readers', 'data_reader_mnist.prototext') diff --git a/model_zoo/vision/resnet.py b/model_zoo/vision/resnet.py index e8f21c89924..463cb019482 100755 --- a/model_zoo/vision/resnet.py +++ b/model_zoo/vision/resnet.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 import argparse -from os.path import dirname, join +from os.path import abspath, dirname, join import google.protobuf.text_format as txtf import lbann import lbann.models @@ -10,7 +10,7 @@ import lbann.contrib.models.wide_resnet # Default data reader -model_zoo_dir = dirname(dirname(__file__)) +model_zoo_dir = dirname(dirname(abspath(__file__))) data_reader_prototext = join(model_zoo_dir, 'data_readers', 'data_reader_imagenet.prototext') From ed1f3108526fc98147296439bdc1d39533b20ade Mon Sep 17 00:00:00 2001 From: Ryan Forsyth Date: Mon, 3 Jun 2019 15:04:49 -0700 Subject: [PATCH 074/634] Clean up tests --- bamboo/common_python/test_tools.py | 113 ++++++++++++++---- bamboo/common_python/tools.py | 50 +++----- bamboo/compiler_tests/build_script.sh | 7 +- .../compiler_tests/build_script_specific.sh | 20 ++-- bamboo/compiler_tests/test_compiler.py | 69 +++-------- bamboo/integration_tests/common_code.py | 2 +- bamboo/integration_tests/conftest.py | 8 +- .../test_integration_autoencoders.py | 10 +- .../test_integration_debug.py | 52 ++++---- .../test_integration_performance.py | 27 ++--- bamboo/unit_tests/conftest.py | 4 + .../test_unit_check_proto_models.py | 8 +- bamboo/unit_tests/test_unit_checkpoint.py | 11 +- bamboo/unit_tests/test_unit_layer_clamp.py | 8 +- .../unit_tests/test_unit_layer_covariance.py | 8 +- bamboo/unit_tests/test_unit_layer_elu.py | 8 +- bamboo/unit_tests/test_unit_layer_identity.py | 8 +- bamboo/unit_tests/test_unit_layer_l1_norm.py | 8 +- bamboo/unit_tests/test_unit_layer_l2_norm2.py | 9 +- .../unit_tests/test_unit_layer_leaky_relu.py | 8 +- .../unit_tests/test_unit_layer_log_sigmoid.py | 8 +- .../unit_tests/test_unit_layer_log_softmax.py | 8 +- .../test_unit_layer_mean_absolute_error.py | 8 +- bamboo/unit_tests/test_unit_layer_relu.py | 8 +- bamboo/unit_tests/test_unit_layer_selu.py | 8 +- bamboo/unit_tests/test_unit_layer_sigmoid.py | 8 +- bamboo/unit_tests/test_unit_layer_softmax.py | 8 +- bamboo/unit_tests/test_unit_layer_softplus.py | 8 +- bamboo/unit_tests/test_unit_layer_softsign.py | 12 +- .../test_unit_layer_squared_difference.py | 8 +- .../unit_tests/test_unit_layer_tessellate.py | 8 +- bamboo/unit_tests/test_unit_layer_variance.py | 8 +- bamboo/unit_tests/test_unit_lbann2_reload.py | 8 +- .../unit_tests/test_unit_lbann_invocation.py | 16 +-- .../unit_tests/test_unit_mnist_conv_graph.py | 8 +- .../test_unit_mnist_ridge_regression.py | 8 +- .../test_unit_mnist_softmax_classifier.py | 8 +- docs/continuous_integration.rst | 4 - 38 files changed, 236 insertions(+), 354 deletions(-) diff --git a/bamboo/common_python/test_tools.py b/bamboo/common_python/test_tools.py index 6cafbb39bd6..c787b9976c1 100644 --- a/bamboo/common_python/test_tools.py +++ b/bamboo/common_python/test_tools.py @@ -4,158 +4,204 @@ # This test isn't in a directory to be run from Bamboo # Run locally with python -m pytest -s +d = dict( + executable='exe', + num_nodes=20, + partition='pdebug', + time_limit=30, + num_processes=40, + dir_name='dir', + data_filedir_default='lscratchh/filedir', + data_reader_name='mnist', + data_reader_percent=0.10, + exit_after_setup=True, + mini_batch_size=15, + model_folder='models/folder', + model_name='lenet', + num_epochs=7, + optimizer_name='adagrad', + processes_per_model=10, + output_file_name='output_file', + error_file_name='error_file', + check_executable_existence=False) + + def test_command_catalyst(): - actual = tools.get_command(cluster='catalyst', executable='exe', num_nodes=20, partition='pdebug', time_limit=30, num_processes=40, dir_name='dir', data_filedir_default='lscratchh/filedir', data_reader_name='mnist', data_reader_percent=0.10, exit_after_setup=True, mini_batch_size=15, model_folder='models/folder', model_name='lenet', num_epochs=7, optimizer_name='adagrad', processes_per_model=10, output_file_name='output_file', error_file_name='error_file', check_executable_existence=False) - expected = 'salloc --nodes=20 --partition=pdebug --time=30 srun --ntasks=40 exe --reader=dir/model_zoo/data_readers/data_reader_mnist.prototext --data_reader_percent=0.100000 --exit_after_setup --mini_batch_size=15 --model=dir/model_zoo/models/folder/model_lenet.prototext --num_epochs=7 --optimizer=dir/model_zoo/optimizers/opt_adagrad.prototext --procs_per_model=10 > output_file 2> error_file' + actual = tools.get_command(cluster='catalyst', **d) + expected = 'salloc --nodes=20 --partition=pdebug --time=30 srun --mpibind=off --ntasks=40 exe --reader=dir/model_zoo/data_readers/data_reader_mnist.prototext --data_reader_percent=0.100000 --exit_after_setup --mini_batch_size=15 --model=dir/model_zoo/models/folder/model_lenet.prototext --num_epochs=7 --optimizer=dir/model_zoo/optimizers/opt_adagrad.prototext --procs_per_model=10 > output_file 2> error_file' assert actual == expected + def test_command_pascal(): - actual = tools.get_command(cluster='pascal', executable='exe', num_nodes=20, partition='pdebug', time_limit=30, num_processes=40, dir_name='dir', data_filedir_default='lscratchh/filedir', data_reader_name='mnist', data_reader_percent=0.10, exit_after_setup=True, mini_batch_size=15, model_folder='models/folder', model_name='lenet', num_epochs=7, optimizer_name='adagrad', processes_per_model=10, output_file_name='output_file', error_file_name='error_file', check_executable_existence=False) - expected = 'salloc --nodes=20 --partition=pdebug --time=30 srun --ntasks=40 exe --reader=dir/model_zoo/data_readers/data_reader_mnist.prototext --data_reader_percent=0.100000 --exit_after_setup --mini_batch_size=15 --model=dir/model_zoo/models/folder/model_lenet.prototext --num_epochs=7 --optimizer=dir/model_zoo/optimizers/opt_adagrad.prototext --procs_per_model=10 > output_file 2> error_file' + actual = tools.get_command(cluster='pascal', **d) + expected = 'salloc --nodes=20 --partition=pbatch --time=30 srun --mpibind=off --ntasks=40 exe --reader=dir/model_zoo/data_readers/data_reader_mnist.prototext --data_reader_percent=0.100000 --exit_after_setup --mini_batch_size=15 --model=dir/model_zoo/models/folder/model_lenet.prototext --num_epochs=7 --optimizer=dir/model_zoo/optimizers/opt_adagrad.prototext --procs_per_model=10 > output_file 2> error_file' assert actual == expected -def test_command_quartz(): - actual = tools.get_command(cluster='quartz', executable='exe', num_nodes=20, partition='pdebug', time_limit=30, num_processes=40, dir_name='dir', data_filedir_default='lscratchh/filedir', data_reader_name='mnist', data_reader_percent=0.10, exit_after_setup=True, mini_batch_size=15, model_folder='models/folder', model_name='lenet', num_epochs=7, optimizer_name='adagrad', processes_per_model=10, output_file_name='output_file', error_file_name='error_file', check_executable_existence=False) - expected = 'salloc --nodes=20 --partition=pdebug --time=30 srun --ntasks=40 exe --data_filedir=lscratchh/filedir --reader=dir/model_zoo/data_readers/data_reader_mnist.prototext --data_reader_percent=0.100000 --exit_after_setup --mini_batch_size=15 --model=dir/model_zoo/models/folder/model_lenet.prototext --num_epochs=7 --optimizer=dir/model_zoo/optimizers/opt_adagrad.prototext --procs_per_model=10 > output_file 2> error_file' - assert actual == expected -def test_command_surface(): - actual = tools.get_command(cluster='surface', executable='exe', num_nodes=20, partition='pdebug', time_limit=30, num_processes=40, dir_name='dir', data_filedir_default='lscratchh/filedir', data_reader_name='mnist', data_reader_percent=0.10, exit_after_setup=True, mini_batch_size=15, model_folder='models/folder', model_name='lenet', num_epochs=7, optimizer_name='adagrad', processes_per_model=10, output_file_name='output_file', error_file_name='error_file', check_executable_existence=False) - expected = 'salloc --nodes=20 --partition=pbatch --time=30 srun --ntasks=40 exe --reader=dir/model_zoo/data_readers/data_reader_mnist.prototext --data_reader_percent=0.100000 --exit_after_setup --mini_batch_size=15 --model=dir/model_zoo/models/folder/model_lenet.prototext --num_epochs=7 --optimizer=dir/model_zoo/optimizers/opt_adagrad.prototext --procs_per_model=10 > output_file 2> error_file' - assert actual == expected - def test_command_ray(): - actual = tools.get_command(cluster='ray', executable='exe', num_nodes=20, partition='pdebug', time_limit=30, num_processes=40, dir_name='dir', data_filedir_default='lscratchh/filedir', data_reader_name='mnist', data_reader_percent=0.10, exit_after_setup=True, mini_batch_size=15, model_folder='models/folder', model_name='lenet', num_epochs=7, optimizer_name='adagrad', processes_per_model=10, output_file_name='output_file', error_file_name='error_file', check_executable_existence=False) + actual = tools.get_command(cluster='ray', **d) expected = 'bsub -x -G guests -Is -n 40 -q pdebug -R "span[ptile=2]" -W 30 mpirun -np 40 -N 2 exe --data_filedir=gscratchr/filedir --reader=dir/model_zoo/data_readers/data_reader_mnist.prototext --data_reader_percent=0.100000 --exit_after_setup --mini_batch_size=15 --model=dir/model_zoo/models/folder/model_lenet.prototext --num_epochs=7 --optimizer=dir/model_zoo/optimizers/opt_adagrad.prototext --procs_per_model=10 > output_file 2> error_file' assert actual == expected # Test error cases ############################################################ + def test_blacklisted_substrings(): try: tools.get_command('ray', 'exe', partition=';', optimizer_path='--model=new_model', check_executable_existence=False) + assert False except Exception as e: actual = str(e) expected = 'Invalid character(s): ; contains ; , --model=new_model contains --' assert actual == expected + def test_unsupported_cluster(): try: - tools.get_command('quartz', 'exe', check_executable_existence=False) + tools.get_command('q', 'exe', check_executable_existence=False) + assert False except Exception as e: actual = str(e) - expected = 'Unsupported Cluster: quartz' + expected = 'Unsupported Cluster: q' assert actual == expected + def test_bad_model_1(): try: tools.get_command('ray', 'exe', dir_name='dir', model_folder='folder', model_name='name', model_path='path', check_executable_existence=False) + assert False except Exception as e: actual = str(e) expected = 'Invalid Usage: model_path is set but so is at least one of model folder and model_name' assert actual == expected + def test_bad_model_2(): try: tools.get_command('ray', 'exe', dir_name='dir', model_folder='folder', model_path='path', check_executable_existence=False) + assert False except Exception as e: actual = str(e) expected = 'Invalid Usage: model_path is set but so is at least one of model folder and model_name' assert actual == expected + def test_bad_model_3(): try: tools.get_command('ray', 'exe', dir_name='dir', model_name='name', model_path='path', check_executable_existence=False) + assert False except Exception as e: actual = str(e) expected = 'Invalid Usage: model_path is set but so is at least one of model folder and model_name' assert actual == expected + def test_bad_model_4(): try: tools.get_command('ray', 'exe', dir_name='dir', model_folder='folder', check_executable_existence=False) + assert False except Exception as e: actual = str(e) expected = 'Invalid Usage: model_folder set but not model_name.' assert actual == expected + def test_bad_model_5(): try: tools.get_command('ray', 'exe', dir_name='dir', model_name='name', check_executable_existence=False) + assert False except Exception as e: actual = str(e) expected = 'Invalid Usage: model_name set but not model_folder.' assert actual == expected + def test_bad_data_reader(): try: tools.get_command('catalyst', 'exe', dir_name='dir', data_reader_name='name', data_reader_path='path', check_executable_existence=False) + assert False except Exception as e: actual = str(e) expected = 'Invalid Usage: data_reader_path is set but so is data_reader_name , data_reader_name or data_reader_path is set but not data_filedir_default. If a data reader is provided, the default filedir must be set. This allows for determining what the filedir should be on each cluster. Alternatively, some or all of [data_filedir_train_default, data_filename_train_default, data_filedir_test_default, data_filename_test_default] can be set.' assert actual == expected + def test_bad_optimizer(): try: tools.get_command('ray', 'exe', dir_name='dir', optimizer_name='name', optimizer_path='path', check_executable_existence=False) + assert False except Exception as e: actual = str(e) expected = 'Invalid Usage: optimizer_path is set but so is optimizer_name' assert actual == expected + def test_bad_dir_name_1(): try: tools.get_command('ray', 'exe', dir_name='dir', check_executable_existence=False) + assert False except Exception as e: - actual = str(e) - expected = 'Invalid Usage: dir_name set but none of model_folder, model_name, data_reader_name, optimizer_name are.' + actual = str(e) + expected = 'Invalid Usage: dir_name set but none of model_folder, model_name, data_reader_name, optimizer_name are.' assert actual == expected + def test_bad_dir_name_2(): try: tools.get_command('ray', 'exe', model_folder='folder', check_executable_existence=False) + assert False except Exception as e: actual = str(e) expected = 'Invalid Usage: dir_name is not set but at least one of model_folder, model_name, data_reader_name, optimizer_name is.' assert actual == expected + def test_bad_dir_name_3(): try: tools.get_command('ray', 'exe', model_name='name', check_executable_existence=False) + assert False except Exception as e: - actual = str(e) - expected = 'Invalid Usage: dir_name is not set but at least one of model_folder, model_name, data_reader_name, optimizer_name is.' + actual = str(e) + expected = 'Invalid Usage: dir_name is not set but at least one of model_folder, model_name, data_reader_name, optimizer_name is.' assert actual == expected + def test_bad_dir_name_4(): try: tools.get_command('catalyst', 'exe', data_reader_name='name', check_executable_existence=False) + assert False except Exception as e: actual = str(e) expected = 'Invalid Usage: dir_name is not set but at least one of model_folder, model_name, data_reader_name, optimizer_name is. , data_reader_name or data_reader_path is set but not data_filedir_default. If a data reader is provided, the default filedir must be set. This allows for determining what the filedir should be on each cluster. Alternatively, some or all of [data_filedir_train_default, data_filename_train_default, data_filedir_test_default, data_filename_test_default] can be set.' assert actual == expected + def test_bad_dir_name_5(): try: tools.get_command('ray', 'exe', optimizer_name='name', check_executable_existence=False) + assert False except Exception as e: - actual = str(e) - expected = 'Invalid Usage: dir_name is not set but at least one of model_folder, model_name, data_reader_name, optimizer_name is.' + actual = str(e) + expected = 'Invalid Usage: dir_name is not set but at least one of model_folder, model_name, data_reader_name, optimizer_name is.' assert actual == expected + def test_bad_data_filedir_1(): try: tools.get_command('ray', 'exe', dir_name='dir', data_reader_name='name', data_filedir_default='filedir', data_filedir_train_default='a', check_executable_existence=False) + assert False except Exception as e: actual = str(e) expected = 'Invalid Usage: data_fildir_default set but so is at least one of [data_filedir_train_default, data_filename_train_default, data_filedir_test_default, data_filename_test_default]' assert actual == expected + def test_bad_data_filedir_2(): try: tools.get_command('ray', 'exe', dir_name='dir', data_reader_name='name', data_filedir_default='filedir', data_filename_train_default='b', check_executable_existence=False) + assert False except Exception as e: actual = str(e) expected = 'Invalid Usage: data_fildir_default set but so is at least one of [data_filedir_train_default, data_filename_train_default, data_filedir_test_default, data_filename_test_default]' @@ -166,31 +212,38 @@ def test_bad_data_filedir_3(): try: tools.get_command('ray', 'exe', dir_name='dir', data_reader_name='name', data_filedir_default='filedir', data_filedir_test_default='c', check_executable_existence=False) + assert False except Exception as e: actual = str(e) expected = 'Invalid Usage: data_fildir_default set but so is at least one of [data_filedir_train_default, data_filename_train_default, data_filedir_test_default, data_filename_test_default]' assert actual == expected + def test_bad_data_filedir_4(): try: tools.get_command('ray', 'exe', dir_name='dir', data_reader_name='name', data_filedir_default='filedir', data_filename_test_default='d', check_executable_existence=False) + assert False except Exception as e: actual = str(e) expected = 'Invalid Usage: data_fildir_default set but so is at least one of [data_filedir_train_default, data_filename_train_default, data_filedir_test_default, data_filename_test_default]' assert actual == expected + def test_bad_data_filedir_5(): try: tools.get_command('ray', 'exe', data_reader_path='path', data_filedir_default='filedir', data_filedir_train_default='e', check_executable_existence=False) + assert False except Exception as e: actual = str(e) expected = 'Invalid Usage: data_fildir_default set but so is at least one of [data_filedir_train_default, data_filename_train_default, data_filedir_test_default, data_filename_test_default]' assert actual == expected + def test_bad_data_filedir_6(): try: tools.get_command('ray', 'exe', data_reader_path='path', data_filedir_default='filedir', data_filename_train_default='f', check_executable_existence=False) + assert False except Exception as e: actual = str(e) expected = 'Invalid Usage: data_fildir_default set but so is at least one of [data_filedir_train_default, data_filename_train_default, data_filedir_test_default, data_filename_test_default]' @@ -200,46 +253,57 @@ def test_bad_data_filedir_6(): def test_bad_data_filedir_7(): try: tools.get_command('ray', 'exe', data_reader_path='path', data_filedir_default='filedir', data_filedir_test_default='g', check_executable_existence=False) + assert False except Exception as e: actual = str(e) expected = 'Invalid Usage: data_fildir_default set but so is at least one of [data_filedir_train_default, data_filename_train_default, data_filedir_test_default, data_filename_test_default]' assert actual == expected + def test_bad_data_filedir_8(): try: tools.get_command('ray', 'exe', data_reader_path='path', data_filedir_default='filedir', data_filename_test_default='h', check_executable_existence=False) + assert False except Exception as e: actual = str(e) expected = 'Invalid Usage: data_fildir_default set but so is at least one of [data_filedir_train_default, data_filename_train_default, data_filedir_test_default, data_filename_test_default]' assert actual == expected + def test_bad_data_filedir_9(): try: tools.get_command('ray', 'exe', dir_name='dir', data_reader_name='name', check_executable_existence=False) + assert False except Exception as e: actual = str(e) expected = 'Invalid Usage: data_reader_name or data_reader_path is set but not data_filedir_default. If a data reader is provided, the default filedir must be set. This allows for determining what the filedir should be on each cluster. Alternatively, some or all of [data_filedir_train_default, data_filename_train_default, data_filedir_test_default, data_filename_test_default] can be set.' assert actual == expected + def test_bad_data_filedir_10(): try: tools.get_command('ray', 'exe', data_reader_path='path', check_executable_existence=False) + assert False except Exception as e: actual = str(e) expected = 'Invalid Usage: data_reader_name or data_reader_path is set but not data_filedir_default. If a data reader is provided, the default filedir must be set. This allows for determining what the filedir should be on each cluster. Alternatively, some or all of [data_filedir_train_default, data_filename_train_default, data_filedir_test_default, data_filename_test_default] can be set.' assert actual == expected + def test_bad_data_filedir_11(): try: tools.get_command('ray', 'exe', data_filedir_default='filedir', check_executable_existence=False) + assert False except Exception as e: actual = str(e) expected = 'Invalid Usage: data_filedir_default set but neither data_reader_name or data_reader_path are.' - assert actual == expected + assert actual == expected + def test_bad_data_filedir_12(): try: tools.get_command('ray', 'exe', data_filedir_train_default='a', check_executable_existence=False) + assert False except Exception as e: actual = str(e) expected = 'Invalid Usage: At least one of [data_filedir_train_default, data_filename_train_default, data_filedir_test_default, data_filename_test_default] is set, but neither data_reader_name or data_reader_path are.' @@ -249,6 +313,7 @@ def test_bad_data_filedir_12(): def test_bad_data_filedir_13(): try: tools.get_command('ray', 'exe', data_filename_train_default='b', check_executable_existence=False) + assert False except Exception as e: actual = str(e) expected = 'Invalid Usage: At least one of [data_filedir_train_default, data_filename_train_default, data_filedir_test_default, data_filename_test_default] is set, but neither data_reader_name or data_reader_path are.' @@ -258,6 +323,7 @@ def test_bad_data_filedir_13(): def test_bad_data_filedir_14(): try: tools.get_command('ray', 'exe', data_filedir_test_default='c', check_executable_existence=False) + assert False except Exception as e: actual = str(e) expected = 'Invalid Usage: At least one of [data_filedir_train_default, data_filename_train_default, data_filedir_test_default, data_filename_test_default] is set, but neither data_reader_name or data_reader_path are.' @@ -267,6 +333,7 @@ def test_bad_data_filedir_14(): def test_bad_data_filedir_15(): try: tools.get_command('ray', 'exe', data_filename_test_default='e', check_executable_existence=False) + assert False except Exception as e: actual = str(e) expected = 'Invalid Usage: At least one of [data_filedir_train_default, data_filename_train_default, data_filedir_test_default, data_filename_test_default] is set, but neither data_reader_name or data_reader_path are.' diff --git a/bamboo/common_python/tools.py b/bamboo/common_python/tools.py index 044cf6add02..24c429b6bdd 100644 --- a/bamboo/common_python/tools.py +++ b/bamboo/common_python/tools.py @@ -57,7 +57,7 @@ def get_command(cluster, # Never give lbannusr an allocation for over 12 hours though. strict_time_limit = 60*6 # 6 hours. - if time_limit > strict_time_limit: + if (time_limit is None) or (time_limit > strict_time_limit): time_limit = strict_time_limit # Check executable existence @@ -65,7 +65,7 @@ def get_command(cluster, process_executable_existence(executable, skip_no_exe) # Determine scheduler - if cluster in ['catalyst', 'pascal', 'quartz', 'surface']: + if cluster in ['catalyst', 'pascal']: scheduler = 'slurm' elif cluster == 'ray': scheduler = 'lsf' @@ -90,8 +90,8 @@ def get_command(cluster, # maxnodes. option_num_nodes = ' --nodes=%d' % num_nodes if partition is not None: - # Surface does not have pdebug, so switch to pbatch - if (cluster in ['surface', 'pascal']) and \ + # If cluster doesn't have pdebug switch to pbatch. + if (cluster in ['pascal']) and \ (partition == 'pdebug'): partition = 'pbatch' # --partition => Request a specific partition for the resource @@ -249,27 +249,19 @@ def get_command(cluster, # Determine data file paths # If there is no regex match, then re.sub keeps the original string if data_filedir_default is not None: - if cluster in ['catalyst', 'pascal', 'surface']: + if cluster in ['catalyst', 'pascal',]: # option_data_filedir = data_filedir_default # lscratchh, presumably pass # No need to pass in a parameter - elif cluster == 'quartz': - option_data_filedir = ' --data_filedir=%s' % re.sub( - '[a-z]scratch[a-z]', 'lscratchh', data_filedir_default) elif cluster == 'ray': option_data_filedir = ' --data_filedir=%s' % re.sub( '[a-z]scratch[a-z]', 'gscratchr', data_filedir_default) elif None not in data_file_parameters: - if cluster in ['catalyst', 'pascal', 'surface']: + if cluster in ['catalyst', 'pascal']: # option_data_filedir_train = data_filedir_train_default # option_data_filename_train = data_filename_train_default # option_data_filedir_test = data_filedir_test_default # option_data_filename_train = data_filename_test_default pass # No need to pass in a parameter - elif cluster == 'quartz': - option_data_filedir_train = ' --data_filedir_train=%s' % re.sub('[a-z]scratch[a-z]', 'lscratchh', data_filedir_train_default) - option_data_filename_train = ' --data_filename_train=%s' % re.sub('[a-z]scratch[a-z]', 'lscratchh', data_filename_train_default) - option_data_filedir_test = ' --data_filedir_test=%s' % re.sub('[a-z]scratch[a-z]', 'lscratchh', data_filedir_test_default) - option_data_filename_train = ' --data_filename_test=%s' % re.sub('[a-z]scratch[a-z]', 'lscratchh', data_filename_test_default) elif cluster == 'ray': option_data_filedir_train = ' --data_filedir_train=%s' % re.sub('[a-z]scratch[a-z]', 'gscratchr', data_filedir_train_default) option_data_filename_train = ' --data_filename_train=%s' % re.sub('[a-z]scratch[a-z]', 'gscratchr', data_filename_train_default) @@ -303,7 +295,7 @@ def get_command(cluster, lbann_errors.append( ('data_filedir_default set but neither data_reader_name' ' or data_reader_path are.')) - elif filter(lambda x: x is not None, data_file_parameters) != []: + elif list(filter(lambda x: x is not None, data_file_parameters)) != []: # If the list of non-None data_file parameters is not empty lbann_errors.append( ('At least one of [data_filedir_train_default, data_filename' @@ -369,14 +361,12 @@ def get_spack_exes(default_dirname, cluster): exes = {} exes['clang4'] = '%s/bamboo/compiler_tests/builds/%s_clang-4.0.0_rel/build/model_zoo/lbann' % (default_dirname, cluster) - exes['gcc4'] = '%s/bamboo/compiler_tests/builds/%s_gcc-4.9.3_rel/build/model_zoo/lbann' % (default_dirname, cluster) exes['gcc7'] = '%s/bamboo/compiler_tests/builds/%s_gcc-7.1.0_rel/build/model_zoo/lbann' % (default_dirname, cluster) - exes['intel18'] = '%s/bamboo/compiler_tests/builds/%s_intel-18.0.0_rel/build/model_zoo/lbann' % (default_dirname, cluster) + exes['intel19'] = '%s/bamboo/compiler_tests/builds/%s_intel-19.0.0_rel/build/model_zoo/lbann' % (default_dirname, cluster) exes['clang4_debug'] = '%s/bamboo/compiler_tests/builds/%s_clang-4.0.0_debug/build/model_zoo/lbann' % (default_dirname, cluster) - exes['gcc4_debug'] = '%s/bamboo/compiler_tests/builds/%s_gcc-4.9.3_debug/build/model_zoo/lbann' % (default_dirname, cluster) exes['gcc7_debug'] = '%s/bamboo/compiler_tests/builds/%s_gcc-7.1.0_debug/build/model_zoo/lbann' % (default_dirname, cluster) - exes['intel18_debug'] = '%s/bamboo/compiler_tests/builds/%s_intel-18.0.0_debug/build/model_zoo/lbann' % (default_dirname, cluster) + exes['intel19_debug'] = '%s/bamboo/compiler_tests/builds/%s_intel-19.0.0_debug/build/model_zoo/lbann' % (default_dirname, cluster) return exes @@ -388,34 +378,28 @@ def get_default_exes(default_dirname, cluster): exes['clang4'] = '%s/build/clang.Release.%s.llnl.gov/install/bin/lbann' % (default_dirname, cluster) if not os.path.exists(exes['gcc7']): exes['gcc7'] = '%s/build/gnu.Release.%s.llnl.gov/install/bin/lbann' % (default_dirname, cluster) - if not os.path.exists(exes['intel18']): - exes['intel18'] = '%s/build/intel.Release.%s.llnl.gov/install/bin/lbann' % (default_dirname, cluster) + if not os.path.exists(exes['intel19']): + exes['intel19'] = '%s/build/intel.Release.%s.llnl.gov/install/bin/lbann' % (default_dirname, cluster) if not os.path.exists(exes['clang4_debug']): exes['clang4_debug'] = '%s/build/clang.Debug.%s.llnl.gov/install/bin/lbann' % (default_dirname, cluster) if not os.path.exists(exes['gcc7_debug']): exes['gcc7_debug'] = '%s/build/gnu.Debug.%s.llnl.gov/install/bin/lbann' % (default_dirname, cluster) - if not os.path.exists(exes['intel18_debug']): - exes['intel18_debug'] = '%s/build/intel.Debug.%s.llnl.gov/install/bin/lbann' % (default_dirname, cluster) + if not os.path.exists(exes['intel19_debug']): + exes['intel19_debug'] = '%s/build/intel.Debug.%s.llnl.gov/install/bin/lbann' % (default_dirname, cluster) default_exes = {} default_exes['default'] = '%s/build/gnu.Release.%s.llnl.gov/install/bin/lbann' % (default_dirname, cluster) - if cluster in ['catalyst', 'quartz', 'pascal']: - # x86_cpu - catalyst, quartz + if cluster in ['catalyst', 'pascal']: + # x86_cpu - catalyst # x86_gpu_pascal - pascal default_exes['clang4'] = exes['clang4'] - default_exes['gcc4'] = exes['gcc4'] default_exes['gcc7'] = exes['gcc7'] - default_exes['intel18'] = exes['intel18'] + default_exes['intel19'] = exes['intel19'] default_exes['clang4_debug'] = exes['clang4_debug'] - default_exes['gcc4_debug'] = exes['gcc4_debug'] default_exes['gcc7_debug'] = exes['gcc7_debug'] - default_exes['intel18_debug'] = exes['intel18_debug'] - elif cluster in ['surface']: - # x86_gpu - surface - default_exes['gcc4'] = exes['gcc4'] - default_exes['gcc4_debug'] = exes['gcc4_debug'] + default_exes['intel19_debug'] = exes['intel19_debug'] print('default_exes={d}'.format(d=default_exes)) return default_exes diff --git a/bamboo/compiler_tests/build_script.sh b/bamboo/compiler_tests/build_script.sh index 07a19172f26..1ccf4efd12d 100755 --- a/bamboo/compiler_tests/build_script.sh +++ b/bamboo/compiler_tests/build_script.sh @@ -1,7 +1,4 @@ -CLUSTER=$(hostname | sed 's/\([a-zA-Z][a-zA-Z]*\)[0-9]*/\1/g') -if [ "${CLUSTER}" != 'surface' ]; then - source /usr/share/lmod/lmod/init/bash - source /etc/profile.d/00-modulepath.sh -fi +source /usr/share/lmod/lmod/init/bash +source /etc/profile.d/00-modulepath.sh LBANN_DIR=$(git rev-parse --show-toplevel) ${LBANN_DIR}/scripts/build_lbann_lc.sh --with-conduit diff --git a/bamboo/compiler_tests/build_script_specific.sh b/bamboo/compiler_tests/build_script_specific.sh index 975d58ac4a1..92925ceae1c 100755 --- a/bamboo/compiler_tests/build_script_specific.sh +++ b/bamboo/compiler_tests/build_script_specific.sh @@ -2,10 +2,8 @@ set -e CLUSTER=$(hostname | sed 's/\([a-zA-Z][a-zA-Z]*\)[0-9]*/\1/g') LBANN_DIR=$(git rev-parse --show-toplevel) DEBUG='' -if [ "${CLUSTER}" != 'surface' ]; then - source /usr/share/lmod/lmod/init/bash - source /etc/profile.d/00-modulepath.sh -fi +source /usr/share/lmod/lmod/init/bash +source /etc/profile.d/00-modulepath.sh while :; do case ${1} in @@ -37,17 +35,13 @@ if [ "${COMPILER}" == 'clang4' ]; then ${LBANN_DIR}/scripts/build_lbann_lc.sh --compiler clang ${DEBUG} --reconfigure --with-conduit fi -if [ "${COMPILER}" == 'intel18' ]; then - module load intel/18.0.0 - ${LBANN_DIR}/scripts/build_lbann_lc.sh --compiler intel ${DEBUG} --reconfigure --with-conduit -fi - -if [ "${COMPILER}" == 'gcc4' ]; then - module load gcc/4.9.3 - ${LBANN_DIR}/scripts/build_lbann_lc.sh --compiler gnu ${DEBUG} --reconfigure --with-conduit -fi if [ "${COMPILER}" == 'gcc7' ]; then module load gcc/7.1.0 ${LBANN_DIR}/scripts/build_lbann_lc.sh --compiler gnu ${DEBUG} --reconfigure --with-conduit fi + +if [ "${COMPILER}" == 'intel19' ]; then + module load intel/19.0.0 + ${LBANN_DIR}/scripts/build_lbann_lc.sh --compiler intel ${DEBUG} --reconfigure --with-conduit +fi diff --git a/bamboo/compiler_tests/test_compiler.py b/bamboo/compiler_tests/test_compiler.py index 5682d11f3af..8e08dbd7881 100644 --- a/bamboo/compiler_tests/test_compiler.py +++ b/bamboo/compiler_tests/test_compiler.py @@ -50,26 +50,6 @@ def test_compiler_clang4_debug(cluster, dirname): assert os.path.exists(path) -def test_compiler_gcc4_release(cluster, dirname): - try: - skeleton_gcc4(cluster, dirname, False) - except AssertionError as e: - print(e) - build_script(cluster, dirname, 'gcc4', False) - path = '%s/bamboo/compiler_tests/builds/%s_gcc-4.9.3_rel/build/model_zoo/lbann' % (dirname, cluster) - assert os.path.exists(path) - - -def test_compiler_gcc4_debug(cluster, dirname): - try: - skeleton_gcc4(cluster, dirname, True) - except AssertionError as e: - print(e) - build_script(cluster, dirname, 'gcc4', True) - path = '%s/bamboo/compiler_tests/builds/%s_gcc-4.9.3_debug/build/model_zoo/lbann' % (dirname, cluster) - assert os.path.exists(path) - - def test_compiler_gcc7_release(cluster, dirname): try: skeleton_gcc7(cluster, dirname, False) @@ -94,32 +74,32 @@ def test_compiler_gcc7_debug(cluster, dirname): assert os.path.exists(path) -def test_compiler_intel18_release(cluster, dirname): +def test_compiler_intel19_release(cluster, dirname): try: - skeleton_intel18(cluster, dirname, False) + skeleton_intel19(cluster, dirname, False) except AssertionError as e: print(e) - build_script(cluster, dirname, 'intel18', False) - path = '%s/bamboo/compiler_tests/builds/%s_intel-18.0.0_rel/build/model_zoo/lbann' % (dirname, cluster) + build_script(cluster, dirname, 'intel19', False) + path = '%s/bamboo/compiler_tests/builds/%s_intel-19.0.0_rel/build/model_zoo/lbann' % (dirname, cluster) if not os.path.exists(path): path = '%s/build/intel.Release.%s.llnl.gov/install/bin/lbann' % (dirname, cluster) assert os.path.exists(path) -def test_compiler_intel18_debug(cluster, dirname): +def test_compiler_intel19_debug(cluster, dirname): try: - skeleton_intel18(cluster, dirname, True) + skeleton_intel19(cluster, dirname, True) except AssertionError as e: print(e) - build_script(cluster, dirname, 'intel18', True) - path = '%s/bamboo/compiler_tests/builds/%s_intel-18.0.0_debug/build/model_zoo/lbann' % (dirname, cluster) + build_script(cluster, dirname, 'intel19', True) + path = '%s/bamboo/compiler_tests/builds/%s_intel-19.0.0_debug/build/model_zoo/lbann' % (dirname, cluster) if not os.path.exists(path): path = '%s/build/intel.Debug.%s.llnl.gov/install/bin/lbann' % (dirname, cluster) assert os.path.exists(path) def skeleton_clang4(cluster, dir_name, debug, should_log=False): - if cluster in ['catalyst', 'quartz']: + if cluster in ['catalyst']: spack_skeleton(dir_name, 'clang@4.0.0', 'mvapich2@2.2', debug, should_log) build_skeleton(dir_name, 'clang@4.0.0', debug, should_log) else: @@ -128,23 +108,8 @@ def skeleton_clang4(cluster, dir_name, debug, should_log=False): pytest.skip(e) -def skeleton_gcc4(cluster, dir_name, debug, should_log=False): - if cluster in ['quartz']: # Taking out 'catalyst' - mpi = 'mvapich2@2.2' - elif cluster in ['surface']: # Taking out 'pascal' - mpi = 'mvapich2@2.2+cuda' - elif cluster == 'ray': - mpi = 'spectrum-mpi@2018.04.27' - else: - e = 'skeleton_gcc4: Unsupported Cluster %s' % cluster - print('Skip - ' + e) - pytest.skip(e) - spack_skeleton(dir_name, 'gcc@4.9.3', mpi, debug, should_log) - build_skeleton(dir_name, 'gcc@4.9.3', debug, should_log) - - def skeleton_gcc7(cluster, dir_name, debug, should_log=False): - if cluster in ['catalyst', 'quartz']: + if cluster in ['catalyst']: spack_skeleton(dir_name, 'gcc@7.1.0', 'mvapich2@2.2', debug, should_log) build_skeleton(dir_name, 'gcc@7.1.0', debug, should_log) else: @@ -153,12 +118,12 @@ def skeleton_gcc7(cluster, dir_name, debug, should_log=False): pytest.skip(e) -def skeleton_intel18(cluster, dir_name, debug, should_log=False): - if cluster in ['quartz']: # Taking out 'catalyst' - spack_skeleton(dir_name, 'intel@18.0.0', 'mvapich2@2.2', debug, should_log) - build_skeleton(dir_name, 'intel@18.0.0', debug, should_log) +def skeleton_intel19(cluster, dir_name, debug, should_log=False): + if cluster in []: # ['catalyst']: + spack_skeleton(dir_name, 'intel@19.0.0', 'mvapich2@2.2', debug, should_log) + build_skeleton(dir_name, 'intel@19.0.0', debug, should_log) else: - e = 'skeleton_intel18: Unsupported Cluster %s' % cluster + e = 'skeleton_intel19: Unsupported Cluster %s' % cluster print('Skip - ' + e) pytest.skip(e) @@ -203,15 +168,13 @@ def build_skeleton(dir_name, compiler, debug, should_log): # For reference: # Commenting out for now. These additions to path name will likely return # one day, so I am not removing them entirely. - # x86_64 <=> catalyst, pascal, quartz, surface + # x86_64 <=> catalyst, pascal # ppc64le <=> ray #architecture = subprocess.check_output('uname -m'.split()).strip() #if cluster == 'ray': # architecture += '_gpu_cuda-9.2.64_cudnn-7.0' #elif cluster == 'pascal': # architecture += '_gpu_cuda-9.1.85_cudnn-7.1' - #elif cluster == 'surface': - # architecture += '_gpu' os.chdir('%s/bamboo/compiler_tests/builds/%s_%s_%s/build' % (dir_name, cluster, compiler, build_type)) command = 'make -j all > %s 2> %s' % (output_file_name, error_file_name) return_code = os.system(command) diff --git a/bamboo/integration_tests/common_code.py b/bamboo/integration_tests/common_code.py index 915289adedd..939de8295eb 100644 --- a/bamboo/integration_tests/common_code.py +++ b/bamboo/integration_tests/common_code.py @@ -26,7 +26,7 @@ def get_command(cluster, dir_name, model_folder, model_name, executable, error_file_name=error_file_name) elif model_name in ['conv_autoencoder_mnist', 'lenet_mnist']: if (model_name == 'lenet_mnist') and \ - (compiler_name in ['clang4', 'intel18']): + (compiler_name in ['clang4', 'intel19']): partition = 'pbatch' time_limit = 600 else: diff --git a/bamboo/integration_tests/conftest.py b/bamboo/integration_tests/conftest.py index 97d34bf9055..d71b3987918 100644 --- a/bamboo/integration_tests/conftest.py +++ b/bamboo/integration_tests/conftest.py @@ -13,12 +13,12 @@ def pytest_addoption(parser): parser.addoption('--cluster', action='store', default=cluster, help='--cluster= to specify the cluster being run on, for the purpose of determing which commands to use. Default the current cluster') + parser.addoption('--debug_build', action='store_true', default=False, + help='--debug_build specifies that debug tests should be run, even without doing a --weekly build. Default False') parser.addoption('--dirname', action='store', default=default_dirname, help='--dirname= to specify the top-level directory. Default directory of build_lbann_lc executable') parser.addoption('--exes', action='store', default=default_exes, help='--exes={compiler_name: path}') - parser.addoption('--log', action='store', default=0, - help='--log=1 to keep trimmed accuracy files. Default (--log=0) removes files') parser.addoption('--run', action='store_true', default=False, help='--run specifies that a test normally ignored should be run. Default False') parser.addoption('--weekly', action='store_true', default=False, @@ -33,8 +33,8 @@ def cluster(request): @pytest.fixture -def debug(request): - return request.config.getoption('--debug') +def debug_build(request): + return request.config.getoption('--debug_build') @pytest.fixture diff --git a/bamboo/integration_tests/test_integration_autoencoders.py b/bamboo/integration_tests/test_integration_autoencoders.py index 5f021ce6f53..9e9e325a902 100644 --- a/bamboo/integration_tests/test_integration_autoencoders.py +++ b/bamboo/integration_tests/test_integration_autoencoders.py @@ -53,7 +53,7 @@ def run_tests(actual_objective_functions, model_name, dir_name, cluster, def skeleton_autoencoder_imagenet(cluster, dir_name, executables, compiler_name, weekly): - if cluster in ['surface', 'pascal']: + if cluster in ['pascal']: e = 'skeleton_autoencoder_imagenet: does not run on GPU' print('Skip - ' + e) pytest.skip(e) @@ -79,17 +79,13 @@ def test_integration_autoencoder_imagenet_clang4(cluster, dirname, exes, skeleton_autoencoder_imagenet(cluster, dirname, exes, 'clang4', weekly) -def test_integration_autoencoder_imagenet_gcc4(cluster, dirname, exes, weekly): - skeleton_autoencoder_imagenet(cluster, dirname, exes, 'gcc4', weekly) - - def test_integration_autoencoder_imagenet_gcc7(cluster, dirname, exes, weekly): skeleton_autoencoder_imagenet(cluster, dirname, exes, 'gcc7', weekly) -def test_integration_autoencoder_imagenet_intel18(cluster, dirname, exes, +def test_integration_autoencoder_imagenet_intel19(cluster, dirname, exes, weekly): - skeleton_autoencoder_imagenet(cluster, dirname, exes, 'intel18', weekly) + skeleton_autoencoder_imagenet(cluster, dirname, exes, 'intel19', weekly) # Run with python -m pytest -s test_integration_autoencoder.py -k 'test_integration_autoencoder_imagenet_exe' --exe= diff --git a/bamboo/integration_tests/test_integration_debug.py b/bamboo/integration_tests/test_integration_debug.py index c205dffb24c..be26995d0c4 100644 --- a/bamboo/integration_tests/test_integration_debug.py +++ b/bamboo/integration_tests/test_integration_debug.py @@ -6,10 +6,10 @@ def skeleton_mnist_debug(cluster, dir_name, executables, compiler_name, weekly, - debug, should_log=False): - # If weekly or debug are true, then run the test. - if (not weekly) and (not debug): - e = 'skeleton_mnist_debug: Not doing weekly or debug testing' + debug_build, should_log=False): + # If weekly or debug_build are true, then run the test. + if not (weekly or debug_build): + e = 'skeleton_mnist_debug: Not doing weekly or debug_build testing' print('Skip - ' + e) pytest.skip(e) if compiler_name not in executables: @@ -26,15 +26,15 @@ def skeleton_mnist_debug(cluster, dir_name, executables, compiler_name, weekly, data_reader_name='mnist', model_folder='models/' + model_name, model_name=model_name, num_epochs=5, optimizer_name='adagrad', output_file_name=output_file_name, error_file_name=error_file_name) - output_value = common_code.run_lbann(command, model_name, output_file_name, error_file_name) + output_value = common_code.run_lbann(command, model_name, output_file_name, error_file_name, should_log) assert output_value == 0 def skeleton_cifar_debug(cluster, dir_name, executables, compiler_name, weekly, - debug, should_log=False): - # If weekly or debug are true, then run the test. - if (not weekly) and (not debug): - e = 'skeleton_cifar_debug: Not doing weekly or debug testing' + debug_build, should_log=False): + # If weekly or debug_build are true, then run the test. + if not (weekly or debug_build): + e = 'skeleton_cifar_debug: Not doing weekly or debug_build testing' print('Skip - ' + e) pytest.skip(e) if cluster == 'ray': @@ -56,40 +56,32 @@ def skeleton_cifar_debug(cluster, dir_name, executables, compiler_name, weekly, data_reader_name='cifar10', data_reader_percent=0.01, model_folder='models/' + model_name, model_name='conv_' + model_name, num_epochs=5, optimizer_name='adagrad', output_file_name=output_file_name, error_file_name=error_file_name) - output_value = common_code.run_lbann(command, model_name, output_file_name, error_file_name) + output_value = common_code.run_lbann(command, model_name, output_file_name, error_file_name, should_log) assert output_value == 0 -def test_integration_mnist_clang4_debug(cluster, dirname, exes, weekly, debug): - skeleton_mnist_debug(cluster, dirname, exes, 'clang4_debug', weekly, debug) +def test_integration_mnist_clang4_debug(cluster, dirname, exes, weekly, debug_build): + skeleton_mnist_debug(cluster, dirname, exes, 'clang4_debug', weekly, debug_build) -def test_integration_cifar_clang4_debug(cluster, dirname, exes, weekly, debug): - skeleton_cifar_debug(cluster, dirname, exes, 'clang4_debug', weekly, debug) +def test_integration_cifar_clang4_debug(cluster, dirname, exes, weekly, debug_build): + skeleton_cifar_debug(cluster, dirname, exes, 'clang4_debug', weekly, debug_build) -def test_integration_mnist_gcc4_debug(cluster, dirname, exes, weekly, debug): - skeleton_mnist_debug(cluster, dirname, exes, 'gcc4_debug', weekly, debug) +def test_integration_mnist_gcc7_debug(cluster, dirname, exes, weekly, debug_build): + skeleton_mnist_debug(cluster, dirname, exes, 'gcc7_debug', weekly, debug_build) -def test_integration_cifar_gcc4_debug(cluster, dirname, exes, weekly, debug): - skeleton_cifar_debug(cluster, dirname, exes, 'gcc4_debug', weekly, debug) +def test_integration_cifar_gcc7_debug(cluster, dirname, exes, weekly, debug_build): + skeleton_cifar_debug(cluster, dirname, exes, 'gcc7_debug', weekly, debug_build) -def test_integration_mnist_gcc7_debug(cluster, dirname, exes, weekly, debug): - skeleton_mnist_debug(cluster, dirname, exes, 'gcc7_debug', weekly, debug) +def test_integration_mnist_intel19_debug(cluster, dirname, exes, weekly, debug_build): + skeleton_mnist_debug(cluster, dirname, exes, 'intel19_debug', weekly, debug_build) -def test_integration_cifar_gcc7_debug(cluster, dirname, exes, weekly, debug): - skeleton_cifar_debug(cluster, dirname, exes, 'gcc7_debug', weekly, debug) - - -def test_integration_mnist_intel18_debug(cluster, dirname, exes, weekly, debug): - skeleton_mnist_debug(cluster, dirname, exes, 'intel18_debug', weekly, debug) - - -def test_integration_cifar_intel18_debug(cluster, dirname, exes, weekly, debug): - skeleton_cifar_debug(cluster, dirname, exes, 'intel18_debug', weekly, debug) +def test_integration_cifar_intel19_debug(cluster, dirname, exes, weekly, debug_build): + skeleton_cifar_debug(cluster, dirname, exes, 'intel19_debug', weekly, debug_build) # Run with python -m pytest -s test_integration_debug.py -k 'test_integration_mnist_exe' --exe= diff --git a/bamboo/integration_tests/test_integration_performance.py b/bamboo/integration_tests/test_integration_performance.py index 82413d93046..e9e74f7a80d 100644 --- a/bamboo/integration_tests/test_integration_performance.py +++ b/bamboo/integration_tests/test_integration_performance.py @@ -158,7 +158,7 @@ def skeleton_performance_full_alexnet(cluster, dir_name, executables, should_log = True output_file_name = '%s/bamboo/integration_tests/output/%s_%s_output.txt' %(dir_name, model_name, compiler_name) error_file_name = '%s/bamboo/integration_tests/error/%s_%s_error.txt' %(dir_name, model_name, compiler_name) - if cluster in ['catalyst', 'surface']: + if cluster in ['catalyst']: command = 'salloc --nodes 128 %s/bamboo/integration_tests/%s.sh > %s 2> %s' % (dir_name, model_name, output_file_name, error_file_name) elif cluster in ['pascal', 'ray']: e = 'skeleton_performance_full_alexnet: Pascal, Ray are unsupported for skeleton_performance_full_alexnet' @@ -188,19 +188,6 @@ def test_integration_performance_full_alexnet_clang4(cluster, dirname, exes, run) -def test_integration_performance_lenet_mnist_gcc4(cluster, dirname, exes): - skeleton_performance_lenet_mnist(cluster, dirname, exes, 'gcc4') - - -def test_integration_performance_alexnet_gcc4(cluster, dirname, exes, weekly): - skeleton_performance_alexnet(cluster, dirname, exes, 'gcc4', weekly) - - -def test_integration_performance_full_alexnet_gcc4(cluster, dirname, exes, - weekly, run): - skeleton_performance_full_alexnet(cluster, dirname, exes, 'gcc4', weekly, run) - - def test_integration_performance_lenet_mnist_gcc7(cluster, dirname, exes): skeleton_performance_lenet_mnist(cluster, dirname, exes, 'gcc7') @@ -214,18 +201,18 @@ def test_integration_performance_full_alexnet_gcc7(cluster, dirname, exes, skeleton_performance_full_alexnet(cluster, dirname, exes, 'gcc7', weekly, run) -def test_integration_performance_lenet_mnist_intel18(cluster, dirname, exes): - skeleton_performance_lenet_mnist(cluster, dirname, exes, 'intel18') +def test_integration_performance_lenet_mnist_intel19(cluster, dirname, exes): + skeleton_performance_lenet_mnist(cluster, dirname, exes, 'intel19') -def test_integration_performance_alexnet_intel18(cluster, dirname, exes, +def test_integration_performance_alexnet_intel19(cluster, dirname, exes, weekly): - skeleton_performance_alexnet(cluster, dirname, exes, 'intel18', weekly) + skeleton_performance_alexnet(cluster, dirname, exes, 'intel19', weekly) -def test_integration_performance_full_alexnet_intel18(cluster, dirname, exes, +def test_integration_performance_full_alexnet_intel19(cluster, dirname, exes, weekly, run): - skeleton_performance_full_alexnet(cluster, dirname, exes, 'intel18', weekly, + skeleton_performance_full_alexnet(cluster, dirname, exes, 'intel19', weekly, run) diff --git a/bamboo/unit_tests/conftest.py b/bamboo/unit_tests/conftest.py index eda975da95a..ef6b449b246 100644 --- a/bamboo/unit_tests/conftest.py +++ b/bamboo/unit_tests/conftest.py @@ -19,18 +19,22 @@ def pytest_addoption(parser): # For local testing only parser.addoption('--exe', action='store', help='--exe=') + @pytest.fixture def cluster(request): return request.config.getoption('--cluster') + @pytest.fixture def dirname(request): return request.config.getoption('--dirname') + @pytest.fixture def exes(request): return request.config.getoption('--exes') + @pytest.fixture def exe(request): return request.config.getoption('--exe') diff --git a/bamboo/unit_tests/test_unit_check_proto_models.py b/bamboo/unit_tests/test_unit_check_proto_models.py index 353fca3143a..7b497f1143d 100644 --- a/bamboo/unit_tests/test_unit_check_proto_models.py +++ b/bamboo/unit_tests/test_unit_check_proto_models.py @@ -122,16 +122,12 @@ def test_unit_models_clang4(cluster, dirname, exes): skeleton_models(cluster, dirname, exes, 'clang4') -def test_unit_models_gcc4(cluster, dirname, exes): - skeleton_models(cluster, dirname, exes, 'gcc4') - - def test_unit_models_gcc7(cluster, dirname, exes): skeleton_models(cluster, exes, dirname, 'gcc7') -def test_unit_models_intel18(cluster, dirname, exes): - skeleton_models(cluster, dirname, exes, 'intel18') +def test_unit_models_intel19(cluster, dirname, exes): + skeleton_models(cluster, dirname, exes, 'intel19') # Run with python -m pytest -s test_unit_check_proto_models.py -k 'test_unit_models_exe' --exe= diff --git a/bamboo/unit_tests/test_unit_checkpoint.py b/bamboo/unit_tests/test_unit_checkpoint.py index 25ea6614e3b..4824c32d104 100644 --- a/bamboo/unit_tests/test_unit_checkpoint.py +++ b/bamboo/unit_tests/test_unit_checkpoint.py @@ -128,19 +128,14 @@ def test_unit_checkpoint_lenet_clang4(cluster, exes, dirname): skeleton_checkpoint_lenet_distributed(cluster, exes, dirname, 'clang4') -def test_unit_checkpoint_lenet_gcc4(cluster, exes, dirname): - skeleton_checkpoint_lenet_shared(cluster, exes, dirname, 'gcc4') - skeleton_checkpoint_lenet_distributed(cluster, exes, dirname, 'gcc4') - - def test_unit_checkpoint_lenet_gcc7(cluster, exes, dirname): skeleton_checkpoint_lenet_shared(cluster, exes, dirname, 'gcc7') skeleton_checkpoint_lenet_distributed(cluster, exes, dirname, 'gcc7') -def test_unit_checkpoint_lenet_intel18(cluster, exes, dirname): - skeleton_checkpoint_lenet_shared(cluster, exes, dirname, 'intel18') - skeleton_checkpoint_lenet_distributed(cluster, exes, dirname, 'intel18') +def test_unit_checkpoint_lenet_intel19(cluster, exes, dirname): + skeleton_checkpoint_lenet_shared(cluster, exes, dirname, 'intel19') + skeleton_checkpoint_lenet_distributed(cluster, exes, dirname, 'intel19') # Run with python -m pytest -s test_unit_checkpoint.py -k 'test_unit_checkpoint_lenet_exe' --exe= diff --git a/bamboo/unit_tests/test_unit_layer_clamp.py b/bamboo/unit_tests/test_unit_layer_clamp.py index 8cd7d579374..67bb1be15de 100644 --- a/bamboo/unit_tests/test_unit_layer_clamp.py +++ b/bamboo/unit_tests/test_unit_layer_clamp.py @@ -27,16 +27,12 @@ def test_unit_layer_clamp_clang4(cluster, exes, dirname): skeleton_layer_clamp(cluster, exes, dirname, 'clang4') -def test_unit_layer_clamp_gcc4_check(cluster, exes, dirname): - skeleton_layer_clamp(cluster, exes, dirname, 'gcc4') - - def test_unit_layer_clamp_gcc7(cluster, exes, dirname): skeleton_layer_clamp(cluster, exes, dirname, 'gcc7') -def test_unit_layer_clamp_intel18(cluster, exes, dirname): - skeleton_layer_clamp(cluster, exes, dirname, 'intel18') +def test_unit_layer_clamp_intel19(cluster, exes, dirname): + skeleton_layer_clamp(cluster, exes, dirname, 'intel19') # Run with python -m pytest -s test_unit_layer_clamp.py -k 'test_unit_layer_clamp_exe' --exe= diff --git a/bamboo/unit_tests/test_unit_layer_covariance.py b/bamboo/unit_tests/test_unit_layer_covariance.py index e72bca4fb51..b836208e68e 100644 --- a/bamboo/unit_tests/test_unit_layer_covariance.py +++ b/bamboo/unit_tests/test_unit_layer_covariance.py @@ -27,16 +27,12 @@ def test_unit_layer_covariance_clang4(cluster, exes, dirname): skeleton_layer_covariance(cluster, exes, dirname, 'clang4') -def test_unit_layer_covariance_gcc4_check(cluster, exes, dirname): - skeleton_layer_covariance(cluster, exes, dirname, 'gcc4') - - def test_unit_layer_covariance_gcc7(cluster, exes, dirname): skeleton_layer_covariance(cluster, exes, dirname, 'gcc7') -def test_unit_layer_covariance_intel18(cluster, exes, dirname): - skeleton_layer_covariance(cluster, exes, dirname, 'intel18') +def test_unit_layer_covariance_intel19(cluster, exes, dirname): + skeleton_layer_covariance(cluster, exes, dirname, 'intel19') # Run with python -m pytest -s test_unit_ridge_regression.py -k 'test_unit_layer_covariance_exe' --exe= diff --git a/bamboo/unit_tests/test_unit_layer_elu.py b/bamboo/unit_tests/test_unit_layer_elu.py index 66b10d1fc5b..f96fcafdd73 100644 --- a/bamboo/unit_tests/test_unit_layer_elu.py +++ b/bamboo/unit_tests/test_unit_layer_elu.py @@ -27,16 +27,12 @@ def test_unit_layer_elu_clang4(cluster, exes, dirname): skeleton_layer_elu(cluster, exes, dirname, 'clang4') -def test_unit_layer_elu_gcc4_check(cluster, exes, dirname): - skeleton_layer_elu(cluster, exes, dirname, 'gcc4') - - def test_unit_layer_elu_gcc7(cluster, exes, dirname): skeleton_layer_elu(cluster, exes, dirname, 'gcc7') -def test_unit_layer_elu_intel18(cluster, exes, dirname): - skeleton_layer_elu(cluster, exes, dirname, 'intel18') +def test_unit_layer_elu_intel19(cluster, exes, dirname): + skeleton_layer_elu(cluster, exes, dirname, 'intel19') # Run with python -m pytest -s test_unit_layer_elu.py -k 'test_unit_layer_elu_exe' --exe= diff --git a/bamboo/unit_tests/test_unit_layer_identity.py b/bamboo/unit_tests/test_unit_layer_identity.py index 86568e946d5..72bc91af09e 100644 --- a/bamboo/unit_tests/test_unit_layer_identity.py +++ b/bamboo/unit_tests/test_unit_layer_identity.py @@ -27,16 +27,12 @@ def test_unit_layer_identity_clang4(cluster, exes, dirname): skeleton_layer_identity(cluster, exes, dirname, 'clang4') -def test_unit_layer_identity_gcc4_check(cluster, exes, dirname): - skeleton_layer_identity(cluster, exes, dirname, 'gcc4') - - def test_unit_layer_identity_gcc7(cluster, exes, dirname): skeleton_layer_identity(cluster, exes, dirname, 'gcc7') -def test_unit_layer_identity_intel18(cluster, exes, dirname): - skeleton_layer_identity(cluster, exes, dirname, 'intel18') +def test_unit_layer_identity_intel19(cluster, exes, dirname): + skeleton_layer_identity(cluster, exes, dirname, 'intel19') # Run with python -m pytest -s test_unit_layer_identity.py -k 'test_unit_layer_identity_exe' --exe= diff --git a/bamboo/unit_tests/test_unit_layer_l1_norm.py b/bamboo/unit_tests/test_unit_layer_l1_norm.py index 9abcc2652ce..adebb726417 100644 --- a/bamboo/unit_tests/test_unit_layer_l1_norm.py +++ b/bamboo/unit_tests/test_unit_layer_l1_norm.py @@ -27,16 +27,12 @@ def test_unit_layer_l1_norm_clang4(cluster, exes, dirname): skeleton_layer_l1_norm(cluster, exes, dirname, 'clang4') -def test_unit_layer_l1_norm_gcc4_check(cluster, exes, dirname): - skeleton_layer_l1_norm(cluster, exes, dirname, 'gcc4') - - def test_unit_layer_l1_norm_gcc7(cluster, exes, dirname): skeleton_layer_l1_norm(cluster, exes, dirname, 'gcc7') -def test_unit_layer_l1_norm_intel18(cluster, exes, dirname): - skeleton_layer_l1_norm(cluster, exes, dirname, 'intel18') +def test_unit_layer_l1_norm_intel19(cluster, exes, dirname): + skeleton_layer_l1_norm(cluster, exes, dirname, 'intel19') # Run with python -m pytest -s test_unit_ridge_regression.py -k 'test_unit_layer_l1_norm_exe' --exe= diff --git a/bamboo/unit_tests/test_unit_layer_l2_norm2.py b/bamboo/unit_tests/test_unit_layer_l2_norm2.py index cdbad231498..9670b6ef7bc 100644 --- a/bamboo/unit_tests/test_unit_layer_l2_norm2.py +++ b/bamboo/unit_tests/test_unit_layer_l2_norm2.py @@ -26,17 +26,12 @@ def skeleton_layer_l2_norm2(cluster, executables, dir_name, compiler_name): def test_unit_layer_l2_norm2_clang4(cluster, exes, dirname): skeleton_layer_l2_norm2(cluster, exes, dirname, 'clang4') - -def test_unit_layer_l2_norm2_gcc4_check(cluster, exes, dirname): - skeleton_layer_l2_norm2(cluster, exes, dirname, 'gcc4') - - def test_unit_layer_l2_norm2_gcc7(cluster, exes, dirname): skeleton_layer_l2_norm2(cluster, exes, dirname, 'gcc7') -def test_unit_layer_l2_norm2_intel18(cluster, exes, dirname): - skeleton_layer_l2_norm2(cluster, exes, dirname, 'intel18') +def test_unit_layer_l2_norm2_intel19(cluster, exes, dirname): + skeleton_layer_l2_norm2(cluster, exes, dirname, 'intel19') # Run with python -m pytest -s test_unit_ridge_regression.py -k 'test_unit_layer_l2_norm2_exe' --exe= diff --git a/bamboo/unit_tests/test_unit_layer_leaky_relu.py b/bamboo/unit_tests/test_unit_layer_leaky_relu.py index 6c90b34ce78..a1a9b020a2b 100644 --- a/bamboo/unit_tests/test_unit_layer_leaky_relu.py +++ b/bamboo/unit_tests/test_unit_layer_leaky_relu.py @@ -27,16 +27,12 @@ def test_unit_layer_leaky_relu_clang4(cluster, exes, dirname): skeleton_layer_leaky_relu(cluster, exes, dirname, 'clang4') -def test_unit_layer_leaky_relu_gcc4_check(cluster, exes, dirname): - skeleton_layer_leaky_relu(cluster, exes, dirname, 'gcc4') - - def test_unit_layer_leaky_relu_gcc7(cluster, exes, dirname): skeleton_layer_leaky_relu(cluster, exes, dirname, 'gcc7') -def test_unit_layer_leaky_relu_intel18(cluster, exes, dirname): - skeleton_layer_leaky_relu(cluster, exes, dirname, 'intel18') +def test_unit_layer_leaky_relu_intel19(cluster, exes, dirname): + skeleton_layer_leaky_relu(cluster, exes, dirname, 'intel19') # Run with python -m pytest -s test_unit_ridge_regression.py -k 'test_unit_layer_leaky_relu_exe' --exe= diff --git a/bamboo/unit_tests/test_unit_layer_log_sigmoid.py b/bamboo/unit_tests/test_unit_layer_log_sigmoid.py index 9a47d55754d..f4b4634cc6e 100644 --- a/bamboo/unit_tests/test_unit_layer_log_sigmoid.py +++ b/bamboo/unit_tests/test_unit_layer_log_sigmoid.py @@ -27,16 +27,12 @@ def test_unit_layer_log_sigmoid_clang4(cluster, exes, dirname): skeleton_layer_log_sigmoid(cluster, exes, dirname, 'clang4') -def test_unit_layer_log_sigmoid_gcc4_check(cluster, exes, dirname): - skeleton_layer_log_sigmoid(cluster, exes, dirname, 'gcc4') - - def test_unit_layer_log_sigmoid_gcc7(cluster, exes, dirname): skeleton_layer_log_sigmoid(cluster, exes, dirname, 'gcc7') -def test_unit_layer_log_sigmoid_intel18(cluster, exes, dirname): - skeleton_layer_log_sigmoid(cluster, exes, dirname, 'intel18') +def test_unit_layer_log_sigmoid_intel19(cluster, exes, dirname): + skeleton_layer_log_sigmoid(cluster, exes, dirname, 'intel19') # Run with python -m pytest -s test_unit_layer_log_sigmoid.py -k 'test_unit_layer_log_sigmoid_exe' --exe= diff --git a/bamboo/unit_tests/test_unit_layer_log_softmax.py b/bamboo/unit_tests/test_unit_layer_log_softmax.py index 85a20790d31..0345180165f 100644 --- a/bamboo/unit_tests/test_unit_layer_log_softmax.py +++ b/bamboo/unit_tests/test_unit_layer_log_softmax.py @@ -27,16 +27,12 @@ def test_unit_layer_log_softmax_clang4(cluster, exes, dirname): skeleton_layer_log_softmax(cluster, exes, dirname, 'clang4') -def test_unit_layer_log_softmax_gcc4_check(cluster, exes, dirname): - skeleton_layer_log_softmax(cluster, exes, dirname, 'gcc4') - - def test_unit_layer_log_softmax_gcc7(cluster, exes, dirname): skeleton_layer_log_softmax(cluster, exes, dirname, 'gcc7') -def test_unit_layer_log_softmax_intel18(cluster, exes, dirname): - skeleton_layer_log_softmax(cluster, exes, dirname, 'intel18') +def test_unit_layer_log_softmax_intel19(cluster, exes, dirname): + skeleton_layer_log_softmax(cluster, exes, dirname, 'intel19') # Run with python -m pytest -s test_unit_ridge_regression.py -k 'test_unit_layer_log_softmax_exe' --exe= diff --git a/bamboo/unit_tests/test_unit_layer_mean_absolute_error.py b/bamboo/unit_tests/test_unit_layer_mean_absolute_error.py index c21544ed295..0a623c48dcc 100644 --- a/bamboo/unit_tests/test_unit_layer_mean_absolute_error.py +++ b/bamboo/unit_tests/test_unit_layer_mean_absolute_error.py @@ -27,16 +27,12 @@ def test_unit_layer_mean_absolute_error_clang4(cluster, exes, dirname): skeleton_layer_mean_absolute_error(cluster, exes, dirname, 'clang4') -def test_unit_layer_mean_absolute_error_gcc4_check(cluster, exes, dirname): - skeleton_layer_mean_absolute_error(cluster, exes, dirname, 'gcc4') - - def test_unit_layer_mean_absolute_error_gcc7(cluster, exes, dirname): skeleton_layer_mean_absolute_error(cluster, exes, dirname, 'gcc7') -def test_unit_layer_mean_absolute_error_intel18(cluster, exes, dirname): - skeleton_layer_mean_absolute_error(cluster, exes, dirname, 'intel18') +def test_unit_layer_mean_absolute_error_intel19(cluster, exes, dirname): + skeleton_layer_mean_absolute_error(cluster, exes, dirname, 'intel19') # Run with python -m pytest -s test_unit_ridge_regression.py -k 'test_unit_layer_mean_absolute_error_exe' --exe= diff --git a/bamboo/unit_tests/test_unit_layer_relu.py b/bamboo/unit_tests/test_unit_layer_relu.py index c904cce301f..0e3da6ecbe9 100644 --- a/bamboo/unit_tests/test_unit_layer_relu.py +++ b/bamboo/unit_tests/test_unit_layer_relu.py @@ -27,16 +27,12 @@ def test_unit_layer_relu_clang4(cluster, exes, dirname): skeleton_layer_relu(cluster, exes, dirname, 'clang4') -def test_unit_layer_relu_gcc4_check(cluster, exes, dirname): - skeleton_layer_relu(cluster, exes, dirname, 'gcc4') - - def test_unit_layer_relu_gcc7(cluster, exes, dirname): skeleton_layer_relu(cluster, exes, dirname, 'gcc7') -def test_unit_layer_relu_intel18(cluster, exes, dirname): - skeleton_layer_relu(cluster, exes, dirname, 'intel18') +def test_unit_layer_relu_intel19(cluster, exes, dirname): + skeleton_layer_relu(cluster, exes, dirname, 'intel19') # Run with python -m pytest -s test_unit_layer_relu.py -k 'test_unit_layer_relu_exe' --exe= diff --git a/bamboo/unit_tests/test_unit_layer_selu.py b/bamboo/unit_tests/test_unit_layer_selu.py index b32f8c9eb71..eb044bf6d30 100644 --- a/bamboo/unit_tests/test_unit_layer_selu.py +++ b/bamboo/unit_tests/test_unit_layer_selu.py @@ -27,16 +27,12 @@ def test_unit_layer_selu_clang4(cluster, exes, dirname): skeleton_layer_selu(cluster, exes, dirname, 'clang4') -def test_unit_layer_selu_gcc4_check(cluster, exes, dirname): - skeleton_layer_selu(cluster, exes, dirname, 'gcc4') - - def test_unit_layer_selu_gcc7(cluster, exes, dirname): skeleton_layer_selu(cluster, exes, dirname, 'gcc7') -def test_unit_layer_selu_intel18(cluster, exes, dirname): - skeleton_layer_selu(cluster, exes, dirname, 'intel18') +def test_unit_layer_selu_intel19(cluster, exes, dirname): + skeleton_layer_selu(cluster, exes, dirname, 'intel19') # Run with python -m pytest -s test_unit_layer_selu.py -k 'test_unit_layer_selu_exe' --exe= diff --git a/bamboo/unit_tests/test_unit_layer_sigmoid.py b/bamboo/unit_tests/test_unit_layer_sigmoid.py index 268526b7644..fab0d235be8 100644 --- a/bamboo/unit_tests/test_unit_layer_sigmoid.py +++ b/bamboo/unit_tests/test_unit_layer_sigmoid.py @@ -27,16 +27,12 @@ def test_unit_layer_sigmoid_clang4(cluster, exes, dirname): skeleton_layer_sigmoid(cluster, exes, dirname, 'clang4') -def test_unit_layer_sigmoid_gcc4_check(cluster, exes, dirname): - skeleton_layer_sigmoid(cluster, exes, dirname, 'gcc4') - - def test_unit_layer_sigmoid_gcc7(cluster, exes, dirname): skeleton_layer_sigmoid(cluster, exes, dirname, 'gcc7') -def test_unit_layer_sigmoid_intel18(cluster, exes, dirname): - skeleton_layer_sigmoid(cluster, exes, dirname, 'intel18') +def test_unit_layer_sigmoid_intel19(cluster, exes, dirname): + skeleton_layer_sigmoid(cluster, exes, dirname, 'intel19') # Run with python -m pytest -s test_unit_layer_sigmoid.py -k 'test_unit_layer_sigmoid_exe' --exe= diff --git a/bamboo/unit_tests/test_unit_layer_softmax.py b/bamboo/unit_tests/test_unit_layer_softmax.py index dd4c3add193..44f78a154a4 100644 --- a/bamboo/unit_tests/test_unit_layer_softmax.py +++ b/bamboo/unit_tests/test_unit_layer_softmax.py @@ -27,16 +27,12 @@ def test_unit_layer_softmax_clang4(cluster, exes, dirname): skeleton_layer_softmax(cluster, exes, dirname, 'clang4') -def test_unit_layer_softmax_gcc4_check(cluster, exes, dirname): - skeleton_layer_softmax(cluster, exes, dirname, 'gcc4') - - def test_unit_layer_softmax_gcc7(cluster, exes, dirname): skeleton_layer_softmax(cluster, exes, dirname, 'gcc7') -def test_unit_layer_softmax_intel18(cluster, exes, dirname): - skeleton_layer_softmax(cluster, exes, dirname, 'intel18') +def test_unit_layer_softmax_intel19(cluster, exes, dirname): + skeleton_layer_softmax(cluster, exes, dirname, 'intel19') # Run with python -m pytest -s test_unit_ridge_regression.py -k 'test_unit_layer_softmax_exe' --exe= diff --git a/bamboo/unit_tests/test_unit_layer_softplus.py b/bamboo/unit_tests/test_unit_layer_softplus.py index 0c017c6f93e..5f0d013df9d 100644 --- a/bamboo/unit_tests/test_unit_layer_softplus.py +++ b/bamboo/unit_tests/test_unit_layer_softplus.py @@ -27,16 +27,12 @@ def test_unit_layer_softplus_clang4(cluster, exes, dirname): skeleton_layer_softplus(cluster, exes, dirname, 'clang4') -def test_unit_layer_softplus_gcc4_check(cluster, exes, dirname): - skeleton_layer_softplus(cluster, exes, dirname, 'gcc4') - - def test_unit_layer_softplus_gcc7(cluster, exes, dirname): skeleton_layer_softplus(cluster, exes, dirname, 'gcc7') -def test_unit_layer_softplus_intel18(cluster, exes, dirname): - skeleton_layer_softplus(cluster, exes, dirname, 'intel18') +def test_unit_layer_softplus_intel19(cluster, exes, dirname): + skeleton_layer_softplus(cluster, exes, dirname, 'intel19') # Run with python -m pytest -s test_unit_layer_softplus.py -k 'test_unit_layer_softplus_exe' --exe= diff --git a/bamboo/unit_tests/test_unit_layer_softsign.py b/bamboo/unit_tests/test_unit_layer_softsign.py index a7bed251425..c0bea317b76 100644 --- a/bamboo/unit_tests/test_unit_layer_softsign.py +++ b/bamboo/unit_tests/test_unit_layer_softsign.py @@ -27,16 +27,16 @@ def test_unit_layer_softsign_clang4(cluster, exes, dirname): skeleton_layer_softsign(cluster, exes, dirname, 'clang4') -def test_unit_layer_softsign_gcc4_check(cluster, exes, dirname): - skeleton_layer_softsign(cluster, exes, dirname, 'gcc4') - - def test_unit_layer_softsign_gcc7(cluster, exes, dirname): skeleton_layer_softsign(cluster, exes, dirname, 'gcc7') -def test_unit_layer_softsign_intel18(cluster, exes, dirname): - skeleton_layer_softsign(cluster, exes, dirname, 'intel18') +def test_unit_layer_softsign_intel19(cluster, exes, dirname): + skeleton_layer_softsign(cluster, exes, dirname, 'intel19') + + +def test_unit_layer_softsign_intel19(cluster, exes, dirname): + skeleton_layer_softsign(cluster, exes, dirname, 'intel19') # Run with python -m pytest -s test_unit_layer_softsign.py -k 'test_unit_layer_softsign_exe' --exe= diff --git a/bamboo/unit_tests/test_unit_layer_squared_difference.py b/bamboo/unit_tests/test_unit_layer_squared_difference.py index a05bbcc5082..2e9cc3d198f 100644 --- a/bamboo/unit_tests/test_unit_layer_squared_difference.py +++ b/bamboo/unit_tests/test_unit_layer_squared_difference.py @@ -27,16 +27,12 @@ def test_unit_layer_squared_difference_clang4(cluster, exes, dirname): skeleton_layer_squared_difference(cluster, exes, dirname, 'clang4') -def test_unit_layer_squared_difference_gcc4_check(cluster, exes, dirname): - skeleton_layer_squared_difference(cluster, exes, dirname, 'gcc4') - - def test_unit_layer_squared_difference_gcc7(cluster, exes, dirname): skeleton_layer_squared_difference(cluster, exes, dirname, 'gcc7') -def test_unit_layer_squared_difference_intel18(cluster, exes, dirname): - skeleton_layer_squared_difference(cluster, exes, dirname, 'intel18') +def test_unit_layer_squared_difference_intel19(cluster, exes, dirname): + skeleton_layer_squared_difference(cluster, exes, dirname, 'intel19') # Run with python -m pytest -s test_unit_layer_squared_difference.py -k 'test_unit_layer_squared_difference_exe' --exe= diff --git a/bamboo/unit_tests/test_unit_layer_tessellate.py b/bamboo/unit_tests/test_unit_layer_tessellate.py index 575bd894f89..7619ee3b5e3 100644 --- a/bamboo/unit_tests/test_unit_layer_tessellate.py +++ b/bamboo/unit_tests/test_unit_layer_tessellate.py @@ -27,16 +27,12 @@ def test_unit_layer_tessellate_clang4(cluster, exes, dirname): skeleton_layer_tessellate(cluster, exes, dirname, 'clang4') -def test_unit_layer_tessellate_gcc4_check(cluster, exes, dirname): - skeleton_layer_tessellate(cluster, exes, dirname, 'gcc4') - - def test_unit_layer_tessellate_gcc7(cluster, exes, dirname): skeleton_layer_tessellate(cluster, exes, dirname, 'gcc7') -def test_unit_layer_tessellate_intel18(cluster, exes, dirname): - skeleton_layer_tessellate(cluster, exes, dirname, 'intel18') +def test_unit_layer_tessellate_intel19(cluster, exes, dirname): + skeleton_layer_tessellate(cluster, exes, dirname, 'intel19') # Run with python -m pytest -s test_unit_layer_tessellate.py -k 'test_unit_layer_tessellate_exe' --exe= diff --git a/bamboo/unit_tests/test_unit_layer_variance.py b/bamboo/unit_tests/test_unit_layer_variance.py index 0db001567d5..e8422c3a70e 100644 --- a/bamboo/unit_tests/test_unit_layer_variance.py +++ b/bamboo/unit_tests/test_unit_layer_variance.py @@ -27,16 +27,12 @@ def test_unit_layer_variance_clang4(cluster, exes, dirname): skeleton_layer_variance(cluster, exes, dirname, 'clang4') -def test_unit_layer_variance_gcc4_check(cluster, exes, dirname): - skeleton_layer_variance(cluster, exes, dirname, 'gcc4') - - def test_unit_layer_variance_gcc7(cluster, exes, dirname): skeleton_layer_variance(cluster, exes, dirname, 'gcc7') -def test_unit_layer_variance_intel18(cluster, exes, dirname): - skeleton_layer_variance(cluster, exes, dirname, 'intel18') +def test_unit_layer_variance_intel19(cluster, exes, dirname): + skeleton_layer_variance(cluster, exes, dirname, 'intel19') # Run with python -m pytest -s test_unit_ridge_regression.py -k 'test_unit_layer_variance_exe' --exe= diff --git a/bamboo/unit_tests/test_unit_lbann2_reload.py b/bamboo/unit_tests/test_unit_lbann2_reload.py index 4b8491e248f..d48b28873ce 100644 --- a/bamboo/unit_tests/test_unit_lbann2_reload.py +++ b/bamboo/unit_tests/test_unit_lbann2_reload.py @@ -124,18 +124,14 @@ def test_unit_lbann2_reload_clang4(cluster, exes, dirname): skeleton_lbann2_reload(cluster, exes, dirname, 'clang4') -def test_unit_lbann2_reload_gcc4(cluster, exes, dirname): - skeleton_lbann2_reload(cluster, exes, dirname, 'gcc4') - - def test_unit_lbann2_reload_gcc7(cluster, exes, dirname): if cluster in ['catalyst', 'pascal']: # STILL ERRORS pytest.skip('FIXME') skeleton_lbann2_reload(cluster, exes, dirname, 'gcc7') -def test_unit_lbann2_reload_intel18(cluster, exes, dirname): - skeleton_lbann2_reload(cluster, exes, dirname, 'intel18') +def test_unit_lbann2_reload_intel19(cluster, exes, dirname): + skeleton_lbann2_reload(cluster, exes, dirname, 'intel19') # Run with python -m pytest -s test_unit_lbann2_reload.py -k 'test_unit_lbann2_reload_exe' --exe= diff --git a/bamboo/unit_tests/test_unit_lbann_invocation.py b/bamboo/unit_tests/test_unit_lbann_invocation.py index a002db49be4..55299c26f9d 100644 --- a/bamboo/unit_tests/test_unit_lbann_invocation.py +++ b/bamboo/unit_tests/test_unit_lbann_invocation.py @@ -4,7 +4,7 @@ import os, sys def test_unit_no_params_bad(cluster, exes): - exe = exes['gcc4'] + exe = exes['gcc7'] sys.stderr.write('TESTING: run lbann with no params; lbann should throw exception\n') command = tools.get_command( cluster=cluster, executable=exe, exit_after_setup=True) @@ -13,7 +13,7 @@ def test_unit_no_params_bad(cluster, exes): def test_unit_one_model_bad(cluster, exes): - exe = exes['gcc4'] + exe = exes['gcc7'] sys.stderr.write('TESTING: run lbann with no optimizer or reader; lbann should throw exception\n') model_path = 'prototext/model_mnist_simple_1.prototext' command = tools.get_command( @@ -24,7 +24,7 @@ def test_unit_one_model_bad(cluster, exes): def test_unit_two_models_bad(cluster, exes): - exe = exes['gcc4'] + exe = exes['gcc7'] sys.stderr.write('TESTING: run lbann with two models but no optimizer or reader; lbann should throw exception\n') model_path = '{prototext/model_mnist_simple_1.prototext,prototext/model_mnist_simple_1.prototext}' command = tools.get_command( @@ -35,7 +35,7 @@ def test_unit_two_models_bad(cluster, exes): def test_unit_two_models_bad2(cluster, exes): - exe = exes['gcc4'] + exe = exes['gcc7'] sys.stderr.write('TESTING: run lbann with two models with missing {; lbann should throw exception\n') model_path='prototext/model_mnist_simple_1.prototext,prototext/model_mnist_simple_1.prototext}' command = tools.get_command( @@ -46,7 +46,7 @@ def test_unit_two_models_bad2(cluster, exes): def test_unit_missing_optimizer(cluster, exes): - exe = exes['gcc4'] + exe = exes['gcc7'] sys.stderr.write('TESTING: run lbann with two models, reader, but no optimizer; lbann should throw exception\n') model_path='{prototext/model_mnist_simple_1.prototext,prototext/model_mnist_simple_1.prototext}' data_reader_path='prototext/data_reader_mnist.prototext' @@ -59,7 +59,7 @@ def test_unit_missing_optimizer(cluster, exes): def test_unit_missing_reader(cluster, exes): - exe = exes['gcc4'] + exe = exes['gcc7'] sys.stderr.write('TESTING: run lbann with two models, reader, but no reader; lbann should throw exception\n') model_path = '{prototext/model_mnist_simple_1.prototext,prototext/model_mnist_simple_1.prototext}' optimizer_path = 'prototext/opt_sgd.prototext' @@ -71,7 +71,7 @@ def test_unit_missing_reader(cluster, exes): def test_unit_bad_params(cluster, exes): - exe = exes['gcc4'] + exe = exes['gcc7'] sys.stderr.write('TESTING: run lbann with ill-formed param (missing -) lbann should throw exception\n') (command_allocate, command_run, _, _) = tools.get_command(cluster=cluster, executable=exe, return_tuple=True) return_code = os.system('%s%s %s -exit_after_setup --reader=prototext/data_reader_mnist.prototext --model={prototext/model_mnist_simple_1.prototext,prototext/model_mnist_simple_1.prototext} --optimizer=prototext/opt_sgd.prototext' % (command_allocate, command_run, exe)) @@ -79,7 +79,7 @@ def test_unit_bad_params(cluster, exes): def test_unit_should_work(cluster, exes): - exe = exes['gcc4'] + exe = exes['gcc7'] sys.stderr.write('TESTING: run lbann with two models, reader, and optimizer; lbann should NOT throw exception\n') model_path = '{prototext/model_mnist_simple_1.prototext,prototext/model_mnist_simple_1.prototext}' data_reader_path = 'prototext/data_reader_mnist.prototext' diff --git a/bamboo/unit_tests/test_unit_mnist_conv_graph.py b/bamboo/unit_tests/test_unit_mnist_conv_graph.py index 65a7bd54ad0..829fffdff2d 100644 --- a/bamboo/unit_tests/test_unit_mnist_conv_graph.py +++ b/bamboo/unit_tests/test_unit_mnist_conv_graph.py @@ -34,16 +34,12 @@ def test_unit_mnist_conv_graph_clang4(cluster, exes, dirname): skeleton_mnist_conv_graph(cluster, exes, dirname, 'clang4') -def test_unit_mnist_conv_graph_gcc4(cluster, exes, dirname): - skeleton_mnist_conv_graph(cluster, exes, dirname, 'gcc4') - - def test_unit_mnist_conv_graph_gcc7(cluster, exes, dirname): skeleton_mnist_conv_graph(cluster, exes, dirname, 'gcc7') -def test_unit_mnist_conv_graph_intel18(cluster, exes, dirname): - skeleton_mnist_conv_graph(cluster, exes, dirname, 'intel18') +def test_unit_mnist_conv_graph_intel19(cluster, exes, dirname): + skeleton_mnist_conv_graph(cluster, exes, dirname, 'intel19') # Run with python -m pytest -s test_unit_conv_graph.py -k 'test_unit_mnist_conv_graph_exe' --exe= diff --git a/bamboo/unit_tests/test_unit_mnist_ridge_regression.py b/bamboo/unit_tests/test_unit_mnist_ridge_regression.py index 0d4d3994837..5b27b342cb4 100644 --- a/bamboo/unit_tests/test_unit_mnist_ridge_regression.py +++ b/bamboo/unit_tests/test_unit_mnist_ridge_regression.py @@ -28,16 +28,12 @@ def test_unit_mnist_ridge_regression_clang4(cluster, exes, dirname): skeleton_mnist_ridge_regression(cluster, exes, dirname, 'clang4') -def test_unit_mnist_ridge_regression_gcc4(cluster, exes, dirname): - skeleton_mnist_ridge_regression(cluster, exes, dirname, 'gcc4') - - def test_unit_mnist_ridge_regression_gcc7(cluster, exes, dirname): skeleton_mnist_ridge_regression(cluster, exes, dirname, 'gcc7') -def test_unit_mnist_ridge_regression_intel18(cluster, exes, dirname): - skeleton_mnist_ridge_regression(cluster, exes, dirname, 'intel18') +def test_unit_mnist_ridge_regression_intel19(cluster, exes, dirname): + skeleton_mnist_ridge_regression(cluster, exes, dirname, 'intel19') # Run with python -m pytest -s test_unit_ridge_regression.py -k 'test_unit_mnist_ridge_regression_exe' --exe= diff --git a/bamboo/unit_tests/test_unit_mnist_softmax_classifier.py b/bamboo/unit_tests/test_unit_mnist_softmax_classifier.py index 8718c0e5802..1c2c2353100 100644 --- a/bamboo/unit_tests/test_unit_mnist_softmax_classifier.py +++ b/bamboo/unit_tests/test_unit_mnist_softmax_classifier.py @@ -28,16 +28,12 @@ def test_unit_mnist_softmax_classifier_clang4(cluster, exes, dirname): skeleton_mnist_softmax_classifier(cluster, exes, dirname, 'clang4') -def test_unit_mnist_softmax_classifier_gcc4(cluster, exes, dirname): - skeleton_mnist_softmax_classifier(cluster, exes, dirname, 'gcc4') - - def test_unit_mnist_softmax_classifier_gcc7(cluster, exes, dirname): skeleton_mnist_softmax_classifier(cluster, exes, dirname, 'gcc7') -def test_unit_mnist_softmax_classifier_intel18(cluster, exes, dirname): - skeleton_mnist_softmax_classifier(cluster, exes, dirname, 'intel18') +def test_unit_mnist_softmax_classifier_intel19(cluster, exes, dirname): + skeleton_mnist_softmax_classifier(cluster, exes, dirname, 'intel19') # Run with python -m pytest -s test_unit_softmax_classifier.py -k 'test_unit_mnist_softmax_classifier_exe' --exe= diff --git a/docs/continuous_integration.rst b/docs/continuous_integration.rst index 363b9aa8f9c..dc12100fedb 100644 --- a/docs/continuous_integration.rst +++ b/docs/continuous_integration.rst @@ -155,12 +155,8 @@ Bamboo agent properties are used to specify requirements for each job. +--------------------------------+-------------+--------------+----------+------------------+---------------------+ | Pascal Agents (x86_gpu_pascal) | lbannusr | x86_64 | pascal | pascal | chaos_6_x86_64_ib | +--------------------------------+-------------+--------------+----------+------------------+---------------------+ -| Quartz Agents (x86_cpu) | lbannusr | x86_64 | quartz | none | toss_3_x86_64_ib | -+--------------------------------+-------------+--------------+----------+------------------+---------------------+ | Ray Agents (ppc64le_gpu) | lbannusr | ppc64_le | ray | pascal | blueos_3_ppc64le_ib | +--------------------------------+-------------+--------------+----------+------------------+---------------------+ -| Surface Agents (x86_gpu) | lbannusr | x86_64 | surface | kepler | chaos_5_x86_64_ib | -+--------------------------------+-------------+--------------+----------+------------------+---------------------+ Currently, "agent_owner", "architecture", and "gpu_architecture" are used to determine agents to run a job. From 9d2ee76d08216290fbb8f7f687bd3f384937c5e3 Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Thu, 13 Jun 2019 18:20:11 -0700 Subject: [PATCH 075/634] Updated the JAG data reader and repack transform layer to use DataType format internally. The JAG data reader will now cast the images from ch_t to DataType during ingestion. --- include/lbann/data_readers/data_reader_jag_conduit.hpp | 2 +- src/data_readers/data_reader_jag_conduit.cpp | 7 ++++--- src/transforms/repack_HWC_to_CHW_layout.cpp | 2 +- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/include/lbann/data_readers/data_reader_jag_conduit.hpp b/include/lbann/data_readers/data_reader_jag_conduit.hpp index 71b03a883de..c9f46c349db 100644 --- a/include/lbann/data_readers/data_reader_jag_conduit.hpp +++ b/include/lbann/data_readers/data_reader_jag_conduit.hpp @@ -343,7 +343,7 @@ class data_reader_jag_conduit : public generic_data_reader { bool has_conduit_path(const size_t i, const std::string& key) const; /// Obtain image data - std::vector< std::vector > get_image_data(const size_t i, conduit::Node& sample) const; + std::vector< std::vector > get_image_data(const size_t i, conduit::Node& sample) const; bool data_store_active() const { bool flag = generic_data_reader::data_store_active(); diff --git a/src/data_readers/data_reader_jag_conduit.cpp b/src/data_readers/data_reader_jag_conduit.cpp index a1942a138b5..3e91bcdc247 100644 --- a/src/data_readers/data_reader_jag_conduit.cpp +++ b/src/data_readers/data_reader_jag_conduit.cpp @@ -1184,9 +1184,9 @@ bool data_reader_jag_conduit::check_non_numeric(const std::string key) { } -std::vector< std::vector > +std::vector< std::vector > data_reader_jag_conduit::get_image_data(const size_t sample_id, conduit::Node& sample) const { - std::vector< std::vector > image_ptrs; + std::vector< std::vector > image_ptrs; image_ptrs.reserve(m_emi_image_keys.size()); for (const auto& emi_tag : m_emi_image_keys) { @@ -1208,6 +1208,7 @@ data_reader_jag_conduit::get_image_data(const size_t sample_id, conduit::Node& s conduit_ch_t emi = sample[conduit_obj].value(); const size_t num_vals = emi.number_of_elements(); const ch_t* emi_data = sample[conduit_obj].value(); + // Note that data will be cast from ch_t to DataType format image_ptrs.emplace_back(emi_data, emi_data + num_vals); } @@ -1325,7 +1326,7 @@ bool data_reader_jag_conduit::fetch(CPUMat& X, int data_id, conduit::Node& sampl const size_t image_size = get_linearized_image_size(); const std::vector sizes(num_images, image_size); std::vector X_v = create_datum_views(X, sizes, mb_idx); - std::vector< std::vector > img_data(get_image_data(data_id, sample)); + std::vector< std::vector > img_data(get_image_data(data_id, sample)); if (img_data.size() != num_images) { _THROW_LBANN_EXCEPTION2_(_CN_, "fetch() : the number of images is not as expected ", \ diff --git a/src/transforms/repack_HWC_to_CHW_layout.cpp b/src/transforms/repack_HWC_to_CHW_layout.cpp index 113f20076c6..6eeabe9d8bd 100644 --- a/src/transforms/repack_HWC_to_CHW_layout.cpp +++ b/src/transforms/repack_HWC_to_CHW_layout.cpp @@ -38,7 +38,7 @@ void repack_HWC_to_CHW_layout::apply(utils::type_erased_matrix& data, std::vecto void repack_HWC_to_CHW_layout::apply(utils::type_erased_matrix& data, CPUMat& out, std::vector& dims) { - CPUMat &src = data.template get(); + CPUMat &src = data.template get(); if (!src.Contiguous()) { LBANN_ERROR("RepackHWCtoCHWLayout does not support non-contiguous src."); } From 2fb3fa304fe9c344a41c9c51119d191b107d3643 Mon Sep 17 00:00:00 2001 From: "David A. Hysom" Date: Fri, 14 Jun 2019 10:36:10 -0700 Subject: [PATCH 076/634] ongoing development --- .../lbann/data_store/data_store_conduit.hpp | 23 +- src/data_store/data_store_conduit.cpp | 212 +++++++++++------- 2 files changed, 139 insertions(+), 96 deletions(-) diff --git a/include/lbann/data_store/data_store_conduit.hpp b/include/lbann/data_store/data_store_conduit.hpp index 2382a41b13e..7dfec6c4429 100644 --- a/include/lbann/data_store/data_store_conduit.hpp +++ b/include/lbann/data_store/data_store_conduit.hpp @@ -294,27 +294,24 @@ protected : bool m_have_sample_sizes; /// fills in m_image_offsets; returns the segment size (which is the - /// sum of the file sizes) - int get_image_offsets(); + /// sum of the file sizes). Currently only used for imagenet + void get_image_sizes(std::unordered_map &sizes, std::vector> &indices); /// offset at which the raw image will be stored in a shared memory segment - std::vector m_image_offsets; + std::unordered_map m_image_offsets; + void compute_image_offsets(std::unordered_map &sizes, std::vector> &indices); - void allocate_shared_segment(int size); + void allocate_shared_segment(std::unordered_map &sizes, std::vector> &indices); - std::string m_image_base_dir; - std::vector m_my_files; - std::vector m_my_sizes; + void read_files(std::vector &work, std::unordered_map &sizes, std::vector &indices); - void load_files(); + void build_conduit_nodes(std::unordered_map &sizes); - void *m_mem_seg = 0; + void exchange_images(std::vector &work, std::unordered_map &image_sizes, std::vector> &indices); - //m_loaded_images[j] = true if the j'th image has been loaded - bool *m_loaded_images; + void fillin_shared_images(const std::vector &images, const std::unordered_map &image_sizes, const std::vector &indices); - std::vector m_image_filenames; - std::vector m_labels; + void *m_mem_seg = 0; }; } // namespace lbann diff --git a/src/data_store/data_store_conduit.cpp b/src/data_store/data_store_conduit.cpp index 6a032304813..d7c5946acfb 100644 --- a/src/data_store/data_store_conduit.cpp +++ b/src/data_store/data_store_conduit.cpp @@ -28,6 +28,7 @@ #include "lbann/data_store/data_store_conduit.hpp" #include "lbann/data_readers/data_reader_jag_conduit.hpp" +#include "lbann/data_readers/data_reader_image.hpp" #include "lbann/utils/exception.hpp" #include "lbann/utils/options.hpp" #include "lbann/utils/timer.hpp" @@ -1014,98 +1015,80 @@ void data_store_conduit::set_preload() { m_preload = true; } -int data_store_conduit::get_image_offsets() { - int segment_length = 0; - options *opts = options::get(); +void data_store_conduit::get_image_sizes(std::unordered_map &file_sizes, std::vector> &indices) { /// this block fires if image sizes have been precomputed - if (opts->has_string("image_sizes_filename")) { + if (options::get()->has_string("image_sizes_filename")) { LBANN_ERROR("not yet implemented"); + //TODO dah - implement, if this becomes a bottleneck (but I don't think it will) } else { // get list of image file names - const std::string image_list_file = m_reader->get_data_filename(); - m_image_base_dir = m_reader->get_file_dir(); - FILE *fplist = fopen(image_list_file.c_str(), "rt"); - int imagelabel; - while (!feof(fplist)) { - char imagepath[512]; - if (fscanf(fplist, "%s%d", imagepath, &imagelabel) <= 1) { - break; - } - m_image_filenames.emplace_back(imagepath); - m_labels.emplace_back(imagelabel); + image_data_reader *image_reader = dynamic_cast(m_reader); + if (image_reader == nullptr) { + LBANN_ERROR("data_reader_image *image_reader = dynamic_cast(m_reader) failed"); } - fclose(fplist); + const std::vector &image_list = image_reader->get_image_list(); // get sizes of files for which I'm responsible - for (size_t h=m_rank_in_trainer; hget_file_dir() + '/' + m_image_filenames[h]; + std::vector my_image_sizes; + for (size_t h=m_rank_in_trainer; hget_file_dir() + '/' + image_list[h].first; std::ifstream in(fn.c_str()); if (!in) { LBANN_ERROR("failed to open " + fn + " for reading"); } in.seekg(0, std::ios::end); - m_my_sizes.push_back(in.tellg()); - m_my_files.push_back(h); + my_image_sizes.push_back(h); + my_image_sizes.push_back(in.tellg()); in.close(); } + int my_count = my_image_sizes.size(); - if (m_output) { - m_output << "my image sizes:\n"; - for (size_t k=0; k counts(m_np_in_trainer); m_comm->all_gather(&my_count, 1, counts.data(), 1, m_comm->get_trainer_comm()); - size_t g_count = std::accumulate(counts.begin(), counts.end(), 0); - if (g_count != m_image_filenames.size()) { - LBANN_ERROR("g_count != m_image_filenames.size()"); - } - std::vector work(m_image_filenames.size()); - std::vector disp(m_np_in_trainer); + + std::vector work(image_list.size()*2); + std::vector disp(m_np_in_trainer + 1); disp[0] = 0; - for (size_t h=0; htrainer_all_gather(m_my_sizes, work, counts, disp); - - // fill in m_image_offsets - m_image_offsets.resize(m_image_filenames.size()+1); - m_image_offsets[0] = 0; - for (int rank = 0; rank < m_np_in_trainer; rank++) { - size_t offset = disp[rank]; - size_t count = counts[rank]; - size_t i = rank; - for (size_t j=offset; jtrainer_all_gather(my_image_sizes, work, counts, disp); + indices.resize(m_np_in_trainer); + for (int h=0; h &sizes, std::vector> &indices) { + int offset = 0; + for (size_t p=0; p &sizes, std::vector> &indices) { + int size = 0; + for (auto &&t : sizes) { + size += t.second; + } + int node_id = m_comm->get_rank_in_node(); key_t key = ftok(",", 'x'); int shm_id; @@ -1118,10 +1101,6 @@ void data_store_conduit::allocate_shared_segment(int size) { if (*(int*)m_mem_seg == -1) { LBANN_ERROR("m_mem_seg == -1; call to shmat() failed"); } - m_loaded_images = (bool*)((char*)m_mem_seg + m_image_offsets.back()); - for (size_t j=0; jbarrier(m_comm->get_node_comm()); @@ -1139,27 +1118,94 @@ void data_store_conduit::allocate_shared_segment(int size) { } void data_store_conduit::preload_local_cache() { - int segment_size = get_image_offsets(); - allocate_shared_segment(segment_size); - load_files(); + std::unordered_map file_sizes; + std::vector> indices; + get_image_sizes(file_sizes, indices); + + //debug block; will go away + if (m_world_master) { + for (int h=0; h &idx = indices[h]; + std::cout << "P_"< work; + read_files(work, file_sizes, indices[m_rank_in_trainer]); + + allocate_shared_segment(file_sizes, indices); + compute_image_offsets(file_sizes, indices); + exchange_images(work, file_sizes, indices); + build_conduit_nodes(file_sizes); } -void data_store_conduit::load_files() { - for (auto j : m_my_files) { - const std::string fn = m_image_base_dir + '/' + m_image_filenames[j]; +void data_store_conduit::read_files(std::vector &work, std::unordered_map &sizes, std::vector &indices) { + int n = 0; + for (auto t : indices) { + n += sizes[t]; + } + work.resize(n); + + image_data_reader *image_reader = dynamic_cast(m_reader); + const std::vector &image_list = image_reader->get_image_list(); + int offset = 0; + for (auto h : indices) { + int s = sizes[h]; + const std::string fn = m_reader->get_file_dir() + '/' + image_list[h].first; std::ifstream in(fn, std::ios::in | std::ios::binary); - if (!in) { - LBANN_ERROR("failed to open " + fn + " for binary read"); - } - char *c = (char*)m_mem_seg + m_image_offsets[j]; - in.read(c, m_image_offsets[j+1] - m_image_offsets[j]); + in.read(work.data()+offset, s); in.close(); + offset += s; + } +} + +void data_store_conduit::build_conduit_nodes(std::unordered_map &sizes) { + image_data_reader *image_reader = dynamic_cast(m_reader); + const std::vector &image_list = image_reader->get_image_list(); + for (size_t idx=0; idx &images, const std::unordered_map &image_sizes, const std::vector &indices) { +} + +void data_store_conduit::exchange_images(std::vector &work, std::unordered_map &image_sizes, std::vector> &indices) { + std::vector work2; + int node_rank = m_comm->get_rank_in_node(); + for (int p=0; ptrainer_broadcast(p, work.data(), work.size()); + if (node_rank == 0) { + fillin_shared_images(work, image_sizes, indices[p]); + } + } else { + int sz = 0; + for (auto idx : indices[p]) { + sz += image_sizes[idx]; + } + work2.resize(sz); + m_comm->trainer_broadcast(p, work2.data(), work.size()); + if (node_rank == 0) { + fillin_shared_images(work2, image_sizes, indices[p]); + } + } + } +} + + } // namespace lbann From e4ab4ab0c4364f3dce1ee28c0a803a672fb135f6 Mon Sep 17 00:00:00 2001 From: "David A. Hysom" Date: Fri, 14 Jun 2019 14:41:37 -0700 Subject: [PATCH 077/634] replacing POSIX for System V for shared memory --- .../lbann/data_store/data_store_conduit.hpp | 2 + src/data_store/data_store_conduit.cpp | 39 +++++++++++-------- 2 files changed, 25 insertions(+), 16 deletions(-) diff --git a/include/lbann/data_store/data_store_conduit.hpp b/include/lbann/data_store/data_store_conduit.hpp index 7dfec6c4429..0dc20c2d9fc 100644 --- a/include/lbann/data_store/data_store_conduit.hpp +++ b/include/lbann/data_store/data_store_conduit.hpp @@ -312,6 +312,8 @@ protected : void fillin_shared_images(const std::vector &images, const std::unordered_map &image_sizes, const std::vector &indices); void *m_mem_seg = 0; + + const std::string m_seg_name = "our_town"; }; } // namespace lbann diff --git a/src/data_store/data_store_conduit.cpp b/src/data_store/data_store_conduit.cpp index d7c5946acfb..7106b5e430b 100644 --- a/src/data_store/data_store_conduit.cpp +++ b/src/data_store/data_store_conduit.cpp @@ -33,8 +33,9 @@ #include "lbann/utils/options.hpp" #include "lbann/utils/timer.hpp" #include -#include -#include +#include +#include +#include namespace lbann { @@ -91,7 +92,7 @@ data_store_conduit::~data_store_conduit() { m_output.close(); } if (m_is_local_cache && m_mem_seg) { - shmdt(m_mem_seg); + shm_unlink(m_seg_name.c_str()); } } @@ -1090,29 +1091,35 @@ void data_store_conduit::allocate_shared_segment(std::unordered_map &si } int node_id = m_comm->get_rank_in_node(); - key_t key = ftok(",", 'x'); - int shm_id; if (node_id == 0) { - shm_id = shmget(key, size, (IPC_CREAT | 0666)); - if (shm_id < 0) { - LBANN_ERROR("shm_id < 0; shmget() failed to create shared memory segment of " + std::to_string(size) + " bytes"); + int shm_fd = shm_open(m_seg_name.c_str(), O_CREAT | O_RDWR, 0666); + if (shm_fd == -1) { + LBANN_ERROR("shm_open failed"); } - m_mem_seg = shmat(shm_id, NULL, 0); + int v = ftruncate(shm_fd, size); + if (v != 0) { + LBANN_ERROR("ftruncate failed"); + } + m_mem_seg = mmap(0, size, PROT_WRITE, MAP_SHARED, shm_fd, 0); if (*(int*)m_mem_seg == -1) { - LBANN_ERROR("m_mem_seg == -1; call to shmat() failed"); + LBANN_ERROR("mmap failed"); } - } + } m_comm->barrier(m_comm->get_node_comm()); if (node_id != 0) { - shm_id = shmget(key, size, 0666); - if (shm_id < 0) { - LBANN_ERROR("shm_id < 0; shmget() failed to create shared memory segment of " + std::to_string(size) + " bytes"); + int shm_fd = shm_open(m_seg_name.c_str(), O_RDONLY, 0666); + if (shm_fd == -1) { + LBANN_ERROR("shm_open failed"); + } + int v = ftruncate(shm_fd, size); + if (v != 0) { + LBANN_ERROR("ftruncate failed"); } - m_mem_seg = shmat(shm_id, NULL, 0); + m_mem_seg = mmap(0, size, PROT_READ, MAP_SHARED, shm_fd, 0); if (*(int*)m_mem_seg == -1) { - LBANN_ERROR("m_mem_seg == -1; call to shmat() failed"); + LBANN_ERROR("mmap failed"); } } } From 523f7dd950d459b5bd2101be0ed5a55c6034837e Mon Sep 17 00:00:00 2001 From: Nikoli Dryden Date: Tue, 30 Apr 2019 20:22:57 -0700 Subject: [PATCH 078/634] Update/clean up CIFAR-10 data reader. Now also supports CIFAR-100. Can fully use the new preprocessing pipeline. Now uses original data instead of custom repackaged data. --- .../data_readers/data_reader_cifar10.hpp | 23 +++- .../data_reader_cifar10.prototext | 28 ++-- src/data_readers/data_reader_cifar10.cpp | 122 +++++++++++------- 3 files changed, 114 insertions(+), 59 deletions(-) diff --git a/include/lbann/data_readers/data_reader_cifar10.hpp b/include/lbann/data_readers/data_reader_cifar10.hpp index 7c72975bf98..a0c7ae61257 100644 --- a/include/lbann/data_readers/data_reader_cifar10.hpp +++ b/include/lbann/data_readers/data_reader_cifar10.hpp @@ -23,7 +23,7 @@ // implied. See the License for the specific language governing // permissions and limitations under the license. // -// lbann_data_reader_cifar10 .hpp .cpp - generic_data_reader class for CIFAR10 dataset +// data_reader_cifar10 .hpp .cpp - Data reader for CIFAR-10/100 //////////////////////////////////////////////////////////////////////////////// #ifndef LBANN_DATA_READER_CIFAR10_HPP @@ -33,14 +33,23 @@ namespace lbann { +/** + * A data reader for the CIFAR-10/100 datasets. + * + * This requires the binary distributions of the datasets, which + * must retain their original filenames. + * CIFAR-10 vs -100 is inferred by the number of labels set. + * @note This does not store the coarse labels from CIFAR-100. + * + * See: + * https://www.cs.toronto.edu/~kriz/cifar.html + */ class cifar10_reader : public image_data_reader { public: - /// constructor cifar10_reader(bool shuffle = true); cifar10_reader(const cifar10_reader&) = default; cifar10_reader& operator=(const cifar10_reader&) = default; - /// destructor ~cifar10_reader() override; cifar10_reader* copy() const override { return new cifar10_reader(*this); } @@ -58,7 +67,13 @@ class cifar10_reader : public image_data_reader { bool fetch_label(CPUMat& Y, int data_id, int mb_idx) override; private: - std::vector > m_data; + /** + * Loaded image data. + * This will be stored in "OpenCV" format for ease of preprocessing. + */ + std::vector> m_images; + /** Loaded label information. */ + std::vector m_labels; }; } // namespace lbann diff --git a/model_zoo/data_readers/data_reader_cifar10.prototext b/model_zoo/data_readers/data_reader_cifar10.prototext index 0984212cd2a..566e47d80b1 100644 --- a/model_zoo/data_readers/data_reader_cifar10.prototext +++ b/model_zoo/data_readers/data_reader_cifar10.prototext @@ -3,14 +3,20 @@ data_reader { name: "cifar10" role: "train" shuffle: true - data_filename: "/p/lscratchh/brainusr/datasets/cifar10-bin/data_all.bin" - label_filename: "/p/lscratchh/brainusr/datasets/cifar10-bin/data_all.bin" + data_filedir: "/p/lscratchh/brainusr/datasets/cifar10-bin/" validation_percent: 0.1 absolute_sample_count: 0 percent_of_data_to_use: 1.0 + transforms { - scale { - scale: 0.003921568627 # 1/255 + horizontal_flip { + p: 0.5 + } + } + transforms { + normalize_to_lbann_layout { + means: "0.44653 0.48216 0.4914" + stddevs: "0.26159 0.24349 0.24703" } } } @@ -18,13 +24,19 @@ data_reader { name: "cifar10" role: "test" shuffle: true - data_filename: "/p/lscratchh/brainusr/datasets/cifar10-bin/test_batch.bin" - label_filename: "/p/lscratchh/brainusr/datasets/cifar10-bin/test_batch.bin" + data_filedir: "/p/lscratchh/brainusr/datasets/cifar10-bin/" absolute_sample_count: 0 percent_of_data_to_use: 1.0 + + transforms { + horizontal_flip { + p: 0.5 + } + } transforms { - scale { - scale: 0.003921568627 # 1/255 + normalize_to_lbann_layout { + means: "0.44653 0.48216 0.4914" + stddevs: "0.26159 0.24349 0.24703" } } } diff --git a/src/data_readers/data_reader_cifar10.cpp b/src/data_readers/data_reader_cifar10.cpp index a9f8127b5d6..fd8535a7c79 100644 --- a/src/data_readers/data_reader_cifar10.cpp +++ b/src/data_readers/data_reader_cifar10.cpp @@ -23,7 +23,7 @@ // implied. See the License for the specific language governing // permissions and limitations under the license. // -// lbann_data_reader_cifar10 .hpp .cpp - generic_data_reader class for CIFAR10 dataset +// data_reader_cifar10 .hpp .cpp - Data reader for CIFAR-10/100 //////////////////////////////////////////////////////////////////////////////// #include "lbann/data_readers/data_reader_cifar10.hpp" @@ -46,68 +46,96 @@ void cifar10_reader::set_defaults() { } void cifar10_reader::load() { - //open data file - std::string image_dir = get_file_dir(); - std::string filename = get_data_filename(); - std::string path = image_dir + "/" + filename; - std::ifstream in(path, std::ios::binary); - if (!in.good()) { - throw lbann_exception( - std::string{} + __FILE__ + " " + std::to_string(__LINE__) + - " :: failed to open " + path + " for reading"); - } + // These are all specified by the CIFAR10/100 description. + constexpr size_t num_channels = 3; + constexpr size_t channel_size = 32*32; + constexpr size_t image_size = num_channels*channel_size; + constexpr size_t cifar10_label_size = 1; + constexpr size_t cifar100_label_size = 2; - //get number of images, with error checking - int len = get_linearized_data_size() + 1; //should be 3073 - in.seekg(0, in.end); - std::streampos fs = in.tellg(); - in.seekg(0, in.beg); - if (fs % len != 0) { - throw lbann_exception( - std::string{} + __FILE__ + " " + std::to_string(__LINE__) + - " :: fs % len != 0; fs: " + std::to_string(fs) + " len: " + - std::to_string(len)); + if (m_num_labels != 10 && m_num_labels != 100) { + LBANN_ERROR("Unsupported number of labels for CIFAR10/100."); } - //reserve space for string images - int num_images = fs / len; - m_data.resize(num_images); - for (auto & h : m_data) { - h.resize(len); - } + const bool cifar100 = m_num_labels == 100; - //read in the images; each image is 1 byte, which is the - //label (0-9), and 3072 pixels - for (auto & h : m_data) { - in.read((char *)&(h[0]), len); + std::string path = get_file_dir(); + // These filenames are specified by the CIFAR-10/100 dataset description. + std::vector filenames; + size_t images_per_file = 10000; + if (this->get_role() == "train") { + if (cifar100) { + filenames = {"train.bin"}; + images_per_file = 50000; + } else { + filenames = { + "data_batch_1.bin", + "data_batch_2.bin", + "data_batch_3.bin", + "data_batch_4.bin", + "data_batch_5.bin" + }; + } + } else if (this->get_role() == "test") { + if (cifar100) { + filenames = {"test.bin"}; + } else { + filenames = {"test_batch.bin"}; + } + } else { + LBANN_ERROR("Unsupported training mode for CIFAR loading."); } - in.close(); - m_shuffled_indices.resize(m_data.size()); - for (size_t n = 0; n < m_data.size(); n++) { - m_shuffled_indices[n] = n; + for (const auto& filename : filenames) { + std::ifstream f(path + "/" + filename, + std::ios::in | std::ios::binary); + if (!f.good()) { + LBANN_ERROR("Could not open " + path + "/" + filename); + } + // Temporary buffer to hold an image. + std::vector buf(image_size + (cifar100 ? + cifar100_label_size : + cifar10_label_size), 0); + for (size_t i = 0; i < images_per_file; ++i) { + f.read(reinterpret_cast(buf.data()), buf.size()); + if (static_cast(f.gcount()) != buf.size()) { + LBANN_ERROR("Could not read from " + path + "/" + filename); + } + // CIFAR-10 has only one label; for CIFAR-100, the second byte is the + // fine label. + m_labels.push_back(buf[cifar100 ? 1 : 0]); + // Convert to OpenCV layout. + std::vector image(image_size); + for (size_t channel = 0; channel < num_channels; ++channel) { + const size_t src_start = channel*channel_size; + for (size_t j = 0; j < channel_size; ++j) { + image[j*num_channels + channel] = buf[src_start + j]; + } + } + m_images.push_back(std::move(image)); + } + f.close(); } + m_shuffled_indices.clear(); + m_shuffled_indices.resize(m_images.size()); + std::iota(m_shuffled_indices.begin(), m_shuffled_indices.end(), 0); select_subset_of_data(); } bool cifar10_reader::fetch_datum(CPUMat& X, int data_id, int mb_idx) { - for (size_t p = 1; p dims = { - static_cast(m_image_num_channels), - static_cast(m_image_height), - static_cast(m_image_width)}; - m_transform_pipeline.apply(pixel_col, dims); + // Copy to a matrix so we can do data augmentation. + // Sizes per CIFAR-10/100 dataset description. + El::Matrix image(3*32*32, 1); + std::vector dims = {size_t(3), size_t(32), size_t(32)}; + std::copy_n(m_images[data_id].data(), 3*32*32, image.Buffer()); + auto X_v = X(El::IR(0, X.Height()), El::IR(mb_idx, mb_idx + 1)); + m_transform_pipeline.apply(image, X_v, dims); return true; } bool cifar10_reader::fetch_label(CPUMat& Y, int data_id, int mb_idx) { - auto label = (int)m_data[data_id][0]; - Y.Set(label, mb_idx, 1); + Y.Set(m_labels[data_id], mb_idx, 1); return true; } From c2bd02a4e9116bdcd5504890666350f27d92aeb1 Mon Sep 17 00:00:00 2001 From: Nikoli Dryden Date: Wed, 15 May 2019 19:00:32 -0700 Subject: [PATCH 079/634] Add transforms for color jittering. Includes adjust brightness/contrast/saturation and a random color jitter. --- .../lbann/transforms/vision/CMakeLists.txt | 4 + .../transforms/vision/adjust_brightness.hpp | 62 ++++++++++ .../transforms/vision/adjust_contrast.hpp | 66 +++++++++++ .../transforms/vision/adjust_saturation.hpp | 67 +++++++++++ .../lbann/transforms/vision/color_jitter.hpp | 106 ++++++++++++++++++ src/proto/factories/transform_factory.cpp | 21 +++- src/proto/lbann.proto | 25 +++++ src/transforms/vision/CMakeLists.txt | 4 + src/transforms/vision/adjust_brightness.cpp | 49 ++++++++ src/transforms/vision/adjust_contrast.cpp | 84 ++++++++++++++ src/transforms/vision/adjust_saturation.cpp | 72 ++++++++++++ src/transforms/vision/color_jitter.cpp | 85 ++++++++++++++ 12 files changed, 644 insertions(+), 1 deletion(-) create mode 100644 include/lbann/transforms/vision/adjust_brightness.hpp create mode 100644 include/lbann/transforms/vision/adjust_contrast.hpp create mode 100644 include/lbann/transforms/vision/adjust_saturation.hpp create mode 100644 include/lbann/transforms/vision/color_jitter.hpp create mode 100644 src/transforms/vision/adjust_brightness.cpp create mode 100644 src/transforms/vision/adjust_contrast.cpp create mode 100644 src/transforms/vision/adjust_saturation.cpp create mode 100644 src/transforms/vision/color_jitter.cpp diff --git a/include/lbann/transforms/vision/CMakeLists.txt b/include/lbann/transforms/vision/CMakeLists.txt index fa6ee2aff49..4a22f176f0b 100644 --- a/include/lbann/transforms/vision/CMakeLists.txt +++ b/include/lbann/transforms/vision/CMakeLists.txt @@ -1,7 +1,11 @@ # Add the headers for this directory set_full_path(THIS_DIR_HEADERS + adjust_brightness.hpp + adjust_contrast.hpp + adjust_saturation.hpp center_crop.hpp colorize.hpp + color_jitter.hpp grayscale.hpp horizontal_flip.hpp normalize_to_lbann_layout.hpp diff --git a/include/lbann/transforms/vision/adjust_brightness.hpp b/include/lbann/transforms/vision/adjust_brightness.hpp new file mode 100644 index 00000000000..6482793f8a7 --- /dev/null +++ b/include/lbann/transforms/vision/adjust_brightness.hpp @@ -0,0 +1,62 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_TRANSFORMS_ADJUST_BRIGHTNESS_HPP_INCLUDED +#define LBANN_TRANSFORMS_ADJUST_BRIGHTNESS_HPP_INCLUDED + +#include "lbann/transforms/transform.hpp" + +namespace lbann { +namespace transform { + +/** Adjust the brightness of an image. */ +class adjust_brightness : public transform { +public: + /** + * Adjust brightness with given factor. + * @param factor A non-negative factor. 0 gives a black image, 1 the original. + */ + adjust_brightness(float factor) : transform(), m_factor(factor) { + if (factor < 0.0f) { + LBANN_ERROR("Brightness factor must be non-negative."); + } + } + + transform* copy() const override { return new adjust_brightness(*this); } + + std::string get_type() const override { return "adjust_brightness"; } + + void apply(utils::type_erased_matrix& data, std::vector& dims) override; + +private: + /** Factor to adjust brightness by. */ + float m_factor; +}; + +} // namespace transform +} // namespace lbann + +#endif // LBANN_TRANSFORMS_ADJUST_BRIGHTNESS_HPP_INCLUDED diff --git a/include/lbann/transforms/vision/adjust_contrast.hpp b/include/lbann/transforms/vision/adjust_contrast.hpp new file mode 100644 index 00000000000..2e6e32f12c6 --- /dev/null +++ b/include/lbann/transforms/vision/adjust_contrast.hpp @@ -0,0 +1,66 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_TRANSFORMS_ADJUST_CONTRAST_HPP_INCLUDED +#define LBANN_TRANSFORMS_ADJUST_CONTRAST_HPP_INCLUDED + +#include "lbann/transforms/transform.hpp" + +namespace lbann { +namespace transform { + +/** + * Adjust the contrast of an image. + * This operates similarly to the contrast control on a television. + */ +class adjust_contrast : public transform { +public: + /** + * Adjust contrast with given factor. + * @param factor A non-negative factor. 0 gives a solid grey image, + * 1 the original. + */ + adjust_contrast(float factor) : transform(), m_factor(factor) { + if (factor < 0.0f) { + LBANN_ERROR("Contrast factor must be non-negative."); + } + } + + transform* copy() const override { return new adjust_contrast(*this); } + + std::string get_type() const override { return "adjust_contrast"; } + + void apply(utils::type_erased_matrix& data, std::vector& dims) override; + +private: + /** Factor to adjust contrast by. */ + float m_factor; +}; + +} // namespace transform +} // namespace lbann + +#endif // LBANN_TRANSFORMS_ADJUST_CONTRAST_HPP_INCLUDED diff --git a/include/lbann/transforms/vision/adjust_saturation.hpp b/include/lbann/transforms/vision/adjust_saturation.hpp new file mode 100644 index 00000000000..d1d6aff7692 --- /dev/null +++ b/include/lbann/transforms/vision/adjust_saturation.hpp @@ -0,0 +1,67 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_TRANSFORMS_ADJUST_SATURATION_HPP_INCLUDED +#define LBANN_TRANSFORMS_ADJUST_SATURATION_HPP_INCLUDED + +#include "lbann/transforms/transform.hpp" + +namespace lbann { +namespace transform { + +/** + * Adjust the saturation of an image. + * This operates similarly to the controls on a color television + * (as opposed to a direct adjustment of saturation). + */ +class adjust_saturation : public transform { +public: + /** + * Adjust saturation with given factor. + * @param factor A non-negative factor. 0 gives a grayscale image, + * 1 the original. + */ + adjust_saturation(float factor) : transform(), m_factor(factor) { + if (factor < 0.0f) { + LBANN_ERROR("Saturation factor must be non-negative."); + } + } + + transform* copy() const override { return new adjust_saturation(*this); } + + std::string get_type() const override { return "adjust_saturation"; } + + void apply(utils::type_erased_matrix& data, std::vector& dims) override; + +private: + /** Factor to adjust saturation by. */ + float m_factor; +}; + +} // namespace transform +} // namespace lbann + +#endif // LBANN_TRANSFORMS_ADJUST_SATURATION_HPP_INCLUDED diff --git a/include/lbann/transforms/vision/color_jitter.hpp b/include/lbann/transforms/vision/color_jitter.hpp new file mode 100644 index 00000000000..c16d2eaba98 --- /dev/null +++ b/include/lbann/transforms/vision/color_jitter.hpp @@ -0,0 +1,106 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_TRANSFORMS_COLOR_JITTER_HPP_INCLUDED +#define LBANN_TRANSFORMS_COLOR_JITTER_HPP_INCLUDED + +#include "lbann/transforms/transform.hpp" + +namespace lbann { +namespace transform { + +/** + * Randomly change brightness, contrast, and saturation. + * This randomly adjusts brightness, contrast, and saturation, in a random + * order. + */ +class color_jitter : public transform { +public: + /** + * Randomly adjust brightness, contrast, and saturation within given ranges. + * Set both min and max to 0 to disable that adjustment. + * @param min_brightness_factor Minimum brightness adjustment (>= 0). + * @param max_brightness_factor Maximum brightness adjustment. + * @param min_contrast_factor Minimum contrast adjustment (>= 0). + * @param max_contrast_factor Maximum contrast adjustment. + * @param min_saturation_factor Minimum saturation adjustment (>= 0). + * @param max_saturation_factor Maximum saturation adjustment. + */ + color_jitter(float min_brightness_factor, float max_brightness_factor, + float min_contrast_factor, float max_contrast_factor, + float min_saturation_factor, float max_saturation_factor) : + transform(), + m_min_brightness_factor(min_brightness_factor), + m_max_brightness_factor(max_brightness_factor), + m_min_contrast_factor(min_contrast_factor), + m_max_contrast_factor(max_contrast_factor), + m_min_saturation_factor(min_saturation_factor), + m_max_saturation_factor(max_saturation_factor) { + if (min_brightness_factor < 0.0f || + max_brightness_factor < min_brightness_factor) { + LBANN_ERROR("Min/max brightness factors out of range: " + + std::to_string(min_brightness_factor) + " " + + std::to_string(max_brightness_factor)); + } + if (min_contrast_factor < 0.0f || + max_contrast_factor < min_contrast_factor) { + LBANN_ERROR("Min/max contrast factors out of range: " + + std::to_string(min_contrast_factor) + " " + + std::to_string(max_contrast_factor)); + } + if (min_saturation_factor < 0.0f || + max_saturation_factor < min_saturation_factor) { + LBANN_ERROR("Min/max saturation factors out of range: " + + std::to_string(min_saturation_factor) + " " + + std::to_string(max_saturation_factor)); + } + } + + transform* copy() const override { return new color_jitter(*this); } + + std::string get_type() const override { return "color_jitter"; } + + void apply(utils::type_erased_matrix& data, std::vector& dims) override; + +private: + /** Minimum brightness factor. */ + float m_min_brightness_factor; + /** Maximum brightness factor. */ + float m_max_brightness_factor; + /** Minimum contrast factor. */ + float m_min_contrast_factor; + /** Maximum contrast factor. */ + float m_max_contrast_factor; + /** Minimum saturation factor. */ + float m_min_saturation_factor; + /** Maximum saturation factor. */ + float m_max_saturation_factor; +}; + +} // namespace transform +} // namespace lbann + +#endif // LBANN_TRANSFORMS_COLOR_JITTER_HPP_INCLUDED diff --git a/src/proto/factories/transform_factory.cpp b/src/proto/factories/transform_factory.cpp index 7940e743545..1794abae8c8 100644 --- a/src/proto/factories/transform_factory.cpp +++ b/src/proto/factories/transform_factory.cpp @@ -28,9 +28,13 @@ #include "lbann/transforms/normalize.hpp" #include "lbann/transforms/sample_normalize.hpp" #include "lbann/transforms/scale.hpp" +#include "lbann/transforms/vision/adjust_brightness.hpp" +#include "lbann/transforms/vision/adjust_contrast.hpp" +#include "lbann/transforms/vision/adjust_saturation.hpp" #include "lbann/transforms/vision/center_crop.hpp" -#include "lbann/transforms/vision/grayscale.hpp" #include "lbann/transforms/vision/colorize.hpp" +#include "lbann/transforms/vision/color_jitter.hpp" +#include "lbann/transforms/vision/grayscale.hpp" #include "lbann/transforms/vision/horizontal_flip.hpp" #include "lbann/transforms/vision/normalize_to_lbann_layout.hpp" #include "lbann/transforms/vision/random_affine.hpp" @@ -114,6 +118,21 @@ std::unique_ptr construct_transform( } else if (trans.has_vertical_flip()) { return make_unique( trans.vertical_flip().p()); + } else if (trans.has_adjust_brightness()) { + return make_unique( + trans.adjust_brightness().factor()); + } else if (trans.has_adjust_contrast()) { + return make_unique( + trans.adjust_contrast().factor()); + } else if (trans.has_adjust_saturation()) { + return make_unique( + trans.adjust_saturation().factor()); + } else if (trans.has_color_jitter()) { + auto& pb_trans = trans.color_jitter(); + return make_unique( + pb_trans.min_brightness_factor(), pb_trans.max_brightness_factor(), + pb_trans.min_contrast_factor(), pb_trans.max_contrast_factor(), + pb_trans.min_saturation_factor(), pb_trans.max_saturation_factor()); } LBANN_ERROR("Unknown transform"); diff --git a/src/proto/lbann.proto b/src/proto/lbann.proto index ecdff306ef8..4f4672e3727 100644 --- a/src/proto/lbann.proto +++ b/src/proto/lbann.proto @@ -101,6 +101,18 @@ message Transform { } // Transforms that apply to images. + // Adjust the brightness of an image. + message AdjustBrightness { + float factor = 1; + } + // Adjust the contrast of an image. + message AdjustContrast { + float factor = 1; + } + // Adjust the saturation of an image. + message AdjustSaturation { + float factor = 1; + } // Crop of size height x width from the center. message CenterCrop { uint64 height = 1; @@ -108,6 +120,15 @@ message Transform { } // Convert to color. message Colorize {} + // Randomly jitter brightness/contrast/saturation. + message ColorJitter { + float min_brightness_factor = 1; + float max_brightness_factor = 2; + float min_contrast_factor = 3; + float max_contrast_factor = 4; + float min_saturation_factor = 5; + float max_saturation_factor = 6; + } // Convert to grayscale. message Grayscale {} // Horizontal flip with probability p. @@ -190,6 +211,10 @@ message Transform { ResizedCenterCrop resized_center_crop = 110; ToLBANNLayout to_lbann_layout = 111; VerticalFlip vertical_flip = 112; + AdjustBrightness adjust_brightness = 113; + AdjustContrast adjust_contrast = 114; + AdjustSaturation adjust_saturation = 115; + ColorJitter color_jitter = 116; } } diff --git a/src/transforms/vision/CMakeLists.txt b/src/transforms/vision/CMakeLists.txt index b2f0781eaa1..c986ba1e149 100644 --- a/src/transforms/vision/CMakeLists.txt +++ b/src/transforms/vision/CMakeLists.txt @@ -1,7 +1,11 @@ # Add the source files for this directory set_full_path(THIS_DIR_SOURCES + adjust_brightness.cpp + adjust_contrast.cpp + adjust_saturation.cpp center_crop.cpp colorize.cpp + color_jitter.cpp grayscale.cpp horizontal_flip.cpp normalize_to_lbann_layout.cpp diff --git a/src/transforms/vision/adjust_brightness.cpp b/src/transforms/vision/adjust_brightness.cpp new file mode 100644 index 00000000000..ce8d0b98c85 --- /dev/null +++ b/src/transforms/vision/adjust_brightness.cpp @@ -0,0 +1,49 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#include "lbann/transforms/vision/adjust_brightness.hpp" +#include "lbann/utils/opencv.hpp" + +namespace lbann { +namespace transform { + +void adjust_brightness::apply(utils::type_erased_matrix& data, std::vector& dims) { + // Adjusting the brightness is simply scaling by a constant value + // taking care to saturate. + cv::Mat src = utils::get_opencv_mat(data, dims); + if (!src.isContinuous()) { + // This should not occur, but just in case. + LBANN_ERROR("Do not support non-contiguous OpenCV matrices."); + } + uint8_t* __restrict__ src_buf = src.ptr(); + const size_t size = utils::get_linearized_size(dims); + for (size_t i = 0; i < size; ++i) { + src_buf[i] = cv::saturate_cast(src_buf[i]*m_factor); + } +} + +} // namespace transform +} // namespace lbann diff --git a/src/transforms/vision/adjust_contrast.cpp b/src/transforms/vision/adjust_contrast.cpp new file mode 100644 index 00000000000..5af3fce227b --- /dev/null +++ b/src/transforms/vision/adjust_contrast.cpp @@ -0,0 +1,84 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#include +#include "lbann/transforms/vision/adjust_contrast.hpp" +#include "lbann/utils/opencv.hpp" + +namespace lbann { +namespace transform { + +void adjust_contrast::apply(utils::type_erased_matrix& data, std::vector& dims) { + // To adjust contrast, we essentially add the mean of the grayscale version + // of the image, scaled by (1 - m_factor) to each pixel. + cv::Mat src = utils::get_opencv_mat(data, dims); + if (!src.isContinuous()) { + // This should not occur, but just in case. + LBANN_ERROR("Do not support non-contiguous OpenCV matrices."); + } + // Get the grayscale version and compute its mean value. + // If need be, we could do this computation in-place by manually computing + // the grayscale value of each pixel. + uint8_t gray_mean = 0.0; + if (dims[0] == 1) { + // Already grayscale, just compute the mean. + uint64_t sum = 0; + const size_t size = utils::get_linearized_size(dims); + const uint8_t* __restrict__ gray_buf = src.ptr(); + for (size_t i = 0; i < size; ++i) { + sum += gray_buf[i]; + } + gray_mean = static_cast( + std::round(static_cast(sum) / static_cast(size))); + } else { + std::vector gray_dims = {1, dims[1], dims[2]}; + const size_t size = utils::get_linearized_size(gray_dims); + auto gray_real = El::Matrix(size, 1); + cv::Mat gray = utils::get_opencv_mat(gray_real, gray_dims); + cv::cvtColor(src, gray, cv::COLOR_BGR2GRAY); + const uint8_t* __restrict__ gray_buf = gray.ptr(); + // We sum integers, so accumulate into an integer. + // This should be large enough to avoid overflow, provided we have less than + // 2^56 pixels or so. + uint64_t sum = 0; + for (size_t i = 0; i < size; ++i) { + sum += gray_buf[i]; + } + gray_mean = static_cast( + std::round(static_cast(sum) / static_cast(size))); + } + // Mix the gray mean with the original image. + uint8_t* __restrict__ src_buf = src.ptr(); + const float one_minus_factor = 1.0f - m_factor; + const size_t size = utils::get_linearized_size(dims); + for (size_t i = 0; i < size; ++i) { + src_buf[i] = cv::saturate_cast( + src_buf[i]*m_factor + gray_mean*one_minus_factor); + } +} + +} // namespace transform +} // namespace lbann diff --git a/src/transforms/vision/adjust_saturation.cpp b/src/transforms/vision/adjust_saturation.cpp new file mode 100644 index 00000000000..e5224422059 --- /dev/null +++ b/src/transforms/vision/adjust_saturation.cpp @@ -0,0 +1,72 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#include +#include "lbann/transforms/vision/adjust_saturation.hpp" +#include "lbann/utils/opencv.hpp" + +namespace lbann { +namespace transform { + +void adjust_saturation::apply(utils::type_erased_matrix& data, std::vector& dims) { + // To adjust contrast, we essentially blend between the grayscale and + // original image based on the given factor. + cv::Mat src = utils::get_opencv_mat(data, dims); + if (!src.isContinuous()) { + // This should not occur, but just in case. + LBANN_ERROR("Do not support non-contiguous OpenCV matrices."); + } + if (dims[0] == 1) { + // Already grayscale, nothing to do. + } else { + // Handle RGB. + // Get the grayscaled image. + // If need be, we could do this computation in-place by manually computing + // the grayscale value of each pixel. + std::vector gray_dims = {1, dims[1], dims[2]}; + const size_t gray_size = utils::get_linearized_size(gray_dims); + auto gray_real = El::Matrix(gray_size, 1); + cv::Mat gray = utils::get_opencv_mat(gray_real, gray_dims); + cv::cvtColor(src, gray, cv::COLOR_BGR2GRAY); + const uint8_t* __restrict__ gray_buf = gray.ptr(); + // Mix the grayscale image with the original. + uint8_t* __restrict__ src_buf = src.ptr(); + const float one_minus_factor = 1.0f - m_factor; + for (size_t i = 0; i < gray_size; ++i) { + // Handle the three channels, in OpenCV format. + const size_t src_base = 3*i; + src_buf[src_base] = cv::saturate_cast( + src_buf[src_base]*m_factor + gray_buf[i]*one_minus_factor); + src_buf[src_base+1] = cv::saturate_cast( + src_buf[src_base+1]*m_factor + gray_buf[i]*one_minus_factor); + src_buf[src_base+2] = cv::saturate_cast( + src_buf[src_base+2]*m_factor + gray_buf[i]*one_minus_factor); + } + } +} + +} // namespace transform +} // namespace lbann diff --git a/src/transforms/vision/color_jitter.cpp b/src/transforms/vision/color_jitter.cpp new file mode 100644 index 00000000000..eff25df6f4b --- /dev/null +++ b/src/transforms/vision/color_jitter.cpp @@ -0,0 +1,85 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#include +#include "lbann/transforms/vision/color_jitter.hpp" +#include "lbann/transforms/vision/adjust_brightness.hpp" +#include "lbann/transforms/vision/adjust_contrast.hpp" +#include "lbann/transforms/vision/adjust_saturation.hpp" +#include "lbann/utils/random.hpp" +#include "lbann/utils/opencv.hpp" + +namespace lbann { +namespace transform { + +void color_jitter::apply(utils::type_erased_matrix& data, std::vector& dims) { + fast_rng_gen& gen = get_fast_generator(); + // Determine the order to apply transforms. + // Unused transforms will be skipped. + // 1 == brightness, 2 == contrast, 3 == saturation. + std::vector transform_order = {1, 2, 3}; + std::shuffle(transform_order.begin(), transform_order.end(), gen); + // Now apply the random adjustments. + for (const auto& t : transform_order) { + switch (t) { + case 1: + // Brightness. + if (!(m_min_brightness_factor == 0.0f && + m_min_brightness_factor == m_max_brightness_factor)) { + std::uniform_real_distribution dist( + m_min_brightness_factor, m_max_brightness_factor); + adjust_brightness trans = adjust_brightness(dist(gen)); + trans.apply(data, dims); + } + break; + case 2: + // Contrast. + if (!(m_min_contrast_factor == 0.0f && + m_min_contrast_factor == m_max_contrast_factor)) { + std::uniform_real_distribution dist( + m_min_contrast_factor, m_max_contrast_factor); + adjust_contrast trans = adjust_contrast(dist(gen)); + trans.apply(data, dims); + } + break; + case 3: + // Saturation. + if (!(m_min_saturation_factor == 0.0f && + m_min_saturation_factor == m_max_saturation_factor)) { + std::uniform_real_distribution dist( + m_min_saturation_factor, m_max_saturation_factor); + adjust_saturation trans = adjust_saturation(dist(gen)); + trans.apply(data, dims); + } + break; + default: + LBANN_ERROR("Unexpected transform number"); + } + } +} + +} // namespace transform +} // namespace lbann From 046fd3fd97dfcb8a1c35a251177ac80d22995f77 Mon Sep 17 00:00:00 2001 From: Nikoli Dryden Date: Sat, 15 Jun 2019 15:02:22 -0700 Subject: [PATCH 080/634] Use real ImageNet validation set for validation instead of random subset. --- model_zoo/data_readers/data_reader_imagenet.prototext | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/model_zoo/data_readers/data_reader_imagenet.prototext b/model_zoo/data_readers/data_reader_imagenet.prototext index f021a7bc70d..652d213b3ae 100644 --- a/model_zoo/data_readers/data_reader_imagenet.prototext +++ b/model_zoo/data_readers/data_reader_imagenet.prototext @@ -6,7 +6,7 @@ data_reader { data_filedir: "/p/lscratchh/brainusr/datasets/ILSVRC2012/original/train/" data_filename: "/p/lscratchh/brainusr/datasets/ILSVRC2012/original/labels/train.txt" label_filename: "" - validation_percent: 0.01 + validation_percent: 0.0 absolute_sample_count: 0 percent_of_data_to_use: 1.0 num_labels: 1000 @@ -35,7 +35,7 @@ data_reader { reader { name: "imagenet" - role: "test" + role: "validate" shuffle: true data_filedir: "/p/lscratchh/brainusr/datasets/ILSVRC2012/original/val/" data_filename: "/p/lscratchh/brainusr/datasets/ILSVRC2012/original/labels/val.txt" From 42df729ec0de8e8ac2e3eb50e1df1fb37188ff40 Mon Sep 17 00:00:00 2001 From: Jae-Seung Yeom Date: Mon, 17 Jun 2019 04:22:14 -0700 Subject: [PATCH 081/634] Templated sample list (#919) This PR splits the JAG specific sample list into a set of classes that can be generalized to other data types. Specifically, it removes both JAG and Conduit specific details from the base class in the hierarchy. * The base class sample_list being the most generic can be applied to imagenet and has no component referencing conduit. In fact, it does not manage any open file handle. It is intended for data sets that have a single sample per file. * sample_list_open_files inherits the base class, which itself is an abstract class. It is designed to handle data that requires tracking file handles during execution. One reason for this class is data sets that have multiple samples per file. It currently has two derived classes: sample_list_hdf5 and sample_list_conduit_io_handle. * sample_list_hdf5 implements a concrete class that can read JAG data from HDF5 files into the internal conduit format. * sample_list_conduit_io_handle use a more abstract interface to access the JAG data in an HDF5 file. ---- * change the file name of sample_list_jag.hpp and sample_list_jag_impl.hpp to sample_list.hpp and sample_list_impl.hpp * class name change from sample_list_jag to sample_list * convert sample_list class to use template parameters * change the literal assignment to use correct type of rvalue * remove sample_file_id_t from template parameter list * add to_sample_name_t(string) function for native numeric types * add file_handle_t to template parameter list, and move member method implementations from header to implementation file. * preparation for general file handle type: - add file_handle_t to the template parameter list of sample_list. - remove hdf5 and conduit from member function names and local variable names. - make member functions that calls conduit interfaces virtual. - make member functinos that modifies file handles virtual. - move member function imeplementations from the header to the implementation file. * separate hdf5 specifics from the generic sample_list into a derived class * wrap only the minimal portions of the code as virtual methods, and put the rest back into the base class. As a result, the accessors to the private members are no longer needed and removed. * move the inclusion of headers relevant to conduit and hdf5 from the base sample_list to sample_list_hdf5 * remove file_handle_t from the template parameter list of sample_list_hdf5 and make it inherit sample_list * update copy_member such that it avoids copying the data that will be cleared without being used. update the destructor * added the general base class for the sample list, with which no open file handle is managed. * added the intermediate sample list class with open file handle management * make the methods that are not fully defined in the intermediate class pure virtual * make the base class of sample list dependent on the template parameter for the sample name type automatically assign sample name in case that its type is size_t (or integral for c++17) or string * generalize uninitialized_sample_name() a bit more for c++17 * fix write_header to print out correct number of samples. add a virtual method to return the number of sample file * fix write_header() to print out correct number of excluded samples * Fixed the error handling function. --- .../data_readers/data_reader_jag_conduit.hpp | 26 +- include/lbann/data_readers/sample_list.hpp | 158 ++++ .../sample_list_conduit_io_handle.hpp | 95 +++ .../lbann/data_readers/sample_list_hdf5.hpp | 91 +++ .../lbann/data_readers/sample_list_impl.hpp | 653 +++++++++++++++++ .../lbann/data_readers/sample_list_jag.hpp | 321 -------- .../data_readers/sample_list_jag_impl.hpp | 683 ------------------ .../data_readers/sample_list_open_files.hpp | 147 ++++ .../sample_list_open_files_impl.hpp | 682 +++++++++++++++++ src/data_readers/data_reader_jag_conduit.cpp | 143 ++-- 10 files changed, 1921 insertions(+), 1078 deletions(-) create mode 100644 include/lbann/data_readers/sample_list.hpp create mode 100644 include/lbann/data_readers/sample_list_conduit_io_handle.hpp create mode 100644 include/lbann/data_readers/sample_list_hdf5.hpp create mode 100644 include/lbann/data_readers/sample_list_impl.hpp delete mode 100644 include/lbann/data_readers/sample_list_jag.hpp delete mode 100644 include/lbann/data_readers/sample_list_jag_impl.hpp create mode 100644 include/lbann/data_readers/sample_list_open_files.hpp create mode 100644 include/lbann/data_readers/sample_list_open_files_impl.hpp diff --git a/include/lbann/data_readers/data_reader_jag_conduit.hpp b/include/lbann/data_readers/data_reader_jag_conduit.hpp index c9f46c349db..2e09b555574 100644 --- a/include/lbann/data_readers/data_reader_jag_conduit.hpp +++ b/include/lbann/data_readers/data_reader_jag_conduit.hpp @@ -37,8 +37,13 @@ #include #include #include -#include "lbann/data_readers/sample_list_jag.hpp" -#include + +//#define _USE_IO_HANDLE_ +#ifdef _USE_IO_HANDLE_ +#include "lbann/data_readers/sample_list_conduit_io_handle.hpp" +#else +#include "lbann/data_readers/sample_list_hdf5.hpp" +#endif namespace lbann { @@ -56,8 +61,16 @@ class data_reader_jag_conduit : public generic_data_reader { /// Type for the pair of the key string of a sample and the handle of the file that contains it using sample_locator_t = std::pair; using sample_map_t = std::vector; ///< valid sample map type - using sample_t = sample_list_jag::sample_t; - using sample_file_id_t = sample_list_jag::sample_file_id_t; + using sample_name_t = std::string; +#ifdef _USE_IO_HANDLE_ + using sample_list_t = sample_list_conduit_io_handle; +#else + using sample_list_t = sample_list_hdf5; +#endif + using file_handle_t = sample_list_t::file_handle_t; + using sample_file_id_t = sample_list_t::sample_file_id_t; + using sample_t = std::pair; + //using sample_t = sample_list_t::sample_t; /// linear transform on X defined as: first * X + second => X' using linear_transform_t = std::pair; @@ -331,6 +344,9 @@ class data_reader_jag_conduit : public generic_data_reader { */ static bool check_non_numeric(const std::string key); + bool has_path(const file_handle_t& h, const std::string& path) const; + void read_node(const file_handle_t& h, const std::string& path, conduit::Node& n) const; + /// Allow const access to the conduit data structure static const conduit::Node& get_conduit_node(const conduit::Node& n_base, const std::string key); /** Load the conduit node with the data of the sample i identified by key @@ -445,7 +461,7 @@ class data_reader_jag_conduit : public generic_data_reader { std::vector m_input_normalization_params; typedef std::pair conduit_sample; - sample_list_jag m_sample_list; + sample_list_t m_sample_list; bool m_list_per_trainer; bool m_list_per_model; }; diff --git a/include/lbann/data_readers/sample_list.hpp b/include/lbann/data_readers/sample_list.hpp new file mode 100644 index 00000000000..45fbdb39e92 --- /dev/null +++ b/include/lbann/data_readers/sample_list.hpp @@ -0,0 +1,158 @@ +#ifndef __SAMPLE_LIST_HPP__ +#define __SAMPLE_LIST_HPP__ + +#include +#include +#include +#include + +#include "lbann/comm.hpp" + +#include "lbann/utils/file_utils.hpp" +#include +#include +#include +#include +#include +#include + +namespace lbann { + +static const std::string sample_exclusion_list = "CONDUIT_HDF5_EXCLUSION"; +static const std::string sample_inclusion_list = "CONDUIT_HDF5_INCLUSION"; + +struct sample_list_header { + bool m_is_exclusive; + /// Number of included samples + size_t m_included_sample_count; + /// Number of excluded samples + size_t m_excluded_sample_count; + size_t m_num_files; + std::string m_file_dir; + std::string m_sample_list_filename; + + sample_list_header(); + + bool is_exclusive() const; + size_t get_sample_count() const; + size_t get_num_files() const; + const std::string& get_sample_list_filename() const; + const std::string& get_file_dir() const; + template void serialize( Archive & ar ) { + ar(m_is_exclusive, m_included_sample_count, m_excluded_sample_count, m_num_files, m_file_dir, m_sample_list_filename); + } +}; + +template +class sample_list { + public: + /// The type for the index assigned to each sample file + using sample_file_id_t = std::size_t; + /** To describe a sample as the id of the file to which it belongs. + * Each file contains only one sample. */ + using sample_t = std::template pair; + /// Type for the list of samples + using samples_t = std::template vector< sample_t >; + /// Mapping of the file index to the filename + using file_id_stats_v_t = std::vector< std::string >; + + sample_list(); + virtual ~sample_list(); + sample_list(const sample_list& rhs); + sample_list& operator=(const sample_list& rhs); + sample_list& copy(const sample_list& rhs); + + void copy_members(const sample_list& rhs); + + /// Load a sample list file + void load(const std::string& samplelist_file, size_t stride=1, size_t offset=0); + + /// Load the header of a sample list file + sample_list_header load_header(const std::string& samplelist_file) const; + + /// Restore a sample list from a serialized string + void load_from_string(const std::string& samplelist); + + /// Tells how many samples in the list + virtual size_t size() const; + + /// Tells how many sample files are there + virtual size_t get_num_files() const; + + /// Tells if the internal list is empty + bool empty() const; + + /// Serialize to and from an archive using the cereal library + template void serialize( Archive & ar ); + + /// Serialize sample list + virtual bool to_string(std::string& sstr) const; + + /// Write the sample list + void write(const std::string filename) const; + + /// Allow read-only access to the internal list data + const samples_t& get_list() const; + + /// Allow the read-only access to the list header + const sample_list_header& get_header() const; + + /// Allow read-only access to the metadata of the idx-th sample in the list + const sample_t& operator[](size_t idx) const; + + virtual const std::string& get_samples_filename(sample_file_id_t id) const; + + const std::string& get_samples_dirname() const; + + void all_gather_archive(const std::string &archive, std::vector& gathered_archive, lbann_comm& comm); + template size_t all_gather_field(T data, std::vector& gathered_data, lbann_comm& comm); + virtual void all_gather_packed_lists(lbann_comm& comm); + + protected: + + /// Reads a header line from the sample list given as a stream, and use the info string for error message + std::string read_header_line(std::istream& ifs, const std::string& filename, const std::string& info) const; + + /// Reads the header of a sample list + sample_list_header read_header(std::istream& istrm, const std::string& filename) const; + + /// read the body of a sample list, which is the list of sample files, where each file contains a single sample. + virtual void read_sample_list(std::istream& istrm, size_t stride=1, size_t offset=0); + + /// Assign names to samples when there is only one sample per file without a name. + virtual void assign_samples_name(); + + /// Reads a sample list and populates the internal list + size_t get_samples_per_file(std::istream& istrm, const std::string& filename, size_t stride=1, size_t offset=0); + + /// Add the header info to the given string + void write_header(std::string& sstr, size_t num_files) const; + + /// Get the number of total/included/excluded samples + virtual void get_num_samples(size_t& total, size_t& included, size_t& excluded) const; + + virtual void set_samples_filename(sample_file_id_t id, const std::string& filename); + + protected: + /// header info of sample list + sample_list_header m_header; + + private: + /// List of all samples with a file identifier and sample name for each sample + samples_t m_sample_list; + + /// Maps sample's file id to file names, file descriptors, and use counts + file_id_stats_v_t m_file_id_stats_map; + +}; + +void handle_mpi_error(int ierr); + +template +inline T uninitialized_sample_name(); + +} // end of namespace + +#include "sample_list_impl.hpp" + +#endif // __SAMPLE_LIST_HPP__ diff --git a/include/lbann/data_readers/sample_list_conduit_io_handle.hpp b/include/lbann/data_readers/sample_list_conduit_io_handle.hpp new file mode 100644 index 00000000000..ff9b59ed7f5 --- /dev/null +++ b/include/lbann/data_readers/sample_list_conduit_io_handle.hpp @@ -0,0 +1,95 @@ +#ifndef __SAMPLE_LIST_CONDUIT_IO_HANDLE_HPP__ +#define __SAMPLE_LIST_CONDUIT_IO_HANDLE_HPP__ + +#include "sample_list_open_files.hpp" +#include "conduit/conduit.hpp" +#include "conduit/conduit_relay.hpp" +#include "conduit/conduit_relay_io_handle.hpp" + +namespace lbann { + +template +class sample_list_conduit_io_handle : public sample_list_open_files { + public: + using file_handle_t = conduit::relay::io::IOHandle*; + using typename sample_list_open_files::sample_file_id_t; + using typename sample_list_open_files::sample_t; + using typename sample_list_open_files::samples_t; + using typename sample_list_open_files::file_id_stats_t; + using typename sample_list_open_files::file_id_stats_v_t; + using typename sample_list_open_files::fd_use_map_t; + + sample_list_conduit_io_handle(); + ~sample_list_conduit_io_handle() override; + + bool is_file_handle_valid(const file_handle_t& h) const override; + + protected: + void obtain_sample_names(file_handle_t& h, std::vector& sample_names) const override; + file_handle_t open_file_handle_for_read(const std::string& path) override; + void close_file_handle(file_handle_t& h) override; + void clear_file_handle(file_handle_t& h) override; +}; + + +template +inline sample_list_conduit_io_handle::sample_list_conduit_io_handle() +: sample_list_open_files() {} + +template +inline sample_list_conduit_io_handle::~sample_list_conduit_io_handle() { + // Close the existing open files + for(auto& f : this->m_file_id_stats_map) { + file_handle_t& h = std::get<1>(f); + close_file_handle(h); + clear_file_handle(h); + std::get<2>(f).clear(); + } + this->m_file_id_stats_map.clear(); +} + +template +inline void sample_list_conduit_io_handle +::obtain_sample_names(sample_list_conduit_io_handle::file_handle_t& h, std::vector& sample_names) const { + sample_names.clear(); + if (h != nullptr) { + h->list_child_names("/", sample_names); + } +} + +template +inline bool sample_list_conduit_io_handle +::is_file_handle_valid(const sample_list_conduit_io_handle::file_handle_t& h) const { + return ((h != nullptr) && (h->is_open())); +} + +template +inline typename sample_list_conduit_io_handle::file_handle_t sample_list_conduit_io_handle +::open_file_handle_for_read(const std::string& file_path) { + file_handle_t h = new conduit::relay::io::IOHandle; + h->open(file_path, "hdf5"); + return h; +} + +template +inline void sample_list_conduit_io_handle +::close_file_handle(file_handle_t& h) { + if(is_file_handle_valid(h)) { + h->close(); + } +} + +template <> +inline conduit::relay::io::IOHandle* uninitialized_file_handle() { + return nullptr; +} + +template +inline void sample_list_conduit_io_handle +::clear_file_handle(sample_list_conduit_io_handle::file_handle_t& h) { + h = uninitialized_file_handle(); +} + +} // end of namespace lbann + +#endif // __SAMPLE_LIST_CONDUIT_IO_HANDLE_HPP__ diff --git a/include/lbann/data_readers/sample_list_hdf5.hpp b/include/lbann/data_readers/sample_list_hdf5.hpp new file mode 100644 index 00000000000..f9181594076 --- /dev/null +++ b/include/lbann/data_readers/sample_list_hdf5.hpp @@ -0,0 +1,91 @@ +#ifndef __SAMPLE_LIST_HDF5_HPP__ +#define __SAMPLE_LIST_HDF5_HPP__ + +#include "sample_list_open_files.hpp" +#include "hdf5.h" +#include "conduit/conduit.hpp" +#include "conduit/conduit_relay.hpp" +#include "conduit/conduit_relay_io_hdf5.hpp" + +namespace lbann { + +template +class sample_list_hdf5 : public sample_list_open_files { + public: + using file_handle_t = hid_t; + using typename sample_list_open_files::sample_file_id_t; + using typename sample_list_open_files::sample_t; + using typename sample_list_open_files::samples_t; + using typename sample_list_open_files::file_id_stats_t; + using typename sample_list_open_files::file_id_stats_v_t; + using typename sample_list_open_files::fd_use_map_t; + + sample_list_hdf5(); + ~sample_list_hdf5() override; + + bool is_file_handle_valid(const hid_t& h) const override; + + protected: + void obtain_sample_names(hid_t& h, std::vector& sample_names) const override; + hid_t open_file_handle_for_read(const std::string& path) override; + void close_file_handle(hid_t& h) override; + void clear_file_handle(hid_t& h) override; +}; + + +template +inline sample_list_hdf5::sample_list_hdf5() +: sample_list_open_files() {} + +template +inline sample_list_hdf5::~sample_list_hdf5() { + // Close the existing open files + for(auto& f : this->m_file_id_stats_map) { + file_handle_t& h = std::get<1>(f); + close_file_handle(h); + clear_file_handle(h); + std::get<2>(f).clear(); + } + this->m_file_id_stats_map.clear(); +} + +template +inline void sample_list_hdf5 +::obtain_sample_names(hid_t& h, std::vector& sample_names) const { + conduit::relay::io::hdf5_group_list_child_names(h, "/", sample_names); +} + +template +inline bool sample_list_hdf5 +::is_file_handle_valid(const hid_t& h) const { + return (h > static_cast(0)); +} + +template +inline hid_t sample_list_hdf5< sample_name_t> +::open_file_handle_for_read(const std::string& file_path) { + return conduit::relay::io::hdf5_open_file_for_read(file_path); +} + +template +inline void sample_list_hdf5 +::close_file_handle(hid_t& h) { + if(is_file_handle_valid(h)) { + conduit::relay::io::hdf5_close_file(h); + } +} + +template <> +inline hid_t uninitialized_file_handle() { + return static_cast(0); +} + +template +inline void sample_list_hdf5 +::clear_file_handle(hid_t& h) { + h = uninitialized_file_handle(); +} + +} // end of namespace lbann + +#endif // __SAMPLE_LIST_HDF5_HPP__ diff --git a/include/lbann/data_readers/sample_list_impl.hpp b/include/lbann/data_readers/sample_list_impl.hpp new file mode 100644 index 00000000000..c131bf814ac --- /dev/null +++ b/include/lbann/data_readers/sample_list_impl.hpp @@ -0,0 +1,653 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include "lbann/utils/exception.hpp" +#include "lbann/utils/file_utils.hpp" +#include +#include +#include +#include + +#include +#include +#include + +namespace lbann { + +template +inline std::string to_string(const T val) { + return std::to_string(val); +} + +template<> +inline std::string to_string(const std::string val) { + return val; +} + +template +inline auto to_sample_name_t(const std::string& sn_str) -> decltype (sample_name_t()){ + LBANN_ERROR(std::string{} + " :: string conversion is not implement for the sample_name_t"); + return sample_name_t(); +} + +template<> inline int to_sample_name_t(const std::string& sn_str) { + return std::stoi(sn_str); +} + +template<> inline long to_sample_name_t(const std::string& sn_str) { + return std::stol(sn_str); +} + +template<> inline unsigned long to_sample_name_t(const std::string& sn_str) { + return std::stoul(sn_str); +} + +template<> inline long long to_sample_name_t(const std::string& sn_str) { + return std::stoll(sn_str); +} + +template<> inline unsigned long long to_sample_name_t(const std::string& sn_str) { + return std::stoull(sn_str); +} + +template<> inline float to_sample_name_t(const std::string& sn_str) { + return std::stof(sn_str); +} + +template<> inline double to_sample_name_t(const std::string& sn_str) { + return std::stod(sn_str); +} + +template<> inline long double to_sample_name_t(const std::string& sn_str) { + return std::stold(sn_str); +} + +template<> inline std::string to_sample_name_t(const std::string& sn_str) { + return sn_str; +} + +//------------------------ +// sample_list_header +//------------------------ + +inline sample_list_header::sample_list_header() + : m_is_exclusive(false), m_included_sample_count(0u), + m_excluded_sample_count(0u), m_num_files(0u), + m_file_dir("") { +} + +inline bool sample_list_header::is_exclusive() const { + return m_is_exclusive; +} + +inline size_t sample_list_header::get_sample_count() const { + return m_included_sample_count; +} + +inline size_t sample_list_header::get_num_files() const { + return m_num_files; +} + +inline const std::string& sample_list_header::get_sample_list_filename() const { + return m_sample_list_filename; +} + +inline const std::string& sample_list_header::get_file_dir() const { + return m_file_dir; +} + +//------------------ +// sample_list +//------------------ + +template +inline sample_list::sample_list() { +} + +template +inline sample_list::~sample_list() { +} + +template +inline sample_list +::sample_list(const sample_list& rhs) { + copy_members(rhs); +} + +template +inline sample_list& sample_list +::operator=(const sample_list& rhs) { + // check for self-assignment + if (this == &rhs) { + return (*this); + } + + copy_members(rhs); + + return (*this); +} + +template +inline sample_list& sample_list +::copy(const sample_list& rhs) { + // check for self-assignment + if (this == &rhs) { + return (*this); + } + + copy_members(rhs); + + return (*this); +} + +template +inline void sample_list +::copy_members(const sample_list& rhs) { + m_header = rhs.m_header; + m_sample_list = rhs.m_sample_list; + + /// Keep track of existing filenames + m_file_id_stats_map = rhs.m_file_id_stats_map; +} + +template +inline void sample_list +::load(const std::string& samplelist_file, + size_t stride, size_t offset) { + std::ifstream istr(samplelist_file); + get_samples_per_file(istr, samplelist_file, stride, offset); + istr.close(); +} + +template +inline sample_list_header sample_list +::load_header(const std::string& samplelist_file) const { + std::ifstream istr(samplelist_file); + return read_header(istr, samplelist_file); +} + +template +inline void sample_list +::load_from_string(const std::string& samplelist) { + std::istringstream istr(samplelist); + get_samples_per_file(istr, "", 1, 0); +} + +template +inline size_t sample_list +::size() const { + return m_sample_list.size(); +} + +template +inline size_t sample_list +::get_num_files() const { + return m_file_id_stats_map.size(); +} + +template +inline bool sample_list +::empty() const { + return (size() == 0ul); +} + +template +inline std::string sample_list +::read_header_line(std::istream& istrm, + const std::string& filename, + const std::string& info) const { + if (!istrm.good()) { + throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__) + + " :: unable to read the header line of sample list " + filename + " for " + info); + } + + std::string line; + std::getline(istrm, line); + + if (line.empty()) { + throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__) + + " :: unable to read the header line of sample list " + filename + " for " + info + + " -- the line was empty"); + } + return line; +} + + +template +inline sample_list_header sample_list +::read_header(std::istream& istrm, + const std::string& filename) const { + sample_list_header hdr; + + hdr.m_sample_list_filename = filename; + + std::string line1 = read_header_line(istrm, filename, "the exclusiveness"); + std::stringstream header1(line1); + + std::string line2 = read_header_line(istrm, filename, "the number of samples and the number of files"); + std::stringstream header2(line2); + + std::string line3 = read_header_line(istrm, filename, "the data file directory"); + std::stringstream header3(line3); + + std::string sample_list_type; + header1 >> sample_list_type; + std::for_each(sample_list_type.begin(), sample_list_type.end(), [](char& c){ c = std::toupper(c); }); + + const std::string type_exclusive = sample_exclusion_list; + size_t found = sample_list_type.find(type_exclusive); + + if (found != std::string::npos) { + hdr.m_is_exclusive = true; + } else { + hdr.m_is_exclusive = false; + } + + header2 >> hdr.m_included_sample_count; + header2 >> hdr.m_excluded_sample_count; + header2 >> hdr.m_num_files; + + header3 >> hdr.m_file_dir; + + if (hdr.get_file_dir().empty() || !check_if_dir_exists(hdr.get_file_dir())) { + LBANN_ERROR(std::string{} + "file " + filename + + " :: data root directory '" + hdr.get_file_dir() + "' does not exist."); + } + + return hdr; +} + + +template +inline void sample_list +::read_sample_list(std::istream& istrm, + size_t stride, size_t offset) { + m_sample_list.reserve(m_header.get_sample_count()); + + const std::string whitespaces(" \t\f\v\n\r"); + size_t cnt_files = 0u; + std::string line; + + while (std::getline(istrm, line)) { + const size_t end_of_str = line.find_last_not_of(whitespaces); + if (end_of_str == std::string::npos) { // empty line + continue; + } + if (cnt_files++ >= m_header.get_num_files()) { + break; + } + // Check to see if there is a strided load and skip the lines that are not for this rank + if ((cnt_files-1)%stride != offset) { + continue; + } + + std::stringstream sstr(line.substr(0, end_of_str + 1)); // clear trailing spaces for accurate parsing + std::string filename; + + sstr >> filename; + + const std::string file_path = add_delimiter(m_header.get_file_dir()) + filename; + + if (filename.empty() || !check_if_file_exists(file_path)) { + throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__) + + " :: data file '" + filename + "' does not exist."); + } + + const sample_file_id_t index = m_file_id_stats_map.size(); + static const auto sn0 = uninitialized_sample_name(); + m_sample_list.emplace_back(std::make_pair(index, sn0)); + m_file_id_stats_map.emplace_back(filename); + } + + if (m_header.get_num_files() != cnt_files) { + LBANN_ERROR(std::string("Sample list number of files requested ") + + std::to_string(m_header.get_num_files()) + + std::string(" does not equal number of files loaded ") + + std::to_string(cnt_files)); + } + + if(stride == 1 && m_header.get_sample_count() != m_sample_list.size()) { + LBANN_ERROR(std::string("Sample list count ") + + std::to_string(m_header.get_sample_count()) + + std::string(" does not equal sample list size ") + + std::to_string(m_sample_list.size())); + } +} + + +template +inline size_t sample_list +::get_samples_per_file(std::istream& istrm, + const std::string& filename, + size_t stride, size_t offset) { + m_header = read_header(istrm, filename); + + read_sample_list(istrm, stride, offset); + + return size(); +} + + +template +inline void sample_list +::all_gather_archive(const std::string &archive, + std::vector& gathered_archive, + lbann_comm& comm) { + int size_of_list_archive = archive.size(); + std::vector packed_sizes(comm.get_procs_per_trainer()); + + comm.trainer_all_gather(size_of_list_archive, packed_sizes); + + int total_packed_size = 0; + std::vector displ; + displ.assign(comm.get_procs_per_trainer()+1, 0); + + for (size_t i = 0u; i < packed_sizes.size(); ++i) { + const auto sz = packed_sizes[i]; + displ[i+1] = displ[i] + sz; + } + total_packed_size = displ.back(); + + if (total_packed_size <= 0) { + return; + } + + std::string all_samples; + all_samples.resize(static_cast(total_packed_size)); + + std::vector local_data(archive.begin(), archive.end()); + std::vector packed_data(all_samples.size() * sizeof(decltype(all_samples)::value_type)); + comm.trainer_all_gather(local_data, + packed_data, + packed_sizes, + displ); + + for (size_t i = 0u; i < packed_sizes.size(); ++i) { + std::string& buf = gathered_archive[i]; + const auto sz = packed_sizes[i]; + displ[i+1] = displ[i] + sz; + std::vector::const_iterator first = packed_data.begin() + displ[i]; + std::vector::const_iterator last = packed_data.begin() + displ[i] + sz; + buf.resize(sz); + buf.assign(first, last); + } + return; +} + +template +template +inline size_t sample_list +::all_gather_field(T data, + std::vector& gathered_data, + lbann_comm& comm) { + std::string archive; + std::stringstream ss; + cereal::BinaryOutputArchive oarchive(ss); + oarchive(data); + archive = ss.str(); + + std::vector gathered_archive(comm.get_procs_per_trainer()); + + all_gather_archive(archive, gathered_archive, comm); + + std::vector per_rank_data(comm.get_procs_per_trainer()); + + size_t gathered_field_size = 0; + for (size_t i = 0u; i < gathered_archive.size(); ++i) { + std::string& buf = gathered_archive[i]; + T& tmp = gathered_data[i]; + + std::stringstream in_ss(buf); + cereal::BinaryInputArchive iarchive(in_ss); + iarchive(tmp); + gathered_field_size += tmp.size(); + } + return gathered_field_size; +} + +template +template +void sample_list +::serialize( Archive & ar ) { + ar(m_header, m_sample_list, m_file_id_stats_map); +} + +template +inline void sample_list +::write_header(std::string& sstr, size_t num_files) const { + // The first line indicate if the list is exclusive or inclusive + // The next line contains the number of samples (included and excluded), + // as well as the number of files, which are the same in this caes + // The next line contains the root data file directory + + sstr += (m_header.is_exclusive()? sample_exclusion_list + "\n" : sample_inclusion_list + "\n"); + size_t total, included, excluded; + get_num_samples(total, included, excluded); + /// TODO: clarify the comment below + /// Include the number of invalid samples, which for an inclusive index list is always 0 + sstr += std::to_string(included) + ' ' + std::to_string(excluded) + ' ' + std::to_string(num_files) + '\n'; + sstr += m_header.get_file_dir() + '\n'; +} + +template +inline void sample_list +::get_num_samples(size_t& total, size_t& included, size_t& excluded) const { + total = size(); + included = size(); + excluded = 0ul; +} + +template +inline bool sample_list +::to_string(std::string& sstr) const { + size_t total_len = 0ul; + for (const auto& s : m_sample_list) { + const std::string& filename = m_file_id_stats_map[s.first]; + total_len += filename.size() + 1u; + } + + sstr.clear(); + + // reserve the string to hold the entire sample lit + size_t estimated_len = 30 + 42 + m_header.get_file_dir().size() + 1 + total_len + 1000; + sstr.reserve(estimated_len); + + // write the list header + write_header(sstr, get_num_files()); + + // write the list body + for (const auto& s : m_sample_list) { + // File name + const std::string& filename = m_file_id_stats_map[s.first]; + sstr += filename + '\n'; + } + + return true; +} + +template +inline void sample_list +::write(const std::string filename) const { + std::string dir, basename; + parse_path(filename, dir, basename); + if (!dir.empty() && !check_if_dir_exists(dir)) { + // The creation of a shared directory must be done once in a coordinated fashion + // among the entities that have access to it. Thus, it must be done in advance + std::cerr << "The sample list output directory (" + dir + ") does not exist" << std::endl; + return; + } + + std::fstream ofs(filename, std::fstream::out | std::fstream::binary); + + if (!ofs.good()) { + return; + } + + std::string buf; + to_string(buf); + + ofs.write(buf.data(), buf.size()*sizeof(std::string::value_type)); + ofs.close(); +} + +template +inline const typename sample_list::samples_t& +sample_list::get_list() const { + return m_sample_list; +} + +template +inline const sample_list_header& +sample_list::get_header() const { + return m_header; +} + +template +inline const typename sample_list::sample_t& +sample_list::operator[](size_t idx) const { + return m_sample_list[idx]; +} + +template +inline const std::string& sample_list +::get_samples_filename(sample_file_id_t id) const { + return m_file_id_stats_map[id]; +} + +template +inline const std::string& sample_list +::get_samples_dirname() const { + return m_header.get_file_dir(); +} + +template +inline void sample_list +::set_samples_filename(sample_file_id_t id, const std::string& filename) { + m_file_id_stats_map[id] = filename; +} + +#if defined(__cpp_if_constexpr) // c++17 +template +inline void sample_list +::assign_samples_name() { + if constexpr (std::is_integral::value + && !std::is_same::value) { + sample_name_t i = 0; + for (auto& s: m_sample_list) { + s.second = i++; + } + } else if constexpr (std::is_same::value) { + for (auto& s: m_sample_list) { + s.second = s.first; + } + } else { + LBANN_ERROR(std::string{} + " :: base class does not implement this method" + + " for the current sample name type"); + } +} + +template +inline sample_name_t uninitialized_sample_name() { + if constexpr (std::is_integral::value) { + return static_cast(0); + } else if constexpr (std::is_same::value) { + return ""; + } else if constexpr (std::is_floating_point::value) { + return 0.0; + } else if constexpr (std::is_default_constructible::value + && std::is_copy_constructible::value) { + sample_name_t ret{}; + return ret; + } else { + LBANN_ERROR(std::string{} + " :: base class does not implement this method" + + " for the current sample name type"); + } +} +#else +template<> inline void sample_list +::assign_samples_name() { + size_t i = 0ul; + for (auto& s: m_sample_list) { + s.second = i++; + } +} + +template<> inline void sample_list +::assign_samples_name() { + for (auto& s: m_sample_list) { + s.second = s.first; + } +} + +template +inline void sample_list +::assign_samples_name() { + LBANN_ERROR(std::string{} + " :: base class does not implement this method" + + " for the current sample name type"); +} + +template<> inline size_t uninitialized_sample_name() { + return 0ul; +} + +template<> inline std::string uninitialized_sample_name() { + return ""; +} + +template +inline sample_name_t uninitialized_sample_name() { + sample_name_t ret{}; + return ret; +} +#endif // defined(__cpp_if_constexpr) + +template +inline void sample_list +::all_gather_packed_lists(lbann_comm& comm) { + int num_ranks = comm.get_procs_per_trainer(); + typename std::vector per_rank_samples(num_ranks); + typename std::vector> per_rank_files(num_ranks); + + size_t num_samples = all_gather_field(m_sample_list, per_rank_samples, comm); + size_t num_ids = all_gather_field(m_file_id_stats_map, per_rank_files, comm); + + m_sample_list.clear(); + m_file_id_stats_map.clear(); + + m_sample_list.reserve(num_samples); + m_file_id_stats_map.reserve(num_ids); + + for(int r = 0; r < num_ranks; r++) { + const samples_t& s_list = per_rank_samples[r]; + const auto& files = per_rank_files[r]; + for (const auto& s : s_list) { + sample_file_id_t index = s.first; + const std::string& filename = files[index]; + if(index >= m_file_id_stats_map.size() + || (m_file_id_stats_map.back() != filename)) { + index = m_file_id_stats_map.size(); + m_file_id_stats_map.emplace_back(filename); + }else { + for(size_t i = 0; i < m_file_id_stats_map.size(); i++) { + if(filename == m_file_id_stats_map[i]) { + index = i; + break; + } + } + } + static const auto sn0 = uninitialized_sample_name(); + m_sample_list.emplace_back(std::make_pair(index, sn0)); + } + } + + assign_samples_name(); + + return; +} + +} // end of namespace lbann diff --git a/include/lbann/data_readers/sample_list_jag.hpp b/include/lbann/data_readers/sample_list_jag.hpp deleted file mode 100644 index 07040a80d48..00000000000 --- a/include/lbann/data_readers/sample_list_jag.hpp +++ /dev/null @@ -1,321 +0,0 @@ -#ifndef __SAMPLE_LIST_JAG_HPP__ -#define __SAMPLE_LIST_JAG_HPP__ - -#include -#include -#include -#include - -#ifndef _JAG_OFFLINE_TOOL_MODE_ -#include "lbann/comm.hpp" -#else -#include -#endif - -#include "lbann/utils/file_utils.hpp" -#include -#include -#include -#include -#include -#include -#include "conduit/conduit_relay_io_hdf5.hpp" - -/// Number of system and other files that may be open during execution -#define LBANN_MAX_OPEN_FILE_MARGIN 128 -#define LBANN_MAX_OPEN_FILE_RETRY 3 - -namespace lbann { - -struct sample_list_header { - bool m_is_exclusive; - /// Number of included samples - size_t m_included_sample_count; - /// Number of excluded samples - size_t m_excluded_sample_count; - size_t m_num_files; - std::string m_file_dir; - std::string m_sample_list_filename; - - sample_list_header(); - - bool is_exclusive() const; - size_t get_sample_count() const; - size_t get_num_files() const; - const std::string& get_sample_list_filename() const; - const std::string& get_file_dir() const; - template void serialize( Archive & ar ) { - ar(m_is_exclusive, m_included_sample_count, m_excluded_sample_count, m_num_files, m_file_dir, m_sample_list_filename); - } -}; - -static const std::string conduit_hdf5_exclusion_list = "CONDUIT_HDF5_EXCLUSION"; -static const std::string conduit_hdf5_inclusion_list = "CONDUIT_HDF5_INCLUSION"; - -class sample_list_jag { - public: - /// The type of the native identifier of a sample rather than an arbitrarily assigned index - using sample_name_t = std::string; - /// The type for arbitrarily assigned index - using sample_file_id_t = std::size_t; - /// To describe a sample as a pair of the file to which it belongs and its name - // using sample_t = std::pair; - using sample_t = std::pair; - /// Statistics for each file used by the sample list: includes the file name, file descriptor, and - /// and a queue of each step and substep when data will be loaded from the file - using file_id_stats_t = std::tuple>>; - - /// Type for the list of samples - using samples_t = std::vector< sample_t >; - /// Mapping of the file index to the statistics for each file - using file_id_stats_v_t = std::vector< file_id_stats_t >; // rename to sample_to_file_v or something - /// Type for the map of file descriptors to usage step and substep - using fd_use_map_t = std::pair>; - - sample_list_jag(); - ~sample_list_jag(); - sample_list_jag(const sample_list_jag& rhs); - sample_list_jag& operator=(const sample_list_jag& rhs); - sample_list_jag& copy(const sample_list_jag& rhs); - - void copy_members(const sample_list_jag& rhs); - - /// Load a sample list file - void load(const std::string& samplelist_file, size_t stride=1, size_t offset=0); - - /// Load the header of a sample list file - sample_list_header load_header(const std::string& samplelist_file) const; - - /// Extract a sample list from a serialized sample list in a string - void load_from_string(const std::string& samplelist); - - /// Tells how many samples in the list - size_t size() const; - - /// Tells if the internal list is empty - bool empty() const; - - /// Clear internal states - void clear(); - - template void serialize( Archive & ar ); - - /// Check if a sample index is in the valid range - bool check_index(size_t idx) const; - - /// Serialize sample list - bool to_string(std::string& sstr) const; - - /// Write the sample list - void write(const std::string filename) const; - - /// Allow read-only access to the internal list data - const samples_t& get_list() const; - - /// Allow the read-only access to the list header - const sample_list_header& get_header() const; - - /// Allow read-only access to the metadata of the idx-th sample in the list - const sample_t& operator[](size_t idx) const; - - const std::string& get_samples_filename(sample_file_id_t id) const { - return std::get<0>(m_file_id_stats_map[id]); - } - - const std::string& get_samples_dirname() const { - return m_header.get_file_dir(); - } - - hid_t get_samples_hdf5_handle(sample_file_id_t id) const { - hid_t h = std::get<1>(m_file_id_stats_map[id]); - return h; - } - - void set_samples_filename(sample_file_id_t id, const std::string& filename) { - std::get<0>(m_file_id_stats_map[id]) = filename; - } - - void set_files_hdf5_handle(const std::string& filename, hid_t h) { - sample_file_id_t id = 0; - for (auto&& e : m_file_id_stats_map) { - if(std::get<0>(e) == filename) { - std::get<1>(e) = h; - break; - } - id++; - } - manage_open_hdf5_handles(id, true); - } - - void delete_hdf5_handle_pq_entry(sample_file_id_t id) { - for (std::deque::iterator it = m_open_fd_pq.begin(); it!=m_open_fd_pq.end(); ++it) { - if(it->first == id) { - it = m_open_fd_pq.erase(it); - break; - } - } - return; - } - - void manage_open_hdf5_handles(sample_file_id_t id, bool pre_open_fd = false) { - /// When we enter this function the priority queue is either empty or a heap - if(!m_open_fd_pq.empty()) { - if(m_open_fd_pq.size() > m_max_open_files) { - auto& f = m_open_fd_pq.front(); - auto& victim = m_file_id_stats_map[f.first]; - hid_t victim_fd = std::get<1>(victim); - std::pop_heap(m_open_fd_pq.begin(), m_open_fd_pq.end(), pq_cmp); - m_open_fd_pq.pop_back(); - if(victim_fd > 0) { - conduit::relay::io::hdf5_close_file(victim_fd); - std::get<1>(victim) = 0; - } - } - } - - /// Before we can enqueue the any new access times for this descriptor, remove any - /// earlier descriptor - std::sort_heap(m_open_fd_pq.begin(), m_open_fd_pq.end(), pq_cmp); - if(m_open_fd_pq.front().first == id) { - m_open_fd_pq.pop_front(); - } - std::make_heap(m_open_fd_pq.begin(), m_open_fd_pq.end(), pq_cmp); - - auto& e = m_file_id_stats_map[id]; - auto& file_access_queue = std::get<2>(e); - if(!file_access_queue.empty()) { - if(!pre_open_fd) { - file_access_queue.pop_front(); - } - } - if(!file_access_queue.empty()) { - m_open_fd_pq.emplace_back(std::make_pair(id,file_access_queue.front())); - }else { - /// If there are no future access of the file place a terminator entry to track - /// the open file, but is always sorted to the top of the heap - m_open_fd_pq.emplace_back(std::make_pair(id,std::make_pair(INT_MAX,id))); - } - std::push_heap(m_open_fd_pq.begin(), m_open_fd_pq.end(), pq_cmp); - return; - } - - hid_t open_samples_hdf5_handle(const size_t i, bool pre_open_fd = false) { - const sample_t& s = m_sample_list[i]; - sample_file_id_t id = s.first; - hid_t h = get_samples_hdf5_handle(id); - if (h <= static_cast(0)) { - const std::string& file_name = get_samples_filename(id); - const std::string conduit_file_path = add_delimiter(get_samples_dirname()) + file_name; - if (file_name.empty() || !check_if_file_exists(conduit_file_path)) { - LBANN_ERROR(std::string{} + " :: data file '" + conduit_file_path + "' does not exist."); - } - bool retry = false; - int retry_cnt = 0; - do { - try { - h = conduit::relay::io::hdf5_open_file_for_read( conduit_file_path ); - }catch (conduit::Error const& e) { - LBANN_WARNING(" :: trying to open the file " + conduit_file_path + " and got " + e.what()); - retry = true; - retry_cnt++; - }catch (...) { - LBANN_ERROR("trying to open the file " + conduit_file_path + " and got an unknown exception"); - } - }while(retry && retry_cnt < 3); - - if (h <= static_cast(0)) { - LBANN_ERROR(std::string{} + " :: data file '" + conduit_file_path + "' could not be opened."); - } - auto& e = m_file_id_stats_map[id]; - std::get<1>(e) = h; - /// If a new file is opened, place it in the priority queue - manage_open_hdf5_handles(id, pre_open_fd); - } - return h; - } - - void close_if_done_samples_hdf5_handle(const size_t i) { - const sample_t& s = m_sample_list[i]; - sample_file_id_t id = s.first; - hid_t h = get_samples_hdf5_handle(id); - if (h > static_cast(0)) { - auto& e = m_file_id_stats_map[id]; - auto& file_access_queue = std::get<2>(e); - if(file_access_queue.empty()) { - conduit::relay::io::hdf5_close_file(std::get<1>(e)); - std::get<1>(e) = 0; - delete_hdf5_handle_pq_entry(id); - } - } - } - - void all_gather_archive(const std::string &archive, std::vector& gathered_archive, lbann_comm& comm); - template size_t all_gather_field(T data, std::vector& gathered_data, lbann_comm& comm); - void all_gather_packed_lists(lbann_comm& comm); - - void compute_epochs_file_usage(const std::vector& shufled_indices, int mini_batch_size, const lbann_comm& comm); - - protected: - - /// Reads a header line from the sample list given as a stream, and use the info string for error message - std::string read_header_line(std::istream& ifs, const std::string& filename, const std::string& info) const; - - /// Reads the header of a sample list - sample_list_header read_header(std::istream& istrm, const std::string& filename) const; - - /// Get the list of samples that exist in a conduit bundle - hid_t get_conduit_bundle_samples(std::string conduit_file_path, std::vector& sample_names, size_t included_samples, size_t excluded_samples); - - /// read the body of exclusive sample list - void read_exclusive_list(std::istream& istrm, size_t stride=1, size_t offset=0); - - /// read the body of inclusive sample list - void read_inclusive_list(std::istream& istrm, size_t stride=1, size_t offset=0); - - /// Reads a sample list and populates the internal list - size_t get_samples_per_file(std::istream& istrm, const std::string& filename, size_t stride=1, size_t offset=0); - - /// Add the header info to the given string - void write_header(std::string& sstr, size_t num_files) const; - - static bool pq_cmp(fd_use_map_t left, fd_use_map_t right) { - return ((left.second).first < (right.second).first) || - (((left.second).first == (right.second).first) && - ((left.second).second < (right.second).second)); } - - private: - /// header info of sample list - sample_list_header m_header; - - /// List of all samples with a file identifier and sample name for each sample - samples_t m_sample_list; - - /// Maps sample's file id to file names, file descriptors, and use counts - file_id_stats_v_t m_file_id_stats_map; - - /// Track the number of samples per file - std::unordered_map m_file_map; - - /// Track the number of open file descriptors and when they will be used next - std::deque m_open_fd_pq; - - size_t m_max_open_files; -}; - -void handle_mpi_error(int ierr); - -#ifndef _JAG_OFFLINE_TOOL_MODE_ -void distribute_sample_list(const sample_list_jag& sn, - std::string& my_samples, - lbann_comm& comm); -#else -void distribute_sample_list(const sample_list_jag& sn, - std::string& my_samples, - MPI_Comm& comm); -#endif - -} // end of namespace - -#include "sample_list_jag_impl.hpp" - -#endif // __SAMPLE_LIST_JAG_HPP__ diff --git a/include/lbann/data_readers/sample_list_jag_impl.hpp b/include/lbann/data_readers/sample_list_jag_impl.hpp deleted file mode 100644 index 6b7ea1eeaa8..00000000000 --- a/include/lbann/data_readers/sample_list_jag_impl.hpp +++ /dev/null @@ -1,683 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include "sample_list_jag.hpp" -#include "lbann/utils/exception.hpp" -#include "lbann/utils/file_utils.hpp" -#include -#include "hdf5.h" -#include "conduit/conduit.hpp" -#include "conduit/conduit_relay.hpp" -#include "conduit/conduit_relay_io_hdf5.hpp" -#include -#include - -#include -#include -#include - -namespace lbann { - -inline sample_list_header::sample_list_header() - : m_is_exclusive(false), m_included_sample_count(0u), m_excluded_sample_count(0u), m_num_files(0u), m_file_dir("") { -} - -inline bool sample_list_header::is_exclusive() const { - return m_is_exclusive; -} - -inline size_t sample_list_header::get_sample_count() const { - return m_included_sample_count; -} - -inline size_t sample_list_header::get_num_files() const { - return m_num_files; -} - -inline const std::string& sample_list_header::get_sample_list_filename() const { - return m_sample_list_filename; -} - -inline const std::string& sample_list_header::get_file_dir() const { - return m_file_dir; -} - -inline sample_list_jag::sample_list_jag() { - m_max_open_files = getdtablesize() - LBANN_MAX_OPEN_FILE_MARGIN; -} - -inline sample_list_jag::~sample_list_jag() { - // Close the existing open files - for(auto f : m_file_id_stats_map) { - if(std::get<1>(f) > 0) { - conduit::relay::io::hdf5_close_file(std::get<1>(f)); - } - std::get<1>(f) = 0; - std::get<2>(f).clear(); - } - m_file_id_stats_map.clear(); - m_open_fd_pq.clear(); -} - -inline sample_list_jag::sample_list_jag(const sample_list_jag& rhs) { - copy_members(rhs); -} - -inline sample_list_jag& sample_list_jag::operator=(const sample_list_jag& rhs) { - // check for self-assignment - if (this == &rhs) { - return (*this); - } - - copy_members(rhs); - - return (*this); -} - -inline sample_list_jag& sample_list_jag::copy(const sample_list_jag& rhs) { - // check for self-assignment - if (this == &rhs) { - return (*this); - } - - copy_members(rhs); - - return (*this); -} - -inline void sample_list_jag::copy_members(const sample_list_jag& rhs) { - m_header = rhs.m_header; - m_sample_list = rhs.m_sample_list; - m_file_id_stats_map = rhs.m_file_id_stats_map; - m_file_map = rhs.m_file_map; - m_max_open_files = rhs.m_max_open_files; - - /// Keep track of existing filenames but do not copy any file - /// descriptor information - for(auto&& e : m_file_id_stats_map) { - if(std::get<1>(e) > 0) { - std::get<1>(e) = 0; - } - std::get<2>(e).clear(); - } - - /// Do not copy the open file descriptor priority queue - /// File handle ownership is not transfered in the copy - m_open_fd_pq.clear(); -} - -inline void sample_list_jag::load(const std::string& samplelist_file, size_t stride, size_t offset) { - std::ifstream istr(samplelist_file); - get_samples_per_file(istr, samplelist_file, stride, offset); - istr.close(); -} - -inline sample_list_header sample_list_jag::load_header(const std::string& samplelist_file) const { - std::ifstream istr(samplelist_file); - return read_header(istr, samplelist_file); -} - -inline void sample_list_jag::load_from_string(const std::string& samplelist) { - std::istringstream istr(samplelist); - get_samples_per_file(istr, "", 1, 0); -} - -inline size_t sample_list_jag::size() const { - return m_sample_list.size(); -} - -inline bool sample_list_jag::empty() const { - return m_sample_list.empty(); -} - -inline std::string sample_list_jag::read_header_line(std::istream& istrm, const std::string& filename, const std::string& info) const { - if (!istrm.good()) { - throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__) - + " :: unable to read the header line of sample list " + filename + " for " + info); - } - - std::string line; - std::getline(istrm, line); - - if (line.empty()) { - throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__) - + " :: unable to read the header line of sample list " + filename + " for " + info - + " -- the line was empty"); - } - return line; -} - - -inline sample_list_header sample_list_jag::read_header(std::istream& istrm, const std::string& filename) const { - sample_list_header hdr; - - hdr.m_sample_list_filename = filename; - - std::string line1 = read_header_line(istrm, filename, "the exclusiveness"); - std::stringstream header1(line1); - - std::string line2 = read_header_line(istrm, filename, "the number of samples and the number of files"); - std::stringstream header2(line2); - - std::string line3 = read_header_line(istrm, filename, "the data file directory"); - std::stringstream header3(line3); - - std::string sample_list_type; - header1 >> sample_list_type; - std::for_each(sample_list_type.begin(), sample_list_type.end(), [](char& c){ c = std::toupper(c); }); - - const std::string type_exclusive = conduit_hdf5_exclusion_list; - size_t found = sample_list_type.find(type_exclusive); - - if (found != std::string::npos) { - hdr.m_is_exclusive = true; - } else { - hdr.m_is_exclusive = false; - } - - header2 >> hdr.m_included_sample_count; - header2 >> hdr.m_excluded_sample_count; - header2 >> hdr.m_num_files; - - header3 >> hdr.m_file_dir; - - if (hdr.get_file_dir().empty() || !check_if_dir_exists(hdr.get_file_dir())) { - LBANN_ERROR(std::string{} + "file " + filename - + " :: data root directory '" + hdr.get_file_dir() + "' does not exist."); - } - - return hdr; -} - -inline hid_t sample_list_jag::get_conduit_bundle_samples(std::string conduit_file_path, std::vector& sample_names, size_t included_samples, size_t excluded_samples) { - hid_t hdf5_file_hnd = 0; - bool retry = false; - int retry_cnt = 0; - do { - try { - hdf5_file_hnd = conduit::relay::io::hdf5_open_file_for_read( conduit_file_path ); - }catch (conduit::Error const& e) { - LBANN_WARNING(" :: trying to open the file " + conduit_file_path + " and got " + e.what()); - retry = true; - retry_cnt++; - } - }while(retry && retry_cnt < LBANN_MAX_OPEN_FILE_RETRY); - - if (hdf5_file_hnd <= static_cast(0)) { - std::cout << "Opening the file didn't work" << std::endl; - return hdf5_file_hnd; - } - - conduit::relay::io::hdf5_group_list_child_names(hdf5_file_hnd, "/", sample_names); - - if(sample_names.size() != (included_samples + excluded_samples)) { - LBANN_ERROR(std::string("File does not contain the correct number of samples: found ") - + std::to_string(sample_names.size()) - + std::string(" -- this does not equal the expected number of samples that are marked for inclusion: ") - + std::to_string(included_samples) - + std::string(" and exclusion: ") - + std::to_string(excluded_samples)); - } - - return hdf5_file_hnd; -} - -inline void sample_list_jag::read_exclusive_list(std::istream& istrm, size_t stride, size_t offset) { - const std::string whitespaces(" \t\f\v\n\r"); - size_t cnt_files = 0u; - std::string line; - - while (std::getline(istrm, line)) { - const size_t end_of_str = line.find_last_not_of(whitespaces); - if (end_of_str == std::string::npos) { // empty line - continue; - } - if (cnt_files++ >= m_header.get_num_files()) { - break; - } - // Check to see if there is a strided load and skip the lines that are not for this rank - if ((cnt_files-1)%stride != offset) { - continue; - } - - std::stringstream sstr(line.substr(0, end_of_str + 1)); // clear trailing spaces for accurate parsing - std::string filename; - size_t included_samples; - size_t excluded_samples; - std::unordered_set excluded_sample_indices; - - sstr >> filename >> included_samples >> excluded_samples; - - const std::string conduit_file_path = add_delimiter(m_header.get_file_dir()) + filename; - - if (filename.empty() || !check_if_file_exists(conduit_file_path)) { - LBANN_ERROR(std::string{} + " :: data file '" + conduit_file_path + "' does not exist."); - } - - excluded_sample_indices.reserve(excluded_samples); - - while(!sstr.eof()) { - std::string index; - sstr >> index; - excluded_sample_indices.insert(index); - } - - if(excluded_sample_indices.size() != excluded_samples) { - LBANN_ERROR(std::string("Index file does not contain the correct number of excluded samples: expected ") - + std::to_string(excluded_samples) - + std::string(" exclusions but found ") - + std::to_string(excluded_sample_indices.size())); - } - - std::vector sample_names; - hid_t hdf5_file_hnd = get_conduit_bundle_samples(conduit_file_path, sample_names, included_samples, excluded_samples); - if(hdf5_file_hnd <= static_cast(0)) { - continue; // skipping the file - } - - if(m_file_map.count(filename) > 0) { - if(sample_names.size() != m_file_map[filename]) { - LBANN_ERROR(std::string("The same file ") - + filename - + " was opened multiple times and reported different sizes: " - + std::to_string(sample_names.size()) - + " and " - + std::to_string(m_file_map[filename])); - } - }else { - m_file_map[filename] = sample_names.size(); - } - - sample_file_id_t index = m_file_id_stats_map.size(); - m_file_id_stats_map.emplace_back(std::make_tuple(filename, 0, std::deque>{})); - set_files_hdf5_handle(filename, hdf5_file_hnd); - - size_t valid_sample_count = 0u; - for(auto s : sample_names) { - std::unordered_set::const_iterator found = excluded_sample_indices.find(s); - if (found != excluded_sample_indices.cend()) { - continue; - } - m_sample_list.emplace_back(index, s); - valid_sample_count++; - } - - if(valid_sample_count != included_samples) { - LBANN_ERROR(std::string("Bundle file does not contain the correct number of included samples: expected ") - + std::to_string(included_samples) - + std::string(" samples, but found ") - + std::to_string(valid_sample_count)); - } - } - - if (m_header.get_num_files() != cnt_files) { - LBANN_ERROR(std::string("Sample list ") - + m_header.get_sample_list_filename() - + std::string(": number of files requested ") - + std::to_string(m_header.get_num_files()) - + std::string(" does not equal number of files loaded ") - + std::to_string(cnt_files)); - } - - m_header.m_is_exclusive = false; -} - - -inline void sample_list_jag::read_inclusive_list(std::istream& istrm, size_t stride, size_t offset) { - const std::string whitespaces(" \t\f\v\n\r"); - size_t cnt_files = 0u; - std::string line; - - while (std::getline(istrm, line)) { - const size_t end_of_str = line.find_last_not_of(whitespaces); - if (end_of_str == std::string::npos) { // empty line - continue; - } - if (cnt_files++ >= m_header.get_num_files()) { - break; - } - // Check to see if there is a strided load and skip the lines that are not for this rank - if ((cnt_files-1)%stride != offset) { - continue; - } - - std::stringstream sstr(line.substr(0, end_of_str + 1)); // clear trailing spaces for accurate parsing - std::string filename; - size_t included_samples; - size_t excluded_samples; - - sstr >> filename >> included_samples >> excluded_samples; - - const std::string conduit_file_path = add_delimiter(m_header.get_file_dir()) + filename; - - if (filename.empty() || !check_if_file_exists(conduit_file_path)) { - throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__) - + " :: data file '" + filename + "' does not exist."); - } - - std::vector sample_names; - hid_t hdf5_file_hnd = get_conduit_bundle_samples(conduit_file_path, sample_names, included_samples, excluded_samples); - if(hdf5_file_hnd <= static_cast(0)) { - continue; // skipping the file - } - - if(m_file_map.count(filename) > 0) { - if(sample_names.size() != m_file_map[filename]) { - LBANN_ERROR(std::string("The same file ") - + filename - + " was opened multiple times and reported different sizes: " - + std::to_string(sample_names.size()) - + " and " - + std::to_string(m_file_map[filename])); - } - }else { - m_file_map[filename] = sample_names.size(); - } - - std::unordered_set set_of_samples(sample_names.begin(), sample_names.end()); - - sample_file_id_t index = m_file_id_stats_map.size(); - m_file_id_stats_map.emplace_back(std::make_tuple(filename, 0, std::deque>{})); - set_files_hdf5_handle(filename, hdf5_file_hnd); - - size_t valid_sample_count = 0u; - while(!sstr.eof()) { - std::string sample_name;; - sstr >> sample_name; - std::unordered_set::const_iterator found = set_of_samples.find(sample_name); - if (found == set_of_samples.cend()) { - LBANN_ERROR(std::string("Illegal request for a data ID that does not exist: ") + sample_name); - } - m_sample_list.emplace_back(index, sample_name); - valid_sample_count++; - } - if(valid_sample_count != included_samples) { - LBANN_ERROR(std::string("Bundle file does not contain the correct number of included samples: expected ") - + std::to_string(included_samples) - + std::string(" samples, but found ") - + std::to_string(valid_sample_count)); - } - } - - if (m_header.get_num_files() != cnt_files) { - LBANN_ERROR(std::string("Sample list number of files requested ") - + std::to_string(m_header.get_num_files()) - + std::string(" does not equal number of files loaded ") - + std::to_string(cnt_files)); - } -} - - -inline size_t sample_list_jag::get_samples_per_file(std::istream& istrm, const std::string& filename, size_t stride, size_t offset) { - m_header = read_header(istrm, filename); - m_sample_list.reserve(m_header.get_sample_count()); - - if (m_header.is_exclusive()) { - read_exclusive_list(istrm, stride, offset); - } else { - read_inclusive_list(istrm, stride, offset); - } - - if(stride == 1 && m_header.get_sample_count() != m_sample_list.size()) { - LBANN_ERROR(std::string("Sample list count ") - + std::to_string(m_header.get_sample_count()) - + std::string(" does not equal sample list size ") - + std::to_string(m_sample_list.size())); - } - - return m_sample_list.size(); -} - - -inline void sample_list_jag::all_gather_archive(const std::string &archive, std::vector& gathered_archive, lbann_comm& comm) { - int size_of_list_archive = archive.size(); - std::vector packed_sizes(comm.get_procs_per_trainer()); - - comm.trainer_all_gather(size_of_list_archive, packed_sizes); - - int total_packed_size = 0; - std::vector displ; - displ.assign(comm.get_procs_per_trainer()+1, 0); - - for (size_t i = 0u; i < packed_sizes.size(); ++i) { - const auto sz = packed_sizes[i]; - displ[i+1] = displ[i] + sz; - } - total_packed_size = displ.back(); - - if (total_packed_size <= 0) { - return; - } - - std::string all_samples; - all_samples.resize(static_cast(total_packed_size)); - - std::vector local_data(archive.begin(), archive.end()); - std::vector packed_data(all_samples.begin(), all_samples.end()); - comm.trainer_all_gather(local_data, - packed_data, - packed_sizes, - displ); - - for (size_t i = 0u; i < packed_sizes.size(); ++i) { - std::string& buf = gathered_archive[i]; - const auto sz = packed_sizes[i]; - displ[i+1] = displ[i] + sz; - std::vector::const_iterator first = packed_data.begin() + displ[i]; - std::vector::const_iterator last = packed_data.begin() + displ[i] + sz; - buf.resize(sz); - buf.assign(first, last); - } - return; -} - -template -inline size_t sample_list_jag::all_gather_field(T data, std::vector& gathered_data, lbann_comm& comm) { - std::string archive; - std::stringstream ss; - cereal::BinaryOutputArchive oarchive(ss); - oarchive(data); - archive = ss.str(); - - std::vector gathered_archive(comm.get_procs_per_trainer()); - - all_gather_archive(archive, gathered_archive, comm); - - std::vector per_rank_data(comm.get_procs_per_trainer()); - - size_t gathered_field_size = 0; - for (size_t i = 0u; i < gathered_archive.size(); ++i) { - std::string& buf = gathered_archive[i]; - T& tmp = gathered_data[i]; - - std::stringstream in_ss(buf); - cereal::BinaryInputArchive iarchive(in_ss); - iarchive(tmp); - gathered_field_size += tmp.size(); - } - return gathered_field_size; -} - -inline void sample_list_jag::all_gather_packed_lists(lbann_comm& comm) { - int num_ranks = comm.get_procs_per_trainer(); - std::vector per_rank_samples(num_ranks); - std::vector per_rank_file_id_stats_map(num_ranks); - std::vector> per_rank_file_map(num_ranks); - - // Close the existing open files - for(auto&& e : m_file_id_stats_map) { - if(std::get<1>(e) > 0) { - conduit::relay::io::hdf5_close_file(std::get<1>(e)); - std::get<1>(e) = 0; - } - std::get<2>(e).clear(); - } - m_open_fd_pq.clear(); - - size_t num_samples = all_gather_field(m_sample_list, per_rank_samples, comm); - size_t num_ids = all_gather_field(m_file_id_stats_map, per_rank_file_id_stats_map, comm); - size_t num_files = all_gather_field(m_file_map, per_rank_file_map, comm); - - m_sample_list.clear(); - m_file_id_stats_map.clear(); - - m_sample_list.reserve(num_samples); - m_file_id_stats_map.reserve(num_ids); - m_file_map.reserve(num_files); - - for(int r = 0; r < num_ranks; r++) { - const samples_t& sample_list = per_rank_samples[r]; - const file_id_stats_v_t& file_id_stats_map = per_rank_file_id_stats_map[r]; - const std::unordered_map& file_map = per_rank_file_map[r]; - for (const auto& s : sample_list) { - sample_file_id_t index = s.first; - const std::string& filename = std::get<0>(file_id_stats_map[index]); - if(index >= m_file_id_stats_map.size() - || (std::get<0>(m_file_id_stats_map.back()) != filename)) { - index = m_file_id_stats_map.size(); - m_file_id_stats_map.emplace_back(std::make_tuple(filename, 0, std::deque>{})); - // Update the file map structure - if(m_file_map.count(filename) == 0) { - m_file_map[filename] = file_map.at(filename); - } - }else { - for(size_t i = 0; i < m_file_id_stats_map.size(); i++) { - if(filename == std::get<0>(m_file_id_stats_map[i])) { - index = i; - break; - } - } - } - m_sample_list.emplace_back(std::make_pair(index, s.second)); - } - } - - return; -} - -inline void sample_list_jag::compute_epochs_file_usage(const std::vector& shuffled_indices, int mini_batch_size, const lbann_comm& comm) { - for (auto&& e : m_file_id_stats_map) { - if(std::get<1>(e) > 0) { - conduit::relay::io::hdf5_close_file(std::get<1>(e)); - } - std::get<1>(e) = 0; - std::get<2>(e).clear(); - } - // Once all of the file handles are closed, clear the priority queue - m_open_fd_pq.clear(); - - for (size_t i = 0; i < shuffled_indices.size(); i++) { - int idx = shuffled_indices[i]; - const auto& s = m_sample_list[idx]; - sample_file_id_t index = s.first; - - if((i % mini_batch_size) % comm.get_procs_per_trainer() == static_cast(comm.get_rank_in_trainer())) { - /// Enqueue the iteration step when the sample will get used - int step = i / mini_batch_size; - int substep = (i % mini_batch_size) / comm.get_procs_per_trainer(); - std::get<2>(m_file_id_stats_map[index]).emplace_back(std::make_pair(step, substep)); - } - } -} - -inline void sample_list_jag::clear() { - m_sample_list.clear(); -} - -template void sample_list_jag::serialize( Archive & ar ) { - ar(m_header, m_sample_list, m_file_id_stats_map); -} - -inline void sample_list_jag::write_header(std::string& sstr, size_t num_files) const { - // The first line indicate if the list is exclusive or inclusive - // The next line contains the number of samples and the number of files, which are the same in this caes - // The next line contains the root data file directory - - sstr += (m_header.is_exclusive()? conduit_hdf5_exclusion_list + "\n" : conduit_hdf5_inclusion_list + "\n"); - /// Include the number of invalid samples, which for an inclusive index list is always 0 - sstr += std::to_string(m_sample_list.size()) + " 0 " + std::to_string(num_files) + '\n'; - sstr += m_header.get_file_dir() + '\n'; -} - - -inline bool sample_list_jag::to_string(std::string& sstr) const { - std::map> tmp_file_map; - for (const auto& s : m_sample_list) { - std::string filename = std::get<0>(m_file_id_stats_map[s.first]); - tmp_file_map[filename].emplace_back(s.second); - } - - samples_t::const_iterator it_begin = m_sample_list.cbegin(); - samples_t::const_iterator it_end = m_sample_list.cbegin(); - - sstr.clear(); - - // reserve the string to hold the entire sample lit - size_t estimated_len = 30 + 42 + m_header.get_file_dir().size() + 1; - if (it_begin < it_end) { - estimated_len += tmp_file_map.size(); - sstr.reserve(estimated_len); - } - - // write the list header - write_header(sstr, tmp_file_map.size()); - - // write the list body - for (const auto& f : tmp_file_map) { - // File name - sstr += f.first; - // Number of included samples - sstr += std::string(" ") + std::to_string(f.second.size()); - // Number of excluded samples - sstr += std::string(" ") + std::to_string(m_file_map.at(f.first) - f.second.size()); - // Inclusion sample list - for (const auto& s : f.second) { - sstr += ' ' + s; - } - sstr += '\n'; - } - - return true; -} - -inline void sample_list_jag::write(const std::string filename) const { - std::string dir, basename; - parse_path(filename, dir, basename); - if (!dir.empty() && !check_if_dir_exists(dir)) { - // The creation of a shared directory must be done once in a coordinated fashion - // among the entities that have access to it. Thus, it must be done in advance - std::cerr << "The sample list output directory (" + dir + ") does not exist" << std::endl; - return; - } - - std::fstream ofs(filename, std::fstream::out | std::fstream::binary); - - if (!ofs.good()) { - return; - } - - std::string buf; - to_string(buf); - - ofs.write(buf.data(), buf.size()*sizeof(std::string::value_type)); - ofs.close(); -} - -inline const sample_list_jag::samples_t& sample_list_jag::get_list() const { - return m_sample_list; -} - -inline const sample_list_header& sample_list_jag::get_header() const { - return m_header; -} - -inline const sample_list_jag::sample_t& sample_list_jag::operator[](size_t idx) const { - return m_sample_list[idx]; -} - -} // end of namespace lbann diff --git a/include/lbann/data_readers/sample_list_open_files.hpp b/include/lbann/data_readers/sample_list_open_files.hpp new file mode 100644 index 00000000000..a7b4b81a278 --- /dev/null +++ b/include/lbann/data_readers/sample_list_open_files.hpp @@ -0,0 +1,147 @@ +#ifndef __SAMPLE_LIST_OPEN_FILES_HPP__ +#define __SAMPLE_LIST_OPEN_FILES_HPP__ + +#include "sample_list.hpp" + +/// Number of system and other files that may be open during execution +#define LBANN_MAX_OPEN_FILE_MARGIN 128 +#define LBANN_MAX_OPEN_FILE_RETRY 3 + +namespace lbann { + +template +class sample_list_open_files : public sample_list { + public: + /// The type for the index assigned to each sample file + using sample_file_id_t = std::size_t; + /** To describe a sample as a pair of the file to which it belongs and its name + Each file may contain multiple samples. */ + using sample_t = std::pair; + /// Information for each file used by the sample list: includes the file name, file descriptor, and + /// and a queue of each step and substep when data will be loaded from the file + using file_id_stats_t = std::tuple>>; + + /// Type for the list of samples + using samples_t = std::template vector< sample_t >; + /// Mapping of the file index to the statistics for each file + using file_id_stats_v_t = std::vector< file_id_stats_t >; // rename to sample_to_file_v or something + /// Type for the map of file descriptors to usage step and substep + using fd_use_map_t = std::template pair>; + + sample_list_open_files(); + virtual ~sample_list_open_files(); + /** Copy constructor repllicates all the member variables as they are except + * the file information vector, for which only the file name is copied. */ + sample_list_open_files(const sample_list_open_files& rhs); + /** assignemnt operation repllicates all the member variables as they are except + * the file information vector, for which only the file name is copied. */ + sample_list_open_files& operator=(const sample_list_open_files& rhs); + sample_list_open_files& copy(const sample_list_open_files& rhs); + + void copy_members(const sample_list_open_files& rhs); + + /// Tells how many samples in the list + size_t size() const override; + + /// Tells how many sample files are there + size_t get_num_files() const override; + + using sample_list::load; + /// Emit a serialized archive using the cereal library + template void save( Archive & ar ) const; + /// Restore the member variables from a given archrive serialized by the cereal library + template void load( Archive & ar ); + + /// Serialize this sample list into an std::string object + bool to_string(std::string& sstr) const override; + + /// Allow read-only access to the internal list data + const samples_t& get_list() const; + + /// Allow read-only access to the metadata of the idx-th sample in the list + const sample_t& operator[](size_t idx) const; + + const std::string& get_samples_filename(sample_file_id_t id) const override; + + file_handle_t get_samples_file_handle(sample_file_id_t id) const; + + void set_files_handle(const std::string& filename, file_handle_t h); + + void delete_file_handle_pq_entry(sample_file_id_t id); + + void manage_open_file_handles(sample_file_id_t id, bool pre_open_fd = false); + + file_handle_t open_samples_file_handle(const size_t i, bool pre_open_fd = false); + + virtual void close_if_done_samples_file_handle(const size_t i); + + void compute_epochs_file_usage(const std::vector& shufled_indices, int mini_batch_size, const lbann_comm& comm); + + virtual bool is_file_handle_valid(const file_handle_t& h) const = 0; + + void all_gather_packed_lists(lbann_comm& comm) override; + + protected: + + void set_samples_filename(sample_file_id_t id, const std::string& filename) override; + + /// Get the list of samples from a specific type of bundle file + virtual void obtain_sample_names(file_handle_t& h, std::vector& sample_names) const = 0; + + /// Get the list of samples that exist in a bundle file + file_handle_t get_bundled_sample_names(std::string file_path, std::vector& sample_names, size_t included_samples, size_t excluded_samples); + + /// read the body of exclusive sample list + void read_exclusive_list(std::istream& istrm, size_t stride=1, size_t offset=0); + + /// read the body of inclusive sample list + void read_inclusive_list(std::istream& istrm, size_t stride=1, size_t offset=0); + + /// read the body of a sample list + void read_sample_list(std::istream& istrm, size_t stride=1, size_t offset=0) override; + + void assign_samples_name() override {} + + /// Get the number of total/included/excluded samples + void get_num_samples(size_t& total, size_t& included, size_t& excluded) const override; + + static bool pq_cmp(fd_use_map_t left, fd_use_map_t right) { + return ((left.second).first < (right.second).first) || + (((left.second).first == (right.second).first) && + ((left.second).second < (right.second).second)); } + + virtual file_handle_t open_file_handle_for_read(const std::string& file_path) = 0; + virtual void close_file_handle(file_handle_t& h) = 0; + virtual void clear_file_handle(file_handle_t& h) = 0; + + private: + using sample_list::serialize; + template void serialize( Archive & ar ) = delete; + + protected: + using sample_list::m_header; + + /// Maps sample's file id to file names, file descriptors, and use counts + file_id_stats_v_t m_file_id_stats_map; + + private: + /// List of all samples with a file identifier and sample name for each sample + samples_t m_sample_list; + + /// Track the number of samples per file + std::unordered_map m_file_map; + + /// Track the number of open file descriptors and when they will be used next + std::deque m_open_fd_pq; + + size_t m_max_open_files; +}; + +template +inline T uninitialized_file_handle(); + +} // end of namespace + +#include "sample_list_open_files_impl.hpp" + +#endif // __SAMPLE_LIST_OPEN_FILES_HPP__ diff --git a/include/lbann/data_readers/sample_list_open_files_impl.hpp b/include/lbann/data_readers/sample_list_open_files_impl.hpp new file mode 100644 index 00000000000..4ad36be3b3b --- /dev/null +++ b/include/lbann/data_readers/sample_list_open_files_impl.hpp @@ -0,0 +1,682 @@ +namespace lbann { + +template +inline sample_list_open_files::sample_list_open_files() { + m_max_open_files = getdtablesize() - LBANN_MAX_OPEN_FILE_MARGIN; +} + +template +inline sample_list_open_files::~sample_list_open_files() { + m_open_fd_pq.clear(); +} + +template +inline sample_list_open_files +::sample_list_open_files(const sample_list_open_files& rhs) { + copy_members(rhs); +} + +template +inline sample_list_open_files& +sample_list_open_files +::operator=(const sample_list_open_files& rhs) { + // check for self-assignment + if (this == &rhs) { + return (*this); + } + + copy_members(rhs); + + return (*this); +} + +template +inline sample_list_open_files& +sample_list_open_files +::copy(const sample_list_open_files& rhs) { + // check for self-assignment + if (this == &rhs) { + return (*this); + } + + copy_members(rhs); + + return (*this); +} + +template +inline void sample_list_open_files +::copy_members(const sample_list_open_files& rhs) { + sample_list::copy_members(rhs); + m_sample_list = rhs.m_sample_list; + m_file_map = rhs.m_file_map; + m_max_open_files = rhs.m_max_open_files; + + /// Keep track of existing filenames but do not copy any file + /// descriptor information + m_file_id_stats_map.assign(rhs.m_file_id_stats_map.size(), + std::make_tuple("", + uninitialized_file_handle(), + std::deque>{})); + + for(size_t i = 0u; i < m_file_id_stats_map.size(); ++i) { + set_samples_filename(i, rhs.get_samples_filename(i)); + } + + /// Do not copy the open file descriptor priority queue + /// File handle ownership is not transfered in the copy + m_open_fd_pq.clear(); +} + +template +inline size_t sample_list_open_files +::size() const { + return m_sample_list.size(); +} + +template +inline size_t sample_list_open_files +::get_num_files() const { + return m_file_id_stats_map.size(); +} + +template +inline void sample_list_open_files +::read_exclusive_list(std::istream& istrm, + size_t stride, size_t offset) { + const std::string whitespaces(" \t\f\v\n\r"); + size_t cnt_files = 0u; + std::string line; + + while (std::getline(istrm, line)) { + const size_t end_of_str = line.find_last_not_of(whitespaces); + if (end_of_str == std::string::npos) { // empty line + continue; + } + if (cnt_files++ >= m_header.get_num_files()) { + break; + } + // Check to see if there is a strided load and skip the lines that are not for this rank + if ((cnt_files-1)%stride != offset) { + continue; + } + + std::stringstream sstr(line.substr(0, end_of_str + 1)); // clear trailing spaces for accurate parsing + std::string filename; + size_t included_samples; + size_t excluded_samples; + std::unordered_set excluded_sample_indices; + + sstr >> filename >> included_samples >> excluded_samples; + + const std::string file_path = add_delimiter(m_header.get_file_dir()) + filename; + + if (filename.empty() || !check_if_file_exists(file_path)) { + LBANN_ERROR(std::string{} + " :: data file '" + file_path + "' does not exist."); + } + + excluded_sample_indices.reserve(excluded_samples); + + while(!sstr.eof()) { + std::string index; + sstr >> index; + excluded_sample_indices.insert(index); + } + + if(excluded_sample_indices.size() != excluded_samples) { + LBANN_ERROR(std::string("Index file does not contain the correct number of excluded samples: expected ") + + std::to_string(excluded_samples) + + std::string(" exclusions but found ") + + std::to_string(excluded_sample_indices.size())); + } + + std::vector sample_names; + file_handle_t file_hnd = get_bundled_sample_names(file_path, sample_names, included_samples, excluded_samples); + if (!is_file_handle_valid(file_hnd)) { + continue; // skipping the file + } + + if(m_file_map.count(filename) > 0) { + if(sample_names.size() != m_file_map[filename]) { + LBANN_ERROR(std::string("The same file ") + + filename + + " was opened multiple times and reported different sizes: " + + std::to_string(sample_names.size()) + + " and " + + std::to_string(m_file_map[filename])); + } + }else { + m_file_map[filename] = sample_names.size(); + } + + sample_file_id_t index = m_file_id_stats_map.size(); + m_file_id_stats_map.emplace_back(std::make_tuple(filename, uninitialized_file_handle(), std::deque>{})); + set_files_handle(filename, file_hnd); + + size_t valid_sample_count = 0u; + for(auto s : sample_names) { + std::unordered_set::const_iterator found = excluded_sample_indices.find(s); + if (found != excluded_sample_indices.cend()) { + continue; + } + m_sample_list.emplace_back(index, to_sample_name_t(s)); + valid_sample_count++; + } + + if(valid_sample_count != included_samples) { + LBANN_ERROR(std::string("Bundle file does not contain the correct number of included samples: expected ") + + std::to_string(included_samples) + + std::string(" samples, but found ") + + std::to_string(valid_sample_count)); + } + } + + if (m_header.get_num_files() != cnt_files) { + LBANN_ERROR(std::string("Sample list ") + + m_header.get_sample_list_filename() + + std::string(": number of files requested ") + + std::to_string(m_header.get_num_files()) + + std::string(" does not equal number of files loaded ") + + std::to_string(cnt_files)); + } + + m_header.m_is_exclusive = false; +} + + +template +inline void sample_list_open_files +::read_inclusive_list(std::istream& istrm, + size_t stride, size_t offset) { + const std::string whitespaces(" \t\f\v\n\r"); + size_t cnt_files = 0u; + std::string line; + + while (std::getline(istrm, line)) { + const size_t end_of_str = line.find_last_not_of(whitespaces); + if (end_of_str == std::string::npos) { // empty line + continue; + } + if (cnt_files++ >= m_header.get_num_files()) { + break; + } + // Check to see if there is a strided load and skip the lines that are not for this rank + if ((cnt_files-1)%stride != offset) { + continue; + } + + std::stringstream sstr(line.substr(0, end_of_str + 1)); // clear trailing spaces for accurate parsing + std::string filename; + size_t included_samples; + size_t excluded_samples; + + sstr >> filename >> included_samples >> excluded_samples; + + const std::string file_path = add_delimiter(m_header.get_file_dir()) + filename; + + if (filename.empty() || !check_if_file_exists(file_path)) { + throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__) + + " :: data file '" + filename + "' does not exist."); + } + + std::vector sample_names; + file_handle_t file_hnd = get_bundled_sample_names(file_path, sample_names, included_samples, excluded_samples); + if (!is_file_handle_valid(file_hnd)) { + continue; // skipping the file + } + + if(m_file_map.count(filename) > 0) { + if(sample_names.size() != m_file_map[filename]) { + LBANN_ERROR(std::string("The same file ") + + filename + + " was opened multiple times and reported different sizes: " + + std::to_string(sample_names.size()) + + " and " + + std::to_string(m_file_map[filename])); + } + }else { + m_file_map[filename] = sample_names.size(); + } + + std::unordered_set set_of_samples(sample_names.begin(), sample_names.end()); + + sample_file_id_t index = m_file_id_stats_map.size(); + m_file_id_stats_map.emplace_back(std::make_tuple(filename, uninitialized_file_handle(), std::deque>{})); + set_files_handle(filename, file_hnd); + + size_t valid_sample_count = 0u; + while(!sstr.eof()) { + std::string sample_name_str; + sstr >> sample_name_str; + std::unordered_set::const_iterator found = set_of_samples.find(sample_name_str); + if (found == set_of_samples.cend()) { + LBANN_ERROR(std::string("Illegal request for a data ID that does not exist: ") + sample_name_str); + } + m_sample_list.emplace_back(index, to_sample_name_t(sample_name_str)); + valid_sample_count++; + } + if(valid_sample_count != included_samples) { + LBANN_ERROR(std::string("Bundle file does not contain the correct number of included samples: expected ") + + std::to_string(included_samples) + + std::string(" samples, but found ") + + std::to_string(valid_sample_count)); + } + } + + if (m_header.get_num_files() != cnt_files) { + LBANN_ERROR(std::string("Sample list number of files requested ") + + std::to_string(m_header.get_num_files()) + + std::string(" does not equal number of files loaded ") + + std::to_string(cnt_files)); + } +} + + +template +inline void sample_list_open_files +::read_sample_list(std::istream& istrm, size_t stride, size_t offset) { + if (m_header.is_exclusive()) { + read_exclusive_list(istrm, stride, offset); + } else { + read_inclusive_list(istrm, stride, offset); + } +} + + +template +template +void sample_list_open_files +::save( Archive & ar ) const { + using ar_file_stats_t = std::tuple>>; + std::vector file_stats; + file_stats.reserve(m_file_id_stats_map.size()); + for(auto&& e : m_file_id_stats_map) { + file_stats.emplace_back(std::make_tuple(std::get<0>(e), std::get<2>(e))); + } + ar(m_header, m_sample_list, file_stats); +} + +template +template +void sample_list_open_files +::load( Archive & ar ) { + using ar_file_stats_t = std::tuple>>; + std::vector file_stats; + ar(m_header, m_sample_list, file_stats); + m_file_id_stats_map.reserve(file_stats.size()); + for(auto&& e : file_stats) { + //m_file_id_stats_map.emplace_back(std::make_tuple(std::get<0>(e), uninitialized_file_handle(), std::deque>{})); + m_file_id_stats_map.emplace_back(std::make_tuple(std::get<0>(e), uninitialized_file_handle(), std::get<1>(e))); + //m_file_id_stats_map.emplace_back(std::make_tuple(std::get<0>(e), file_handle_t(), std::get<1>(e))); + } +} + +template +inline bool sample_list_open_files +::to_string(std::string& sstr) const { + std::map> tmp_file_map; + for (const auto& s : m_sample_list) { + const std::string& filename = get_samples_filename(s.first); + tmp_file_map[filename].emplace_back(s.second); + } + + sstr.clear(); + + // reserve the string to hold the entire sample lit + size_t estimated_len = 30 + 42 + m_header.get_file_dir().size() + 1; + for (const auto& f : tmp_file_map) { + estimated_len += f.first.size() + + std::to_string(f.second.size()).size() + + std::to_string(m_file_map.at(f.first) - f.second.size()).size() + + 3u; + for (const auto& s : f.second) { + estimated_len += lbann::to_string(s).size() + 1u; + } + } + sstr.reserve(estimated_len); + + // write the list header + this->write_header(sstr, tmp_file_map.size()); + + // write the list body + for (const auto& f : tmp_file_map) { + // File name + sstr += f.first; + // Number of included samples + sstr += std::string(" ") + std::to_string(f.second.size()); + // Number of excluded samples + sstr += std::string(" ") + std::to_string(m_file_map.at(f.first) - f.second.size()); + // Inclusion sample list + for (const auto& s : f.second) { + sstr += ' ' + lbann::to_string(s); + } + sstr += '\n'; + } + + return true; +} + +template +inline void sample_list_open_files +::get_num_samples(size_t& total, size_t& included, size_t& excluded) const { + total = 0u; + for ( const auto f: m_file_map) { + total += f.second; + } + included = size(); + excluded = total - included; +} + +template +inline const typename sample_list_open_files::samples_t& +sample_list_open_files::get_list() const { + return m_sample_list; +} + +template +inline const typename sample_list_open_files::sample_t& +sample_list_open_files::operator[](size_t idx) const { + return m_sample_list[idx]; +} + +template +inline const std::string& sample_list_open_files +::get_samples_filename(sample_file_id_t id) const { + return std::get<0>(m_file_id_stats_map[id]); +} + +template +inline file_handle_t sample_list_open_files +::get_samples_file_handle(sample_file_id_t id) const { + file_handle_t h = std::get<1>(m_file_id_stats_map[id]); + return h; +} + +template +inline void sample_list_open_files +::set_samples_filename(sample_file_id_t id, const std::string& filename) { + std::get<0>(m_file_id_stats_map[id]) = filename; +} + +template +inline void sample_list_open_files +::set_files_handle(const std::string& filename, file_handle_t h) { + sample_file_id_t id = sample_file_id_t(0); + for (auto&& e : m_file_id_stats_map) { + if(std::get<0>(e) == filename) { + std::get<1>(e) = h; + break; + } + id++; + } + manage_open_file_handles(id, true); +} + +template +inline void sample_list_open_files +::obtain_sample_names(file_handle_t& h, std::vector& sample_names) const { + LBANN_ERROR(std::string{} + " :: abstract class does not implement this method"); +} + +template +inline file_handle_t sample_list_open_files +::get_bundled_sample_names(std::string file_path, + std::vector& sample_names, + size_t included_samples, + size_t excluded_samples) { + file_handle_t file_hnd; + clear_file_handle(file_hnd); + bool retry = false; + int retry_cnt = 0; + do { + try { + file_hnd = open_file_handle_for_read( file_path ); + }catch (conduit::Error const& e) { + LBANN_WARNING(" :: trying to open the file " + file_path + " and got " + e.what()); + retry = true; + retry_cnt++; + } + }while(retry && retry_cnt < LBANN_MAX_OPEN_FILE_RETRY); + + if (!is_file_handle_valid(file_hnd)) { + std::cout << "Opening the file didn't work" << std::endl; + return file_hnd; + } + + obtain_sample_names(file_hnd, sample_names); + + if(sample_names.size() != (included_samples + excluded_samples)) { + LBANN_ERROR(std::string("File does not contain the correct number of samples: found ") + + std::to_string(sample_names.size()) + + std::string(" -- this does not equal the expected number of samples that are marked for inclusion: ") + + std::to_string(included_samples) + + std::string(" and exclusion: ") + + std::to_string(excluded_samples)); + } + + return file_hnd; +} + +template +inline void sample_list_open_files +::all_gather_packed_lists(lbann_comm& comm) { + int num_ranks = comm.get_procs_per_trainer(); + typename std::vector per_rank_samples(num_ranks); + typename std::vector> per_rank_files(num_ranks); + std::vector my_files; + my_files.reserve(m_file_id_stats_map.size()); + std::vector> per_rank_file_map(num_ranks); + + // Close the existing open files + for(auto&& e : m_file_id_stats_map) { + auto& h = std::get<1>(e); + close_file_handle(h); + clear_file_handle(h); + std::get<2>(e).clear(); + my_files.emplace_back(std::get<0>(e)); + } + m_open_fd_pq.clear(); + + size_t num_samples = this->all_gather_field(m_sample_list, per_rank_samples, comm); + size_t num_ids = this->all_gather_field(my_files, per_rank_files, comm); + size_t num_files = this->all_gather_field(m_file_map, per_rank_file_map, comm); + + m_sample_list.clear(); + m_file_id_stats_map.clear(); + + m_sample_list.reserve(num_samples); + m_file_id_stats_map.reserve(num_ids); + m_file_map.reserve(num_files); + + for(int r = 0; r < num_ranks; r++) { + const samples_t& s_list = per_rank_samples[r]; + const auto& files = per_rank_files[r]; + const std::unordered_map& file_map = per_rank_file_map[r]; + for (const auto& s : s_list) { + sample_file_id_t index = s.first; + const std::string& filename = files[index]; + if(index >= m_file_id_stats_map.size() + || (std::get<0>(m_file_id_stats_map.back()) != filename)) { + index = m_file_id_stats_map.size(); + m_file_id_stats_map.emplace_back(std::make_tuple(filename, uninitialized_file_handle(), std::deque>{})); + // Update the file map structure + if(m_file_map.count(filename) == 0) { + m_file_map[filename] = file_map.at(filename); + } + }else { + for(size_t i = 0; i < m_file_id_stats_map.size(); i++) { + if(filename == get_samples_filename(i)) { + index = i; + break; + } + } + } + m_sample_list.emplace_back(std::make_pair(index, s.second)); + } + } + + return; +} + +template +inline void sample_list_open_files +::compute_epochs_file_usage(const std::vector& shuffled_indices, + int mini_batch_size, + const lbann_comm& comm) { + for (auto&& e : m_file_id_stats_map) { + auto& h = std::get<1>(e); + close_file_handle(h); + clear_file_handle(h); + std::get<2>(e).clear(); + } + // Once all of the file handles are closed, clear the priority queue + m_open_fd_pq.clear(); + for (size_t i = 0; i < shuffled_indices.size(); i++) { + int idx = shuffled_indices[i]; + const auto& s = m_sample_list[idx]; + sample_file_id_t index = s.first; + + if((i % mini_batch_size) % comm.get_procs_per_trainer() == static_cast(comm.get_rank_in_trainer())) { + /// Enqueue the iteration step when the sample will get used + int step = i / mini_batch_size; + int substep = (i % mini_batch_size) / comm.get_procs_per_trainer(); + std::get<2>(m_file_id_stats_map[index]).emplace_back(std::make_pair(step, substep)); + } + } +} + +template +inline void sample_list_open_files +::delete_file_handle_pq_entry(sample_file_id_t id) { + for (std::deque::iterator it = m_open_fd_pq.begin(); it!=m_open_fd_pq.end(); ++it) { + if(it->first == id) { + it = m_open_fd_pq.erase(it); + break; + } + } + return; +} + +template +inline void sample_list_open_files +::manage_open_file_handles(sample_file_id_t id, bool pre_open_fd) { + /// When we enter this function the priority queue is either empty or a heap + if(!m_open_fd_pq.empty()) { + if(m_open_fd_pq.size() > m_max_open_files) { + auto& f = m_open_fd_pq.front(); + auto& victim = m_file_id_stats_map[f.first]; + auto& victim_fd = std::get<1>(victim); + std::pop_heap(m_open_fd_pq.begin(), m_open_fd_pq.end(), pq_cmp); + m_open_fd_pq.pop_back(); + close_file_handle(victim_fd); + clear_file_handle(victim_fd); + } + } + + /// Before we can enqueue the any new access times for this descriptor, remove any + /// earlier descriptor + std::sort_heap(m_open_fd_pq.begin(), m_open_fd_pq.end(), pq_cmp); + if(m_open_fd_pq.front().first == id) { + m_open_fd_pq.pop_front(); + } + std::make_heap(m_open_fd_pq.begin(), m_open_fd_pq.end(), pq_cmp); + + auto& e = m_file_id_stats_map[id]; + auto& file_access_queue = std::get<2>(e); + if(!file_access_queue.empty()) { + if(!pre_open_fd) { + file_access_queue.pop_front(); + } + } + if(!file_access_queue.empty()) { + m_open_fd_pq.emplace_back(std::make_pair(id,file_access_queue.front())); + }else { + /// If there are no future access of the file place a terminator entry to track + /// the open file, but is always sorted to the top of the heap + m_open_fd_pq.emplace_back(std::make_pair(id,std::make_pair(INT_MAX,id))); + } + std::push_heap(m_open_fd_pq.begin(), m_open_fd_pq.end(), pq_cmp); + return; +} + +template +inline file_handle_t sample_list_open_files +::open_samples_file_handle(const size_t i, bool pre_open_fd) { + const sample_t& s = m_sample_list[i]; + sample_file_id_t id = s.first; + file_handle_t h = get_samples_file_handle(id); + if (!is_file_handle_valid(h)) { + const std::string& file_name = get_samples_filename(id); + const std::string& file_dir = this->get_samples_dirname(); + const std::string file_path = add_delimiter(file_dir) + file_name; + if (file_name.empty() || !check_if_file_exists(file_path)) { + LBANN_ERROR(std::string{} + " :: data file '" + file_path + "' does not exist."); + } + bool retry = false; + int retry_cnt = 0; + do { + try { + h = open_file_handle_for_read( file_path ); + }catch (conduit::Error const& e) { + LBANN_WARNING(" :: trying to open the file " + file_path + " and got " + e.what()); + retry = true; + retry_cnt++; + } + }while(retry && retry_cnt < 3); + + if (!is_file_handle_valid(h)) { + LBANN_ERROR(std::string{} + " :: data file '" + file_path + "' could not be opened."); + } + auto& e = m_file_id_stats_map[id]; + std::get<1>(e) = h; + /// If a new file is opened, place it in the priority queue + manage_open_file_handles(id, pre_open_fd); + } + return h; +} + +template +inline void sample_list_open_files +::close_if_done_samples_file_handle(const size_t i) { + const sample_t& s = m_sample_list[i]; + sample_file_id_t id = s.first; + auto h = get_samples_file_handle(id); + if (!is_file_handle_valid(h)) { + auto& e = m_file_id_stats_map[id]; + auto& file_access_queue = std::get<2>(e); + if(file_access_queue.empty()) { + auto& fh = std::get<1>(e); + close_file_handle(fh); + clear_file_handle(fh); + delete_file_handle_pq_entry(id); + } + } +} + +template +inline bool sample_list_open_files +::is_file_handle_valid(const file_handle_t& h) const { + LBANN_ERROR(std::string{} + " :: abstract class does not implement this method"); + return false; +} + +template +inline file_handle_t sample_list_open_files +::open_file_handle_for_read(const std::string& file_path) { + LBANN_ERROR(std::string{} + " :: abstract class does not implement this method"); + return file_handle_t(); +} + +template +inline void sample_list_open_files +::close_file_handle(file_handle_t& h) { + LBANN_ERROR(std::string{} + " :: abstract class does not implement this method"); +} + +template +inline void sample_list_open_files +::clear_file_handle(file_handle_t& h) { + LBANN_ERROR(std::string{} + " :: abstract class does not implement this method"); +} + +} // end of namespace lbann diff --git a/src/data_readers/data_reader_jag_conduit.cpp b/src/data_readers/data_reader_jag_conduit.cpp index 3e91bcdc247..2764162b014 100644 --- a/src/data_readers/data_reader_jag_conduit.cpp +++ b/src/data_readers/data_reader_jag_conduit.cpp @@ -54,24 +54,9 @@ #include #include -// This macro may be moved to a global scope -#define _THROW_LBANN_EXCEPTION_(_CLASS_NAME_,_MSG_) { \ - std::stringstream _err; \ - _err << __FILE__ << ' ' << __LINE__ << " :: " \ - << (_CLASS_NAME_) << "::" << (_MSG_); \ - throw lbann_exception(_err.str()); \ -} - -#define _THROW_LBANN_EXCEPTION2_(_CLASS_NAME_,_MSG1_,_MSG2_) { \ - std::stringstream _err; \ - _err << __FILE__ << ' ' << __LINE__ << " :: " \ - << (_CLASS_NAME_) << "::" << (_MSG1_) << (_MSG2_); \ - throw lbann_exception(_err.str()); \ -} - // This comes after all the headers, and is only visible within the current implementation file. // To make sure, we put '#undef _CN_' at the end of this file -#define _CN_ "data_reader_jag_conduit" +#define _CN_ std::string("data_reader_jag_conduit") namespace lbann { @@ -138,7 +123,7 @@ int data_reader_jag_conduit::compute_max_num_parallel_readers() { set_sample_stride(get_num_parallel_readers()); set_iteration_stride(1); } else { - _THROW_LBANN_EXCEPTION_(get_type(), " unknown io_buffer type: " + m_io_buffer_type); + LBANN_ERROR(get_type() + ":: unknown io_buffer type: " + m_io_buffer_type); } return get_num_parallel_readers(); } @@ -273,7 +258,7 @@ void data_reader_jag_conduit::set_defaults() { m_scalar_normalization_params.clear(); m_input_normalization_params.clear(); - m_sample_list.clear(); + //m_sample_list.clear(); m_list_per_trainer = false; m_list_per_model = false; } @@ -282,6 +267,31 @@ void data_reader_jag_conduit::setup(int num_io_threads, std::shared_ptrhas_path(path); +} + +void data_reader_jag_conduit::read_node(const data_reader_jag_conduit::file_handle_t& h, + const std::string& path, + conduit::Node& n) const { + if (!h) { + return; + } + h->read(path, n); +} +#else +bool data_reader_jag_conduit::has_path(const hid_t& h, const std::string& path) const { + return (m_sample_list.is_file_handle_valid(h) && + conduit::relay::io::hdf5_has_path(h, path)); +} + +void data_reader_jag_conduit::read_node(const hid_t& h, const std::string& path, conduit::Node& n) const { + conduit::relay::io::hdf5_read(h, path, n); +} +#endif + const conduit::Node& data_reader_jag_conduit::get_conduit_node(const conduit::Node& n_base, const std::string key) { return n_base[key]; } @@ -297,10 +307,10 @@ bool data_reader_jag_conduit::load_conduit_node(const size_t i, const std::strin const std::string path = sample_name + key; sample_file_id_t id = s.first; - hid_t h = m_sample_list.get_samples_hdf5_handle(id); - if (h <= static_cast(0) || !conduit::relay::io::hdf5_has_path(h, path)) { + auto h = m_sample_list.get_samples_file_handle(id); + if (!has_path(h, path)) { + const std::string& file_name = m_sample_list.get_samples_filename(id); if (m_data_store != nullptr) { - const std::string& file_name = m_sample_list.get_samples_filename(id); if (! m_data_store->is_preloaded()) { const conduit::Node obj = m_data_store->get_random_node(); node = obj["data"]; @@ -314,13 +324,11 @@ bool data_reader_jag_conduit::load_conduit_node(const size_t i, const std::strin <<" and key: " << key << "\n"; return false; } else { - if (h <= static_cast(0) ) { + if (!m_sample_list.is_file_handle_valid(h)) { LBANN_ERROR("failed to get file handle for file " + file_name); - } else if (!conduit::relay::io::hdf5_has_path(h, path)) { + } else { LBANN_ERROR("got file handle for file " + file_name + \ " but the path doesn't exist in the file: " + path); - } else { - LBANN_ERROR("it should not be possible to be here"); } } } @@ -328,8 +336,7 @@ bool data_reader_jag_conduit::load_conduit_node(const size_t i, const std::strin // this block fires if we cannot load a conduit node, either from file // or from the data_store else { - const std::string& file_name = m_sample_list.get_samples_filename(id); - if (h <= static_cast(0)) { + if (!m_sample_list.is_file_handle_valid(h)) { LBANN_ERROR(get_type() + ":: Cannot open file " + file_name + \ " in dir: " + m_sample_list.get_samples_dirname() + " for sample "+ sample_name + " ran_in_trainer: " \ @@ -347,9 +354,7 @@ bool data_reader_jag_conduit::load_conduit_node(const size_t i, const std::strin } } - /// @todo explore the possibility of putting the sample name in - /// node's hierarchy, e.g. node[sample_name] - conduit::relay::io::hdf5_read(h, path, node); + read_node(h, path, node); return true; } @@ -358,16 +363,16 @@ bool data_reader_jag_conduit::has_conduit_path(const size_t i, const std::string const sample_t& s = m_sample_list[i]; sample_file_id_t id = s.first; const std::string& sample_name = s.second; - const hid_t h = m_sample_list.get_samples_hdf5_handle(id); + const auto h = m_sample_list.get_samples_file_handle(id); const std::string path = sample_name + key; - if (h <= static_cast(0) || !conduit::relay::io::hdf5_has_path(h, path)) { + if (!has_path(h, path)) { const std::string& file_name = m_sample_list.get_samples_filename(id); - _THROW_LBANN_EXCEPTION_(get_type(), "Cannot open file " + file_name + \ - " for sample "+ sample_name); + LBANN_ERROR(get_type() + ":: Cannot open file " + file_name + \ + " for sample "+ sample_name); return false; } - return conduit::relay::io::hdf5_has_path(h, std::string("/") + sample_name + key); + return true; } @@ -386,7 +391,7 @@ void data_reader_jag_conduit::set_independent_variable_type( void data_reader_jag_conduit::add_independent_variable_type( const data_reader_jag_conduit::variable_t independent) { if (!(independent == JAG_Image || independent == JAG_Scalar || independent == JAG_Input)) { - _THROW_LBANN_EXCEPTION_(_CN_, "unrecognized independent variable type "); + LBANN_ERROR(_CN_ + ":: unrecognized independent variable type "); } m_independent.push_back(independent); } @@ -406,7 +411,7 @@ void data_reader_jag_conduit::set_dependent_variable_type( void data_reader_jag_conduit::add_dependent_variable_type( const data_reader_jag_conduit::variable_t dependent) { if (!(dependent == JAG_Image || dependent == JAG_Scalar || dependent == JAG_Input)) { - _THROW_LBANN_EXCEPTION_(_CN_, "unrecognized dependent variable type "); + LBANN_ERROR(_CN_ + ":: unrecognized dependent variable type "); } m_dependent.push_back(dependent); } @@ -427,7 +432,7 @@ void data_reader_jag_conduit::set_image_dims(const int width, const int height, m_image_height = height; m_image_num_channels = ch; } else if (!((width == 0) && (height == 0) && (ch == 1))) { // set but not valid - _THROW_LBANN_EXCEPTION_(_CN_, "set_image_dims() : invalid image dims"); + LBANN_ERROR(_CN_ + ":: set_image_dims() : invalid image dims"); } set_linearized_image_size(); } @@ -561,7 +566,7 @@ void data_reader_jag_conduit::check_image_data() { size_t first_idx = (m_sample_list[0]).first; if (!has_conduit_path(first_idx, "")) { - _THROW_LBANN_EXCEPTION_(_CN_, "check_image_data() : no sample by " + m_sample_list[first_idx].second); + LBANN_ERROR(_CN_ + ":: check_image_data() : no sample by " + m_sample_list[first_idx].second); return; } conduit::Node n_imageset; @@ -574,7 +579,7 @@ void data_reader_jag_conduit::check_image_data() { } for (const auto& emi_tag: m_emi_image_keys) { if (!has_conduit_path(first_idx, m_output_image_prefix + emi_tag)) { - _THROW_LBANN_EXCEPTION_(_CN_, "check_image_data() : no emi image by " + emi_tag); + LBANN_ERROR(_CN_ + ":: check_image_data() : no emi image by " + emi_tag); return; } } @@ -589,18 +594,18 @@ void data_reader_jag_conduit::check_image_data() { m_image_num_channels = 1; set_linearized_image_size(); } else { - std::string msg = "expected linearized emi image size: " + std::string msg = ":: expected linearized emi image size: " + std::to_string(emi.number_of_elements()) + '\n'; - _THROW_LBANN_EXCEPTION_(_CN_, msg + get_description()); + LBANN_ERROR(_CN_ + msg + get_description()); } } if (m_image_normalization_params.empty()) { m_image_normalization_params.assign(m_emi_image_keys.size()*m_image_num_channels, linear_transform_t(1.0, 0.0)); } else if (m_image_normalization_params.size() != static_cast(m_image_num_channels)) { - _THROW_LBANN_EXCEPTION_(_CN_, "Incorrect number of image normalization parameter sets!" \ - + std::to_string(m_image_normalization_params.size()) + " != " \ - + std::to_string(m_image_num_channels)); + LBANN_ERROR(_CN_ + ":: Incorrect number of image normalization parameter sets!" \ + + std::to_string(m_image_normalization_params.size()) + " != " \ + + std::to_string(m_image_num_channels)); } #if defined(LBANN_DEBUG) std::cout << "image normalization parameters: " << std::endl; @@ -660,15 +665,15 @@ void data_reader_jag_conduit::check_scalar_keys() { msg += ' ' + m_scalar_keys[i]; } } - _THROW_LBANN_EXCEPTION_(_CN_, "check_scalar_keys() : " + msg); + LBANN_ERROR(_CN_ + ":: check_scalar_keys() : " + msg); } if (m_scalar_normalization_params.empty()) { m_scalar_normalization_params.assign(m_scalar_keys.size(), linear_transform_t(1.0, 0.0)); } else if (m_scalar_normalization_params.size() != m_scalar_keys.size()) { - _THROW_LBANN_EXCEPTION_(_CN_, "Incorrect number of scalar normalization parameter sets! " \ - + std::to_string(m_scalar_normalization_params.size()) + " != " \ - + std::to_string(m_scalar_keys.size())); + LBANN_ERROR(_CN_ + ":: Incorrect number of scalar normalization parameter sets! " \ + + std::to_string(m_scalar_normalization_params.size()) + " != " \ + + std::to_string(m_scalar_keys.size())); } #if defined(LBANN_DEBUG) std::cout << "scalar normalization parameters: " << std::endl; @@ -731,7 +736,7 @@ void data_reader_jag_conduit::check_input_keys() { msg += ' ' + m_input_keys[i]; } } - _THROW_LBANN_EXCEPTION_(_CN_, "check_input_keys() : " + msg); + LBANN_ERROR(_CN_ + ":: check_input_keys() : " + msg); } m_uniform_input_type = (m_input_keys.size() == 0u)? false : is_input_t; @@ -739,9 +744,9 @@ void data_reader_jag_conduit::check_input_keys() { if (m_input_normalization_params.empty()) { m_input_normalization_params.assign(m_input_keys.size(), linear_transform_t(1.0, 0.0)); } else if (m_input_normalization_params.size() != m_input_keys.size()) { - _THROW_LBANN_EXCEPTION_(_CN_, "Incorrect number of input normalization parameter sets! " \ - + std::to_string(m_input_normalization_params.size()) + " != " \ - + std::to_string(m_input_keys.size())); + LBANN_ERROR(_CN_ + ":: Incorrect number of input normalization parameter sets! " \ + + std::to_string(m_input_normalization_params.size()) + " != " \ + + std::to_string(m_input_keys.size())); } #if defined(LBANN_DEBUG) std::cout << "input normalization parameters: " << std::endl; @@ -792,7 +797,7 @@ void data_reader_jag_conduit::load() { m_is_data_loaded = true; /// Open the first sample to make sure that all of the fields are correct - m_sample_list.open_samples_hdf5_handle(0, true); + m_sample_list.open_samples_file_handle(0, true); if (m_scalar_keys.size() == 0u) { set_all_scalar_choices(); // use all by default if none is specified @@ -806,7 +811,7 @@ void data_reader_jag_conduit::load() { check_image_data(); - m_sample_list.close_if_done_samples_hdf5_handle(0); + m_sample_list.close_if_done_samples_file_handle(0); } if(is_master()) { std::cout << "Done with data checking" << std::endl; @@ -870,7 +875,7 @@ void data_reader_jag_conduit::preload_data_store() { } try { work.reset(); - m_sample_list.open_samples_hdf5_handle(idx, true); + m_sample_list.open_samples_file_handle(idx, true); load_conduit_node(idx, key, work); conduit::Node & node = m_data_store->get_empty_node(idx); const std::string padded_idx = '/' + LBANN_DATA_ID_STR(idx); @@ -886,7 +891,7 @@ void data_reader_jag_conduit::preload_data_store() { if(m_data_store->get_index_owner(idx) != m_rank_in_model) { continue; } - m_sample_list.close_if_done_samples_hdf5_handle(idx); + m_sample_list.close_if_done_samples_file_handle(idx); } if (get_comm()->am_world_master() || (opts->get_bool("ltfb_verbose") && get_comm()->am_trainer_master())) { @@ -952,8 +957,8 @@ size_t data_reader_jag_conduit::get_linearized_size(const data_reader_jag_condui case JAG_Input: return get_linearized_input_size(); default: { // includes Unefined case - _THROW_LBANN_EXCEPTION2_(_CN_, "get_linearized_size() : ", \ - "unknown or undefined variable type"); + LBANN_ERROR(_CN_ + ":: get_linearized_size() : " \ + + "unknown or undefined variable type"); } } return 0u; @@ -1009,8 +1014,8 @@ const std::vector data_reader_jag_conduit::get_dims(const data_reader_jag_c case JAG_Input: return {static_cast(get_linearized_input_size())}; default: { // includes Undefined case - _THROW_LBANN_EXCEPTION2_(_CN_, "get_dims() : ", \ - "unknown or undefined variable type"); + LBANN_ERROR(_CN_ + ":: get_dims() : " \ + + "unknown or undefined variable type"); } } return {}; @@ -1074,7 +1079,7 @@ int data_reader_jag_conduit::get_linearized_size(const std::string& desc) const } else if (desc == "JAG_Input") { return get_linearized_size(JAG_Input); } else { - _THROW_LBANN_EXCEPTION_(_CN_, "get_linearized_size() : unknown key " + desc); + LBANN_ERROR(_CN_ + ":: get_linearized_size() : unknown key " + desc); } return generic_data_reader::get_linearized_size(desc); } @@ -1329,12 +1334,12 @@ bool data_reader_jag_conduit::fetch(CPUMat& X, int data_id, conduit::Node& sampl std::vector< std::vector > img_data(get_image_data(data_id, sample)); if (img_data.size() != num_images) { - _THROW_LBANN_EXCEPTION2_(_CN_, "fetch() : the number of images is not as expected ", \ - std::to_string(img_data.size()) + "!=" + std::to_string(num_images)); + LBANN_ERROR(_CN_ + ":: fetch() : the number of images is not as expected " \ + + std::to_string(img_data.size()) + "!=" + std::to_string(num_images)); } if (!m_split_channels && m_image_num_channels != 1) { - _THROW_LBANN_EXCEPTION2_(_CN_, "fetch() : transform pipeline now requires single channel images: num_channels=", \ - std::to_string(m_image_num_channels) + " split_channel=" + std::to_string(m_split_channels)); + LBANN_ERROR(_CN_ + ":: fetch() : transform pipeline now requires single channel images: num_channels=" \ + + std::to_string(m_image_num_channels) + " split_channel=" + std::to_string(m_split_channels)); } std::vector dims = {num_channels, static_cast(m_image_height), static_cast(m_image_width)}; @@ -1368,7 +1373,7 @@ bool data_reader_jag_conduit::fetch(CPUMat& X, int data_id, conduit::Node& sampl break; } default: { // includes Undefined case - _THROW_LBANN_EXCEPTION_(_CN_, "fetch_" + tag + "() : unknown or undefined variable type"); + LBANN_ERROR(_CN_ + ":: fetch_" + tag + "() : unknown or undefined variable type"); } } return true; @@ -1431,7 +1436,7 @@ bool data_reader_jag_conduit::fetch_datum(CPUMat& X, int data_id, int mb_idx) { const conduit::Node& ds_node = m_data_store->get_conduit_node(data_id); node.set_external(ds_node); }else { - m_sample_list.open_samples_hdf5_handle(data_id); + m_sample_list.open_samples_file_handle(data_id); } for(size_t i = 0u; ok && (i < X_v.size()); ++i) { @@ -1444,7 +1449,7 @@ bool data_reader_jag_conduit::fetch_datum(CPUMat& X, int data_id, int mb_idx) { m_data_store->set_conduit_node(data_id, node); } - m_sample_list.close_if_done_samples_hdf5_handle(data_id); + m_sample_list.close_if_done_samples_file_handle(data_id); m_using_random_node.erase(m_io_thread_pool->get_local_thread_id()); return ok; } From d5f06d1177188711e31d0d8ef5f0973d22321397 Mon Sep 17 00:00:00 2001 From: "Thomas R. Benson" Date: Mon, 17 Jun 2019 09:17:04 -0700 Subject: [PATCH 082/634] Add conduit include paths to wrapper interface target --- CMakeLists.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index d689bc0817c..37f516aca5f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -421,7 +421,8 @@ if (LBANN_WITH_CONDUIT) set_target_properties(conduit::conduit PROPERTIES - INTERFACE_INCLUDE_DIRECTORIES "${HDF5_INCLUDE_DIRS}") + INTERFACE_INCLUDE_DIRECTORIES + "${CONDUIT_INCLUDE_DIRS}" "${HDF5_INCLUDE_DIRS}") endif () set_target_properties(conduit::conduit From df9436860a2aafc5144da5f8024beb11b16ba19f Mon Sep 17 00:00:00 2001 From: "Thomas R. Benson" Date: Mon, 17 Jun 2019 09:29:07 -0700 Subject: [PATCH 083/634] fix error in CMake syntax --- CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 37f516aca5f..c7e5c777da2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -419,8 +419,8 @@ if (LBANN_WITH_CONDUIT) list(APPEND _conduit_interface_link_libs ${HDF5_LIBRARIES}) - set_target_properties(conduit::conduit - PROPERTIES + set_property(TARGET conduit::conduit + PROPERTY INTERFACE_INCLUDE_DIRECTORIES "${CONDUIT_INCLUDE_DIRS}" "${HDF5_INCLUDE_DIRS}") endif () From 176e3b8a9ab8604c5ed281a55af2aca254b12a41 Mon Sep 17 00:00:00 2001 From: "Thomas R. Benson" Date: Mon, 17 Jun 2019 09:41:35 -0700 Subject: [PATCH 084/634] remove trailing "/conduit" from include path --- CMakeLists.txt | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index c7e5c777da2..bd5e188b5b3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -422,9 +422,17 @@ if (LBANN_WITH_CONDUIT) set_property(TARGET conduit::conduit PROPERTY INTERFACE_INCLUDE_DIRECTORIES - "${CONDUIT_INCLUDE_DIRS}" "${HDF5_INCLUDE_DIRS}") + "${HDF5_INCLUDE_DIRS}") endif () + get_filename_component(CONDUIT_INCLUDE_DIR_ + "${CONDUIT_INCLUDE_DIRS}" DIRECTORY) + + set_property(TARGET conduit::conduit + PROPERTY + INTERFACE_INCLUDE_DIRECTORIES + "${CONDUIT_INCLUDE_DIR_}") + set_target_properties(conduit::conduit PROPERTIES INTERFACE_LINK_LIBRARIES From 8c72baeb2fcc2b38bc480867baeb94828c6f572d Mon Sep 17 00:00:00 2001 From: "Thomas R. Benson" Date: Mon, 17 Jun 2019 09:49:23 -0700 Subject: [PATCH 085/634] prevent overwriting HDF5 include dirs --- CMakeLists.txt | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index bd5e188b5b3..d863a9204bf 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -415,23 +415,21 @@ if (LBANN_WITH_CONDUIT) endif () endforeach () + get_filename_component(_conduit_include_dirs + "${CONDUIT_INCLUDE_DIRS}" DIRECTORY) + if (HDF5_FOUND_WITH_MODULE) list(APPEND _conduit_interface_link_libs ${HDF5_LIBRARIES}) - set_property(TARGET conduit::conduit - PROPERTY - INTERFACE_INCLUDE_DIRECTORIES + list(APPEND _conduit_include_dirs "${HDF5_INCLUDE_DIRS}") endif () - get_filename_component(CONDUIT_INCLUDE_DIR_ - "${CONDUIT_INCLUDE_DIRS}" DIRECTORY) - set_property(TARGET conduit::conduit PROPERTY INTERFACE_INCLUDE_DIRECTORIES - "${CONDUIT_INCLUDE_DIR_}") + "${_conduit_include_dirs}") set_target_properties(conduit::conduit PROPERTIES From 4971c7f879912f93fbb490c3fad80803f4273abd Mon Sep 17 00:00:00 2001 From: Brian Van Essen Date: Tue, 18 Jun 2019 10:03:48 +0200 Subject: [PATCH 086/634] Updated the spack environment to define the modules format (#1083) * Updated the spack environment to define the modules format for both TCL and LMOD. Simplified the instructions since the new modules don't require you to unset the LIBRARY_PATH. Removed the old spack recipes build shell script, since it is deprecated and replaced by the environments. * Updated the documentation to indicate that it may be necessary to reinitialize spack if there were not existing module files. Changed the standard version and blacklisted versions of the core compiler. *Note that it will be necessary to refresh the spack module files, using the commend: spack module tcl refresh --delete-tree --- docs/building_lbann.rst | 2 +- scripts/spack_recipes/build_lbann.sh | 274 ------------------ .../std_versions_and_variants_llnl_lc_cz.yaml | 50 ++++ 3 files changed, 51 insertions(+), 275 deletions(-) delete mode 100755 scripts/spack_recipes/build_lbann.sh diff --git a/docs/building_lbann.rst b/docs/building_lbann.rst index 277f6d781b6..9866d31b94e 100644 --- a/docs/building_lbann.rst +++ b/docs/building_lbann.rst @@ -150,8 +150,8 @@ Hydrogen, and LBANN separately, by whatever means they choose. cp ${LBANN_HOME}/spack_environments/externals__llnl_lc_cz.yaml . # where = x86_64 | ppc64le spack install spack env loads # Spack creates a file named loads that has all of the correct modules + source ${SPACK_ROOT}/share/spack/setup-env.sh # Rerun setup since spack doesn't modify MODULEPATH unless there are module files defined source loads - unset LIBRARY_PATH + Note that the environments provided here have a set of external diff --git a/scripts/spack_recipes/build_lbann.sh b/scripts/spack_recipes/build_lbann.sh deleted file mode 100755 index 7254e9e4139..00000000000 --- a/scripts/spack_recipes/build_lbann.sh +++ /dev/null @@ -1,274 +0,0 @@ -#!/bin/bash - -set -e - -if [ ! -z "$bamboo_SPACK_ROOT" ]; then - . $bamboo_SPACK_ROOT/share/spack/setup-env.sh -fi - -CLUSTER=`hostname | sed 's/\([a-zA-Z][a-zA-Z]*\)[0-9]*/\1/g'` - -SPACK_RECIPES=`dirname ${0}` -#Set Script Name variable -SCRIPT=`basename ${0}` - -SCRIPTS_DIR=`dirname ${SPACK_RECIPES}` - -if [[ "$SCRIPTS_DIR" = /* ]]; then - ROOT_DIR=`dirname ${SCRIPTS_DIR}` -else - LVL_TO_ROOT_DIR=`dirname ${SCRIPTS_DIR}` -fi - -BLAS=openblas -BUILD_TYPE=Release -COMPILER=gcc@4.9.3 -DTYPE=float -EL_VER=develop -if [ "${CLUSTER}" == "ray" -o "${CLUSTER}" == "sierra" ]; then - MPI=spectrum-mpi -elif [ "${CLUSTER}" == "pascal" -o "${CLUSTER}" == "surface" ]; then - MPI='mvapich2 +cuda' -else - MPI=mvapich2 -fi -VARIANTS= -GPU=0 # usually ignored - -#Help function -function HELP { - echo -e \\n"Help documentation for ${BOLD}${SCRIPT}.${NORM}"\\n - echo -e "${REV}Basic usage:${NORM} ${BOLD}$SCRIPT -c gcc@7.1.0${NORM}"\\n - echo "Command line switches are optional. The following switches are recognized." - echo "${REV}-b${NORM} --Select ${BOLD}BLAS library${NORM}. Default is ${BOLD}${BLAS}${NORM}." - echo "${REV}-c${NORM} --Select ${BOLD}compiler${NORM}. Default is ${BOLD}${COMPILER}${NORM}." - echo "${REV}-d${NORM} --Build with ${BOLD}Debug mode${NORM} enabled. Default is ${BOLD}${BUILD_TYPE}${NORM}." - echo "${REV}-e${NORM} --Select ${BOLD}Elemental version${NORM}. Default is ${BOLD}${EL_VER}${NORM}." - echo "${REV}-g${NORM} --Build with ${BOLD}GPU support${NORM} enabled." - echo "${REV}-m${NORM} --Select ${BOLD}MPI library${NORM}. Default is ${BOLD}${MPI}${NORM}." - echo "${REV}-s${NORM} --Build with ${BOLD}sequential initialization mode${NORM} enabled." - echo "${REV}-t${NORM} --Select ${BOLD}datatype${NORM}. Default is ${BOLD}${DTYPE}${NORM}." - echo -e "${REV}-h${NORM} --Displays this help message. No further functions are performed."\\n - exit 1 -} - -while getopts "b:c:de:ghm:st:z" opt; do - case $opt in - b) - BLAS=$OPTARG - ;; - c) - COMPILER=$OPTARG - ;; - d) - BUILD_TYPE=Debug - ;; - e) - EL_VER=$OPTARG - ;; - g) - GPU=1 - ;; - h) - HELP - exit 1 - ;; - m) - MPI=$OPTARG - ;; - s) - VARIANTS="${VARIANTS} +seq_init" - ;; - t) - DTYPE=$OPTARG - ;; - z) - SPACK_DIRTY=1 - ;; - \?) - echo "Invalid option: -$OPTARG" >&2 - exit 1 - ;; - :) - echo "Option -$OPTARG requires an argument." >&2 - exit 1 - ;; - esac -done - -shift $((OPTIND-1)) -# now do something with $@ - -# Figure out which cluster we are on -ARCH=`uname -m` - -PLATFORM= -FEATURE= -if [ "${GPU}" == "1" -o "${CLUSTER}" == "surface" -o "${CLUSTER}" == "ray" -o "${CLUSTER}" == "sierra" -o "${CLUSTER}" == "pascal" ]; then - if [ "${CLUSTER}" == "flash" ]; then - PLATFORM="+gpu ^cuda@7.5 ^cudnn@5.1" - FEATURE="_gpu_cuda-7.5_cudnn-5.1" - elif [ "${CLUSTER}" == "sierra" -o "${CLUSTER}" == "ray" ]; then - PLATFORM="+gpu ^cuda@9.2.64 ^cudnn@7.0" - FEATURE="_gpu_cuda-9.2.64_cudnn-7.0" - elif [ "${CLUSTER}" == "pascal" ]; then - PLATFORM="+gpu ^cuda@9.1.85 ^cudnn@7.1" - FEATURE="_gpu_cuda-9.1.85_cudnn-7.1" - else - PLATFORM="+gpu" - FEATURE="_gpu" - fi - EL_VER="${EL_VER}+cuda" - MPI="${MPI}+cuda" -else - PLATFORM="~gpu" -fi - -C_FLAGS= -CXX_FLAGS= -Fortran_FLAGS= - -DIST= -case ${BUILD_TYPE} in - Release) - DIST=rel - # Don't use the march=native flag for gcc and intel compilers since that - # wouldn't allow spack to differentiate between optimization sets - # C_FLAGS="${C_FLAGS} -march=native" - # CXX_FLAGS="${CXX_FLAGS} -march=native" - # Fortran_FLAGS="${Fortran_FLAGS} -march=native" - if [[ (${COMPILER} == gcc@*) ]]; then - if [ "${CLUSTER}" == "catalyst" ]; then - ARCH_FLAGS="-march=ivybridge -mtune=ivybridge" - elif [ "${CLUSTER}" == "quartz" -o "${CLUSTER}" == "pascal" ]; then - ARCH_FLAGS="-march=broadwell -mtune=broadwell" - elif [ "${CLUSTER}" == "surface" ]; then - ARCH_FLAGS="-march=sandybridge -mtune=sandybridge" - elif [ "${CLUSTER}" == "flash" ]; then - ARCH_FLAGS="-march=haswell -mtune=haswell" - fi - elif [[ (${COMPILER} == intel@*) ]]; then - if [ "${CLUSTER}" == "catalyst" ]; then - ARCH_FLAGS="-march=corei7-avx -mtune=ivybridge" - elif [ "${CLUSTER}" == "quartz" -o "${CLUSTER}" == "pascal" ]; then - ARCH_FLAGS="-march=core-avx2 -mtune=broadwell" - elif [ "${CLUSTER}" == "surface" ]; then - ARCH_FLAGS="-march=corei7-avx -mtune=sandybridge" - elif [ "${CLUSTER}" == "flash" ]; then - ARCH_FLAGS="-march=core-avx2 -mtune=haswell" - fi - elif [[ ${COMPILER} == clang@* ]]; then - if [ "${CLUSTER}" == "catalyst" -o "${CLUSTER}" == "surface" ]; then - ARCH_FLAGS="-mavx -march=native" - elif [ "${CLUSTER}" == "quartz" -o "${CLUSTER}" == "flash" -o "${CLUSTER}" == "pascal" ]; then - ARCH_FLAGS="-mavx2 -march=native" - fi - fi - C_FLAGS="-O3 -g ${ARCH_FLAGS}" - CXX_FLAGS="-O3 -g ${ARCH_FLAGS}" - Fortran_FLAGS="-O3 -g ${ARCH_FLAGS}" - ;; - Debug) - DIST=debug - C_FLAGS="-g" - CXX_FLAGS="-g" - Fortran_FLAGS="-g" - ;; - :) - DIST=unkwn - ;; -esac - -SPACK_CFLAGS= -if [ ! -z "${C_FLAGS}" ]; then - SPACK_CFLAGS="cflags=\"${C_FLAGS}\"" -fi -SPACK_CXXFLAGS= -if [ ! -z "${CXX_FLAGS}" ]; then - SPACK_CXXFLAGS="cxxflags=\"${CXX_FLAGS}\"" -fi -SPACK_FFLAGS= -if [ ! -z "${Fortran_FLAGS}" ]; then - SPACK_FFLAGS="fflags=\"${Fortran_FLAGS}\"" -fi - -SPACK_SETUP_FLAGS= -if [ "${SPACK_DIRTY}" == "1" ]; then - SPACK_SETUP_FLAGS="--dirty" -fi - -if [ "${CLUSTER}" == "ray" ]; then - MPI="spectrum-mpi@2018.04.27" -fi - -SPACK_OPTIONS="lbann@local %${COMPILER} build_type=${BUILD_TYPE} dtype=${DTYPE} ${PLATFORM} ${VARIANTS} ^hydrogen@${EL_VER} build_type=${BUILD_TYPE} blas=${BLAS} ^${MPI}" -# Disable the extra compiler flags until spack supports propagating flags properly -#SPACK_OPTIONS="lbann@local build_type=${BUILD_TYPE} dtype=${DTYPE} ${PLATFORM} ${VARIANTS} %${COMPILER} ${SPACK_CFLAGS} ${SPACK_CXXFLAGS} ${SPACK_FFLAGS} ^elemental@${EL_VER} blas=${BLAS} ^${MPI}" - -# Use older cmake to avoid passing -pthread to nvcc -if [ "${CLUSTER}" == "ray" -o "${CLUSTER}" == "sierra" ]; then - SPACK_OPTIONS="$SPACK_OPTIONS ^cmake@3.9.0" -fi - -SPEC="spack spec ${SPACK_OPTIONS}" -CMD="spack setup ${SPACK_SETUP_FLAGS} ${SPACK_OPTIONS}" - -# Create a directory for the build -if [ ! -z "$bamboo_SPACK_ROOT" ]; then - DIR="${CLUSTER}_${COMPILER}_${DIST}" - DIR=${DIR//@/-} - DIR=${DIR// /-} - DIR=${DIR//+/-} -else - DIR="${CLUSTER}_${COMPILER}_${ARCH}${FEATURE}_${MPI}_${BLAS}_${DIST}" - DIR=${DIR//@/-} - DIR=${DIR// /-} - DIR=${DIR//+/-} -fi -echo "Creating directory ${DIR}" -mkdir -p ${DIR}/build -cd ${DIR} - -echo $SPEC -echo $SPEC > spack_build_lbann.sh -eval $SPEC -err=$? -if [ $err -eq 1 ]; then - echo "Spack spec command returned error: $err" - exit -1 -fi - -echo $CMD -echo $CMD >> spack_build_lbann.sh -chmod +x spack_build_lbann.sh -eval $CMD -err=$? -if [ $err -eq 1 ]; then - echo "Spack setup command returned error: $err" - exit -1 -fi - -# Find the root of the git repo -cd build -PATH_TO_SRC= -if [ ! -z ${LVL_TO_ROOT_DIR} ]; then - PATH_TO_SRC="${LVL_TO_ROOT_DIR}/../.." -elif [ ! -z ${ROOT_DIR} ]; then - PATH_TO_SRC="${ROOT_DIR}" -fi - -if [ ! -z ${PATH_TO_SRC} -a -d ${PATH_TO_SRC}/src ]; then - CMD="../spconfig.py ${PATH_TO_SRC}" - echo $CMD - eval $CMD -fi - -# Deal with the fact that spack should not install a package when doing setup" -FIX="spack uninstall --all -y lbann %${COMPILER} build_type=${BUILD_TYPE}" -echo $FIX -if [ ! -z "$bamboo_SPACK_ROOT" ]; then - eval $FIX &> /dev/null - exit 0 -else - eval $FIX -fi diff --git a/spack_environments/std_versions_and_variants_llnl_lc_cz.yaml b/spack_environments/std_versions_and_variants_llnl_lc_cz.yaml index 64604c430d1..362c5d5927f 100644 --- a/spack_environments/std_versions_and_variants_llnl_lc_cz.yaml +++ b/spack_environments/std_versions_and_variants_llnl_lc_cz.yaml @@ -43,3 +43,53 @@ zlib:: buildable: True version: [1.2.11] + + modules:: + enable:: + - tcl + - lmod + lmod:: + hash_length: 3 + core_compilers: + - 'gcc@7.3.0' + - 'gcc@7.3.1' + naming_scheme: '${PACKAGE}/${VERSION}-${COMPILERNAME}-${COMPILERVER}' + blacklist: + - '%gcc@4.8' + - '%gcc@4.9.3' + hierarchy: + - 'mpi' + - 'lapack' + all: + autoload: 'direct' + suffixes: + '^openblas': openblas + '^netlib-lapack': netlib + '^python@3.7.2': python-3.7.2 + filter: + # Exclude changes to any of these variables + environment_blacklist: ['CPATH', 'LIBRARY_PATH'] + ^python: + autoload: 'direct' + tcl: + hash_length: 3 + core_compilers: + - 'gcc@7.3.0' + - 'gcc@7.3.1' + naming_scheme: '${PACKAGE}/${VERSION}-${COMPILERNAME}-${COMPILERVER}' + whitelist: + - gcc + blacklist: + - '%gcc@4.8' + - '%gcc@4.9.3' + all: + autoload: 'direct' + suffixes: + '^openblas': openblas + '^netlib-lapack': netlib + '^python@3.7.2': python-3.7.2 + filter: + # Exclude changes to any of these variables + environment_blacklist: ['CPATH', 'LIBRARY_PATH'] + ^python: + autoload: 'direct' From 1eeca09ab4025feedc148523496504b6b07798c8 Mon Sep 17 00:00:00 2001 From: Nikoli Dryden Date: Wed, 19 Jun 2019 13:42:58 -0700 Subject: [PATCH 087/634] Address review comments. --- .../transforms/vision/adjust_contrast.hpp | 1 + .../transforms/vision/adjust_saturation.hpp | 4 ++- .../lbann/transforms/vision/color_jitter.hpp | 28 +---------------- src/transforms/vision/color_jitter.cpp | 30 +++++++++++++++++++ 4 files changed, 35 insertions(+), 28 deletions(-) diff --git a/include/lbann/transforms/vision/adjust_contrast.hpp b/include/lbann/transforms/vision/adjust_contrast.hpp index 2e6e32f12c6..6a6a7528dc4 100644 --- a/include/lbann/transforms/vision/adjust_contrast.hpp +++ b/include/lbann/transforms/vision/adjust_contrast.hpp @@ -34,6 +34,7 @@ namespace transform { /** * Adjust the contrast of an image. + * * This operates similarly to the contrast control on a television. */ class adjust_contrast : public transform { diff --git a/include/lbann/transforms/vision/adjust_saturation.hpp b/include/lbann/transforms/vision/adjust_saturation.hpp index d1d6aff7692..605043b9b4b 100644 --- a/include/lbann/transforms/vision/adjust_saturation.hpp +++ b/include/lbann/transforms/vision/adjust_saturation.hpp @@ -34,8 +34,10 @@ namespace transform { /** * Adjust the saturation of an image. + * * This operates similarly to the controls on a color television - * (as opposed to a direct adjustment of saturation). + * (as opposed to a direct adjustment of saturation) by interpolating + * between the original value and its grayscale value. */ class adjust_saturation : public transform { public: diff --git a/include/lbann/transforms/vision/color_jitter.hpp b/include/lbann/transforms/vision/color_jitter.hpp index c16d2eaba98..7d952e35cf3 100644 --- a/include/lbann/transforms/vision/color_jitter.hpp +++ b/include/lbann/transforms/vision/color_jitter.hpp @@ -51,33 +51,7 @@ class color_jitter : public transform { */ color_jitter(float min_brightness_factor, float max_brightness_factor, float min_contrast_factor, float max_contrast_factor, - float min_saturation_factor, float max_saturation_factor) : - transform(), - m_min_brightness_factor(min_brightness_factor), - m_max_brightness_factor(max_brightness_factor), - m_min_contrast_factor(min_contrast_factor), - m_max_contrast_factor(max_contrast_factor), - m_min_saturation_factor(min_saturation_factor), - m_max_saturation_factor(max_saturation_factor) { - if (min_brightness_factor < 0.0f || - max_brightness_factor < min_brightness_factor) { - LBANN_ERROR("Min/max brightness factors out of range: " - + std::to_string(min_brightness_factor) + " " - + std::to_string(max_brightness_factor)); - } - if (min_contrast_factor < 0.0f || - max_contrast_factor < min_contrast_factor) { - LBANN_ERROR("Min/max contrast factors out of range: " - + std::to_string(min_contrast_factor) + " " - + std::to_string(max_contrast_factor)); - } - if (min_saturation_factor < 0.0f || - max_saturation_factor < min_saturation_factor) { - LBANN_ERROR("Min/max saturation factors out of range: " - + std::to_string(min_saturation_factor) + " " - + std::to_string(max_saturation_factor)); - } - } + float min_saturation_factor, float max_saturation_factor); transform* copy() const override { return new color_jitter(*this); } diff --git a/src/transforms/vision/color_jitter.cpp b/src/transforms/vision/color_jitter.cpp index eff25df6f4b..e5d3b2c5294 100644 --- a/src/transforms/vision/color_jitter.cpp +++ b/src/transforms/vision/color_jitter.cpp @@ -35,6 +35,36 @@ namespace lbann { namespace transform { +color_jitter::color_jitter(float min_brightness_factor, float max_brightness_factor, + float min_contrast_factor, float max_contrast_factor, + float min_saturation_factor, float max_saturation_factor) : + transform(), + m_min_brightness_factor(min_brightness_factor), + m_max_brightness_factor(max_brightness_factor), + m_min_contrast_factor(min_contrast_factor), + m_max_contrast_factor(max_contrast_factor), + m_min_saturation_factor(min_saturation_factor), + m_max_saturation_factor(max_saturation_factor) { + if (min_brightness_factor < 0.0f || + max_brightness_factor < min_brightness_factor) { + LBANN_ERROR("Min/max brightness factors out of range: " + + std::to_string(min_brightness_factor) + " " + + std::to_string(max_brightness_factor)); + } + if (min_contrast_factor < 0.0f || + max_contrast_factor < min_contrast_factor) { + LBANN_ERROR("Min/max contrast factors out of range: " + + std::to_string(min_contrast_factor) + " " + + std::to_string(max_contrast_factor)); + } + if (min_saturation_factor < 0.0f || + max_saturation_factor < min_saturation_factor) { + LBANN_ERROR("Min/max saturation factors out of range: " + + std::to_string(min_saturation_factor) + " " + + std::to_string(max_saturation_factor)); + } +} + void color_jitter::apply(utils::type_erased_matrix& data, std::vector& dims) { fast_rng_gen& gen = get_fast_generator(); // Determine the order to apply transforms. From 1dafff28be4b0eb619ee0a92c174083e654a693c Mon Sep 17 00:00:00 2001 From: Nikoli Dryden Date: Tue, 4 Jun 2019 17:57:31 -0700 Subject: [PATCH 088/634] Add utilities for fast generation of uniform random numbers in [0, 1). --- include/lbann/utils/random.hpp | 80 +++++++++++++++++++++++++++++ src/utils/unit_test/CMakeLists.txt | 1 + src/utils/unit_test/random_test.cpp | 46 +++++++++++++++++ 3 files changed, 127 insertions(+) create mode 100644 src/utils/unit_test/random_test.cpp diff --git a/include/lbann/utils/random.hpp b/include/lbann/utils/random.hpp index 640fc40ae22..9ae890d7681 100644 --- a/include/lbann/utils/random.hpp +++ b/include/lbann/utils/random.hpp @@ -113,6 +113,86 @@ inline T fast_rand_int_pow2(Generator& g, T max) { return x & ((typename Generator::result_type) max); } +// Methods for quickly generating uniformly random values in [0, 1). + +namespace details { + +// See section on converting uint64_ts to doubles in: +// http://xoshiro.di.unimi.it/ + +template +inline float random_float_32(Generator& g) { + const uint32_t r = g() >> 9; + return r * (1.0f / 8388608.0f); +} + +template +inline float random_float_64(Generator& g) { + const uint32_t r = uint32_t(g()) >> 9; // Truncate. + return r * (1.0f / 8388608.0f); +} + +template +inline float random_float(Generator& g) { + // TODO: Replace with if constexpr when possible. + if (sizeof(typename Generator::result_type) == 4) { + return random_float_32(g); + } else if (sizeof(typename Generator::result_type) == 8) { + return random_float_64(g); + } else { + LBANN_ERROR("Unsupported generator type"); + } +} + +template +inline double random_double_32(Generator& g) { + const uint32_t r1 = g() >> 5; + const uint32_t r2 = g() >> 6; + return (r1 * 67108864.0 + r2) * (1.0 / 9007199254740992.0); +} + +template +inline double random_double_64(Generator& g) { + const uint64_t r = g() >> 11; + return r * (1.0 / 9007199254740992.0); +} + +template +inline double random_double(Generator& g) { + // TODO: Replace with if constexpr when possible. + if (sizeof(typename Generator::result_type) == 4) { + return random_double_32(g); + } else if (sizeof(typename Generator::result_type) == 8) { + return random_double_64(g); + } else { + LBANN_ERROR("Unsupported generator type"); + } +} + +template +struct random_uniform_impl { + static T generate(Generator&); +}; +template +struct random_uniform_impl { + static float generate(Generator& g) { return random_float(g); } +}; +template +struct random_uniform_impl { + static float generate(Generator& g) { return random_double(g); } +}; + +} // namespace details + +/** Generate uniformly random values in the range (0, 1]. */ +template +inline T fast_random_uniform(Generator& g) { + static_assert(sizeof(typename Generator::result_type) == 4 || + sizeof(typename Generator::result_type) == 8, + "Invalid generator result_type."); + return details::random_uniform_impl::generate(g); +} + /** @brief Initialize the random number generator (with optional seed). * * @param seed Seed value for the random number generator diff --git a/src/utils/unit_test/CMakeLists.txt b/src/utils/unit_test/CMakeLists.txt index 69c29e8632e..0e7558e7d8e 100644 --- a/src/utils/unit_test/CMakeLists.txt +++ b/src/utils/unit_test/CMakeLists.txt @@ -2,6 +2,7 @@ set_full_path(_DIR_LBANN_CATCH2_TEST_FILES any_test.cpp factory_test.cpp image_test.cpp + random_test.cpp type_erased_matrix_test.cpp ) diff --git a/src/utils/unit_test/random_test.cpp b/src/utils/unit_test/random_test.cpp new file mode 100644 index 00000000000..2313a79ec14 --- /dev/null +++ b/src/utils/unit_test/random_test.cpp @@ -0,0 +1,46 @@ +// MUST include this +#include + +// File being tested +#include + +constexpr size_t num_tests = 1000; + +TEST_CASE("Testing fast_random_uniform", "[random][utilities]") { + SECTION("32-bit generator") { + std::mt19937 gen; + SECTION("floats") { + for (size_t i = 0; i < num_tests; ++i) { + float val = lbann::fast_random_uniform(gen); + REQUIRE(val >= 0.0f); + REQUIRE(val <= 1.0f); + } + } + + SECTION("doubles") { + for (size_t i = 0; i < num_tests; ++i) { + double val = lbann::fast_random_uniform(gen); + REQUIRE(val >= 0.0); + REQUIRE(val <= 1.0); + } + } + } + SECTION("64-bit generator") { + std::mt19937_64 gen; + SECTION("floats") { + for (size_t i = 0; i < num_tests; ++i) { + float val = lbann::fast_random_uniform(gen); + REQUIRE(val >= 0.0f); + REQUIRE(val <= 1.0f); + } + } + + SECTION("doubles") { + for (size_t i = 0; i < num_tests; ++i) { + double val = lbann::fast_random_uniform(gen); + REQUIRE(val >= 0.0); + REQUIRE(val <= 1.0); + } + } + } +} From a16e9155bbc00fb7a7e4e0fed7a2a1a5ddbe70c7 Mon Sep 17 00:00:00 2001 From: Nikoli Dryden Date: Tue, 4 Jun 2019 18:28:27 -0700 Subject: [PATCH 089/634] Add beta distribution. --- include/lbann/utils/beta.hpp | 220 ++++++++++++++++++ src/utils/unit_test/CMakeLists.txt | 1 + .../unit_test/beta_distribution_test.cpp | 46 ++++ 3 files changed, 267 insertions(+) create mode 100644 include/lbann/utils/beta.hpp create mode 100644 src/utils/unit_test/beta_distribution_test.cpp diff --git a/include/lbann/utils/beta.hpp b/include/lbann/utils/beta.hpp new file mode 100644 index 00000000000..afddbce74ec --- /dev/null +++ b/include/lbann/utils/beta.hpp @@ -0,0 +1,220 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_UTILS_BETA_HPP +#define LBANN_UTILS_BETA_HPP + +#include +#include +#include +#include + +#include "lbann/utils/random.hpp" + +namespace lbann { + +/** + * Produces random floating point values drawn from a Beta distribution with + * parameters a > 0 and b > 0. + * See: + * https://en.wikipedia.org/wiki/Beta_distribution + * for more details. + */ +template +class beta_distribution { +public: + using result_type = RealType; + + class param_type { + public: + using distribution_type = beta_distribution; + + explicit param_type(RealType a, RealType b) : + m_a(a), m_b(b) {} + + RealType a() const { return m_a; } + RealType b() const { return m_b; } + + bool operator==(const param_type& other) const { + return m_a == other.m_a && m_b == other.m_b; + } + bool operator!=(const param_type& other) const { + return m_a != other.m_a || m_b != other.m_b; + } + private: + RealType m_a, m_b; + }; + + explicit beta_distribution(RealType a, RealType b) : + m_params(a, b), m_gamma_a(a), m_gamma_b(b) {} + explicit beta_distribution(const param_type& p) : + m_params(p), m_gamma_a(p.a()), m_gamma_b(p.b()) {} + + result_type a() const { return m_params.a(); } + result_type b() const { return m_params.b(); } + + void reset() {} + + param_type param() const { return m_params; } + void param(const param_type& p) { + m_params = p; + m_gamma_a = gamma_dist(p.a()); + m_gamma_b = gamma_dist(p.b()); + } + + template + result_type operator()(Generator& g) { + return generate(g); + } + template + result_type operator()(Generator& g, const param_type& p) { + return generate(g, p); + } + + result_type min() const { return result_type(0); } + result_type max() const { return result_type(1); } + + bool operator==(const beta_distribution& other) const { + return param() == other.param(); + } + bool operator!=(const beta_distribution& other) const { + return param() != other.param(); + } + +private: + param_type m_params; + + using gamma_dist = std::gamma_distribution; + gamma_dist m_gamma_a, m_gamma_b; + + // Generator for when we use the distribution's parameters. + template + result_type generate(Generator& g) { + if (a() <= result_type(1) && b() <= result_type(1)) { + return generate_johnk(g, m_params.a(), m_params.b()); + } else { + return generate_gamma(g, m_gamma_a, m_gamma_b); + } + } + // Generator for when we use specified parameters. + template + result_type generate(Generator& g, const param_type& p) { + if (p.a() <= result_type(1) && p.b() <= result_type(1)) { + return generate_johnk(p.a(), p.b()); + } else { + gamma_dist gamma_a(p.a()), gamma_b(p.b()); + return generate_gamma(g, gamma_a, gamma_b); + } + } + + /** + * Generate Beta-distributed values using Johnk's algorithm. + * This is a rejection-sampling algorithm that only needs a few + * uniformly random values. + * + * See: + * Johnk, H. D. "Erzeugung von betaverteilten und gammaverteilten + * Zufallszahlen." Metrika 8, no. 1 (1964). + * For an English-language presentation, see: + * Atkinson, A. C. and M. C. Pearce. "The computer generation of beta, + * gamma and normal random variables." Journal of the Royal Statistical + * Society: Series A (General) 139, no. 4 (1976). + * + * This includes fixes for numerical stability when the parameters are small, + * see: + * https://github.com/numpy/numpy/issues/5851 + * for discussion there; and a catch for the (extremely rare) case of the RNG + * giving us U and V both exactly 0. + * + * Note: There should be an umlaut on the "o" in "Johnk", but blame poor + * unicode support. + */ + template + result_type generate_johnk(Generator& g, result_type a, result_type b) { + while (true) { + const result_type U = fast_random_uniform(g); + const result_type V = fast_random_uniform(g); + const result_type X = std::pow(U, result_type(1) / a); + const result_type Y = std::pow(V, result_type(1) / b); + const result_type XplusY = X + Y; + if (XplusY <= result_type(1.0)) { + if (XplusY > result_type(0)) { + return X / XplusY; + } else if (U != result_type(0) && V != result_type(0)) { + // Work with logs instead if a/b is too small. + result_type logX = std::log(U) / a; + result_type logY = std::log(V) / b; + const result_type log_max = std::max(logX, logY); + logX -= log_max; + logY -= log_max; + return std::exp(logX - std::log(std::exp(logX) + std::exp(logY))); + } + } + } + } + + /** + * Generate Beta-distributed values based on Gamma distributions. + * See: + * https://en.wikipedia.org/wiki/Beta_distribution#Generating_beta-distributed_random_variates + * for details. + */ + template + result_type generate_gamma(Generator& g, gamma_dist& gamma_a, + gamma_dist& gamma_b) { + const result_type Ga = gamma_a(g); + const result_type Gb = gamma_b(g); + return Ga / (Ga + Gb); + } +}; + +template +std::basic_ostream& operator<<(std::basic_ostream& os, + const beta_distribution& d) { + os << "~Beta(" << d.a() << "," << d.b() << ")"; + return os; +} + +template +std::basic_istream& operator<<(std::basic_istream& is, + beta_distribution& d) { + std::string s; + RealType a, b; + if (std::getline(is, s, '(') && s == "~Beta" + && is >> a + && is.get() == ',' + && is >> b + && is.get() == ')') { + d = beta_distribution(a, b); + } else { + is.setstate(std::ios::failbit); + } + return is; +} + +} // namespace lbann + +#endif // LBANN_UTILS_BETA_HPP diff --git a/src/utils/unit_test/CMakeLists.txt b/src/utils/unit_test/CMakeLists.txt index 0e7558e7d8e..35aba73754b 100644 --- a/src/utils/unit_test/CMakeLists.txt +++ b/src/utils/unit_test/CMakeLists.txt @@ -1,5 +1,6 @@ set_full_path(_DIR_LBANN_CATCH2_TEST_FILES any_test.cpp + beta_distribution_test.cpp factory_test.cpp image_test.cpp random_test.cpp diff --git a/src/utils/unit_test/beta_distribution_test.cpp b/src/utils/unit_test/beta_distribution_test.cpp new file mode 100644 index 00000000000..103925dc684 --- /dev/null +++ b/src/utils/unit_test/beta_distribution_test.cpp @@ -0,0 +1,46 @@ +// MUST include this +#include + +// File being tested +#include + +#include + +constexpr size_t num_tests = 1000; + +template +void test_dist(Generator& g, RealType a, RealType b) { + lbann::beta_distribution dist(a, b); + for (size_t i = 0; i < num_tests; ++i) { + RealType val = dist(g); + REQUIRE(std::isfinite(val)); + REQUIRE(val >= RealType(0)); + REQUIRE(val <= RealType(1)); + } +} + +TEST_CASE("Testing beta_distribution", "[random][utilities]") { + std::mt19937 gen; + SECTION("float") { + SECTION("a=0.5 b=0.5") { + test_dist(gen, 0.5f, 0.5f); + } + SECTION("a=0.001 b=0.001") { + test_dist(gen, 0.001f, 0.001f); + } + SECTION("a=1.5 b=1.5") { + test_dist(gen, 1.5f, 1.5f); + } + } + SECTION("double") { + SECTION("a=0.5 b=0.5") { + test_dist(gen, 0.5, 0.5); + } + SECTION("a=0.001 b=0.001") { + test_dist(gen, 0.001, 0.001); + } + SECTION("a=1.5 b=1.5") { + test_dist(gen, 1.5, 1.5); + } + } +} From 11473127e1be9a7832a65e774805890d6da16f82 Mon Sep 17 00:00:00 2001 From: Nikoli Dryden Date: Thu, 6 Jun 2019 21:48:24 -0700 Subject: [PATCH 090/634] Address issues in @timmoon10's review. --- include/lbann/utils/beta.hpp | 23 ++++++++++++++++++----- include/lbann/utils/random.hpp | 24 +++--------------------- src/utils/unit_test/random_test.cpp | 8 ++++---- 3 files changed, 25 insertions(+), 30 deletions(-) diff --git a/include/lbann/utils/beta.hpp b/include/lbann/utils/beta.hpp index afddbce74ec..f8f73636ba7 100644 --- a/include/lbann/utils/beta.hpp +++ b/include/lbann/utils/beta.hpp @@ -33,14 +33,18 @@ #include #include "lbann/utils/random.hpp" +#include "lbann/utils/exception.hpp" namespace lbann { /** * Produces random floating point values drawn from a Beta distribution with * parameters a > 0 and b > 0. + * * See: + * * https://en.wikipedia.org/wiki/Beta_distribution + * * for more details. */ template @@ -53,10 +57,14 @@ class beta_distribution { using distribution_type = beta_distribution; explicit param_type(RealType a, RealType b) : - m_a(a), m_b(b) {} + m_a(a), m_b(b) { + if (a <= RealType(0) || b <= RealType(0)) { + LBANN_ERROR("Beta distribution parameters must be positive"); + } + } - RealType a() const { return m_a; } - RealType b() const { return m_b; } + constexpr RealType a() const { return m_a; } + constexpr RealType b() const { return m_b; } bool operator==(const param_type& other) const { return m_a == other.m_a && m_b == other.m_b; @@ -123,7 +131,7 @@ class beta_distribution { template result_type generate(Generator& g, const param_type& p) { if (p.a() <= result_type(1) && p.b() <= result_type(1)) { - return generate_johnk(p.a(), p.b()); + return generate_johnk(g, p.a(), p.b()); } else { gamma_dist gamma_a(p.a()), gamma_b(p.b()); return generate_gamma(g, gamma_a, gamma_b); @@ -136,16 +144,21 @@ class beta_distribution { * uniformly random values. * * See: + * * Johnk, H. D. "Erzeugung von betaverteilten und gammaverteilten * Zufallszahlen." Metrika 8, no. 1 (1964). + * * For an English-language presentation, see: + * * Atkinson, A. C. and M. C. Pearce. "The computer generation of beta, * gamma and normal random variables." Journal of the Royal Statistical * Society: Series A (General) 139, no. 4 (1976). * * This includes fixes for numerical stability when the parameters are small, * see: + * * https://github.com/numpy/numpy/issues/5851 + * * for discussion there; and a catch for the (extremely rare) case of the RNG * giving us U and V both exactly 0. * @@ -199,7 +212,7 @@ std::basic_ostream& operator<<(std::basic_ostream& os, } template -std::basic_istream& operator<<(std::basic_istream& is, +std::basic_istream& operator>>(std::basic_istream& is, beta_distribution& d) { std::string s; RealType a, b; diff --git a/include/lbann/utils/random.hpp b/include/lbann/utils/random.hpp index 9ae890d7681..25cd5c1a0aa 100644 --- a/include/lbann/utils/random.hpp +++ b/include/lbann/utils/random.hpp @@ -120,28 +120,10 @@ namespace details { // See section on converting uint64_ts to doubles in: // http://xoshiro.di.unimi.it/ -template -inline float random_float_32(Generator& g) { - const uint32_t r = g() >> 9; - return r * (1.0f / 8388608.0f); -} - -template -inline float random_float_64(Generator& g) { - const uint32_t r = uint32_t(g()) >> 9; // Truncate. - return r * (1.0f / 8388608.0f); -} - template inline float random_float(Generator& g) { - // TODO: Replace with if constexpr when possible. - if (sizeof(typename Generator::result_type) == 4) { - return random_float_32(g); - } else if (sizeof(typename Generator::result_type) == 8) { - return random_float_64(g); - } else { - LBANN_ERROR("Unsupported generator type"); - } + const uint32_t r = uint32_t(g()) >> 9; // Truncate if needed. + return r * (1.0f / 8388608.0f); } template @@ -184,7 +166,7 @@ struct random_uniform_impl { } // namespace details -/** Generate uniformly random values in the range (0, 1]. */ +/** Generate uniformly random values in the range [0, 1). */ template inline T fast_random_uniform(Generator& g) { static_assert(sizeof(typename Generator::result_type) == 4 || diff --git a/src/utils/unit_test/random_test.cpp b/src/utils/unit_test/random_test.cpp index 2313a79ec14..2979b7d0e3c 100644 --- a/src/utils/unit_test/random_test.cpp +++ b/src/utils/unit_test/random_test.cpp @@ -13,7 +13,7 @@ TEST_CASE("Testing fast_random_uniform", "[random][utilities]") { for (size_t i = 0; i < num_tests; ++i) { float val = lbann::fast_random_uniform(gen); REQUIRE(val >= 0.0f); - REQUIRE(val <= 1.0f); + REQUIRE(val < 1.0f); } } @@ -21,7 +21,7 @@ TEST_CASE("Testing fast_random_uniform", "[random][utilities]") { for (size_t i = 0; i < num_tests; ++i) { double val = lbann::fast_random_uniform(gen); REQUIRE(val >= 0.0); - REQUIRE(val <= 1.0); + REQUIRE(val < 1.0); } } } @@ -31,7 +31,7 @@ TEST_CASE("Testing fast_random_uniform", "[random][utilities]") { for (size_t i = 0; i < num_tests; ++i) { float val = lbann::fast_random_uniform(gen); REQUIRE(val >= 0.0f); - REQUIRE(val <= 1.0f); + REQUIRE(val < 1.0f); } } @@ -39,7 +39,7 @@ TEST_CASE("Testing fast_random_uniform", "[random][utilities]") { for (size_t i = 0; i < num_tests; ++i) { double val = lbann::fast_random_uniform(gen); REQUIRE(val >= 0.0); - REQUIRE(val <= 1.0); + REQUIRE(val < 1.0); } } } From 474ae5b5913a12230bc2af4aae2b8b14ffa75bc0 Mon Sep 17 00:00:00 2001 From: Nikoli Dryden Date: Tue, 4 Jun 2019 21:55:53 -0700 Subject: [PATCH 091/634] Initial implementation of cutout. --- .../lbann/transforms/vision/CMakeLists.txt | 1 + include/lbann/transforms/vision/cutout.hpp | 80 +++++++++++++++++++ src/proto/factories/transform_factory.cpp | 5 ++ src/proto/lbann.proto | 6 ++ src/transforms/vision/CMakeLists.txt | 1 + src/transforms/vision/cutout.cpp | 68 ++++++++++++++++ 6 files changed, 161 insertions(+) create mode 100644 include/lbann/transforms/vision/cutout.hpp create mode 100644 src/transforms/vision/cutout.cpp diff --git a/include/lbann/transforms/vision/CMakeLists.txt b/include/lbann/transforms/vision/CMakeLists.txt index 4a22f176f0b..2bd30f178c3 100644 --- a/include/lbann/transforms/vision/CMakeLists.txt +++ b/include/lbann/transforms/vision/CMakeLists.txt @@ -6,6 +6,7 @@ set_full_path(THIS_DIR_HEADERS center_crop.hpp colorize.hpp color_jitter.hpp + cutout.hpp grayscale.hpp horizontal_flip.hpp normalize_to_lbann_layout.hpp diff --git a/include/lbann/transforms/vision/cutout.hpp b/include/lbann/transforms/vision/cutout.hpp new file mode 100644 index 00000000000..1e89a85a0da --- /dev/null +++ b/include/lbann/transforms/vision/cutout.hpp @@ -0,0 +1,80 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_TRANSFORMS_CUTOUT_HPP_INCLUDED +#define LBANN_TRANSFORMS_CUTOUT_HPP_INCLUDED + +#include "lbann/transforms/transform.hpp" + +namespace lbann { +namespace transform { + +/** + * Cutout data augmentation which randomly masks out square regions of input. + * See: + * DeVries and Taylor. "Improved Regularization of Convolutional Neural + * Networks with Cutout". arXiv preprint arXiv:1708.04552 (2017). + * + * This will randomly select a center pixel for each square and set all pixels + * within that square to 0. It is permissible for portions of the masks to lie + * outside of the image. + * + * Normalization about 0 should be applied after applying cutout. + */ +class cutout : public transform { +public: + /** + * Cutout with a given number of squares of a given size. + * @param num_holes Number of squares to mask out (must be positive). + * @param size Length of a side of the square (must be positive). + */ + cutout(size_t num_holes, size_t length) : + transform(), m_num_holes(num_holes), m_length(length) { + if (num_holes == 0) { + LBANN_ERROR("num_holes must be positive, got 0"); + } + if (length == 0) { + LBANN_ERROR("length must be positive, got 0"); + } + } + + transform* copy() const override { return new cutout(*this); } + + std::string get_type() const override { return "cutout"; } + + void apply(utils::type_erased_matrix& data, std::vector& dims) override; + +private: + /** Number of squares that will be masked out. */ + size_t m_num_holes; + /** Length of a side of each square that will be masked out. */ + size_t m_length; +}; + +} // namespace transform +} // namespace lbann + +#endif // LBANN_TRANSFORMS_CUTOUT_HPP_INCLUDED diff --git a/src/proto/factories/transform_factory.cpp b/src/proto/factories/transform_factory.cpp index 1794abae8c8..47a721eb9aa 100644 --- a/src/proto/factories/transform_factory.cpp +++ b/src/proto/factories/transform_factory.cpp @@ -34,6 +34,7 @@ #include "lbann/transforms/vision/center_crop.hpp" #include "lbann/transforms/vision/colorize.hpp" #include "lbann/transforms/vision/color_jitter.hpp" +#include "lbann/transforms/vision/cutout.hpp" #include "lbann/transforms/vision/grayscale.hpp" #include "lbann/transforms/vision/horizontal_flip.hpp" #include "lbann/transforms/vision/normalize_to_lbann_layout.hpp" @@ -133,6 +134,10 @@ std::unique_ptr construct_transform( pb_trans.min_brightness_factor(), pb_trans.max_brightness_factor(), pb_trans.min_contrast_factor(), pb_trans.max_contrast_factor(), pb_trans.min_saturation_factor(), pb_trans.max_saturation_factor()); + } else if (trans.has_cutout()) { + auto& pb_trans = trans.cutout(); + return make_unique( + pb_trans.num_holes(), pb_trans.length()); } LBANN_ERROR("Unknown transform"); diff --git a/src/proto/lbann.proto b/src/proto/lbann.proto index 4f4672e3727..dc7649e91a6 100644 --- a/src/proto/lbann.proto +++ b/src/proto/lbann.proto @@ -129,6 +129,11 @@ message Transform { float min_saturation_factor = 5; float max_saturation_factor = 6; } + // Apply cutout augmentation. + message Cutout { + uint64 num_holes = 1; + uint64 length = 2; + } // Convert to grayscale. message Grayscale {} // Horizontal flip with probability p. @@ -215,6 +220,7 @@ message Transform { AdjustContrast adjust_contrast = 114; AdjustSaturation adjust_saturation = 115; ColorJitter color_jitter = 116; + Cutout cutout = 117; } } diff --git a/src/transforms/vision/CMakeLists.txt b/src/transforms/vision/CMakeLists.txt index c986ba1e149..e354a90d3e0 100644 --- a/src/transforms/vision/CMakeLists.txt +++ b/src/transforms/vision/CMakeLists.txt @@ -6,6 +6,7 @@ set_full_path(THIS_DIR_SOURCES center_crop.cpp colorize.cpp color_jitter.cpp + cutout.cpp grayscale.cpp horizontal_flip.cpp normalize_to_lbann_layout.cpp diff --git a/src/transforms/vision/cutout.cpp b/src/transforms/vision/cutout.cpp new file mode 100644 index 00000000000..812a973d337 --- /dev/null +++ b/src/transforms/vision/cutout.cpp @@ -0,0 +1,68 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#include "lbann/transforms/vision/cutout.hpp" +#include "lbann/utils/opencv.hpp" + +namespace lbann { +namespace transform { + +void cutout::apply(utils::type_erased_matrix& data, std::vector& dims) { + cv::Mat src = utils::get_opencv_mat(data, dims); + for (size_t i = 0; i < m_num_holes; ++i) { + // Select the center of the hole. + const ssize_t center_x = transform::get_uniform_random_int(0, dims[2]); + const ssize_t center_y = transform::get_uniform_random_int(0, dims[1]); + // Compute top-left corner and bottom-right corners of the hole. + const ssize_t length = static_cast(m_length); + const size_t x1 = std::max(center_x - length / 2, 0l); + const size_t x2 = std::min(center_x + length / 2, + static_cast(dims[2]) - 1); + const size_t y1 = std::max(center_y - length / 2, 0l); + const size_t y2 = std::min(center_y + length / 2, + static_cast(dims[1]) - 1); + // Convert to height/width. + const size_t h = y2 - y1; + const size_t w = x2 - x1; + // Sanity check. + if (x1 >= static_cast(src.cols) || + y1 >= static_cast(src.rows) || + (x1 + w) > static_cast(src.cols) || + (y1 + h) > static_cast(src.rows)) { + std::stringstream ss; + ss << "Bad hole dimensions for " << src.rows << "x" << src.cols << ": " + << h << "x" << w << " at (" << x1 << "," << y1 << ")"; + LBANN_ERROR(ss.str()); + } + std::cout << "Cutout: " << x1 << " " << y1 << " " << w << " " << h << std::endl; + // This will be just a view into the original. + cv::Mat hole = src(cv::Rect(x1, y1, w, h)); + hole = 0; + } +} + +} // namespace transform +} // namespace lbann From 3b18632b09f33db6d0d00b336797e33edf44606a Mon Sep 17 00:00:00 2001 From: Nikoli Dryden Date: Wed, 5 Jun 2019 06:17:12 -0700 Subject: [PATCH 092/634] Remove print statement leftover from debugging. --- src/transforms/vision/cutout.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/transforms/vision/cutout.cpp b/src/transforms/vision/cutout.cpp index 812a973d337..be2d57262dd 100644 --- a/src/transforms/vision/cutout.cpp +++ b/src/transforms/vision/cutout.cpp @@ -57,7 +57,6 @@ void cutout::apply(utils::type_erased_matrix& data, std::vector& dims) { << h << "x" << w << " at (" << x1 << "," << y1 << ")"; LBANN_ERROR(ss.str()); } - std::cout << "Cutout: " << x1 << " " << y1 << " " << w << " " << h << std::endl; // This will be just a view into the original. cv::Mat hole = src(cv::Rect(x1, y1, w, h)); hole = 0; From 1e7953b04863ee250bac5e0f0dbe1c2cb226c3f7 Mon Sep 17 00:00:00 2001 From: Nikoli Dryden Date: Tue, 4 Jun 2019 21:58:06 -0700 Subject: [PATCH 093/634] Initial implementation of mixup. Conflicts: src/proto/lbann.proto --- include/lbann/callbacks/CMakeLists.txt | 1 + include/lbann/callbacks/callback_mixup.hpp | 75 +++++++++++++++++++ include/lbann/lbann.hpp | 1 + src/callbacks/CMakeLists.txt | 1 + src/callbacks/callback_mixup.cpp | 83 ++++++++++++++++++++++ src/proto/factories/callback_factory.cpp | 12 ++++ src/proto/lbann.proto | 7 ++ 7 files changed, 180 insertions(+) create mode 100644 include/lbann/callbacks/callback_mixup.hpp create mode 100644 src/callbacks/callback_mixup.cpp diff --git a/include/lbann/callbacks/CMakeLists.txt b/include/lbann/callbacks/CMakeLists.txt index 8466fdf53ef..d9043c2e48a 100644 --- a/include/lbann/callbacks/CMakeLists.txt +++ b/include/lbann/callbacks/CMakeLists.txt @@ -21,6 +21,7 @@ set_full_path(THIS_DIR_HEADERS callback_io.hpp callback_learning_rate.hpp callback_ltfb.hpp + callback_mixup.hpp callback_perturb_adam.hpp callback_print.hpp callback_save_images.hpp diff --git a/include/lbann/callbacks/callback_mixup.hpp b/include/lbann/callbacks/callback_mixup.hpp new file mode 100644 index 00000000000..fdc0e9dff2c --- /dev/null +++ b/include/lbann/callbacks/callback_mixup.hpp @@ -0,0 +1,75 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_CALLBACKS_MIXUP_HPP +#define LBANN_CALLBACKS_MIXUP_HPP + +#include +#include + +#include "lbann/callbacks/callback.hpp" + +namespace lbann { + +/** + * Apply mixup to named input layers. + * See: + * Zhang, H. et al. "mixup: Beyond Empirical Risk Minimization." ICLR, 2018. + * + * This implementation does mixup within a single batch, per the recommendation + * within the paper. + * This approach may create duplicate images, and so uses + * lambda = max(lambda, 1 - lambda) + * for the mixing value. + * This recommendation comes from https://docs.fast.ai/callbacks.mixup.html + * + * The recommended default alpha (from the paper) is 0.4. + */ +class callback_mixup : public lbann_callback { +public: + /** Apply mixup to layers named in layers with mixup parameter alpha. */ + callback_mixup(std::unordered_set layers, float alpha) : + lbann_callback(), m_layers(layers), m_alpha(alpha) { + if (alpha < 0.0f) { + LBANN_ERROR("Mixup alpha must be non-negative."); + } + } + + callback_mixup* copy() const override { return new callback_mixup(*this); } + std::string name() const override { return "mixup"; } + + void on_forward_prop_end(model *m, Layer *l) override; + +private: + /** Names of input layers to apply mixup to. */ + std::unordered_set m_layers; + /** mixup parameter. */ + float m_alpha; +}; + +} // namespace lbann + +#endif // LBANN_CALLBACKS_MIXUP_HPP diff --git a/include/lbann/lbann.hpp b/include/lbann/lbann.hpp index 52018264b2b..aacbdafb1d6 100644 --- a/include/lbann/lbann.hpp +++ b/include/lbann/lbann.hpp @@ -148,6 +148,7 @@ #include "lbann/callbacks/callback_dump_minibatch_sample_indices.hpp" #include "lbann/callbacks/callback_early_stopping.hpp" #include "lbann/callbacks/callback_ltfb.hpp" +#include "lbann/callbacks/callback_mixup.hpp" #include "lbann/callbacks/callback_save_images.hpp" #include "lbann/callbacks/callback_save_model.hpp" #include "lbann/callbacks/callback_save_topk_models.hpp" diff --git a/src/callbacks/CMakeLists.txt b/src/callbacks/CMakeLists.txt index b9baa45839d..4c520b32a19 100644 --- a/src/callbacks/CMakeLists.txt +++ b/src/callbacks/CMakeLists.txt @@ -20,6 +20,7 @@ set_full_path(THIS_DIR_SOURCES callback_io.cpp callback_learning_rate.cpp callback_ltfb.cpp + callback_mixup.cpp callback_perturb_adam.cpp callback_print.cpp callback_save_images.cpp diff --git a/src/callbacks/callback_mixup.cpp b/src/callbacks/callback_mixup.cpp new file mode 100644 index 00000000000..c739ea70a88 --- /dev/null +++ b/src/callbacks/callback_mixup.cpp @@ -0,0 +1,83 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#include +#include "lbann/callbacks/callback_mixup.hpp" +#include "lbann/utils/beta.hpp" +#include "lbann/utils/exception.hpp" +#include "lbann/utils/image.hpp" + +namespace lbann { + +void callback_mixup::on_forward_prop_end(model *m, Layer *l) { + if (!m_layers.count(l->get_name())) { + return; + } + if (m->get_execution_mode() != execution_mode::training) { + return; // No mixup outside of training. + } + + auto& samples = l->get_local_activations(0); + auto& targets = l->get_local_activations(1); + El::Int mbsize = samples.Width(); + const El::Int samples_height = samples.Height(); + const El::Int targets_height = targets.Height(); + auto& gen = get_fast_generator(); + beta_distribution dist(m_alpha, m_alpha); + + // For now, data must be on the CPU. + if (samples.GetDevice() != El::Device::CPU || + targets.GetDevice() != El::Device::CPU) { + LBANN_ERROR("mixup only works with CPU data"); + } + + // Decide how to mix the mini-batch. + std::vector shuffled_indices(mbsize); + std::iota(shuffled_indices.begin(), shuffled_indices.end(), 0); + std::shuffle(shuffled_indices.begin(), shuffled_indices.end(), gen); + + for (El::Int i = 0; i < mbsize; ++i) { + const El::Int j = shuffled_indices[i]; + if (i == j) { + continue; + } + float lambda = dist(gen); + lambda = std::max(lambda, 1.0f - lambda); + const float lambda_sub = 1.0f - lambda; + DataType* __restrict__ x1_buf = samples.Buffer() + i*samples.LDim(); + const DataType* __restrict__ x2_buf = samples.LockedBuffer() + j*samples.LDim(); + DataType* __restrict__ y1_buf = targets.Buffer() + i*targets.LDim(); + const DataType* __restrict__ y2_buf = targets.LockedBuffer() + j*targets.LDim(); + for (El::Int k = 0; k < samples_height; ++k) { + x1_buf[k] = lambda*x1_buf[k] + lambda_sub*x2_buf[k]; + } + for (El::Int k = 0; k < targets_height; ++k) { + y1_buf[k] = lambda*y1_buf[k] + lambda_sub*y2_buf[k]; + } + } +} + +} // namespace lbann diff --git a/src/proto/factories/callback_factory.cpp b/src/proto/factories/callback_factory.cpp index cd7e647ef85..3667e42e943 100644 --- a/src/proto/factories/callback_factory.cpp +++ b/src/proto/factories/callback_factory.cpp @@ -435,6 +435,18 @@ lbann_callback* construct_callback(lbann_comm* comm, params.keep_dropout_factor(), parse_set(params.layers())); } + + ////////////////////////////////////////////////////////////// + // Data augmentation + ////////////////////////////////////////////////////////////// + if (proto_cb.has_mixup()) { + const auto& params = proto_cb.mixup(); + const auto& layers_list = parse_list(params.layers()); + std::unordered_set layers(layers_list.begin(), + layers_list.end()); + return new callback_mixup(layers, params.alpha()); + } + return nullptr; } diff --git a/src/proto/lbann.proto b/src/proto/lbann.proto index 4f4672e3727..5e52378992d 100644 --- a/src/proto/lbann.proto +++ b/src/proto/lbann.proto @@ -439,6 +439,7 @@ message Callback { CallbackPerturbAdam perturb_adam = 38; CallbackPerturbDropout perturb_dropout = 39; CallbackSaveTopKModels save_topk_models = 40; + CallbackMixup mixup = 41; } message CallbackLTFB { @@ -692,6 +693,12 @@ message CallbackSaveTopKModels { string metric = 3; //metrics to use in evaluating models bool ascending_ordering = 4; //whether to sort metrics per model in ascending order, descending order is default } + +message CallbackMixup { + string layers = 1; + float alpha = 2; +} + //======================================================================== // Weights //======================================================================== From d610d7ddac3ecac2de0345abd4dd018b20e77a1e Mon Sep 17 00:00:00 2001 From: "Thomas R. Benson" Date: Thu, 20 Jun 2019 13:36:52 -0700 Subject: [PATCH 094/634] change a constructor that was causing clang to throw an error --- include/lbann/utils/any.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/lbann/utils/any.hpp b/include/lbann/utils/any.hpp index 6b55e7caf8b..957ac119974 100644 --- a/include/lbann/utils/any.hpp +++ b/include/lbann/utils/any.hpp @@ -64,7 +64,7 @@ class any ///@{ /** @brief Default construct an empty "any" */ - any() noexcept = default; + any() noexcept {} /** @brief Construct an object holding a T */ template From 6044535eb60c11a09a0d6a49f20c92f7ee79b199 Mon Sep 17 00:00:00 2001 From: Ryan Forsyth Date: Wed, 19 Jun 2019 11:00:19 -0700 Subject: [PATCH 095/634] Update to Clang 6 --- bamboo/allocate_and_run.sh | 2 +- bamboo/common_python/tools.py | 16 ++++++------- .../compiler_tests/build_script_specific.sh | 4 ++-- bamboo/compiler_tests/test_compiler.py | 24 +++++++++---------- bamboo/integration_tests/common_code.py | 2 +- ...toencoder_imagenet_objective_functions.csv | 0 ..._autoencoder_mnist_objective_functions.csv | 0 .../expected_performance.csv | 0 .../test_integration_autoencoders.py | 4 ++-- .../test_integration_debug.py | 8 +++---- .../test_integration_performance.py | 12 +++++----- .../test_unit_check_proto_models.py | 4 ++-- bamboo/unit_tests/test_unit_checkpoint.py | 6 ++--- bamboo/unit_tests/test_unit_layer_clamp.py | 4 ++-- .../unit_tests/test_unit_layer_covariance.py | 4 ++-- bamboo/unit_tests/test_unit_layer_elu.py | 4 ++-- bamboo/unit_tests/test_unit_layer_identity.py | 4 ++-- bamboo/unit_tests/test_unit_layer_l1_norm.py | 4 ++-- bamboo/unit_tests/test_unit_layer_l2_norm2.py | 4 ++-- .../unit_tests/test_unit_layer_leaky_relu.py | 4 ++-- .../unit_tests/test_unit_layer_log_sigmoid.py | 4 ++-- .../unit_tests/test_unit_layer_log_softmax.py | 4 ++-- .../test_unit_layer_mean_absolute_error.py | 4 ++-- bamboo/unit_tests/test_unit_layer_relu.py | 4 ++-- bamboo/unit_tests/test_unit_layer_selu.py | 4 ++-- bamboo/unit_tests/test_unit_layer_sigmoid.py | 4 ++-- bamboo/unit_tests/test_unit_layer_softmax.py | 4 ++-- bamboo/unit_tests/test_unit_layer_softplus.py | 4 ++-- bamboo/unit_tests/test_unit_layer_softsign.py | 4 ++-- .../test_unit_layer_squared_difference.py | 4 ++-- .../unit_tests/test_unit_layer_tessellate.py | 4 ++-- bamboo/unit_tests/test_unit_layer_variance.py | 4 ++-- bamboo/unit_tests/test_unit_lbann2_reload.py | 4 ++-- .../unit_tests/test_unit_mnist_conv_graph.py | 4 ++-- .../test_unit_mnist_ridge_regression.py | 4 ++-- .../test_unit_mnist_softmax_classifier.py | 4 ++-- 36 files changed, 87 insertions(+), 87 deletions(-) rename bamboo/integration_tests/expected_values/catalyst/{clang4 => clang6}/expected_conv_autoencoder_imagenet_objective_functions.csv (100%) rename bamboo/integration_tests/expected_values/catalyst/{clang4 => clang6}/expected_conv_autoencoder_mnist_objective_functions.csv (100%) rename bamboo/integration_tests/expected_values/catalyst/{clang4 => clang6}/expected_performance.csv (100%) diff --git a/bamboo/allocate_and_run.sh b/bamboo/allocate_and_run.sh index 0f338913e4b..6aee9e80698 100755 --- a/bamboo/allocate_and_run.sh +++ b/bamboo/allocate_and_run.sh @@ -34,7 +34,7 @@ if [ ${WEEKLY} -ne 0 ]; then salloc -N16 -t 600 ./run.sh --weekly if [ "${CLUSTER}" = 'catalyst' ]; then cd integration_tests - python -m pytest -s test_integration_performance_full_alexnet_clang4 --weekly --run + python -m pytest -s test_integration_performance_full_alexnet_clang6 --weekly --run python -m pytest -s test_integration_performance_full_alexnet_gcc4 --weekly --run python -m pytest -s test_integration_performance_full_alexnet_gcc7 --weekly --run python -m pytest -s test_integration_performance_full_alexnet_intel18 --weekly --run diff --git a/bamboo/common_python/tools.py b/bamboo/common_python/tools.py index 24c429b6bdd..d680b1d9da1 100644 --- a/bamboo/common_python/tools.py +++ b/bamboo/common_python/tools.py @@ -360,11 +360,11 @@ def process_executable_existence(executable, skip_no_exe=True): def get_spack_exes(default_dirname, cluster): exes = {} - exes['clang4'] = '%s/bamboo/compiler_tests/builds/%s_clang-4.0.0_rel/build/model_zoo/lbann' % (default_dirname, cluster) + exes['clang6'] = '%s/bamboo/compiler_tests/builds/%s_clang-6.0.0_rel/build/model_zoo/lbann' % (default_dirname, cluster) exes['gcc7'] = '%s/bamboo/compiler_tests/builds/%s_gcc-7.1.0_rel/build/model_zoo/lbann' % (default_dirname, cluster) exes['intel19'] = '%s/bamboo/compiler_tests/builds/%s_intel-19.0.0_rel/build/model_zoo/lbann' % (default_dirname, cluster) - exes['clang4_debug'] = '%s/bamboo/compiler_tests/builds/%s_clang-4.0.0_debug/build/model_zoo/lbann' % (default_dirname, cluster) + exes['clang6_debug'] = '%s/bamboo/compiler_tests/builds/%s_clang-6.0.0_debug/build/model_zoo/lbann' % (default_dirname, cluster) exes['gcc7_debug'] = '%s/bamboo/compiler_tests/builds/%s_gcc-7.1.0_debug/build/model_zoo/lbann' % (default_dirname, cluster) exes['intel19_debug'] = '%s/bamboo/compiler_tests/builds/%s_intel-19.0.0_debug/build/model_zoo/lbann' % (default_dirname, cluster) @@ -374,15 +374,15 @@ def get_spack_exes(default_dirname, cluster): def get_default_exes(default_dirname, cluster): exes = get_spack_exes(default_dirname, cluster) # Use build script as a backup if the Spack build doesn't work. - if not os.path.exists(exes['clang4']): - exes['clang4'] = '%s/build/clang.Release.%s.llnl.gov/install/bin/lbann' % (default_dirname, cluster) + if not os.path.exists(exes['clang6']): + exes['clang6'] = '%s/build/clang.Release.%s.llnl.gov/install/bin/lbann' % (default_dirname, cluster) if not os.path.exists(exes['gcc7']): exes['gcc7'] = '%s/build/gnu.Release.%s.llnl.gov/install/bin/lbann' % (default_dirname, cluster) if not os.path.exists(exes['intel19']): exes['intel19'] = '%s/build/intel.Release.%s.llnl.gov/install/bin/lbann' % (default_dirname, cluster) - if not os.path.exists(exes['clang4_debug']): - exes['clang4_debug'] = '%s/build/clang.Debug.%s.llnl.gov/install/bin/lbann' % (default_dirname, cluster) + if not os.path.exists(exes['clang6_debug']): + exes['clang6_debug'] = '%s/build/clang.Debug.%s.llnl.gov/install/bin/lbann' % (default_dirname, cluster) if not os.path.exists(exes['gcc7_debug']): exes['gcc7_debug'] = '%s/build/gnu.Debug.%s.llnl.gov/install/bin/lbann' % (default_dirname, cluster) if not os.path.exists(exes['intel19_debug']): @@ -393,11 +393,11 @@ def get_default_exes(default_dirname, cluster): if cluster in ['catalyst', 'pascal']: # x86_cpu - catalyst # x86_gpu_pascal - pascal - default_exes['clang4'] = exes['clang4'] + default_exes['clang6'] = exes['clang6'] default_exes['gcc7'] = exes['gcc7'] default_exes['intel19'] = exes['intel19'] - default_exes['clang4_debug'] = exes['clang4_debug'] + default_exes['clang6_debug'] = exes['clang6_debug'] default_exes['gcc7_debug'] = exes['gcc7_debug'] default_exes['intel19_debug'] = exes['intel19_debug'] diff --git a/bamboo/compiler_tests/build_script_specific.sh b/bamboo/compiler_tests/build_script_specific.sh index 92925ceae1c..49833de8b1e 100755 --- a/bamboo/compiler_tests/build_script_specific.sh +++ b/bamboo/compiler_tests/build_script_specific.sh @@ -30,8 +30,8 @@ while :; do shift done -if [ "${COMPILER}" == 'clang4' ]; then - module load clang/4.0.0 +if [ "${COMPILER}" == 'clang6' ]; then + module load clang/6.0.0 ${LBANN_DIR}/scripts/build_lbann_lc.sh --compiler clang ${DEBUG} --reconfigure --with-conduit fi diff --git a/bamboo/compiler_tests/test_compiler.py b/bamboo/compiler_tests/test_compiler.py index 8e08dbd7881..5f637519901 100644 --- a/bamboo/compiler_tests/test_compiler.py +++ b/bamboo/compiler_tests/test_compiler.py @@ -26,25 +26,25 @@ def test_compiler_build_script(cluster, dirname): pytest.skip(e) -def test_compiler_clang4_release(cluster, dirname): +def test_compiler_clang6_release(cluster, dirname): try: - skeleton_clang4(cluster, dirname, False) + skeleton_clang6(cluster, dirname, False) except AssertionError as e: print(e) - build_script(cluster, dirname, 'clang4', False) - path = '%s/bamboo/compiler_tests/builds/%s_clang-4.0.0_rel/build/model_zoo/lbann' % (dirname, cluster) + build_script(cluster, dirname, 'clang6', False) + path = '%s/bamboo/compiler_tests/builds/%s_clang-6.0.0_rel/build/model_zoo/lbann' % (dirname, cluster) if not os.path.exists(path): path = '%s/build/clang.Release.%s.llnl.gov/install/bin/lbann' % (dirname, cluster) assert os.path.exists(path) -def test_compiler_clang4_debug(cluster, dirname): +def test_compiler_clang6_debug(cluster, dirname): try: - skeleton_clang4(cluster, dirname, True) + skeleton_clang6(cluster, dirname, True) except AssertionError as e: print(e) - build_script(cluster, dirname, 'clang4', True) - path = '%s/bamboo/compiler_tests/builds/%s_clang-4.0.0_debug/build/model_zoo/lbann' % (dirname, cluster) + build_script(cluster, dirname, 'clang6', True) + path = '%s/bamboo/compiler_tests/builds/%s_clang-6.0.0_debug/build/model_zoo/lbann' % (dirname, cluster) if not os.path.exists(path): path = '%s/build/clang.Debug.%s.llnl.gov/install/bin/lbann' % (dirname, cluster) assert os.path.exists(path) @@ -98,12 +98,12 @@ def test_compiler_intel19_debug(cluster, dirname): assert os.path.exists(path) -def skeleton_clang4(cluster, dir_name, debug, should_log=False): +def skeleton_clang6(cluster, dir_name, debug, should_log=False): if cluster in ['catalyst']: - spack_skeleton(dir_name, 'clang@4.0.0', 'mvapich2@2.2', debug, should_log) - build_skeleton(dir_name, 'clang@4.0.0', debug, should_log) + spack_skeleton(dir_name, 'clang@6.0.0', 'mvapich2@2.2', debug, should_log) + build_skeleton(dir_name, 'clang@6.0.0', debug, should_log) else: - e = 'skeleton_clang4: Unsupported Cluster %s' % cluster + e = 'skeleton_clang6: Unsupported Cluster %s' % cluster print('Skip - ' + e) pytest.skip(e) diff --git a/bamboo/integration_tests/common_code.py b/bamboo/integration_tests/common_code.py index 939de8295eb..76bb59257c6 100644 --- a/bamboo/integration_tests/common_code.py +++ b/bamboo/integration_tests/common_code.py @@ -26,7 +26,7 @@ def get_command(cluster, dir_name, model_folder, model_name, executable, error_file_name=error_file_name) elif model_name in ['conv_autoencoder_mnist', 'lenet_mnist']: if (model_name == 'lenet_mnist') and \ - (compiler_name in ['clang4', 'intel19']): + (compiler_name in ['clang6', 'intel19']): partition = 'pbatch' time_limit = 600 else: diff --git a/bamboo/integration_tests/expected_values/catalyst/clang4/expected_conv_autoencoder_imagenet_objective_functions.csv b/bamboo/integration_tests/expected_values/catalyst/clang6/expected_conv_autoencoder_imagenet_objective_functions.csv similarity index 100% rename from bamboo/integration_tests/expected_values/catalyst/clang4/expected_conv_autoencoder_imagenet_objective_functions.csv rename to bamboo/integration_tests/expected_values/catalyst/clang6/expected_conv_autoencoder_imagenet_objective_functions.csv diff --git a/bamboo/integration_tests/expected_values/catalyst/clang4/expected_conv_autoencoder_mnist_objective_functions.csv b/bamboo/integration_tests/expected_values/catalyst/clang6/expected_conv_autoencoder_mnist_objective_functions.csv similarity index 100% rename from bamboo/integration_tests/expected_values/catalyst/clang4/expected_conv_autoencoder_mnist_objective_functions.csv rename to bamboo/integration_tests/expected_values/catalyst/clang6/expected_conv_autoencoder_mnist_objective_functions.csv diff --git a/bamboo/integration_tests/expected_values/catalyst/clang4/expected_performance.csv b/bamboo/integration_tests/expected_values/catalyst/clang6/expected_performance.csv similarity index 100% rename from bamboo/integration_tests/expected_values/catalyst/clang4/expected_performance.csv rename to bamboo/integration_tests/expected_values/catalyst/clang6/expected_performance.csv diff --git a/bamboo/integration_tests/test_integration_autoencoders.py b/bamboo/integration_tests/test_integration_autoencoders.py index 9e9e325a902..b256b11c8df 100644 --- a/bamboo/integration_tests/test_integration_autoencoders.py +++ b/bamboo/integration_tests/test_integration_autoencoders.py @@ -74,9 +74,9 @@ def skeleton_autoencoder_imagenet(cluster, dir_name, executables, compiler_name, should_log, compiler_name, frequency_str) -def test_integration_autoencoder_imagenet_clang4(cluster, dirname, exes, +def test_integration_autoencoder_imagenet_clang6(cluster, dirname, exes, weekly): - skeleton_autoencoder_imagenet(cluster, dirname, exes, 'clang4', weekly) + skeleton_autoencoder_imagenet(cluster, dirname, exes, 'clang6', weekly) def test_integration_autoencoder_imagenet_gcc7(cluster, dirname, exes, weekly): diff --git a/bamboo/integration_tests/test_integration_debug.py b/bamboo/integration_tests/test_integration_debug.py index be26995d0c4..cca69a66ff0 100644 --- a/bamboo/integration_tests/test_integration_debug.py +++ b/bamboo/integration_tests/test_integration_debug.py @@ -60,12 +60,12 @@ def skeleton_cifar_debug(cluster, dir_name, executables, compiler_name, weekly, assert output_value == 0 -def test_integration_mnist_clang4_debug(cluster, dirname, exes, weekly, debug_build): - skeleton_mnist_debug(cluster, dirname, exes, 'clang4_debug', weekly, debug_build) +def test_integration_mnist_clang6_debug(cluster, dirname, exes, weekly, debug_build): + skeleton_mnist_debug(cluster, dirname, exes, 'clang6_debug', weekly, debug_build) -def test_integration_cifar_clang4_debug(cluster, dirname, exes, weekly, debug_build): - skeleton_cifar_debug(cluster, dirname, exes, 'clang4_debug', weekly, debug_build) +def test_integration_cifar_clang6_debug(cluster, dirname, exes, weekly, debug_build): + skeleton_cifar_debug(cluster, dirname, exes, 'clang6_debug', weekly, debug_build) def test_integration_mnist_gcc7_debug(cluster, dirname, exes, weekly, debug_build): diff --git a/bamboo/integration_tests/test_integration_performance.py b/bamboo/integration_tests/test_integration_performance.py index e9e74f7a80d..1a77589732b 100644 --- a/bamboo/integration_tests/test_integration_performance.py +++ b/bamboo/integration_tests/test_integration_performance.py @@ -174,17 +174,17 @@ def skeleton_performance_full_alexnet(cluster, dir_name, executables, cluster) -def test_integration_performance_lenet_mnist_clang4(cluster, dirname, exes): - skeleton_performance_lenet_mnist(cluster, dirname, exes, 'clang4') +def test_integration_performance_lenet_mnist_clang6(cluster, dirname, exes): + skeleton_performance_lenet_mnist(cluster, dirname, exes, 'clang6') -def test_integration_performance_alexnet_clang4(cluster, dirname, exes, weekly): - skeleton_performance_alexnet(cluster, dirname, exes, 'clang4', weekly) +def test_integration_performance_alexnet_clang6(cluster, dirname, exes, weekly): + skeleton_performance_alexnet(cluster, dirname, exes, 'clang6', weekly) -def test_integration_performance_full_alexnet_clang4(cluster, dirname, exes, +def test_integration_performance_full_alexnet_clang6(cluster, dirname, exes, weekly, run): - skeleton_performance_full_alexnet(cluster, dirname, exes, 'clang4', weekly, + skeleton_performance_full_alexnet(cluster, dirname, exes, 'clang6', weekly, run) diff --git a/bamboo/unit_tests/test_unit_check_proto_models.py b/bamboo/unit_tests/test_unit_check_proto_models.py index 7b497f1143d..431449dc960 100644 --- a/bamboo/unit_tests/test_unit_check_proto_models.py +++ b/bamboo/unit_tests/test_unit_check_proto_models.py @@ -118,8 +118,8 @@ def skeleton_models(cluster, dir_name, executables, compiler_name): assert num_defective == 0 -def test_unit_models_clang4(cluster, dirname, exes): - skeleton_models(cluster, dirname, exes, 'clang4') +def test_unit_models_clang6(cluster, dirname, exes): + skeleton_models(cluster, dirname, exes, 'clang6') def test_unit_models_gcc7(cluster, dirname, exes): diff --git a/bamboo/unit_tests/test_unit_checkpoint.py b/bamboo/unit_tests/test_unit_checkpoint.py index 4824c32d104..be468dccbfc 100644 --- a/bamboo/unit_tests/test_unit_checkpoint.py +++ b/bamboo/unit_tests/test_unit_checkpoint.py @@ -123,9 +123,9 @@ def skeleton_checkpoint_lenet_distributed(cluster, executables, dir_name, assert diff_test == 0 -def test_unit_checkpoint_lenet_clang4(cluster, exes, dirname): - skeleton_checkpoint_lenet_shared(cluster, exes, dirname, 'clang4') - skeleton_checkpoint_lenet_distributed(cluster, exes, dirname, 'clang4') +def test_unit_checkpoint_lenet_clang6(cluster, exes, dirname): + skeleton_checkpoint_lenet_shared(cluster, exes, dirname, 'clang6') + skeleton_checkpoint_lenet_distributed(cluster, exes, dirname, 'clang6') def test_unit_checkpoint_lenet_gcc7(cluster, exes, dirname): diff --git a/bamboo/unit_tests/test_unit_layer_clamp.py b/bamboo/unit_tests/test_unit_layer_clamp.py index 67bb1be15de..56b8ca50520 100644 --- a/bamboo/unit_tests/test_unit_layer_clamp.py +++ b/bamboo/unit_tests/test_unit_layer_clamp.py @@ -23,8 +23,8 @@ def skeleton_layer_clamp(cluster, executables, dir_name, compiler_name): assert return_code == 0 -def test_unit_layer_clamp_clang4(cluster, exes, dirname): - skeleton_layer_clamp(cluster, exes, dirname, 'clang4') +def test_unit_layer_clamp_clang6(cluster, exes, dirname): + skeleton_layer_clamp(cluster, exes, dirname, 'clang6') def test_unit_layer_clamp_gcc7(cluster, exes, dirname): diff --git a/bamboo/unit_tests/test_unit_layer_covariance.py b/bamboo/unit_tests/test_unit_layer_covariance.py index b836208e68e..7b7de9b50ca 100644 --- a/bamboo/unit_tests/test_unit_layer_covariance.py +++ b/bamboo/unit_tests/test_unit_layer_covariance.py @@ -23,8 +23,8 @@ def skeleton_layer_covariance(cluster, executables, dir_name, compiler_name): assert return_code == 0 -def test_unit_layer_covariance_clang4(cluster, exes, dirname): - skeleton_layer_covariance(cluster, exes, dirname, 'clang4') +def test_unit_layer_covariance_clang6(cluster, exes, dirname): + skeleton_layer_covariance(cluster, exes, dirname, 'clang6') def test_unit_layer_covariance_gcc7(cluster, exes, dirname): diff --git a/bamboo/unit_tests/test_unit_layer_elu.py b/bamboo/unit_tests/test_unit_layer_elu.py index f96fcafdd73..4e5abe80398 100644 --- a/bamboo/unit_tests/test_unit_layer_elu.py +++ b/bamboo/unit_tests/test_unit_layer_elu.py @@ -23,8 +23,8 @@ def skeleton_layer_elu(cluster, executables, dir_name, compiler_name): assert return_code == 0 -def test_unit_layer_elu_clang4(cluster, exes, dirname): - skeleton_layer_elu(cluster, exes, dirname, 'clang4') +def test_unit_layer_elu_clang6(cluster, exes, dirname): + skeleton_layer_elu(cluster, exes, dirname, 'clang6') def test_unit_layer_elu_gcc7(cluster, exes, dirname): diff --git a/bamboo/unit_tests/test_unit_layer_identity.py b/bamboo/unit_tests/test_unit_layer_identity.py index 72bc91af09e..448531354d4 100644 --- a/bamboo/unit_tests/test_unit_layer_identity.py +++ b/bamboo/unit_tests/test_unit_layer_identity.py @@ -23,8 +23,8 @@ def skeleton_layer_identity(cluster, executables, dir_name, compiler_name): assert return_code == 0 -def test_unit_layer_identity_clang4(cluster, exes, dirname): - skeleton_layer_identity(cluster, exes, dirname, 'clang4') +def test_unit_layer_identity_clang6(cluster, exes, dirname): + skeleton_layer_identity(cluster, exes, dirname, 'clang6') def test_unit_layer_identity_gcc7(cluster, exes, dirname): diff --git a/bamboo/unit_tests/test_unit_layer_l1_norm.py b/bamboo/unit_tests/test_unit_layer_l1_norm.py index adebb726417..b1362658093 100644 --- a/bamboo/unit_tests/test_unit_layer_l1_norm.py +++ b/bamboo/unit_tests/test_unit_layer_l1_norm.py @@ -23,8 +23,8 @@ def skeleton_layer_l1_norm(cluster, executables, dir_name, compiler_name): assert return_code == 0 -def test_unit_layer_l1_norm_clang4(cluster, exes, dirname): - skeleton_layer_l1_norm(cluster, exes, dirname, 'clang4') +def test_unit_layer_l1_norm_clang6(cluster, exes, dirname): + skeleton_layer_l1_norm(cluster, exes, dirname, 'clang6') def test_unit_layer_l1_norm_gcc7(cluster, exes, dirname): diff --git a/bamboo/unit_tests/test_unit_layer_l2_norm2.py b/bamboo/unit_tests/test_unit_layer_l2_norm2.py index 9670b6ef7bc..2df17ef30e1 100644 --- a/bamboo/unit_tests/test_unit_layer_l2_norm2.py +++ b/bamboo/unit_tests/test_unit_layer_l2_norm2.py @@ -23,8 +23,8 @@ def skeleton_layer_l2_norm2(cluster, executables, dir_name, compiler_name): assert return_code == 0 -def test_unit_layer_l2_norm2_clang4(cluster, exes, dirname): - skeleton_layer_l2_norm2(cluster, exes, dirname, 'clang4') +def test_unit_layer_l2_norm2_clang6(cluster, exes, dirname): + skeleton_layer_l2_norm2(cluster, exes, dirname, 'clang6') def test_unit_layer_l2_norm2_gcc7(cluster, exes, dirname): skeleton_layer_l2_norm2(cluster, exes, dirname, 'gcc7') diff --git a/bamboo/unit_tests/test_unit_layer_leaky_relu.py b/bamboo/unit_tests/test_unit_layer_leaky_relu.py index a1a9b020a2b..d62d559bb3e 100644 --- a/bamboo/unit_tests/test_unit_layer_leaky_relu.py +++ b/bamboo/unit_tests/test_unit_layer_leaky_relu.py @@ -23,8 +23,8 @@ def skeleton_layer_leaky_relu(cluster, executables, dir_name, compiler_name): assert return_code == 0 -def test_unit_layer_leaky_relu_clang4(cluster, exes, dirname): - skeleton_layer_leaky_relu(cluster, exes, dirname, 'clang4') +def test_unit_layer_leaky_relu_clang6(cluster, exes, dirname): + skeleton_layer_leaky_relu(cluster, exes, dirname, 'clang6') def test_unit_layer_leaky_relu_gcc7(cluster, exes, dirname): diff --git a/bamboo/unit_tests/test_unit_layer_log_sigmoid.py b/bamboo/unit_tests/test_unit_layer_log_sigmoid.py index f4b4634cc6e..b2b53a46a86 100644 --- a/bamboo/unit_tests/test_unit_layer_log_sigmoid.py +++ b/bamboo/unit_tests/test_unit_layer_log_sigmoid.py @@ -23,8 +23,8 @@ def skeleton_layer_log_sigmoid(cluster, executables, dir_name, compiler_name): assert return_code == 0 -def test_unit_layer_log_sigmoid_clang4(cluster, exes, dirname): - skeleton_layer_log_sigmoid(cluster, exes, dirname, 'clang4') +def test_unit_layer_log_sigmoid_clang6(cluster, exes, dirname): + skeleton_layer_log_sigmoid(cluster, exes, dirname, 'clang6') def test_unit_layer_log_sigmoid_gcc7(cluster, exes, dirname): diff --git a/bamboo/unit_tests/test_unit_layer_log_softmax.py b/bamboo/unit_tests/test_unit_layer_log_softmax.py index 0345180165f..234b068d714 100644 --- a/bamboo/unit_tests/test_unit_layer_log_softmax.py +++ b/bamboo/unit_tests/test_unit_layer_log_softmax.py @@ -23,8 +23,8 @@ def skeleton_layer_log_softmax(cluster, executables, dir_name, compiler_name): assert return_code == 0 -def test_unit_layer_log_softmax_clang4(cluster, exes, dirname): - skeleton_layer_log_softmax(cluster, exes, dirname, 'clang4') +def test_unit_layer_log_softmax_clang6(cluster, exes, dirname): + skeleton_layer_log_softmax(cluster, exes, dirname, 'clang6') def test_unit_layer_log_softmax_gcc7(cluster, exes, dirname): diff --git a/bamboo/unit_tests/test_unit_layer_mean_absolute_error.py b/bamboo/unit_tests/test_unit_layer_mean_absolute_error.py index 0a623c48dcc..744c8e5cd89 100644 --- a/bamboo/unit_tests/test_unit_layer_mean_absolute_error.py +++ b/bamboo/unit_tests/test_unit_layer_mean_absolute_error.py @@ -23,8 +23,8 @@ def skeleton_layer_mean_absolute_error(cluster, executables, dir_name, compiler_ assert return_code == 0 -def test_unit_layer_mean_absolute_error_clang4(cluster, exes, dirname): - skeleton_layer_mean_absolute_error(cluster, exes, dirname, 'clang4') +def test_unit_layer_mean_absolute_error_clang6(cluster, exes, dirname): + skeleton_layer_mean_absolute_error(cluster, exes, dirname, 'clang6') def test_unit_layer_mean_absolute_error_gcc7(cluster, exes, dirname): diff --git a/bamboo/unit_tests/test_unit_layer_relu.py b/bamboo/unit_tests/test_unit_layer_relu.py index 0e3da6ecbe9..39d10030ab3 100644 --- a/bamboo/unit_tests/test_unit_layer_relu.py +++ b/bamboo/unit_tests/test_unit_layer_relu.py @@ -23,8 +23,8 @@ def skeleton_layer_relu(cluster, executables, dir_name, compiler_name): assert return_code == 0 -def test_unit_layer_relu_clang4(cluster, exes, dirname): - skeleton_layer_relu(cluster, exes, dirname, 'clang4') +def test_unit_layer_relu_clang6(cluster, exes, dirname): + skeleton_layer_relu(cluster, exes, dirname, 'clang6') def test_unit_layer_relu_gcc7(cluster, exes, dirname): diff --git a/bamboo/unit_tests/test_unit_layer_selu.py b/bamboo/unit_tests/test_unit_layer_selu.py index eb044bf6d30..f41f4d5ed57 100644 --- a/bamboo/unit_tests/test_unit_layer_selu.py +++ b/bamboo/unit_tests/test_unit_layer_selu.py @@ -23,8 +23,8 @@ def skeleton_layer_selu(cluster, executables, dir_name, compiler_name): assert return_code == 0 -def test_unit_layer_selu_clang4(cluster, exes, dirname): - skeleton_layer_selu(cluster, exes, dirname, 'clang4') +def test_unit_layer_selu_clang6(cluster, exes, dirname): + skeleton_layer_selu(cluster, exes, dirname, 'clang6') def test_unit_layer_selu_gcc7(cluster, exes, dirname): diff --git a/bamboo/unit_tests/test_unit_layer_sigmoid.py b/bamboo/unit_tests/test_unit_layer_sigmoid.py index fab0d235be8..7d75d32c1d0 100644 --- a/bamboo/unit_tests/test_unit_layer_sigmoid.py +++ b/bamboo/unit_tests/test_unit_layer_sigmoid.py @@ -23,8 +23,8 @@ def skeleton_layer_sigmoid(cluster, executables, dir_name, compiler_name): assert return_code == 0 -def test_unit_layer_sigmoid_clang4(cluster, exes, dirname): - skeleton_layer_sigmoid(cluster, exes, dirname, 'clang4') +def test_unit_layer_sigmoid_clang6(cluster, exes, dirname): + skeleton_layer_sigmoid(cluster, exes, dirname, 'clang6') def test_unit_layer_sigmoid_gcc7(cluster, exes, dirname): diff --git a/bamboo/unit_tests/test_unit_layer_softmax.py b/bamboo/unit_tests/test_unit_layer_softmax.py index 44f78a154a4..af1fa09ac17 100644 --- a/bamboo/unit_tests/test_unit_layer_softmax.py +++ b/bamboo/unit_tests/test_unit_layer_softmax.py @@ -23,8 +23,8 @@ def skeleton_layer_softmax(cluster, executables, dir_name, compiler_name): assert return_code == 0 -def test_unit_layer_softmax_clang4(cluster, exes, dirname): - skeleton_layer_softmax(cluster, exes, dirname, 'clang4') +def test_unit_layer_softmax_clang6(cluster, exes, dirname): + skeleton_layer_softmax(cluster, exes, dirname, 'clang6') def test_unit_layer_softmax_gcc7(cluster, exes, dirname): diff --git a/bamboo/unit_tests/test_unit_layer_softplus.py b/bamboo/unit_tests/test_unit_layer_softplus.py index 5f0d013df9d..cfcccf9b694 100644 --- a/bamboo/unit_tests/test_unit_layer_softplus.py +++ b/bamboo/unit_tests/test_unit_layer_softplus.py @@ -23,8 +23,8 @@ def skeleton_layer_softplus(cluster, executables, dir_name, compiler_name): assert return_code == 0 -def test_unit_layer_softplus_clang4(cluster, exes, dirname): - skeleton_layer_softplus(cluster, exes, dirname, 'clang4') +def test_unit_layer_softplus_clang6(cluster, exes, dirname): + skeleton_layer_softplus(cluster, exes, dirname, 'clang6') def test_unit_layer_softplus_gcc7(cluster, exes, dirname): diff --git a/bamboo/unit_tests/test_unit_layer_softsign.py b/bamboo/unit_tests/test_unit_layer_softsign.py index c0bea317b76..bff14d3e789 100644 --- a/bamboo/unit_tests/test_unit_layer_softsign.py +++ b/bamboo/unit_tests/test_unit_layer_softsign.py @@ -23,8 +23,8 @@ def skeleton_layer_softsign(cluster, executables, dir_name, compiler_name): assert return_code == 0 -def test_unit_layer_softsign_clang4(cluster, exes, dirname): - skeleton_layer_softsign(cluster, exes, dirname, 'clang4') +def test_unit_layer_softsign_clang6(cluster, exes, dirname): + skeleton_layer_softsign(cluster, exes, dirname, 'clang6') def test_unit_layer_softsign_gcc7(cluster, exes, dirname): diff --git a/bamboo/unit_tests/test_unit_layer_squared_difference.py b/bamboo/unit_tests/test_unit_layer_squared_difference.py index 2e9cc3d198f..6050310b4dc 100644 --- a/bamboo/unit_tests/test_unit_layer_squared_difference.py +++ b/bamboo/unit_tests/test_unit_layer_squared_difference.py @@ -23,8 +23,8 @@ def skeleton_layer_squared_difference(cluster, executables, dir_name, compiler_n assert return_code == 0 -def test_unit_layer_squared_difference_clang4(cluster, exes, dirname): - skeleton_layer_squared_difference(cluster, exes, dirname, 'clang4') +def test_unit_layer_squared_difference_clang6(cluster, exes, dirname): + skeleton_layer_squared_difference(cluster, exes, dirname, 'clang6') def test_unit_layer_squared_difference_gcc7(cluster, exes, dirname): diff --git a/bamboo/unit_tests/test_unit_layer_tessellate.py b/bamboo/unit_tests/test_unit_layer_tessellate.py index 7619ee3b5e3..cef99ca567c 100644 --- a/bamboo/unit_tests/test_unit_layer_tessellate.py +++ b/bamboo/unit_tests/test_unit_layer_tessellate.py @@ -23,8 +23,8 @@ def skeleton_layer_tessellate(cluster, executables, dir_name, compiler_name): assert return_code == 0 -def test_unit_layer_tessellate_clang4(cluster, exes, dirname): - skeleton_layer_tessellate(cluster, exes, dirname, 'clang4') +def test_unit_layer_tessellate_clang6(cluster, exes, dirname): + skeleton_layer_tessellate(cluster, exes, dirname, 'clang6') def test_unit_layer_tessellate_gcc7(cluster, exes, dirname): diff --git a/bamboo/unit_tests/test_unit_layer_variance.py b/bamboo/unit_tests/test_unit_layer_variance.py index e8422c3a70e..b1bb6803707 100644 --- a/bamboo/unit_tests/test_unit_layer_variance.py +++ b/bamboo/unit_tests/test_unit_layer_variance.py @@ -23,8 +23,8 @@ def skeleton_layer_variance(cluster, executables, dir_name, compiler_name): assert return_code == 0 -def test_unit_layer_variance_clang4(cluster, exes, dirname): - skeleton_layer_variance(cluster, exes, dirname, 'clang4') +def test_unit_layer_variance_clang6(cluster, exes, dirname): + skeleton_layer_variance(cluster, exes, dirname, 'clang6') def test_unit_layer_variance_gcc7(cluster, exes, dirname): diff --git a/bamboo/unit_tests/test_unit_lbann2_reload.py b/bamboo/unit_tests/test_unit_lbann2_reload.py index d48b28873ce..f90b33ce62c 100644 --- a/bamboo/unit_tests/test_unit_lbann2_reload.py +++ b/bamboo/unit_tests/test_unit_lbann2_reload.py @@ -118,10 +118,10 @@ def skeleton_lbann2_reload(cluster, executables, dir_name, compiler_name): assert diff_result == 0 -def test_unit_lbann2_reload_clang4(cluster, exes, dirname): +def test_unit_lbann2_reload_clang6(cluster, exes, dirname): if cluster == 'catalyst': # STILL ERRORS pytest.skip('FIXME') - skeleton_lbann2_reload(cluster, exes, dirname, 'clang4') + skeleton_lbann2_reload(cluster, exes, dirname, 'clang6') def test_unit_lbann2_reload_gcc7(cluster, exes, dirname): diff --git a/bamboo/unit_tests/test_unit_mnist_conv_graph.py b/bamboo/unit_tests/test_unit_mnist_conv_graph.py index 829fffdff2d..530501a7035 100644 --- a/bamboo/unit_tests/test_unit_mnist_conv_graph.py +++ b/bamboo/unit_tests/test_unit_mnist_conv_graph.py @@ -30,8 +30,8 @@ def skeleton_mnist_conv_graph(cluster, executables, dir_name, compiler_name): assert return_code == 0 -def test_unit_mnist_conv_graph_clang4(cluster, exes, dirname): - skeleton_mnist_conv_graph(cluster, exes, dirname, 'clang4') +def test_unit_mnist_conv_graph_clang6(cluster, exes, dirname): + skeleton_mnist_conv_graph(cluster, exes, dirname, 'clang6') def test_unit_mnist_conv_graph_gcc7(cluster, exes, dirname): diff --git a/bamboo/unit_tests/test_unit_mnist_ridge_regression.py b/bamboo/unit_tests/test_unit_mnist_ridge_regression.py index 5b27b342cb4..521a59a310f 100644 --- a/bamboo/unit_tests/test_unit_mnist_ridge_regression.py +++ b/bamboo/unit_tests/test_unit_mnist_ridge_regression.py @@ -24,8 +24,8 @@ def skeleton_mnist_ridge_regression(cluster, executables, dir_name, compiler_nam assert return_code == 0 -def test_unit_mnist_ridge_regression_clang4(cluster, exes, dirname): - skeleton_mnist_ridge_regression(cluster, exes, dirname, 'clang4') +def test_unit_mnist_ridge_regression_clang6(cluster, exes, dirname): + skeleton_mnist_ridge_regression(cluster, exes, dirname, 'clang6') def test_unit_mnist_ridge_regression_gcc7(cluster, exes, dirname): diff --git a/bamboo/unit_tests/test_unit_mnist_softmax_classifier.py b/bamboo/unit_tests/test_unit_mnist_softmax_classifier.py index 1c2c2353100..36cb5f7ce86 100644 --- a/bamboo/unit_tests/test_unit_mnist_softmax_classifier.py +++ b/bamboo/unit_tests/test_unit_mnist_softmax_classifier.py @@ -24,8 +24,8 @@ def skeleton_mnist_softmax_classifier(cluster, executables, dir_name, compiler_n assert return_code == 0 -def test_unit_mnist_softmax_classifier_clang4(cluster, exes, dirname): - skeleton_mnist_softmax_classifier(cluster, exes, dirname, 'clang4') +def test_unit_mnist_softmax_classifier_clang6(cluster, exes, dirname): + skeleton_mnist_softmax_classifier(cluster, exes, dirname, 'clang6') def test_unit_mnist_softmax_classifier_gcc7(cluster, exes, dirname): From 9474f50108229c430b67e6d206160730c93fcd9b Mon Sep 17 00:00:00 2001 From: "Thomas R. Benson" Date: Thu, 20 Jun 2019 15:01:12 -0700 Subject: [PATCH 096/634] fix a copy-paste error in FindBreathe.cmake --- cmake/modules/FindBreathe.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/modules/FindBreathe.cmake b/cmake/modules/FindBreathe.cmake index c1f2d2c5fa2..36f9499c1b8 100644 --- a/cmake/modules/FindBreathe.cmake +++ b/cmake/modules/FindBreathe.cmake @@ -10,7 +10,7 @@ find_program(BREATHE_EXECUTABLE breathe-apidoc PATH_SUFFIXES bin DOC "The breathe documentation tool." NO_DEFAULT_PATH) -find_program(BREATHE_EXECUTABLE breathe-build) +find_program(BREATHE_EXECUTABLE breathe-apidoc) # Standard handling of the package arguments include(FindPackageHandleStandardArgs) From 0e077799fe5ab8483a2d4713aa935fd903b5fe44 Mon Sep 17 00:00:00 2001 From: Nikoli Dryden Date: Thu, 20 Jun 2019 20:06:37 -0500 Subject: [PATCH 097/634] Apply suggestions from code review Co-Authored-By: Tim Moon --- src/transforms/vision/cutout.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/transforms/vision/cutout.cpp b/src/transforms/vision/cutout.cpp index be2d57262dd..578f1ab8939 100644 --- a/src/transforms/vision/cutout.cpp +++ b/src/transforms/vision/cutout.cpp @@ -38,8 +38,8 @@ void cutout::apply(utils::type_erased_matrix& data, std::vector& dims) { const ssize_t center_y = transform::get_uniform_random_int(0, dims[1]); // Compute top-left corner and bottom-right corners of the hole. const ssize_t length = static_cast(m_length); - const size_t x1 = std::max(center_x - length / 2, 0l); - const size_t x2 = std::min(center_x + length / 2, + const size_t x1 = std::max(center_x - length / 2, 0); + const size_t x2 = std::min(x1 + length, static_cast(dims[2]) - 1); const size_t y1 = std::max(center_y - length / 2, 0l); const size_t y2 = std::min(center_y + length / 2, From 1f92e003291a49ac670af6c82ce00f2afe0a26fa Mon Sep 17 00:00:00 2001 From: Nikoli Dryden Date: Thu, 20 Jun 2019 18:08:16 -0700 Subject: [PATCH 098/634] Update doxygen. --- include/lbann/transforms/vision/cutout.hpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/lbann/transforms/vision/cutout.hpp b/include/lbann/transforms/vision/cutout.hpp index 1e89a85a0da..861a4b655a7 100644 --- a/include/lbann/transforms/vision/cutout.hpp +++ b/include/lbann/transforms/vision/cutout.hpp @@ -34,7 +34,9 @@ namespace transform { /** * Cutout data augmentation which randomly masks out square regions of input. + * * See: + * * DeVries and Taylor. "Improved Regularization of Convolutional Neural * Networks with Cutout". arXiv preprint arXiv:1708.04552 (2017). * From f84e232b7ab08f6bde654d82d78010a8bfbbfe37 Mon Sep 17 00:00:00 2001 From: Nikoli Dryden Date: Thu, 20 Jun 2019 20:30:03 -0700 Subject: [PATCH 099/634] Address reviews. --- include/lbann/callbacks/callback_mixup.hpp | 6 ++++ src/callbacks/callback_mixup.cpp | 34 +++++++++++++++------- 2 files changed, 29 insertions(+), 11 deletions(-) diff --git a/include/lbann/callbacks/callback_mixup.hpp b/include/lbann/callbacks/callback_mixup.hpp index fdc0e9dff2c..3159afb9787 100644 --- a/include/lbann/callbacks/callback_mixup.hpp +++ b/include/lbann/callbacks/callback_mixup.hpp @@ -36,14 +36,20 @@ namespace lbann { /** * Apply mixup to named input layers. + * * See: + * * Zhang, H. et al. "mixup: Beyond Empirical Risk Minimization." ICLR, 2018. * * This implementation does mixup within a single batch, per the recommendation * within the paper. + * * This approach may create duplicate images, and so uses + * * lambda = max(lambda, 1 - lambda) + * * for the mixing value. + * * This recommendation comes from https://docs.fast.ai/callbacks.mixup.html * * The recommended default alpha (from the paper) is 0.4. diff --git a/src/callbacks/callback_mixup.cpp b/src/callbacks/callback_mixup.cpp index c739ea70a88..9f58a475d04 100644 --- a/src/callbacks/callback_mixup.cpp +++ b/src/callbacks/callback_mixup.cpp @@ -40,17 +40,26 @@ void callback_mixup::on_forward_prop_end(model *m, Layer *l) { return; // No mixup outside of training. } - auto& samples = l->get_local_activations(0); - auto& targets = l->get_local_activations(1); + auto& samples_orig = l->get_local_activations(0); + auto& labels_orig = l->get_local_activations(1); + if (samples_orig.GetDevice() != El::Device::CPU || + labels_orig.GetDevice() != El::Device::CPU) { + LBANN_ERROR("Mixup requires CPU data."); + } + // Copy samples. + // Assumes data are on CPU. + CPUMat samples, labels; + El::Copy(samples_orig, samples); + El::Copy(labels_orig, labels); El::Int mbsize = samples.Width(); const El::Int samples_height = samples.Height(); - const El::Int targets_height = targets.Height(); + const El::Int labels_height = labels.Height(); auto& gen = get_fast_generator(); beta_distribution dist(m_alpha, m_alpha); // For now, data must be on the CPU. if (samples.GetDevice() != El::Device::CPU || - targets.GetDevice() != El::Device::CPU) { + labels.GetDevice() != El::Device::CPU) { LBANN_ERROR("mixup only works with CPU data"); } @@ -59,6 +68,7 @@ void callback_mixup::on_forward_prop_end(model *m, Layer *l) { std::iota(shuffled_indices.begin(), shuffled_indices.end(), 0); std::shuffle(shuffled_indices.begin(), shuffled_indices.end(), gen); + LBANN_OMP_PARALLEL_FOR for (El::Int i = 0; i < mbsize; ++i) { const El::Int j = shuffled_indices[i]; if (i == j) { @@ -67,15 +77,17 @@ void callback_mixup::on_forward_prop_end(model *m, Layer *l) { float lambda = dist(gen); lambda = std::max(lambda, 1.0f - lambda); const float lambda_sub = 1.0f - lambda; - DataType* __restrict__ x1_buf = samples.Buffer() + i*samples.LDim(); - const DataType* __restrict__ x2_buf = samples.LockedBuffer() + j*samples.LDim(); - DataType* __restrict__ y1_buf = targets.Buffer() + i*targets.LDim(); - const DataType* __restrict__ y2_buf = targets.LockedBuffer() + j*targets.LDim(); + const DataType* __restrict__ x1_buf = samples.LockedBuffer(0, i); + const DataType* __restrict__ x2_buf = samples.LockedBuffer(0, j); + DataType* __restrict__ x = samples_orig.Buffer(0, i); + const DataType* __restrict__ y1_buf = labels.LockedBuffer(0, i); + const DataType* __restrict__ y2_buf = labels.LockedBuffer(0, j); + DataType* __restrict__ y = labels_orig.Buffer(0, i); for (El::Int k = 0; k < samples_height; ++k) { - x1_buf[k] = lambda*x1_buf[k] + lambda_sub*x2_buf[k]; + x[k] = lambda*x1_buf[k] + lambda_sub*x2_buf[k]; } - for (El::Int k = 0; k < targets_height; ++k) { - y1_buf[k] = lambda*y1_buf[k] + lambda_sub*y2_buf[k]; + for (El::Int k = 0; k < labels_height; ++k) { + y[k] = lambda*y1_buf[k] + lambda_sub*y2_buf[k]; } } } From c6f779b4731638b40186a7ac1b5f5ee543c7e9ef Mon Sep 17 00:00:00 2001 From: Nikoli Dryden Date: Fri, 21 Jun 2019 05:30:22 -0500 Subject: [PATCH 100/634] Apply more suggestions from code review Co-Authored-By: Tim Moon --- include/lbann/transforms/vision/cutout.hpp | 2 +- src/transforms/vision/cutout.cpp | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/include/lbann/transforms/vision/cutout.hpp b/include/lbann/transforms/vision/cutout.hpp index 861a4b655a7..861877e0d3e 100644 --- a/include/lbann/transforms/vision/cutout.hpp +++ b/include/lbann/transforms/vision/cutout.hpp @@ -51,7 +51,7 @@ class cutout : public transform { /** * Cutout with a given number of squares of a given size. * @param num_holes Number of squares to mask out (must be positive). - * @param size Length of a side of the square (must be positive). + * @param length Length of a side of the square (must be positive). */ cutout(size_t num_holes, size_t length) : transform(), m_num_holes(num_holes), m_length(length) { diff --git a/src/transforms/vision/cutout.cpp b/src/transforms/vision/cutout.cpp index 578f1ab8939..e61661117e9 100644 --- a/src/transforms/vision/cutout.cpp +++ b/src/transforms/vision/cutout.cpp @@ -41,8 +41,8 @@ void cutout::apply(utils::type_erased_matrix& data, std::vector& dims) { const size_t x1 = std::max(center_x - length / 2, 0); const size_t x2 = std::min(x1 + length, static_cast(dims[2]) - 1); - const size_t y1 = std::max(center_y - length / 2, 0l); - const size_t y2 = std::min(center_y + length / 2, + const size_t y1 = std::max(center_y - length / 2, 0); + const size_t y2 = std::min(y1 + length, static_cast(dims[1]) - 1); // Convert to height/width. const size_t h = y2 - y1; From c91461654764a018ca426f152c3cd91501b9bfe7 Mon Sep 17 00:00:00 2001 From: Nikoli Dryden Date: Fri, 21 Jun 2019 07:11:10 -0500 Subject: [PATCH 101/634] Fix type issue. --- src/transforms/vision/cutout.cpp | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/src/transforms/vision/cutout.cpp b/src/transforms/vision/cutout.cpp index e61661117e9..419a78b5e28 100644 --- a/src/transforms/vision/cutout.cpp +++ b/src/transforms/vision/cutout.cpp @@ -38,12 +38,10 @@ void cutout::apply(utils::type_erased_matrix& data, std::vector& dims) { const ssize_t center_y = transform::get_uniform_random_int(0, dims[1]); // Compute top-left corner and bottom-right corners of the hole. const ssize_t length = static_cast(m_length); - const size_t x1 = std::max(center_x - length / 2, 0); - const size_t x2 = std::min(x1 + length, - static_cast(dims[2]) - 1); - const size_t y1 = std::max(center_y - length / 2, 0); - const size_t y2 = std::min(y1 + length, - static_cast(dims[1]) - 1); + const size_t x1 = std::max(center_x - length / 2, 0l); + const size_t x2 = std::min(x1 + length, dims[2] - 1); + const size_t y1 = std::max(center_y - length / 2, 0l); + const size_t y2 = std::min(y1 + length, dims[1] - 1); // Convert to height/width. const size_t h = y2 - y1; const size_t w = x2 - x1; From 751287f5273810161fe0a9e0bc656b56aee8fbca Mon Sep 17 00:00:00 2001 From: Nikoli Dryden Date: Fri, 21 Jun 2019 10:10:50 -0500 Subject: [PATCH 102/634] Pass reservation correctly. Also fix some formatting. --- python/lbann/contrib/lc/launcher.py | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/python/lbann/contrib/lc/launcher.py b/python/lbann/contrib/lc/launcher.py index e459a13978c..c96f1b3a8ef 100644 --- a/python/lbann/contrib/lc/launcher.py +++ b/python/lbann/contrib/lc/launcher.py @@ -67,17 +67,18 @@ def run(model, data_reader, optimizer, # Run LBANN lbann.launcher.run(model, data_reader, optimizer, - lbann_exe = lbann_exe, - lbann_args = lbann_args, - experiment_dir = experiment_dir, - nodes = nodes, - procs_per_node = procs_per_node, - time_limit = time_limit, - scheduler = scheduler, - job_name = job_name, - system = system, - partition = partition, - account = account, - launcher_args = launcher_args, - environment = environment, - setup_only = setup_only) + lbann_exe=lbann_exe, + lbann_args=lbann_args, + experiment_dir=experiment_dir, + nodes=nodes, + procs_per_node=procs_per_node, + time_limit=time_limit, + scheduler=scheduler, + job_name=job_name, + system=system, + partition=partition, + account=account, + reservation=reservation, + launcher_args=launcher_args, + environment=environment, + setup_only=setup_only) From 19ba5886e9034138ddf70af004363bc6f434ddc8 Mon Sep 17 00:00:00 2001 From: Ryan Forsyth Date: Fri, 21 Jun 2019 12:41:44 -0700 Subject: [PATCH 103/634] Further test clean up --- bamboo/allocate_and_run.sh | 7 +++---- bamboo/common_python/tools.py | 10 ++++++++-- bamboo/integration_tests/full_alexnet.sh | 2 +- 3 files changed, 12 insertions(+), 7 deletions(-) diff --git a/bamboo/allocate_and_run.sh b/bamboo/allocate_and_run.sh index 6aee9e80698..5cb5489f10f 100755 --- a/bamboo/allocate_and_run.sh +++ b/bamboo/allocate_and_run.sh @@ -31,15 +31,14 @@ if [ "${CLUSTER}" = 'pascal' ]; then fi if [ ${WEEKLY} -ne 0 ]; then - salloc -N16 -t 600 ./run.sh --weekly + salloc -N16 -t 900 ./run.sh --weekly if [ "${CLUSTER}" = 'catalyst' ]; then cd integration_tests python -m pytest -s test_integration_performance_full_alexnet_clang6 --weekly --run - python -m pytest -s test_integration_performance_full_alexnet_gcc4 --weekly --run python -m pytest -s test_integration_performance_full_alexnet_gcc7 --weekly --run - python -m pytest -s test_integration_performance_full_alexnet_intel18 --weekly --run + python -m pytest -s test_integration_performance_full_alexnet_intel19 --weekly --run cd .. fi else - salloc -N16 -t 600 ./run.sh + salloc -N16 -t 900 ./run.sh fi diff --git a/bamboo/common_python/tools.py b/bamboo/common_python/tools.py index d680b1d9da1..e209f1e1c3e 100644 --- a/bamboo/common_python/tools.py +++ b/bamboo/common_python/tools.py @@ -6,7 +6,7 @@ def check_list(substrings, strings): errors = [] for string in strings: for substring in substrings: - if (string != None) and (substring in string): + if (string is not None) and (substring in string): errors.append('%s contains %s' % (string, substring)) return errors @@ -79,6 +79,7 @@ def get_command(cluster, command_allocate = '' # Allocate nodes only if we don't already have an allocation. if os.getenv('SLURM_JOB_NUM_NODES') is None: + print('Allocating slurm nodes.') command_allocate = 'salloc' option_num_nodes = '' option_partition = '' @@ -105,6 +106,8 @@ def get_command(cluster, command_allocate = '%s%s%s%s' % ( command_allocate, option_num_nodes, option_partition, option_time_limit) + else: + print('slurm nodes already allocated.') # Create run command if command_allocate == '': @@ -123,6 +126,7 @@ def get_command(cluster, command_allocate = '' # Allocate nodes only if we don't already have an allocation. if os.getenv('LSB_HOSTS') is None: + print('Allocating lsf nodes.') command_allocate = 'bsub' # x => Puts the host running your job into exclusive execution # mode. @@ -160,6 +164,8 @@ def get_command(cluster, command_allocate, option_exclusive, option_group, option_interactive, option_num_processes, option_partition, option_processes_per_node, option_time_limit) + else: + print('lsf nodes already allocated.') # Create run command if command_allocate == '': @@ -261,7 +267,7 @@ def get_command(cluster, # option_data_filename_train = data_filename_train_default # option_data_filedir_test = data_filedir_test_default # option_data_filename_train = data_filename_test_default - pass # No need to pass in a parameter + pass # No need to pass in a parameter elif cluster == 'ray': option_data_filedir_train = ' --data_filedir_train=%s' % re.sub('[a-z]scratch[a-z]', 'gscratchr', data_filedir_train_default) option_data_filename_train = ' --data_filename_train=%s' % re.sub('[a-z]scratch[a-z]', 'gscratchr', data_filename_train_default) diff --git a/bamboo/integration_tests/full_alexnet.sh b/bamboo/integration_tests/full_alexnet.sh index ff1b5cf1c76..8ce75add3a2 100755 --- a/bamboo/integration_tests/full_alexnet.sh +++ b/bamboo/integration_tests/full_alexnet.sh @@ -26,4 +26,4 @@ LBANN_DIR=$(git rev-parse --show-toplevel) CLUSTER=$(hostname | sed 's/\([a-zA-Z][a-zA-Z]*\)[0-9]*/\1/g') # Experiment -srun --nodes=128 --ntasks-per-node=2 ${LBANN_DIR}/bamboo/compiler_tests/builds/catalyst_gcc-4.9.3_x86_64_mvapich2-2.2_openblas_rel/build/model_zoo/lbann --model=${LBANN_DIR}/model_zoo/models/alexnet/model_alexnet.prototext --optimizer=${LBANN_DIR}/model_zoo/optimizers/opt_sgd.prototext --reader=${LBANN_DIR}/model_zoo/data_readers/data_reader_imagenet.prototext --data_filedir_train=/l/ssd/lbannusr/datasets-resized/ILSVRC2012/train/ --data_filename_train=/l/ssd/lbannusr/datasets-resized/ILSVRC2012/labels/train.txt --data_filedir_test=/l/ssd/lbannusr/datasets-resized/ILSVRC2012/val/ --data_filename_test=/l/ssd/lbannusr/datasets-resized/ILSVRC2012/labels/val.txt +srun --nodes=128 --ntasks-per-node=2 ${LBANN_DIR}/bamboo/compiler_tests/builds/catalyst_gcc-7.1.0_x86_64_mvapich2-2.2_openblas_rel/build/model_zoo/lbann --model=${LBANN_DIR}/model_zoo/models/alexnet/model_alexnet.prototext --optimizer=${LBANN_DIR}/model_zoo/optimizers/opt_sgd.prototext --reader=${LBANN_DIR}/model_zoo/data_readers/data_reader_imagenet.prototext --data_filedir_train=/l/ssd/lbannusr/datasets-resized/ILSVRC2012/train/ --data_filename_train=/l/ssd/lbannusr/datasets-resized/ILSVRC2012/labels/train.txt --data_filedir_test=/l/ssd/lbannusr/datasets-resized/ILSVRC2012/val/ --data_filename_test=/l/ssd/lbannusr/datasets-resized/ILSVRC2012/labels/val.txt From db974988e9fb3b2a635a3284adf0562732f52d47 Mon Sep 17 00:00:00 2001 From: graham63 <50850420+graham63@users.noreply.github.com> Date: Fri, 21 Jun 2019 15:22:31 -0700 Subject: [PATCH 104/634] Add tb image support (#1086) * New preprocessing pipeline (#1014) * Bump OpenCV to 4.1.0; drop highgui and old version support. * Kill the mnist_siamese data reader. * Kill the imagenet_patches data reader. * Kill unneeded tests. * Kill patch processing. * Kill ancient image preprocessor. * Kill image_utils. Will be restored; temporarily breaks things. * Kill instantiation of data_reader_multi_images. This is now an ABC of triplet and can be refactored out later. * Kill old preprocessing pipeline. * Kill old preprocessing pipeline prototext. * Do not need to replace old preprocessing pipeline in jag_conduit. * Add OpenCV utilities. * Fix issues with OpenCV/old preproc pipeline removal. * Add image_utils, for loading/saving images. * Restore save_image. * Add initial version of new preprocessing pipeline. * Add new preprocessing pipeline to prototext. * Add transform pipeline to data reader. * Fix bug in to_lbann_layout. * Add fused normalize/to LBANN layout transform. * Support non-in-place normalize. * Restore preprocessing for MNIST and CIFAR10 data readers. * Restore loading/preprocessing to ImageNet/triplet/multihead siamese data readers. * Update data reader prototexts. ImageNet data reader changes to better match proper normalization. * Remove unneeded preprocessing for JAG reader. * Add debug-mode argument checking to fast_rand_int. * Fix off-by-1 that could lead to infinite loops. * Move setting expected output dims to image data reader. This fixes a memory corruption issue caused by multiple threads setting the same thing. * Kill unused lbann_data_generator.cpp * Make non-contiguous LBANN matrices error out. * Make transform random number helpers static. * random_resized_crop -> random_resized_crop_with_fixed_aspect_ratio; random_resized_aspect_ratio_crop -> random_resized_crop. * Clarify to_lbann_layout rescaling. * Rename image_utils.cpp/hpp to image.cpp/hpp. * Rename opencv_utils.hpp to opencv.hpp. * Fix unit test. * Fix return type for get_linearized_size. * Remove unused save_image method. * Remove unused methods in ImageNet data reader. * Fixed-size crops throw an exception if input image is too small. * Fix bug in random_crop where wrong dimension was used. * Update some docs/comments. * Remove unneeded implementations of transform_pipeline move operator/constructor. * Updating JAG Conduit reader to work with new transform pipeline. Note that transformation from cv::Mat to CPUMat is currently failing in the to_lbann_layout transformation object. * Improve non-contiguous check. * Restore data store functionality for ImageNet data reader. Note I haven't actually tested this. * Use I/O generator for transform RNGs. Fixes issue where transforms done by different I/O threads would have the same random number sequence. * Address @benson31's comments on image.hpp/cpp. * Updated the JAG data reader to use a simple transform to repack the JAG image data from HDF5/Conduit HWC to CHW format. Then used the scale and translate transformation to divide each channel by the average and add the channel offset values for normalization. Added classes for both the HWC to CHW and scale and translate transformations, * Cleaned up dead code. * Added an explicit move for the create_datum_views to ensure that the contents of the view are not deep copied. Also, removed some of the dump_outputs callback from the test code. * Updated the JAG data reader and repack transform layer to use DataType format internally. The JAG data reader will now cast the images from ch_t to DataType during ingestion. * Templated sample list (#919) This PR splits the JAG specific sample list into a set of classes that can be generalized to other data types. Specifically, it removes both JAG and Conduit specific details from the base class in the hierarchy. * The base class sample_list being the most generic can be applied to imagenet and has no component referencing conduit. In fact, it does not manage any open file handle. It is intended for data sets that have a single sample per file. * sample_list_open_files inherits the base class, which itself is an abstract class. It is designed to handle data that requires tracking file handles during execution. One reason for this class is data sets that have multiple samples per file. It currently has two derived classes: sample_list_hdf5 and sample_list_conduit_io_handle. * sample_list_hdf5 implements a concrete class that can read JAG data from HDF5 files into the internal conduit format. * sample_list_conduit_io_handle use a more abstract interface to access the JAG data in an HDF5 file. ---- * change the file name of sample_list_jag.hpp and sample_list_jag_impl.hpp to sample_list.hpp and sample_list_impl.hpp * class name change from sample_list_jag to sample_list * convert sample_list class to use template parameters * change the literal assignment to use correct type of rvalue * remove sample_file_id_t from template parameter list * add to_sample_name_t(string) function for native numeric types * add file_handle_t to template parameter list, and move member method implementations from header to implementation file. * preparation for general file handle type: - add file_handle_t to the template parameter list of sample_list. - remove hdf5 and conduit from member function names and local variable names. - make member functions that calls conduit interfaces virtual. - make member functinos that modifies file handles virtual. - move member function imeplementations from the header to the implementation file. * separate hdf5 specifics from the generic sample_list into a derived class * wrap only the minimal portions of the code as virtual methods, and put the rest back into the base class. As a result, the accessors to the private members are no longer needed and removed. * move the inclusion of headers relevant to conduit and hdf5 from the base sample_list to sample_list_hdf5 * remove file_handle_t from the template parameter list of sample_list_hdf5 and make it inherit sample_list * update copy_member such that it avoids copying the data that will be cleared without being used. update the destructor * added the general base class for the sample list, with which no open file handle is managed. * added the intermediate sample list class with open file handle management * make the methods that are not fully defined in the intermediate class pure virtual * make the base class of sample list dependent on the template parameter for the sample name type automatically assign sample name in case that its type is size_t (or integral for c++17) or string * generalize uninitialized_sample_name() a bit more for c++17 * fix write_header to print out correct number of samples. add a virtual method to return the number of sample file * fix write_header() to print out correct number of excluded samples * Fixed the error handling function. * Add conduit include paths to wrapper interface target * Added function to add image to event files for tensorboard * Added function to convert images from El::Matrix, and a second function to encode into std::string. * clean up merge artifacts * Fix an issue where I can't read diffs * Removed unnecessary instances of lbann::, refactored to remove duplicate code, replaced by function call. * remove duplicate definitions and remove lbann namespace resolution * Updated 'static_cast(std::round(norm_img_val) * 255);' to 'static_cast(std::min(std::floor(norm_img_val) * 256, DataType(255)));' --- CMakeLists.txt | 2 +- external/TBinf/TBinf.cpp | 20 +++++++++++++ external/TBinf/TBinf.hpp | 47 +++++++++++++++++++----------- include/lbann/utils/image.hpp | 24 +++++++++++++--- src/utils/image.cpp | 54 ++++++++++++++++++++++++----------- 5 files changed, 109 insertions(+), 38 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index d863a9204bf..4d596590ba7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -430,7 +430,7 @@ if (LBANN_WITH_CONDUIT) PROPERTY INTERFACE_INCLUDE_DIRECTORIES "${_conduit_include_dirs}") - + set_target_properties(conduit::conduit PROPERTIES INTERFACE_LINK_LIBRARIES diff --git a/external/TBinf/TBinf.cpp b/external/TBinf/TBinf.cpp index b92141f9bc5..90e9dabdd8e 100644 --- a/external/TBinf/TBinf.cpp +++ b/external/TBinf/TBinf.cpp @@ -68,6 +68,26 @@ void SummaryWriter::add_scalar(const std::string tag, float value, write_summary_event(s, step); } +void SummaryWriter::add_image(const std::string& tag, + std::string encoded_img, + const std::vector& dims, + int64_t step){ + + auto s = std::unique_ptr(new tensorflow::Summary()); + tensorflow::Summary::Value *v = s->add_value(); + v->set_tag(tag); + tensorflow::Summary_Image *img = v->mutable_image(); + img->Clear(); + img->set_colorspace(dims[0]); + img->set_height(dims[1]); + img->set_width(dims[2]); + + img->set_encoded_image_string(std::move(encoded_img)); + + write_summary_event(s.release(), step); +} + + void SummaryWriter::add_histogram(const std::string tag, std::vector::const_iterator first, std::vector::const_iterator last, diff --git a/external/TBinf/TBinf.hpp b/external/TBinf/TBinf.hpp index 0a11937da71..7b87fee53b1 100644 --- a/external/TBinf/TBinf.hpp +++ b/external/TBinf/TBinf.hpp @@ -39,27 +39,40 @@ namespace TBinf { /** - * Write data to a Tensorboard logging directory. - * This writes data in the same format as Tensorflow does. + * @brief Write data to Tensorboard logging directory in Tensorflow format. */ class SummaryWriter { public: /** - * Create a new event file in logdir to write to. + * @brief Create a new event file in logdir to write to. * @param logdir The directory where the event file will be written. */ SummaryWriter(const std::string logdir); ~SummaryWriter(); /** - * Add a scalar value to the event file. + * @brief Add a scalar value to the event file. * @param tag The tag for this summary. * @param value The scalar value. * @param step Optional global step. */ void add_scalar(const std::string tag, float value, int64_t step = -1); + + /** + * @brief Add an image to the event file. + * @param tag The tag for this summary. + * @param encoded_img The image to be written. + * @param dims The dimensions of the image. + * @param step Optional global step. + */ + + void add_image(const std::string& tag, + std::string encoded_img, + const std::vector& dims, + int64_t step = -1); + /** - * Add a histogram of values to the event file. + * @brief Add a histogram of values to the event file. * @param tag The tag for this summary. * @param first Iterator to the first value to add. * @param last Iterator past the last value to add. @@ -70,7 +83,7 @@ class SummaryWriter { std::vector::const_iterator last, int64_t step = -1); /** - * Add a histogram based upon buckets to the event file. + * @brief Add a histogram based upon buckets to the event file. * @param tag The tag for this summary. * @param buckets The histogram buckets. * @param min The minimum value in the dataset. @@ -85,44 +98,44 @@ class SummaryWriter { double min, double max, double num, double sum, double sqsum, int64_t step = -1); - /** Return the current histogram buckets. */ + /** @brief Return the current histogram buckets. */ const std::vector& get_histogram_buckets() const; - /** Return the default histogram buckets. */ + /** @brief Return the default histogram buckets. */ static std::vector get_default_histogram_buckets(); - /** Ensure all events are written out. */ + /** @brief Ensure all events are written out. */ void flush(); private: /** - * Write a summary to the event file. + * @brief Write a summary to the event file. * @param s The summary to write. * @param step Optional global step for the event. */ void write_summary_event(tensorflow::Summary *s, int64_t step = -1); /** - * Write an event to the event file. + * @brief Write an event to the event file. * @param e The event to write. */ void write_event(tensorflow::Event& e); - /** Get current wall time in fractional seconds. */ + /** @brief Get current wall time in fractional seconds. */ double get_time_in_seconds(); - /** Initialize histogram buckets. */ + /** @brief Initialize histogram buckets. */ void init_histogram_buckets(); - /** Current event version. */ + /** @brief Current event version. */ static constexpr const char *EVENT_VERSION = "brain.Event:2"; - /** Filename to write to. */ + /** @brief Filename to write to. */ std::string filename; - /** File stream for writing. */ + /** @brief File stream for writing. */ std::fstream file; - /** Current histogram buckets. */ + /** @brief Current histogram buckets. */ std::vector histogram_buckets; }; diff --git a/include/lbann/utils/image.hpp b/include/lbann/utils/image.hpp index 73fda36d395..dbfeb5be62f 100644 --- a/include/lbann/utils/image.hpp +++ b/include/lbann/utils/image.hpp @@ -32,7 +32,7 @@ namespace lbann { /** - * Load an image from filename. + * @brief Load an image from filename. * @param filename The path to the image to load. * @param dst Image will be loaded into this matrix, in OpenCV format. * @param dims Will contain the dimensions of the image as {channels, height, @@ -42,7 +42,7 @@ void load_image(const std::string& filename, El::Matrix& dst, std::vector& dims); /** - * Decode an image from buf. + * @brief Decode an image from buf. * @param src A buffer containing image data to be decoded. * @param dst Image will be loaded into this matrix, in OpenCV format. * @param dims Will contain the dimensions of the image as {channels, height, @@ -52,7 +52,7 @@ void decode_image(El::Matrix& src, El::Matrix& dst, std::vector& dims); /** - * Save an image to filename. + * @brief Save an image to filename. * @param filename The path to the image to write. * @param src The image to save. This is in OpenCV format. * @param dims The dimensions of the image. @@ -60,7 +60,7 @@ void decode_image(El::Matrix& src, El::Matrix& dst, void save_image(const std::string& filename, El::Matrix& src, const std::vector& dims); /** - * Save an image to filename. + * @brief Save an image to filename. * @param filename The path to the image to write. * @param src The image to save. This is in standard LBANN format, and will be * converted to a uint8_t matrix, interpolating between the min and max values @@ -69,6 +69,22 @@ void save_image(const std::string& filename, El::Matrix& src, */ void save_image(const std::string& filename, const CPUMat& src, const std::vector& dims); +/** + * @brief Convert image from El::Matrix to El::Matrix + * @param image The image to convert. + * @param dims The dimensions of the image. + * @returns El::Matrix Returns image in El::Matrix format + */ +El::Matrix get_uint8_t_image(const CPUMat& image, + const std::vector& dims); +/** + * @brief Encodes image to std:string format + * @param image The image to convert + * @param dims The dimensions of the image. + * @returns std::string Returns image in std::string format + */ +std::string encode_image(const El::Matrix& image, + const std::vector& dims); } // namespace lbann diff --git a/src/utils/image.cpp b/src/utils/image.cpp index f90baae8f3c..4631bdf1d87 100644 --- a/src/utils/image.cpp +++ b/src/utils/image.cpp @@ -31,6 +31,8 @@ #include "lbann/utils/exception.hpp" #include "lbann/utils/opencv.hpp" +namespace lbann { + namespace { // Read filename into buf. @@ -135,7 +137,7 @@ void opencv_decode(El::Matrix& buf, El::Matrix& dst, std::vector& dims, const std::string filename) { const size_t encoded_size = buf.Height() * buf.Width(); std::vector buf_dims = {1, encoded_size, 1}; - cv::Mat cv_encoded = lbann::utils::get_opencv_mat(buf, buf_dims); + cv::Mat cv_encoded = utils::get_opencv_mat(buf, buf_dims); // Attempt to guess the decoded size. // Warning: These may be wrong. size_t height, width, channels; @@ -145,7 +147,7 @@ void opencv_decode(El::Matrix& buf, El::Matrix& dst, dst.Resize(height*width*channels, 1); std::vector guessed_dims = {channels, height, width}; // Decode the image. - cv::Mat cv_dst = lbann::utils::get_opencv_mat(dst, guessed_dims); + cv::Mat cv_dst = utils::get_opencv_mat(dst, guessed_dims); cv::Mat real_decoded = cv::imdecode(cv_encoded, cv::IMREAD_ANYCOLOR | cv::IMREAD_ANYDEPTH, &cv_dst); @@ -158,8 +160,8 @@ void opencv_decode(El::Matrix& buf, El::Matrix& dst, static_cast(real_decoded.cols)}; // If we did not guess the size right, need to copy. if (real_decoded.ptr() != dst.Buffer()) { - dst.Resize(lbann::utils::get_linearized_size(dims), 1); - cv_dst = lbann::utils::get_opencv_mat(dst, dims); + dst.Resize(utils::get_linearized_size(dims), 1); + cv_dst = utils::get_opencv_mat(dst, dims); real_decoded.copyTo(cv_dst); } } else { @@ -172,16 +174,14 @@ void opencv_decode(El::Matrix& buf, El::Matrix& dst, static_cast(decoded.rows), static_cast(decoded.cols)}; // Copy to dst. - dst.Resize(lbann::utils::get_linearized_size(dims), 1); - cv::Mat cv_dst = lbann::utils::get_opencv_mat(dst, dims); + dst.Resize(utils::get_linearized_size(dims), 1); + cv::Mat cv_dst = utils::get_opencv_mat(dst, dims); decoded.copyTo(cv_dst); } } } // anonymous namespace -namespace lbann { - void load_image(const std::string& filename, El::Matrix& dst, std::vector& dims) { // Load the encoded image. @@ -209,33 +209,55 @@ void save_image(const std::string& filename, const CPUMat& src, if (dims.size() != 3 || (dims[0] != 1 && dims[0] != 3)) { LBANN_ERROR("Unsupported dimensions for saving an image."); } + + El::Matrix cv_mat = get_uint8_t_image(src, dims); + + save_image(filename, cv_mat, dims); +} + +El::Matrix get_uint8_t_image(const CPUMat& image, + const std::vector& dims) +{ // Need to convert to uint8_t matrix in OpenCV format. // We will normalize to [0, 1], then map to [0, 255]. const size_t size = utils::get_linearized_size(dims); El::Matrix cv_mat = El::Matrix(size, 1); // Find the minimum and maximum to normalize with. - const DataType* __restrict__ src_buf = src.LockedBuffer(); + const DataType* __restrict__ img_buf = image.LockedBuffer(); DataType min = std::numeric_limits::max(); DataType max = std::numeric_limits::lowest(); for (size_t i = 0; i < size; ++i) { - min = std::min(min, src_buf[i]); - max = std::max(max, src_buf[i]); + min = std::min(min, img_buf[i]); + max = std::max(max, img_buf[i]); } const DataType norm_denom = max - min; // Construct the OpenCV buffer. uint8_t* __restrict__ cv_buf = cv_mat.Buffer(); for (size_t channel = 0; channel < dims[0]; ++channel) { - const size_t src_offset = channel*dims[1]*dims[2]; + const size_t img_offset = channel*dims[1]*dims[2]; for (size_t col = 0; col < dims[2]; ++col) { for (size_t row = 0; row < dims[1]; ++row) { - const DataType norm_src_val = - (src_buf[src_offset + row + col*dims[1]] - min) / norm_denom; + const DataType norm_img_val = + (img_buf[img_offset + row + col*dims[1]] - min) / norm_denom; cv_buf[dims[0]*(col + row*dims[2]) + channel] = - static_cast(norm_src_val * 255); + static_cast(std::min(std::floor(norm_img_val) * 256, DataType(255))); } } } - save_image(filename, cv_mat, dims); + return cv_mat; +} + +std::string encode_image(const El::Matrix& image, + const std::vector& dims) +{ + cv::Mat Mat_img = utils::get_opencv_mat( + const_cast&>(image), dims); + std::vector encoded_img; + std::vector params = {cv::IMWRITE_JPEG_QUALITY, 20}; + + cv::imencode(".jpg", Mat_img, encoded_img, params); + + return std::string{encoded_img.begin(), encoded_img.end()}; } } // namespace lbann From ae26ef780e157fe04694458a829321ed36230160 Mon Sep 17 00:00:00 2001 From: Tim Moon Date: Fri, 21 Jun 2019 17:59:31 -0700 Subject: [PATCH 105/634] Use shared memory for inter-process communication in Python data reader (#1088) * Initial work toward using shared memory IPC in Python data reader. Compiles, but crashes at runtime. * Python data reader uses shared memory for IPC. Runs and gets 2x performance on my test problem. * Python data reader uses buffer protocol for copies. --- .../lbann/data_readers/data_reader_python.hpp | 37 ++++ src/data_readers/data_reader_python.cpp | 168 +++++++++++++++--- 2 files changed, 184 insertions(+), 21 deletions(-) diff --git a/include/lbann/data_readers/data_reader_python.hpp b/include/lbann/data_readers/data_reader_python.hpp index 35264a18e7b..ca289ff4770 100644 --- a/include/lbann/data_readers/data_reader_python.hpp +++ b/include/lbann/data_readers/data_reader_python.hpp @@ -161,11 +161,48 @@ class python_reader : public generic_data_reader { bool fetch_label(CPUMat& Y, int data_id, int mb_idx) override; private: + + /** @brief Dimensions of data sample tensor. */ std::vector m_sample_dims; + /** @brief Number of data samples in data set. */ El::Int m_num_samples; + + /** @brief User-provided Python function to access data samples. + * + * The function is expected to take one integer argument for the + * sample index. It must return an iterator that defines the + * entries in a data sample. + */ python::object m_sample_function; + + /** @brief Wrapper function around sample access function. + * + * This function will be executed on worker processes (see @c + * m_process_pool). It will obtain a data sample from @c + * m_sample_function and copy it into a @c m_shared_memory_array. + * + * @todo Performance optimizations for NumPy data. + */ + python::object m_sample_function_wrapper; + + /** @brief Pool of worker processes. + * + * From the Python @c multiprocessing module. + */ python::object m_process_pool; + /** @brief Shared memory array. + * + * @c RawArray the Python @c multiprocessing module. + */ + python::object m_shared_memory_array; + + /** @brief Pointer into shared memory array. + * + * Points to buffer for @c m_shared_memory_array. + */ + DataType* m_shared_memory_array_ptr = nullptr; + }; } // namespace lbann diff --git a/src/data_readers/data_reader_python.cpp b/src/data_readers/data_reader_python.cpp index 13853fd0a65..ffa1c7d16e4 100644 --- a/src/data_readers/data_reader_python.cpp +++ b/src/data_readers/data_reader_python.cpp @@ -25,9 +25,11 @@ //////////////////////////////////////////////////////////////////////////////// #include "lbann/data_readers/data_reader_python.hpp" +#include "lbann/models/model.hpp" #ifdef LBANN_HAS_PYTHON #include #include +#include namespace lbann { @@ -224,7 +226,7 @@ python_reader::python_reader(std::string module, } manager.check_error(); - // Get sample function + // Get sample access function m_sample_function = PyObject_GetAttrString(data_module, sample_function.c_str()); @@ -232,6 +234,7 @@ python_reader::python_reader(std::string module, python_reader::~python_reader() { if (Py_IsInitialized() && m_process_pool != nullptr) { + python::global_interpreter_lock gil(python::manager::get_instance()); PyObject_CallMethod(m_process_pool, "terminate", nullptr); } } @@ -266,32 +269,43 @@ bool python_reader::fetch_data_block(CPUMat& X, auto& manager = python::manager::get_instance(); python::global_interpreter_lock gil(manager); - // Get sample indices - python::object indices = PyList_New(0); + // Check that shared memory array is large enough + const El::Int sample_size = get_linearized_data_size(); + const El::Int array_size = PyObject_Length(m_shared_memory_array); + if (array_size < sample_size * mb_size) { + std::stringstream err; + err << "Python data reader attempted to load " + << sample_size * mb_size * sizeof(DataType) << " B " + << "into shared memory array, but only " + << array_size * sizeof(DataType) << " B is available"; + LBANN_ERROR(err.str()); + } + + // Get arguments for sample access function + python::object args_list = PyList_New(0); for (El::Int i = 0; i < mb_size; ++i) { - El::Int index = m_shuffled_indices[m_current_pos + i * m_sample_stride]; - PyList_Append(indices, python::object(index)); - indices_fetched.Set(i, 0, index); + El::Int sample_index = m_shuffled_indices[m_current_pos + i * m_sample_stride]; + El::Int array_offset = sample_size * i; + PyList_Append(args_list, + python::object(Py_BuildValue("(l,l)", + sample_index, + array_offset))); + indices_fetched.Set(i, 0, sample_index); } // Get samples using Python process pool python::object samples = PyObject_CallMethod(m_process_pool, - "map", + "starmap", "(O,O)", - m_sample_function.get(), - indices.get()); + m_sample_function_wrapper.get(), + args_list.get()); - // Extract sample entries from Python objects - const El::Int sample_size = get_linearized_data_size(); - samples = PyObject_GetIter(samples); - for (El::Int col = 0; col < mb_size; ++col) { - python::object sample = PyIter_Next(samples); - sample = PyObject_GetIter(sample); - for (El::Int row = 0; row < sample_size; ++row) { - python::object val = PyIter_Next(sample); - X(row, col) = PyFloat_AsDouble(val); - } - } + // Copy data from shared memory to output matrix + CPUMat shared_memory_matrix(sample_size, + mb_size, + m_shared_memory_array_ptr, + sample_size); + El::Copy(shared_memory_matrix, X); return true; } @@ -304,11 +318,123 @@ void python_reader::setup(int num_io_threads, std::shared_ptr io_thread_pool) { generic_data_reader::setup(num_io_threads, io_thread_pool); - // Initialize Python process pool + // Acquire Python GIL auto& manager = python::manager::get_instance(); python::global_interpreter_lock gil(manager); + + // Import modules + python::object main_module = PyImport_ImportModule("__main__"); + python::object ctypes_module = PyImport_ImportModule("ctypes"); python::object multiprocessing_module = PyImport_ImportModule("multiprocessing"); + + // Stop process pool if needed + if (m_process_pool != nullptr) { + PyObject_CallMethod(m_process_pool, "terminate", nullptr); + m_process_pool = nullptr; + } + + // Allocate shared memory array + /// @todo Figure out more robust way to get max mini-batch size + const El::Int sample_size = get_linearized_data_size(); + const El::Int mini_batch_size + = generic_data_reader::get_model()->get_max_mini_batch_size(); + std::string datatype_typecode; + switch (sizeof(DataType)) { + case 4: datatype_typecode = "f"; break; + case 8: datatype_typecode = "d"; break; + default: LBANN_ERROR("invalid data type for Python data reader " + "(only float and double are supported)"); + } + m_shared_memory_array + = PyObject_CallMethod(multiprocessing_module, + "RawArray", + "(s, l)", + datatype_typecode.c_str(), + sample_size * mini_batch_size); + + // Get address of shared memory buffer + python::object shared_memory_ptr + = PyObject_CallMethod(ctypes_module, + "addressof", + "(O)", + m_shared_memory_array.get()); + m_shared_memory_array_ptr + = reinterpret_cast(PyLong_AsLong(shared_memory_ptr)); + + // Create global variables in Python + // Note: The static counter makes sure variable names are unique. + static El::Int instance_id = 0; + instance_id++; + const std::string sample_func_name + = ("_DATA_READER_PYTHON_CPP_sample_function_wrapper" + + std::to_string(instance_id)); + PyObject_SetAttrString(main_module, + sample_func_name.c_str(), + m_sample_function); + manager.check_error(); + const std::string shared_array_name + = ("_DATA_READER_PYTHON_CPP_shared_memory_array" + + std::to_string(instance_id)); + PyObject_SetAttrString(main_module, + shared_array_name.c_str(), + m_shared_memory_array); + manager.check_error(); + + // Create wrapper around sample function + // Note: We attempt accessing the sample with the buffer protocol + // since they can be copied more efficiently. If this fails, we just + // iterate through the sample entries. + /// @todo Handle multi-dimensional NumPy arrays. + const std::string wrapper_func_name + = ("_DATA_READER_PYTHON_CPP_sample_function" + + std::to_string(instance_id)); + std::string wrapper_func_def = R"( +def @wrapper_func@(sample_index, array_offset): + """Get data sample and copy to shared memory array.""" + + # Get sample + sample = @sample_func@(sample_index) + + # Copy entries from sample to shared memory array + # Note: We attempt to copy via the buffer protocol since it is + # much more efficient than naively looping through the arrays. + try: + # Note: ctypes arrays explicitly specify their endianness, but + # memoryview copies only work when the endianness is + # explicitly set to the system default. We need to do some + # type casting to get around this excessive error checking. + input_buffer = memoryview(sample) + output_buffer = memoryview(@shared_array@) + output_buffer = output_buffer[array_offset:array_offset+@sample_size@] + output_buffer = output_buffer.cast('B').cast('@datatype_typecode@') + output_buffer[:] = input_buffer + except: + for i, val in enumerate(sample): + @shared_array@[i + array_offset] = val +)"; + wrapper_func_def = std::regex_replace(wrapper_func_def, + std::regex("\\@wrapper_func\\@"), + wrapper_func_name); + wrapper_func_def = std::regex_replace(wrapper_func_def, + std::regex("\\@sample_func\\@"), + sample_func_name); + wrapper_func_def = std::regex_replace(wrapper_func_def, + std::regex("\\@shared_array\\@"), + shared_array_name); + wrapper_func_def = std::regex_replace(wrapper_func_def, + std::regex("\\@sample_size\\@"), + std::to_string(sample_size)); + wrapper_func_def = std::regex_replace(wrapper_func_def, + std::regex("\\@datatype_typecode\\@"), + datatype_typecode); + PyRun_SimpleString(wrapper_func_def.c_str()); + manager.check_error(); + m_sample_function_wrapper + = PyObject_GetAttrString(main_module, + wrapper_func_name.c_str()); + + // Start Python process pool m_process_pool = PyObject_CallMethod(multiprocessing_module, "Pool", "(L)", num_io_threads); From 683d794c851d04b55cf8d059fbf2a5d877d0d525 Mon Sep 17 00:00:00 2001 From: "David A. Hysom" Date: Mon, 24 Jun 2019 10:09:04 -0700 Subject: [PATCH 106/634] working version: preload mode --- src/data_readers/data_reader.cpp | 24 +++++------- src/data_readers/data_reader_image.cpp | 11 ++---- src/data_store/data_store_conduit.cpp | 54 +++++++++++++++++++++----- 3 files changed, 58 insertions(+), 31 deletions(-) diff --git a/src/data_readers/data_reader.cpp b/src/data_readers/data_reader.cpp index 5f55c76ba22..87f824fe287 100644 --- a/src/data_readers/data_reader.cpp +++ b/src/data_readers/data_reader.cpp @@ -737,20 +737,16 @@ void generic_data_reader::instantiate_data_store(const std::vector& local_l m_data_store->set_shuffled_indices(&m_shuffled_indices); // optionally preload the data store - if (opts->get_bool("preload_data_store")) { - //TODO: future development: preloading when using store as local cache - if (!opts->get_bool("data_store_cache")) { - - if(is_master()) { - std::cout << "generic_data_reader::instantiate_data_store - Starting the preload" << std::endl; - } - if (local_list_sizes.size() != 0) { - m_data_store->build_preloaded_owner_map(local_list_sizes); - } - preload_data_store(); - if(is_master()) { - std::cout << "preload complete" << std::endl; - } + if (opts->get_bool("preload_data_store") && !opts->get_bool("data_store_cache")) { + if(is_master()) { + std::cout << "generic_data_reader::instantiate_data_store - Starting the preload" << std::endl; + } + if (local_list_sizes.size() != 0) { + m_data_store->build_preloaded_owner_map(local_list_sizes); + } + preload_data_store(); + if(is_master()) { + std::cout << "preload complete" << std::endl; } } diff --git a/src/data_readers/data_reader_image.cpp b/src/data_readers/data_reader_image.cpp index 926cd553222..126bd085737 100644 --- a/src/data_readers/data_reader_image.cpp +++ b/src/data_readers/data_reader_image.cpp @@ -139,21 +139,16 @@ bool image_data_reader::fetch_label(CPUMat& Y, int data_id, int mb_idx) { } void image_data_reader::load() { - //const std::string imageDir = get_file_dir(); - const std::string imageListFile = get_data_filename(); - options *opts = options::get(); - m_image_list.clear(); + const std::string imageListFile = get_data_filename(); // load image list + m_image_list.clear(); FILE *fplist = fopen(imageListFile.c_str(), "rt"); if (!fplist) { - throw lbann_exception( - std::string{} + __FILE__ + " " + std::to_string(__LINE__) + - " :: failed to open: " + imageListFile); + LBANN_ERROR("failed to open: " + imageListFile + " for reading"); } - while (!feof(fplist)) { char imagepath[512]; label_t imagelabel; diff --git a/src/data_store/data_store_conduit.cpp b/src/data_store/data_store_conduit.cpp index 7106b5e430b..cd9629c817c 100644 --- a/src/data_store/data_store_conduit.cpp +++ b/src/data_store/data_store_conduit.cpp @@ -36,9 +36,34 @@ #include #include #include +#include namespace lbann { +// Macro to throw an LBANN exception +#undef LBANN_ERROR +#define LBANN_ERROR(message) \ + do { \ + std::stringstream ss_LBANN_ERROR; \ + ss_LBANN_ERROR << "LBANN error "; \ + const int rank_LBANN_ERROR = lbann::get_rank_in_world(); \ + if (rank_LBANN_ERROR >= 0) { \ + ss_LBANN_ERROR << "on rank " << rank_LBANN_ERROR << " "; \ + } \ + ss_LBANN_ERROR << "(" << __FILE__ << ":" << __LINE__ << ")" \ + << ": " << (message); \ + if (errno) { \ + ss_LBANN_ERROR << "\nerrno: " << errno << " msg: " \ + << strerror(errno); \ + } \ + if (m_output) { \ + m_output << "ERROR: " << ss_LBANN_ERROR.str() \ + << std::endl; \ + m_output.close(); \ + } \ + throw lbann::exception(ss_LBANN_ERROR.str()); \ + } while (0) + data_store_conduit::data_store_conduit( generic_data_reader *reader) : m_n(0), @@ -196,7 +221,10 @@ void data_store_conduit::copy_members(const data_store_conduit& rhs, const std:: } void data_store_conduit::setup(int mini_batch_size) { + double tm1 = get_time(); + if (m_world_master) { + std::cout << "starting data_store_conduit::setup() for role: " << m_reader->get_role() << "\n"; if (m_super_node) { std::cout << "data store mode: exchange_data via super nodes\n"; } else { @@ -204,13 +232,7 @@ void data_store_conduit::setup(int mini_batch_size) { } } - double tm1 = get_time(); - if (m_world_master && !m_preload) { - std::cout << "starting data_store_conduit::setup() for role: " << m_reader->get_role() << "\n"; - } - if (!m_preload) { - // generic_data_store::setup(mini_batch_size); build_owner_map(mini_batch_size); } else { m_owner_map_mb_size = mini_batch_size; @@ -1113,10 +1135,17 @@ void data_store_conduit::allocate_shared_segment(std::unordered_map &si if (shm_fd == -1) { LBANN_ERROR("shm_open failed"); } + #if 0 int v = ftruncate(shm_fd, size); if (v != 0) { - LBANN_ERROR("ftruncate failed"); + struct stat b; + int sanity = fstat(shm_fd, &b); + if (sanity != 0) { + LBANN_ERROR("ftruncate failed, and fstat failed"); + } + LBANN_ERROR("ftruncate failed; file size: " + std::to_string(b.st_size) + " bytes; requested size: " + std::to_string(size)); } + #endif m_mem_seg = mmap(0, size, PROT_READ, MAP_SHARED, shm_fd, 0); if (*(int*)m_mem_seg == -1) { LBANN_ERROR("mmap failed"); @@ -1144,11 +1173,16 @@ void data_store_conduit::preload_local_cache() { } std::vector work; + if (m_world_master) { std::cerr << "calling read_files\n"; } read_files(work, file_sizes, indices[m_rank_in_trainer]); - + if (m_world_master) { std::cerr << "calling allocate_shared_segment\n"; } allocate_shared_segment(file_sizes, indices); + if (m_world_master) { std::cerr << "calling compute_image_offsets\n"; } compute_image_offsets(file_sizes, indices); + if (m_output) m_output << "calling exchange_images\n"; + if (m_world_master) { std::cerr << "calling exchange_images\n"; } exchange_images(work, file_sizes, indices); + if (m_world_master) { std::cerr << "calling build_conduit_nodes\n"; } build_conduit_nodes(file_sizes); } @@ -1196,6 +1230,7 @@ void data_store_conduit::exchange_images(std::vector &work, std::unordered for (int p=0; ptrainer_broadcast(p, work.data(), work.size()); + std::cerr << "data_store_conduit::exchange_images, P_" + std::to_string(m_rank_in_trainer) + " is bcasting size: " + std::to_string(work.size()) << std::endl; if (node_rank == 0) { fillin_shared_images(work, image_sizes, indices[p]); } @@ -1205,7 +1240,8 @@ void data_store_conduit::exchange_images(std::vector &work, std::unordered sz += image_sizes[idx]; } work2.resize(sz); - m_comm->trainer_broadcast(p, work2.data(), work.size()); + std::cerr << "data_store_conduit::exchange_images, P_" + std::to_string(m_rank_in_trainer) + " is receiving bcast of size: " + std::to_string(sz) << std::endl; + m_comm->trainer_broadcast(p, work2.data(), sz); if (node_rank == 0) { fillin_shared_images(work2, image_sizes, indices[p]); } From 350b1086d6aa7b1371ca83441116fa0c2bd3d553 Mon Sep 17 00:00:00 2001 From: Tim Moon Date: Tue, 25 Jun 2019 17:06:34 -0700 Subject: [PATCH 107/634] Launcher in Python frontend returns LBANN exit status. (#1096) --- python/lbann/contrib/lc/launcher.py | 32 ++++++++--------- python/lbann/launcher/__init__.py | 54 ++++++++++++++++------------- python/lbann/launcher/lsf.py | 11 +++++- python/lbann/launcher/slurm.py | 11 +++++- 4 files changed, 66 insertions(+), 42 deletions(-) diff --git a/python/lbann/contrib/lc/launcher.py b/python/lbann/contrib/lc/launcher.py index c96f1b3a8ef..d7c30645d41 100644 --- a/python/lbann/contrib/lc/launcher.py +++ b/python/lbann/contrib/lc/launcher.py @@ -66,19 +66,19 @@ def run(model, data_reader, optimizer, environment['AL_PROGRESS_RANKS_PER_NUMA_NODE'] = 2 # Run LBANN - lbann.launcher.run(model, data_reader, optimizer, - lbann_exe=lbann_exe, - lbann_args=lbann_args, - experiment_dir=experiment_dir, - nodes=nodes, - procs_per_node=procs_per_node, - time_limit=time_limit, - scheduler=scheduler, - job_name=job_name, - system=system, - partition=partition, - account=account, - reservation=reservation, - launcher_args=launcher_args, - environment=environment, - setup_only=setup_only) + return lbann.launcher.run(model, data_reader, optimizer, + lbann_exe=lbann_exe, + lbann_args=lbann_args, + experiment_dir=experiment_dir, + nodes=nodes, + procs_per_node=procs_per_node, + time_limit=time_limit, + scheduler=scheduler, + job_name=job_name, + system=system, + partition=partition, + account=account, + reservation=reservation, + launcher_args=launcher_args, + environment=environment, + setup_only=setup_only) diff --git a/python/lbann/launcher/__init__.py b/python/lbann/launcher/__init__.py index 35ecd09e7ee..a3579d71ba0 100644 --- a/python/lbann/launcher/__init__.py +++ b/python/lbann/launcher/__init__.py @@ -65,6 +65,12 @@ def run(model, data_reader, optimizer, setup_only (bool, optional): If true, the experiment is not run after the experiment directory is initialized. + Returns: + int: Exit status from scheduler. This is really only + meaningful if LBANN is run on an existing node + allocation. If a batch job is submitted, the scheduler + will probably return 0 trivially. + """ # Construct experiment directory if needed @@ -95,31 +101,31 @@ def run(model, data_reader, optimizer, # Run experiment if scheduler.lower() in ('slurm', 'srun', 'sbatch'): - slurm.run(experiment_dir=experiment_dir, - command='{} {}'.format(lbann_exe, lbann_args), - nodes=nodes, - procs_per_node=procs_per_node, - time_limit=time_limit, - job_name=job_name, - partition=partition, - account=account, - reservation=reservation, - srun_args=launcher_args, - environment=environment, - setup_only=setup_only) + return slurm.run(experiment_dir=experiment_dir, + command='{} {}'.format(lbann_exe, lbann_args), + nodes=nodes, + procs_per_node=procs_per_node, + time_limit=time_limit, + job_name=job_name, + partition=partition, + account=account, + reservation=reservation, + srun_args=launcher_args, + environment=environment, + setup_only=setup_only) elif scheduler.lower() in ('lsf', 'jsrun', 'bsub'): - lsf.run(experiment_dir=experiment_dir, - command='{} {}'.format(lbann_exe, lbann_args), - nodes=nodes, - procs_per_node=procs_per_node, - time_limit=time_limit, - job_name=job_name, - partition=partition, - account=account, - reservation=reservation, - jsrun_args=launcher_args, - environment=environment, - setup_only=setup_only) + return lsf.run(experiment_dir=experiment_dir, + command='{} {}'.format(lbann_exe, lbann_args), + nodes=nodes, + procs_per_node=procs_per_node, + time_limit=time_limit, + job_name=job_name, + partition=partition, + account=account, + reservation=reservation, + jsrun_args=launcher_args, + environment=environment, + setup_only=setup_only) else: raise RuntimeError('unsupported job scheduler ({})' .format(scheduler)) diff --git a/python/lbann/launcher/lsf.py b/python/lbann/launcher/lsf.py index e24c2846a10..cac6830e7a7 100644 --- a/python/lbann/launcher/lsf.py +++ b/python/lbann/launcher/lsf.py @@ -42,6 +42,12 @@ def run(command, setup_only (bool, optional): If true, the experiment is not run after the batch script is created. + Returns: + int: Exit status from LSF. This is really only meaningful if + the script is run on an existing node allocation. If a + batch job is submitted, LSF will probably return 0 + trivially. + """ # Check for an existing job allocation. # Note: Settings for existing allocations take precedence. @@ -104,7 +110,9 @@ def run(command, os.chmod(batch_file, 0o755) # Launch if needed. - if not setup_only: + if setup_only: + return 0 + else: if has_allocation: run_proc = subprocess.Popen(['sh', batch_file], stdout=subprocess.PIPE, @@ -128,3 +136,4 @@ def run(command, run_proc.wait() out_proc.wait() err_proc.wait() + return run_proc.returncode diff --git a/python/lbann/launcher/slurm.py b/python/lbann/launcher/slurm.py index 55ca2b71882..e7253adc5df 100644 --- a/python/lbann/launcher/slurm.py +++ b/python/lbann/launcher/slurm.py @@ -40,6 +40,12 @@ def run(command, setup_only (bool, optional): If true, the experiment is not run after the batch script is created. + Returns: + int: Exit status from Slurm. This is really only meaningful if + the script is run on an existing node allocation. If a + batch job is submitted, Slurm will probably return 0 + trivially. + """ # Check for an existing job allocation from Slurm @@ -111,7 +117,9 @@ def run(command, # Launch job if needed # Note: Pipes output to log files - if not setup_only: + if setup_only: + return 0 + else: run_exe = 'sh' if has_allocation else 'sbatch' run_proc = subprocess.Popen([run_exe, batch_file], stdout = subprocess.PIPE, @@ -128,3 +136,4 @@ def run(command, run_proc.wait() out_proc.wait() err_proc.wait() + return run_proc.returncode From cc8cf2edf9d6e4d678fb92ffcfc8be20e9bbecd2 Mon Sep 17 00:00:00 2001 From: Tom Benson <30674819+benson31@users.noreply.github.com> Date: Wed, 26 Jun 2019 17:26:31 -0700 Subject: [PATCH 108/634] remove a cpp file from the root directory that doesn't seem to be in any CMakeLists (#1087) --- test_numpy_conduit_cache.cpp | 68 ------------------------------------ 1 file changed, 68 deletions(-) delete mode 100644 test_numpy_conduit_cache.cpp diff --git a/test_numpy_conduit_cache.cpp b/test_numpy_conduit_cache.cpp deleted file mode 100644 index 79928889029..00000000000 --- a/test_numpy_conduit_cache.cpp +++ /dev/null @@ -1,68 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. -// Produced at the Lawrence Livermore National Laboratory. -// Written by the LBANN Research Team (B. Van Essen, et al.) listed in -// the CONTRIBUTORS file. -// -// LLNL-CODE-697807. -// All rights reserved. -// -// This file is part of LBANN: Livermore Big Artificial Neural Network -// Toolkit. For details, see http://software.llnl.gov/LBANN or -// https://github.com/LLNL/LBANN. -// -// Licensed under the Apache License, Version 2.0 (the "Licensee"); you -// may not use this file except in compliance with the License. You may -// obtain a copy of the License at: -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the license. -// -//////////////////////////////////////////////////////////////////////////////// - -#include "lbann_config.hpp" - -#ifdef LBANN_HAS_CONDUIT - -#include "conduit/conduit.hpp" -#include "conduit/conduit_relay.hpp" -#include "conduit/conduit_relay_io_hdf5.hpp" -#include -#include -#include -#include -#include -#include "lbann/lbann.hpp" -#include "lbann/utils/jag_utils.hpp" -#include "lbann/data_readers/numpy_conduit_cache.hpp" - -using namespace lbann; - -int main(int argc, char *argv[]) { - int random_seed = lbann_default_random_seed; - world_comm_ptr comm = initialize(argc, argv, random_seed); - bool master = comm->am_world_master(); - - try { - - numpy_conduit_cache n(comm.get()); - n.load("/g/g10/hysom/test.npz", 42); - - } catch (std::exception const &e) { - if (master) std::cerr << "caught exception: " << e.what() << "\n"; - return EXIT_FAILURE; - } catch (...) { - std::cerr << "unknown exception in main\n"; - return EXIT_FAILURE; - } - - // Clean up - return EXIT_SUCCESS; -} - -#endif //#ifdef LBANN_HAS_CONDUIT From fc2066b381555cefb259dd9e7951e84dcf047161 Mon Sep 17 00:00:00 2001 From: Ryan Forsyth Date: Wed, 26 Jun 2019 16:47:45 -0700 Subject: [PATCH 109/634] Test clean up --- bamboo/README.md | 3 ++- bamboo/allocate_and_run.sh | 6 +++--- docs/continuous_integration.rst | 8 +++++++- 3 files changed, 12 insertions(+), 5 deletions(-) diff --git a/bamboo/README.md b/bamboo/README.md index 763df1443a0..ccb1813e878 100644 --- a/bamboo/README.md +++ b/bamboo/README.md @@ -1,2 +1,3 @@ Refer to `lbann/docs/continuous_integration.rst` -or "LBANN CI" on the [LBANN docs](http://software.llnl.gov/lbann/). +or "LBANN CI" on the [LBANN docs](http://software.llnl.gov/lbann/) - +specifically [LBANN CI docs](https://lbann.readthedocs.io/en/latest/continuous_integration.html). diff --git a/bamboo/allocate_and_run.sh b/bamboo/allocate_and_run.sh index 5cb5489f10f..23a01bc1ad8 100755 --- a/bamboo/allocate_and_run.sh +++ b/bamboo/allocate_and_run.sh @@ -34,9 +34,9 @@ if [ ${WEEKLY} -ne 0 ]; then salloc -N16 -t 900 ./run.sh --weekly if [ "${CLUSTER}" = 'catalyst' ]; then cd integration_tests - python -m pytest -s test_integration_performance_full_alexnet_clang6 --weekly --run - python -m pytest -s test_integration_performance_full_alexnet_gcc7 --weekly --run - python -m pytest -s test_integration_performance_full_alexnet_intel19 --weekly --run + python -m pytest -s test_integration_performance_full_alexnet_clang6 --weekly --run --junitxml=alexnet_clang6_results.xml + python -m pytest -s test_integration_performance_full_alexnet_gcc7 --weekly --run --junitxml=alexnet_gcc7_results.xml + # python -m pytest -s test_integration_performance_full_alexnet_intel19 --weekly --run --junitxml=alexnet_intel19_results.xml cd .. fi else diff --git a/docs/continuous_integration.rst b/docs/continuous_integration.rst index dc12100fedb..a3745596da9 100644 --- a/docs/continuous_integration.rst +++ b/docs/continuous_integration.rst @@ -97,10 +97,16 @@ your fork for commits. They do not run nightly. If you push new commits to your fork, a new build should start automatically. You can also manually start a build by navigating to your individual plan and -clicking Run > Run Plan. +clicking Run > Run plan +(this will say "Run branch" if you have plan branches set up). Once again, keep in mind that the tests will run off what has been pushed to your GitHub fork of LBANN and not your local copy of the LBANN repository. +Plan branches allow you to test multiple branches simultaneously instead +of simply testing "/develop". +You can create plan branches by navigating to your individual plan, +clicking Actions > Configure plan > Branches > Create plan branch. + Navigating Bamboo ---------------------------------------- From e5520fd480b68a2ddfc5d5e7399f23d45cd0b126 Mon Sep 17 00:00:00 2001 From: "David A. Hysom" Date: Thu, 27 Jun 2019 14:36:24 -0700 Subject: [PATCH 110/634] pushing developmental changes --- .../lbann/data_store/data_store_conduit.hpp | 45 +++-- src/data_readers/data_reader_image.cpp | 3 - src/data_readers/data_reader_imagenet.cpp | 6 +- src/data_store/data_store_conduit.cpp | 178 ++++++++++++------ 4 files changed, 145 insertions(+), 87 deletions(-) diff --git a/include/lbann/data_store/data_store_conduit.hpp b/include/lbann/data_store/data_store_conduit.hpp index 0dc20c2d9fc..1033e638532 100644 --- a/include/lbann/data_store/data_store_conduit.hpp +++ b/include/lbann/data_store/data_store_conduit.hpp @@ -80,17 +80,6 @@ class data_store_conduit { void setup(int mini_batch_size); - /* - * dah - may be needed in the future, but not needed for bare-bones squashing - void set_is_subsidiary_store() { - m_is_subsidiary_store = true; - } - - bool is_subsidiary_store() const { - return m_is_subsidiary_store; - } - */ - void preload_local_cache(); void check_mem_capacity(lbann_comm *comm, const std::string sample_list_file, size_t stride, size_t offset); @@ -200,12 +189,6 @@ protected : /// set to true if data_store is being explicitly loaded bool m_explicit_loading; - /// maps an index to the processor that owns the associated data - mutable std::unordered_map m_owner; - - /// convenience handle - const std::vector *m_shuffled_indices; - /// The size of the mini-batch that was used to calculate ownership /// of samples when building the owner map. This size has to be /// used consistently when computing the indices that will be sent @@ -216,6 +199,12 @@ protected : /// exchange_data_by_sample; default if false bool m_super_node; + /// maps an index to the processor that owns the associated data + mutable std::unordered_map m_owner; + + /// convenience handle + const std::vector *m_shuffled_indices; + void exchange_data_by_super_node(size_t current_pos, size_t mb_size); void exchange_data_by_sample(size_t current_pos, size_t mb_size); @@ -293,27 +282,35 @@ protected : /// used in exchange_data_by_sample, when sample sizes are non-uniform bool m_have_sample_sizes; - /// fills in m_image_offsets; returns the segment size (which is the - /// sum of the file sizes). Currently only used for imagenet + /// Currently only used for imagenet. On return, 'sizes' maps a sample_id to image size, and indices[p] contains the sample_ids that P_p owns + /// for use in local cache mode void get_image_sizes(std::unordered_map &sizes, std::vector> &indices); - /// offset at which the raw image will be stored in a shared memory segment + /// offset at which the raw image will be stored in a shared memory segment; + /// for use in local cache mode std::unordered_map m_image_offsets; + /// fills in m_image_offsets for use in local cache mode void compute_image_offsets(std::unordered_map &sizes, std::vector> &indices); + /// for use in local cache mode void allocate_shared_segment(std::unordered_map &sizes, std::vector> &indices); + /// for use in local cache mode void read_files(std::vector &work, std::unordered_map &sizes, std::vector &indices); + /// for use in local cache mode void build_conduit_nodes(std::unordered_map &sizes); + /// for use in local cache mode void exchange_images(std::vector &work, std::unordered_map &image_sizes, std::vector> &indices); - void fillin_shared_images(const std::vector &images, const std::unordered_map &image_sizes, const std::vector &indices); - - void *m_mem_seg = 0; + /// for use in local cache mode + void fillin_shared_images(const std::vector &images, int offset); - const std::string m_seg_name = "our_town"; + /// for use in local cache mode + char *m_mem_seg = 0; + size_t m_mem_seg_length = 0; + std::string m_seg_name; }; } // namespace lbann diff --git a/src/data_readers/data_reader_image.cpp b/src/data_readers/data_reader_image.cpp index 126bd085737..f8a3071bc11 100644 --- a/src/data_readers/data_reader_image.cpp +++ b/src/data_readers/data_reader_image.cpp @@ -246,19 +246,16 @@ void image_data_reader::load_conduit_node_from_file(int data_id, conduit::Node & read_raw_data(filename, data); node[LBANN_DATA_ID_STR(data_id) + "/label"].set(label); node[LBANN_DATA_ID_STR(data_id) + "/buffer"].set(data); - //node[LBANN_DATA_ID_STR(data_id) + "/buffer"].set_char_ptr(data.data(), data.size()); node[LBANN_DATA_ID_STR(data_id) + "/buffer_size"] = data.size(); } void image_data_reader::load_conduit_node_from_file(int data_id, conduit::Node &node) { node.reset(); const std::string filename = get_file_dir() + m_image_list[data_id].first; int label = m_image_list[data_id].second; - //std::vector data; std::vector data; read_raw_data(filename, data); node[LBANN_DATA_ID_STR(data_id) + "/label"].set(label); node[LBANN_DATA_ID_STR(data_id) + "/buffer"].set(data); - //node[LBANN_DATA_ID_STR(data_id) + "/buffer"].set_char_ptr(data.data(), data.size()); node[LBANN_DATA_ID_STR(data_id) + "/buffer_size"] = data.size(); } diff --git a/src/data_readers/data_reader_imagenet.cpp b/src/data_readers/data_reader_imagenet.cpp index 426da295d1d..873fd7c85c4 100644 --- a/src/data_readers/data_reader_imagenet.cpp +++ b/src/data_readers/data_reader_imagenet.cpp @@ -99,8 +99,10 @@ bool imagenet_reader::fetch_datum(CPUMat& X, int data_id, int mb_idx) { El::Matrix encoded_image(size, 1, reinterpret_cast(buf), size); decode_image(encoded_image, image, dims); } - } else { - // Data store is not being used. + } + + // this block fires if not using data store + else { load_image(image_path, image, dims); } diff --git a/src/data_store/data_store_conduit.cpp b/src/data_store/data_store_conduit.cpp index cd9629c817c..e5c5a5377bb 100644 --- a/src/data_store/data_store_conduit.cpp +++ b/src/data_store/data_store_conduit.cpp @@ -95,19 +95,23 @@ data_store_conduit::data_store_conduit( ss << "debug_" << m_reader->get_role() << "." << m_comm->get_rank_in_world(); m_output.open(ss.str().c_str()); if (m_world_master) { - std::cout << "opened " << ss.str() << " for writing\n"; + std::cerr << "opened " << ss.str() << " for writing\n"; } } m_is_local_cache = opts->get_bool("data_store_cache"); + m_preload = opts->get_bool("preload_data_store"); + if (m_is_local_cache && !m_preload) { + LBANN_ERROR("data_store_cache is currently only implemented for preload mode; this will change in the future. For now, pleas pass both flags: data_store_cache and --preload_data_store"); + } if (m_world_master) { if (m_is_local_cache) { - std::cout << "data_store_conduit is running in local_cache mode\n"; + std::cerr << "data_store_conduit is running in local_cache mode\n"; } else if (m_super_node) { - std::cout << "data_store_conduit is running in super_node mode\n"; + std::cerr << "data_store_conduit is running in super_node mode\n"; } else { - std::cout << "data_store_conduit is running in multi-message mode\n"; + std::cerr << "data_store_conduit is running in multi-message mode\n"; } } } @@ -224,11 +228,11 @@ void data_store_conduit::setup(int mini_batch_size) { double tm1 = get_time(); if (m_world_master) { - std::cout << "starting data_store_conduit::setup() for role: " << m_reader->get_role() << "\n"; + std::cerr << "starting data_store_conduit::setup() for role: " << m_reader->get_role() << "\n"; if (m_super_node) { - std::cout << "data store mode: exchange_data via super nodes\n"; + std::cerr << "data store mode: exchange_data via super nodes\n"; } else { - std::cout << "data store mode: exchange_data via individual samples\n"; + std::cerr << "data store mode: exchange_data via individual samples\n"; } } @@ -240,12 +244,12 @@ void data_store_conduit::setup(int mini_batch_size) { m_is_setup = true; - if (m_is_local_cache) { + if (m_is_local_cache && m_preload) { preload_local_cache(); } if (m_world_master && !m_preload) { - std::cout << "TIME for data_store_conduit setup: " << get_time() - tm1 << "\n"; + std::cerr << "TIME for data_store_conduit setup: " << get_time() - tm1 << "\n"; } } @@ -425,6 +429,9 @@ void data_store_conduit::error_check_compacted_node(const conduit::Node &nd, int void data_store_conduit::set_conduit_node(int data_id, conduit::Node &node, bool already_have) { + if (m_is_local_cache && m_preload) { + LBANN_ERROR("you called data_store_conduit::set_conduit_node, but you're running in local cache mode with preloading; something is broken; please contact Dave Hysom"); + } m_mutex.lock(); if (already_have == false && m_data.find(data_id) != m_data.end()) { LBANN_ERROR("duplicate data_id: " + std::to_string(data_id) + " in data_store_conduit::set_conduit_node"); @@ -741,7 +748,7 @@ void data_store_conduit::build_preloaded_owner_map(const std::vector& per_r } void data_store_conduit::build_owner_map(int mini_batch_size) { - if (m_world_master) std::cout << "starting data_store_conduit::build_owner_map for role: " << m_reader->get_role() << " with mini_batch_size: " << mini_batch_size << " num indices: " << m_shuffled_indices->size() << "\n"; + if (m_world_master) std::cerr << "starting data_store_conduit::build_owner_map for role: " << m_reader->get_role() << " with mini_batch_size: " << mini_batch_size << " num indices: " << m_shuffled_indices->size() << "\n"; if (mini_batch_size == 0) { LBANN_ERROR("mini_batch_size == 0; can't build owner_map"); } @@ -957,7 +964,7 @@ void data_store_conduit::check_mem_capacity(lbann_comm *comm, const std::string double mem_this_proc = bytes_per_sample * my_sample_count; double mem_this_node = mem_this_proc * procs_per_node; - std::cout + std::cerr << "\n" << "==============================================================\n" << "Estimated memory requirements for JAG samples:\n" @@ -968,12 +975,12 @@ void data_store_conduit::check_mem_capacity(lbann_comm *comm, const std::string << "Total mem for all ranks on a node: " << mem_this_node << " kB\n" << "Available memory: " << a_mem << " kB (RAM only; not virtual)\n"; if (mem_this_node > static_cast(a_mem)) { - std::cout << "\nYOU DO NOT HAVE ENOUGH MEMORY\n" + std::cerr << "\nYOU DO NOT HAVE ENOUGH MEMORY\n" << "==============================================================\n\n"; LBANN_ERROR("insufficient memory to load data\n"); } else { double m = 100 * mem_this_node / a_mem; - std::cout << "Estimate that data will consume at least " << m << " % of memory\n" + std::cerr << "Estimate that data will consume at least " << m << " % of memory\n" << "==============================================================\n\n"; } } @@ -1071,13 +1078,17 @@ void data_store_conduit::get_image_sizes(std::unordered_map &file_sizes std::vector counts(m_np_in_trainer); m_comm->all_gather(&my_count, 1, counts.data(), 1, m_comm->get_trainer_comm()); - std::vector work(image_list.size()*2); + //counts[h*2] contains the image index + //counts[h*2+1] contains the image sizee + + //fill in displacement vector for gathering the actual image sizes std::vector disp(m_np_in_trainer + 1); disp[0] = 0; for (size_t h=0; h work(image_list.size()*2); m_comm->trainer_all_gather(my_image_sizes, work, counts, disp); indices.resize(m_np_in_trainer); for (int h=0; h &size int offset = 0; for (size_t p=0; p &si for (auto &&t : sizes) { size += t.second; } + m_mem_seg_length = size; + + //need to ensure name is unique across all data readers + m_seg_name = "our_town_" + m_reader->get_role(); int node_id = m_comm->get_rank_in_node(); if (node_id == 0) { - int shm_fd = shm_open(m_seg_name.c_str(), O_CREAT | O_RDWR, 0666); + int shm_fd = shm_open(m_seg_name.c_str(), O_CREAT | O_RDWR | O_EXCL, 0666); if (shm_fd == -1) { LBANN_ERROR("shm_open failed"); } @@ -1122,34 +1140,34 @@ void data_store_conduit::allocate_shared_segment(std::unordered_map &si if (v != 0) { LBANN_ERROR("ftruncate failed"); } - m_mem_seg = mmap(0, size, PROT_WRITE, MAP_SHARED, shm_fd, 0); - if (*(int*)m_mem_seg == -1) { + void *m = mmap(0, size, PROT_WRITE, MAP_SHARED, shm_fd, 0); + if (*static_cast(m) == -1) { LBANN_ERROR("mmap failed"); } + m_mem_seg = reinterpret_cast(m); } m_comm->barrier(m_comm->get_node_comm()); if (node_id != 0) { - int shm_fd = shm_open(m_seg_name.c_str(), O_RDONLY, 0666); + int shm_fd = shm_open(m_seg_name.c_str(), O_RDONLY | O_EXCL, 0666); if (shm_fd == -1) { LBANN_ERROR("shm_open failed"); } - #if 0 - int v = ftruncate(shm_fd, size); - if (v != 0) { - struct stat b; - int sanity = fstat(shm_fd, &b); - if (sanity != 0) { - LBANN_ERROR("ftruncate failed, and fstat failed"); - } - LBANN_ERROR("ftruncate failed; file size: " + std::to_string(b.st_size) + " bytes; requested size: " + std::to_string(size)); - } - #endif - m_mem_seg = mmap(0, size, PROT_READ, MAP_SHARED, shm_fd, 0); - if (*(int*)m_mem_seg == -1) { + void *m = mmap(0, size, PROT_WRITE, MAP_SHARED, shm_fd, 0); + if (*static_cast(m) == -1) { LBANN_ERROR("mmap failed"); } + m_mem_seg = reinterpret_cast(m); + + struct stat b; + int sanity = fstat(shm_fd, &b); + if (sanity == -1) { + LBANN_ERROR("fstat failed"); + } + if (b.st_size != size) { + LBANN_ERROR("b.st_size= " + std::to_string(b.st_size) + " should be equal to " + std::to_string(size)); + } } } @@ -1158,31 +1176,56 @@ void data_store_conduit::preload_local_cache() { std::vector> indices; get_image_sizes(file_sizes, indices); - //debug block; will go away if (m_world_master) { - for (int h=0; h &idx = indices[h]; - std::cout << "P_"< work; - if (m_world_master) { std::cerr << "calling read_files\n"; } read_files(work, file_sizes, indices[m_rank_in_trainer]); - if (m_world_master) { std::cerr << "calling allocate_shared_segment\n"; } allocate_shared_segment(file_sizes, indices); - if (m_world_master) { std::cerr << "calling compute_image_offsets\n"; } compute_image_offsets(file_sizes, indices); - if (m_output) m_output << "calling exchange_images\n"; - if (m_world_master) { std::cerr << "calling exchange_images\n"; } exchange_images(work, file_sizes, indices); - if (m_world_master) { std::cerr << "calling build_conduit_nodes\n"; } + +#if 0 + if (m_world_master) { + //verify that images in shared segment are correct + image_data_reader *image_reader = dynamic_cast(m_reader); + const std::vector &image_list = image_reader->get_image_list(); + for (size_t h=0; hget_file_dir() + '/' + image_list[h].first; + std::cerr << "\nXX checking data_id " << h << " file: " << fn << "\n"; + std::ifstream in(fn, std::ios::in | std::ios::binary); + in.seekg(0, std::ios::end); + int n = in.tellg(); + in.seekg(0, std::ios::beg); + std::cerr << " XX file size: " << n << " from sizes map: " << file_sizes[h] << "\n"; + if (n != file_sizes[h]) { + LBANN_ERROR("n != sizes[h]"); + } + char *c = m_mem_seg + m_image_offsets[h]; + std::vector w(n); + in.read(w.data(), n); + in.close(); + for (int i=0; i &sizes) for (size_t idx=0; idx &images, const std::unordered_map &image_sizes, const std::vector &indices) { +void data_store_conduit::fillin_shared_images(const std::vector &images, int offset) { + if (m_world_master) std::cerr << "YY data_store_conduit::fillin_shared_images; offest: " << offset << " seg size: " << m_mem_seg_length << std::endl; + memcpy(m_mem_seg+offset, reinterpret_cast(images.data()), images.size()); } void data_store_conduit::exchange_images(std::vector &work, std::unordered_map &image_sizes, std::vector> &indices) { std::vector work2; int node_rank = m_comm->get_rank_in_node(); + int offset = 0; for (int p=0; ptrainer_broadcast(p, work.data(), work.size()); - std::cerr << "data_store_conduit::exchange_images, P_" + std::to_string(m_rank_in_trainer) + " is bcasting size: " + std::to_string(work.size()) << std::endl; if (node_rank == 0) { - fillin_shared_images(work, image_sizes, indices[p]); + fillin_shared_images(work, offset); } } else { int sz = 0; @@ -1240,13 +1285,30 @@ void data_store_conduit::exchange_images(std::vector &work, std::unordered sz += image_sizes[idx]; } work2.resize(sz); - std::cerr << "data_store_conduit::exchange_images, P_" + std::to_string(m_rank_in_trainer) + " is receiving bcast of size: " + std::to_string(sz) << std::endl; m_comm->trainer_broadcast(p, work2.data(), sz); if (node_rank == 0) { - fillin_shared_images(work2, image_sizes, indices[p]); + fillin_shared_images(work2, offset); } } + for (size_t r=0; rbarrier(m_comm->get_node_comm()); + +if (m_world_master) { + if (m_reader->get_role() == "train") { + FILE *f = fopen("xyz", "w"); + for (size_t j=0; j Date: Fri, 28 Jun 2019 09:46:08 -0700 Subject: [PATCH 111/634] final working version (I hope) --- .../lbann/data_store/data_store_conduit.hpp | 65 +++++----- src/data_readers/data_reader_imagenet.cpp | 2 +- src/data_store/data_store_conduit.cpp | 115 +++++++++++------- 3 files changed, 105 insertions(+), 77 deletions(-) diff --git a/include/lbann/data_store/data_store_conduit.hpp b/include/lbann/data_store/data_store_conduit.hpp index 1033e638532..6f651dfc793 100644 --- a/include/lbann/data_store/data_store_conduit.hpp +++ b/include/lbann/data_store/data_store_conduit.hpp @@ -163,41 +163,52 @@ class data_store_conduit { protected : /// records the number of times exchange_mini_batch_data has been called - int m_n; + int m_n = 0; - bool m_is_setup; - - generic_data_reader *m_reader; - - lbann_comm *m_comm; - - /// rank in the trainer; convenience handle - int m_rank_in_trainer; - - /// number of procs in the trainer; convenience handle - int m_np_in_trainer; - - /// convenience handle - bool m_world_master; - - /// convenience handle - bool m_trainer_master; + bool m_is_setup = false; /// set to true if data_store is preloaded - bool m_preload; + bool m_preload = false; /// set to true if data_store is being explicitly loaded - bool m_explicit_loading; + //VBE: please explain what this means! + bool m_explicit_loading = false; /// The size of the mini-batch that was used to calculate ownership /// of samples when building the owner map. This size has to be /// used consistently when computing the indices that will be sent /// and received. - int m_owner_map_mb_size; + int m_owner_map_mb_size = 0; /// if true, use exchange_data_by_super_node, else use /// exchange_data_by_sample; default if false - bool m_super_node; + bool m_super_node = false; + + /// size of a compacted conduit::Node that contains a single sample + int m_compacted_sample_size = 0; + + bool m_is_local_cache = false; + + bool m_node_sizes_vary = false; + + /// used in exchange_data_by_sample, when sample sizes are non-uniform + bool m_have_sample_sizes = false; + + generic_data_reader *m_reader; + + lbann_comm *m_comm; + + /// convenience handle + bool m_world_master; + + /// convenience handle + bool m_trainer_master; + + /// rank in the trainer; convenience handle + int m_rank_in_trainer; + + /// number of procs in the trainer; convenience handle + int m_np_in_trainer; /// maps an index to the processor that owns the associated data mutable std::unordered_map m_owner; @@ -231,9 +242,6 @@ protected : std::vector m_outgoing_msg_sizes; std::vector m_incoming_msg_sizes; - /// size of a compacted conduit::Node that contains a single sample - int m_compacted_sample_size; - /// used in exchange_data_by_super_node(); contains the super_nodes, /// after they have been converted from compacted format std::vector m_reconstituted; @@ -269,19 +277,12 @@ protected : void error_check_compacted_node(const conduit::Node &nd, int data_id); - bool m_is_local_cache; - - bool m_node_sizes_vary; - /// for use when conduit Nodes have non-uniform size, e.g, imagenet std::unordered_map m_sample_sizes; /// used in set_conduit_node(...) std::mutex m_mutex; - /// used in exchange_data_by_sample, when sample sizes are non-uniform - bool m_have_sample_sizes; - /// Currently only used for imagenet. On return, 'sizes' maps a sample_id to image size, and indices[p] contains the sample_ids that P_p owns /// for use in local cache mode void get_image_sizes(std::unordered_map &sizes, std::vector> &indices); diff --git a/src/data_readers/data_reader_imagenet.cpp b/src/data_readers/data_reader_imagenet.cpp index 873fd7c85c4..0e3e9e55c73 100644 --- a/src/data_readers/data_reader_imagenet.cpp +++ b/src/data_readers/data_reader_imagenet.cpp @@ -87,8 +87,8 @@ bool imagenet_reader::fetch_datum(CPUMat& X, int data_id, int mb_idx) { if (is_master()) { LBANN_WARNING("m_data_store != nullptr, but we are not retrivieving a node from the store; role: " + get_role() + "; this is probably OK for test mode, but may be an error for train or validate modes"); } - m_issue_warning = false; } + m_issue_warning = false; load_image(image_path, image, dims); have_node = false; } diff --git a/src/data_store/data_store_conduit.cpp b/src/data_store/data_store_conduit.cpp index e5c5a5377bb..2e78887c1a3 100644 --- a/src/data_store/data_store_conduit.cpp +++ b/src/data_store/data_store_conduit.cpp @@ -37,6 +37,7 @@ #include #include #include +#include namespace lbann { @@ -66,17 +67,8 @@ namespace lbann { data_store_conduit::data_store_conduit( generic_data_reader *reader) : - m_n(0), - m_is_setup(false), - m_reader(reader), - m_preload(false), - m_explicit_loading(false), - m_owner_map_mb_size(0), - m_super_node(false), - m_compacted_sample_size(0), - m_is_local_cache(false), - m_node_sizes_vary(false), - m_have_sample_sizes(false) { + m_reader(reader) { + m_comm = m_reader->get_comm(); if (m_comm == nullptr) { LBANN_ERROR(" m_comm is nullptr"); @@ -121,7 +113,14 @@ data_store_conduit::~data_store_conduit() { m_output.close(); } if (m_is_local_cache && m_mem_seg) { - shm_unlink(m_seg_name.c_str()); + int sanity = shm_unlink(m_seg_name.c_str()); + if (sanity != 0) { + std::cerr << "\nWARNING: shm_unlink failed in data_store_conduit::~data_store_conduit()\n"; + } + sanity = munmap(reinterpret_cast(m_mem_seg), m_mem_seg_length); + if (sanity != 0) { + std::cerr << "\nWARNING: munmap failed in data_store_conduit::~data_store_conduit()\n"; + } } } @@ -153,21 +152,22 @@ void data_store_conduit::set_role(const std::string role) { void data_store_conduit::copy_members(const data_store_conduit& rhs, const std::vector& ds_sample_move_list) { m_n = rhs.m_n; m_is_setup = rhs.m_is_setup; - m_reader = rhs.m_reader; - m_comm = rhs.m_comm; - m_rank_in_trainer = rhs.m_rank_in_trainer; - m_np_in_trainer = rhs.m_np_in_trainer; - m_world_master = rhs.m_world_master; - m_trainer_master = rhs.m_trainer_master; m_preload = rhs.m_preload; m_explicit_loading = rhs.m_explicit_loading; - m_owner = rhs.m_owner; - m_shuffled_indices = rhs.m_shuffled_indices; m_owner_map_mb_size = rhs.m_owner_map_mb_size; m_super_node = rhs.m_super_node; m_compacted_sample_size = rhs.m_compacted_sample_size; m_is_local_cache = rhs.m_is_local_cache; m_node_sizes_vary = rhs.m_node_sizes_vary; + m_have_sample_sizes = rhs.m_have_sample_sizes; + m_reader = rhs.m_reader; + m_comm = rhs.m_comm; + m_world_master = rhs.m_world_master; + m_trainer_master = rhs.m_trainer_master; + m_rank_in_trainer = rhs.m_rank_in_trainer; + m_np_in_trainer = rhs.m_np_in_trainer; + m_owner = rhs.m_owner; + m_shuffled_indices = rhs.m_shuffled_indices; m_sample_sizes = rhs.m_sample_sizes; /// This block needed when carving a validation set from the training set @@ -177,9 +177,15 @@ void data_store_conduit::copy_members(const data_store_conduit& rhs, const std:: } if(ds_sample_move_list.size() == 0) { + if (m_trainer_master) { + std::cout << "data_store_conduit::copy_members; ds_sample_move_list.size = 0; copying all entries in m_data\n"; + } m_data = rhs.m_data; } else { /// Move indices on the list from the data and owner maps in the RHS data store to the new data store + if (m_trainer_master) { + std::cout << "data_store_conduit::copy_members; ds_sample_move_list.size != 0; copying ONLY SOME entries in m_data\n"; + } for(auto&& i : ds_sample_move_list) { if(rhs.m_data.find(i) != rhs.m_data.end()){ @@ -641,12 +647,13 @@ void data_store_conduit::exchange_data_by_sample(size_t current_pos, size_t mb_s for (int p=0; p &indices = m_indices_to_recv[p]; + int sanity = 0; for (auto index : indices) { - + ++sanity; int sz = m_compacted_sample_size; if (m_node_sizes_vary) { if (m_sample_sizes.find(index) == m_sample_sizes.end()) { - LBANN_ERROR("m_sample_sizes.find(index) == m_sample_sizes.end() for index: " + std::to_string(index) + "; m_sample_sizes.size(): " + std::to_string(m_sample_sizes.size()) + " role: " + m_reader->get_role()); + LBANN_ERROR("m_sample_sizes.find(index) == m_sample_sizes.end() for index: " + std::to_string(index) + "; m_sample_sizes.size(): " + std::to_string(m_sample_sizes.size()) + " role: " + m_reader->get_role() + " for index: " + std::to_string(sanity) + " of " + std::to_string(indices.size())); } sz = m_sample_sizes[index]; } @@ -1128,11 +1135,39 @@ void data_store_conduit::allocate_shared_segment(std::unordered_map &si m_mem_seg_length = size; //need to ensure name is unique across all data readers - m_seg_name = "our_town_" + m_reader->get_role(); + m_seg_name = "/our_town_" + m_reader->get_role(); + //in case a previous run was aborted, attempt to remove the file, which + //may or may not exist int node_id = m_comm->get_rank_in_node(); if (node_id == 0) { - int shm_fd = shm_open(m_seg_name.c_str(), O_CREAT | O_RDWR | O_EXCL, 0666); + std::stringstream s; + s << "rm -rf /dev/shm/" << m_seg_name; + system(s.str().c_str()); + } + + #if 0 + debug block; may go away + for (int i=0; iget_role() << "; m_rank_in_trainer: " << m_rank_in_trainer << std::endl; + system("ls -l /dev/shm"); + s << "rm -rf /dev/shm/" << m_seg_name; + system(s.str().c_str()); + std::cerr << "\nls -l /dev/shm; AFTER rm -rf; role: " << m_reader->get_role() << "; m_rank_in_trainer: " << m_rank_in_trainer << std::endl; + system("ls -l /dev/shm"); + } + } + m_comm->trainer_barrier(); + } + #endif + + int shm_fd; + + if (node_id == 0) { + shm_fd = shm_open(m_seg_name.c_str(), O_CREAT | O_RDWR | O_EXCL, 0666); if (shm_fd == -1) { LBANN_ERROR("shm_open failed"); } @@ -1140,22 +1175,27 @@ void data_store_conduit::allocate_shared_segment(std::unordered_map &si if (v != 0) { LBANN_ERROR("ftruncate failed"); } - void *m = mmap(0, size, PROT_WRITE, MAP_SHARED, shm_fd, 0); - if (*static_cast(m) == -1) { + void *m = mmap(0, size, PROT_WRITE | PROT_READ, MAP_SHARED, shm_fd, 0); + if (m == MAP_FAILED) { LBANN_ERROR("mmap failed"); } m_mem_seg = reinterpret_cast(m); + std::fill_n(m_mem_seg, m_mem_seg_length, 1); + int sanity = msync(static_cast(m_mem_seg), m_mem_seg_length, MS_SYNC); + if (sanity != 0) { + LBANN_ERROR("msync failed"); + } } m_comm->barrier(m_comm->get_node_comm()); if (node_id != 0) { - int shm_fd = shm_open(m_seg_name.c_str(), O_RDONLY | O_EXCL, 0666); + shm_fd = shm_open(m_seg_name.c_str(), O_RDONLY, 0666); if (shm_fd == -1) { - LBANN_ERROR("shm_open failed"); + LBANN_ERROR("shm_open failed for filename: " + m_seg_name); } - void *m = mmap(0, size, PROT_WRITE, MAP_SHARED, shm_fd, 0); - if (*static_cast(m) == -1) { + void *m = mmap(0, size, PROT_READ, MAP_SHARED, shm_fd, 0); + if (m == MAP_FAILED) { LBANN_ERROR("mmap failed"); } m_mem_seg = reinterpret_cast(m); @@ -1169,6 +1209,7 @@ void data_store_conduit::allocate_shared_segment(std::unordered_map &si LBANN_ERROR("b.st_size= " + std::to_string(b.st_size) + " should be equal to " + std::to_string(size)); } } + close(shm_fd); } void data_store_conduit::preload_local_cache() { @@ -1265,7 +1306,6 @@ void data_store_conduit::build_conduit_nodes(std::unordered_map &sizes) } void data_store_conduit::fillin_shared_images(const std::vector &images, int offset) { - if (m_world_master) std::cerr << "YY data_store_conduit::fillin_shared_images; offest: " << offset << " seg size: " << m_mem_seg_length << std::endl; memcpy(m_mem_seg+offset, reinterpret_cast(images.data()), images.size()); } @@ -1293,22 +1333,9 @@ void data_store_conduit::exchange_images(std::vector &work, std::unordered for (size_t r=0; rbarrier(m_comm->get_node_comm()); - -if (m_world_master) { - if (m_reader->get_role() == "train") { - FILE *f = fopen("xyz", "w"); - for (size_t j=0; j Date: Fri, 28 Jun 2019 09:51:30 -0700 Subject: [PATCH 112/634] modified copy_members --- src/data_store/data_store_conduit.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/data_store/data_store_conduit.cpp b/src/data_store/data_store_conduit.cpp index 2e78887c1a3..57915d5687c 100644 --- a/src/data_store/data_store_conduit.cpp +++ b/src/data_store/data_store_conduit.cpp @@ -169,6 +169,10 @@ void data_store_conduit::copy_members(const data_store_conduit& rhs, const std:: m_owner = rhs.m_owner; m_shuffled_indices = rhs.m_shuffled_indices; m_sample_sizes = rhs.m_sample_sizes; + m_mem_seg = rhs.m_mem_seg; + m_mem_seg_length = rhs.m_mem_seg_length; + m_seg_name = rhs.m_seg_name; + m_image_offsets = rhs.m_image_offsets; /// This block needed when carving a validation set from the training set if (options::get()->get_bool("debug") && !m_output) { From 4fbd23b41855607b275edb1146aeb974bf0b52a9 Mon Sep 17 00:00:00 2001 From: Tim Moon Date: Fri, 28 Jun 2019 10:31:31 -0700 Subject: [PATCH 113/634] Refactoring Python utility classes (#1100) * Launcher in Python frontend can accept `None` as optimizer. * Refactoring Python utility classes. `session` (formerly `manager`) is mostly accessed with static functions and the singleton instance is used mostly internally. `object` will take GIL whenever it calls Python API. * Documenting Python utility classes. * Moving Python utility classes to utility header. * Minor tweaks to Python utility documentation. --- .../lbann/data_readers/data_reader_python.hpp | 102 +------- include/lbann/utils/CMakeLists.txt | 1 + include/lbann/utils/python.hpp | 175 ++++++++++++++ python/lbann/launcher/__init__.py | 10 +- python/lbann/proto.py | 31 ++- src/data_readers/data_reader_python.cpp | 188 ++------------- src/utils/CMakeLists.txt | 1 + src/utils/python.cpp | 221 ++++++++++++++++++ 8 files changed, 447 insertions(+), 282 deletions(-) create mode 100644 include/lbann/utils/python.hpp create mode 100644 src/utils/python.cpp diff --git a/include/lbann/data_readers/data_reader_python.hpp b/include/lbann/data_readers/data_reader_python.hpp index ca289ff4770..9d31503c648 100644 --- a/include/lbann/data_readers/data_reader_python.hpp +++ b/include/lbann/data_readers/data_reader_python.hpp @@ -29,106 +29,10 @@ #include "data_reader.hpp" #ifdef LBANN_HAS_PYTHON -#include +#include "lbann/utils/python.hpp" namespace lbann { -namespace python { - -/** @brief Singleton class to manage embedded Python session. - * - * This is very experimental. Be warned. - */ -class manager { -public: - - /** @brief Get singleton instance. */ - static manager& get_instance(); - /** @brief Construct singleton instance. - * @details If there is already an instance, it is destroyed. - */ - static void create(); - /** Destroy singleton instance. */ - static void destroy(); - - /** @brief Check if a Python error has occurred. - * - * Throw an exception if an error is detected. - * - * @param force_error Whether to force an exception to be thrown. - */ - void check_error(bool force_error = false) const; - - ~manager(); - -private: - - /** @brief Singleton instance. */ - static std::unique_ptr m_instance; - - /** @brief State on main Python thread. */ - PyThreadState* m_thread_state = nullptr; - - // Lifetime functions - manager(); - manager(const manager&) = delete; - manager& operator=(const manager&) = delete; - -}; - -/** @brief RAII wrapper for Python GIL. - * - * The Python interpreter is not thread-safe, so it uses the "global - * interpreter lock" to ensure only one thread is executing at a - * time. Multithreading is achieved by periodically transferring - * control of the GIL between threads. This makes it hard to get - * meaningful speedups from simple multithreading. Certain - * operations, e.g. I/O and numerical kernels in NumPy, can be - * efficiently parallelized because they yield control of the GIL - * while working. - * - * This is very experimental. Be warned. - */ -class global_interpreter_lock { -public: - - global_interpreter_lock(const manager&); - ~global_interpreter_lock(); - -private: - - global_interpreter_lock(const global_interpreter_lock&) = delete; - global_interpreter_lock& operator=(const global_interpreter_lock&) = delete; - - PyGILState_STATE m_gil_state; - -}; - -/** @brief Convenience wrapper around @c PyObject pointer. - * - * This is very experimental. Be warned. - */ -class object { -public: - object(PyObject* obj = nullptr); - object(std::string val); - object(El::Int val); - object(DataType val); - object(const object& other); - object& operator=(const object& other); - object(object&& other); - object& operator=(object&& other); - ~object(); - inline PyObject* get() { return m_ptr; } - inline const PyObject* get() const { return m_ptr; } - inline operator PyObject*() { return get(); } - inline operator const PyObject*() const { return get(); } -private: - PyObject* m_ptr; -}; - -} // namespace python - class python_reader : public generic_data_reader { public: python_reader(std::string module, @@ -180,8 +84,6 @@ class python_reader : public generic_data_reader { * This function will be executed on worker processes (see @c * m_process_pool). It will obtain a data sample from @c * m_sample_function and copy it into a @c m_shared_memory_array. - * - * @todo Performance optimizations for NumPy data. */ python::object m_sample_function_wrapper; @@ -193,7 +95,7 @@ class python_reader : public generic_data_reader { /** @brief Shared memory array. * - * @c RawArray the Python @c multiprocessing module. + * @c RawArray from the Python @c multiprocessing module. */ python::object m_shared_memory_array; diff --git a/include/lbann/utils/CMakeLists.txt b/include/lbann/utils/CMakeLists.txt index 6dbd75433f1..c510021d3ff 100644 --- a/include/lbann/utils/CMakeLists.txt +++ b/include/lbann/utils/CMakeLists.txt @@ -24,6 +24,7 @@ set_full_path(THIS_DIR_HEADERS options.hpp profiling.hpp prototext.hpp + python.hpp random.hpp statistics.hpp summary.hpp diff --git a/include/lbann/utils/python.hpp b/include/lbann/utils/python.hpp new file mode 100644 index 00000000000..1bab09a2346 --- /dev/null +++ b/include/lbann/utils/python.hpp @@ -0,0 +1,175 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_UTILS_PYTHON_HPP_INCLUDED +#define LBANN_UTILS_PYTHON_HPP_INCLUDED + +#include "lbann/base.hpp" +#ifdef LBANN_HAS_PYTHON +#include +#include + +namespace lbann { +namespace python { + +/** @brief Singleton class to manage embedded Python session. + * + * This mostly manages the initialization and finalization of the + * Python session. It is rarely necessary to interact with the + * singleton instance directly. + * + * All static member functions are thread-safe. + */ +class session { +public: + + /** @brief Start embedded Python session if not already running. + * @details Does nothing if Python has already been started. + */ + static void start_once(); + + /** @brief Check if embedded Python session is running. */ + static bool is_active() noexcept; + + /** @brief Check if a Python error has occurred. + * + * Throws an exception if a Python error is detected. + * + * @param force_error Whether to force an exception to be thrown. + */ + static void check_error(bool force_error = false); + + /** @brief Get singleton instance. + * + * Initializes an embedded Python session the first time it is + * called. + */ + static session& get(); + + ~session(); + +private: + + /** @brief State on main Python thread. */ + PyThreadState* m_thread_state = nullptr; + + // Lifetime functions + session(); + session(const session&) = delete; + session& operator=(const session&) = delete; + +}; + +/** @brief RAII wrapper for Python GIL. + * + * The Python interpreter is not thread-safe, so it uses the "global + * interpreter lock" to ensure only one thread is executing at a + * time. Make sure to acquire the GIL before calling Python C API + * functions. The GIL can be acquired recursively, i.e. you can + * acquire the GIL even if you already control it. + */ +class global_interpreter_lock { +public: + global_interpreter_lock(); + ~global_interpreter_lock(); +private: + global_interpreter_lock(const global_interpreter_lock&) = delete; + global_interpreter_lock& operator=(const global_interpreter_lock&) = delete; + PyGILState_STATE m_gil_state; +}; + +/** @brief Wrapper around a Python object pointer. + * + * Manages the reference count for a @c PyObject pointer and is + * implicitly convertible to the pointer. This is especially + * convenient for interacting with Python C API functions that @a + * borrow references and return @a new references (this is the most + * common kind). + * + * This class is @a not thread-safe. However, it's best practice to + * acquire the GIL before doing any Python operations, so access will + * typically be serialized. + * + * Handling reference counts is a tricky part of the Python C API. Be + * especially careful with functions that @a steal references or + * return @a borrowed references. See + * + * https://docs.python.org/3.7/c-api/intro.html#reference-counts + * + * for an explanation of reference counts. + */ +class object { +public: + + /** @brief Take ownership of a Python object pointer. + * @details @a Steals the reference. + */ + object(PyObject* ptr); + + /** @brief Create a Python string. */ + object(const std::string& val); + /** @brief Create a Python integer. */ + object(long val); + /** @brief Create a Python floating point number. */ + object(double val); + + object() {} + /** @details @a Borrows the reference. */ + object(const object& other); + /** @details @a Borrows the reference. */ + object& operator=(const object& other); + /** @details @a Steals the reference. */ + object(object&& other) noexcept; + /** @details @a Steals the reference. */ + object& operator=(object&& other); + ~object(); + + /** @returns @a Borrowed reference. */ + inline PyObject* get() noexcept { return m_ptr; } + /** @returns @a Borrowed reference. */ + inline const PyObject* get() const noexcept { return m_ptr; } + /** @returns @a Borrowed reference. */ + inline operator PyObject*() noexcept { return get(); } + /** @returns @a Borrowed reference. */ + inline operator const PyObject*() const noexcept { return get(); } + + /** @brief Release ownership of Python object pointer. + * @returns @a New reference. + */ + PyObject* release() noexcept; + +private: + + /** Python object pointer. */ + PyObject* m_ptr = nullptr; + +}; + +} // namespace python +} // namespace lbann + +#endif // LBANN_HAS_PYTHON +#endif // LBANN_UTILS_PYTHON_HPP_INCLUDED diff --git a/python/lbann/launcher/__init__.py b/python/lbann/launcher/__init__.py index a3579d71ba0..b42f7311c99 100644 --- a/python/lbann/launcher/__init__.py +++ b/python/lbann/launcher/__init__.py @@ -42,8 +42,8 @@ def run(model, data_reader, optimizer, model (lbann.model.Model or lbann_pb2.Model): Neural network model. data_reader (lbann_pb2.DataReader): Data reader. - optimizer (lbann.model.Model or lbann_pb2.Optimizer): Default - optimizer for model. + optimizer (lbann.model.Optimizer or lbann_pb2.Optimizer): + Default optimizer for model. lbann_exe (str, optional): LBANN executable. lbann_args (str, optional): Command-line arguments to LBANN executable. @@ -94,9 +94,9 @@ def run(model, data_reader, optimizer, # Create experiment prototext file prototext_file = os.path.join(experiment_dir, 'experiment.prototext') lbann.proto.save_prototext(prototext_file, - model = model, - data_reader = data_reader, - optimizer = optimizer) + model=model, + data_reader=data_reader, + optimizer=optimizer) lbann_args += ' --prototext=' + prototext_file # Run experiment diff --git a/python/lbann/proto.py b/python/lbann/proto.py index 2dcac9fb77e..f1166851a85 100644 --- a/python/lbann/proto.py +++ b/python/lbann/proto.py @@ -13,12 +13,33 @@ def save_prototext(filename, **kwargs): """ # Construct protobuf message - for key, value in kwargs.items(): - if not isinstance(value, google.protobuf.message.Message): - kwargs[key] = value.export_proto() - pb = lbann_pb2.LbannPB(**kwargs) + message = lbann_pb2.LbannPB() + field_names = message.DESCRIPTOR.fields_by_name.keys() + + # Make sure keyword arguments are valid + for key, val in kwargs.items(): + if key not in field_names: + raise TypeError("'{}' is an invalid keyword " + "argument for this function".format(key)) + if val is not None: + field = getattr(message, key) + if isinstance(val, google.protobuf.message.Message): + field.CopyFrom(val) + else: + field.CopyFrom(val.export_proto()) + field.SetInParent() + + # Make sure default optimizer is set + # TODO: This is a hack that should be removed when possible. LBANN + # requires the prototext file to provide a default optimizer. It + # would be better if LBANN used no optimizer if one isn't + # provided. + if not message.HasField('optimizer'): + from lbann import Optimizer + message.optimizer.CopyFrom(Optimizer().export_proto()) + message.optimizer.SetInParent() # Write to file with open(filename, 'wb') as f: f.write(google.protobuf.text_format.MessageToString( - pb, use_index_order=True).encode()) + message, use_index_order=True).encode()) diff --git a/src/data_readers/data_reader_python.cpp b/src/data_readers/data_reader_python.cpp index ffa1c7d16e4..65a9c109a4c 100644 --- a/src/data_readers/data_reader_python.cpp +++ b/src/data_readers/data_reader_python.cpp @@ -25,170 +25,15 @@ //////////////////////////////////////////////////////////////////////////////// #include "lbann/data_readers/data_reader_python.hpp" -#include "lbann/models/model.hpp" #ifdef LBANN_HAS_PYTHON #include #include #include +#include "lbann/models/model.hpp" +#include "lbann/utils/python.hpp" namespace lbann { -namespace python { - -// Static variables -std::unique_ptr manager::m_instance; - -manager& manager::get_instance() { - if (m_instance == nullptr) { create(); } - return *m_instance; -} - -void manager::create() { - m_instance.reset(new manager()); -} - -void manager::destroy() { - m_instance.reset(nullptr); -} - -manager::manager() { - if (!Py_IsInitialized()) { - - // Hack to display output from Python - // Note: Python outputs didn't appear because MPI intercepts - // stdout and stderr. See - // https://stackoverflow.com/questions/29352485/python-print-not-working-when-embedded-into-mpi-program - Py_UnbufferedStdioFlag = 1; - - // Initialize embedded Python session - Py_Initialize(); - PyEval_InitThreads(); - - // Release GIL - m_thread_state = PyEval_SaveThread(); - - } - if (!Py_IsInitialized()) { - LBANN_ERROR("error creating embedded Python session"); - } -} - -manager::~manager() { - if (Py_IsInitialized()) { - if (m_thread_state != nullptr) { - PyEval_RestoreThread(m_thread_state); - } - Py_Finalize(); - } -} - -void manager::check_error(bool force_error) const { - global_interpreter_lock gil(*this); - if (force_error || PyErr_Occurred()) { - - // Get error information from Python session - PyObject *type, *value, *traceback; - PyErr_Fetch(&type, &value, &traceback); - - // Construct error message - std::ostringstream err; - err << "detected Python error"; - if (value != nullptr) { - auto msg = PyObject_Repr(value); - auto msg_str = PyUnicode_AsEncodedString(msg, "utf-8", "Error -"); - err << " (" << PyBytes_AS_STRING(msg_str) << ")"; - Py_XDECREF(msg_str); - Py_XDECREF(msg); - } - - // Print Python traceback if available - if (traceback != nullptr) { - - // Format traceback - auto module = PyImport_ImportModule("traceback"); - auto func = PyObject_GetAttrString(module, "format_tb"); - auto args = PyTuple_Pack(1, traceback); - auto message = PyObject_CallObject(func, args); - - // Print traceback - err << "\n\n" << "Python traceback:"; - auto iter = PyObject_GetIter(message); - for (auto line = PyIter_Next(iter); - line != nullptr; - line = PyIter_Next(iter)) { - const char* line_ = PyUnicode_AsUTF8(line); - err << "\n" << (line_ ? line_ : ""); - Py_DECREF(line); - } - - // Clean up - Py_XDECREF(iter); - Py_XDECREF(message); - Py_XDECREF(args); - Py_XDECREF(func); - Py_XDECREF(module); - - } - - // Clean up and throw exception - Py_XDECREF(type); - Py_XDECREF(value); - Py_XDECREF(traceback); - LBANN_ERROR(err.str()); - - } -} - -global_interpreter_lock::global_interpreter_lock(const manager&) - : m_gil_state(PyGILState_Ensure()) {} - -global_interpreter_lock::~global_interpreter_lock() { - if (Py_IsInitialized()) { - PyGILState_Release(m_gil_state); - } -} - -object::object(PyObject* ptr) : m_ptr(ptr) { - if (Py_IsInitialized() && PyErr_Occurred()) { - manager::get_instance().check_error(); - } -} - -object::object(std::string val) - : object(PyUnicode_FromStringAndSize(val.c_str(), val.size())) {} -object::object(El::Int val) : object(PyLong_FromLong(val)) {} -object::object(DataType val) : object(PyFloat_FromDouble(val)) {} - -object::object(const object& other) : m_ptr(other.m_ptr) { - Py_XINCREF(m_ptr); -} - -object& object::operator=(const object& other) { - Py_XDECREF(m_ptr); - m_ptr = other.m_ptr; - Py_XINCREF(m_ptr); - return *this; -} - -object::object(object&& other) : m_ptr(other.m_ptr) { - other.m_ptr = nullptr; -} - -object& object::operator=(object&& other) { - Py_XDECREF(m_ptr); - m_ptr = other.m_ptr; - other.m_ptr = nullptr; - return *this; -} - -object::~object() { - if (Py_IsInitialized()) { - Py_XDECREF(m_ptr); - } -} - -} // namespace python - python_reader::python_reader(std::string module, std::string module_dir, std::string sample_function, @@ -196,15 +41,15 @@ python_reader::python_reader(std::string module, std::string sample_dims_function) : generic_data_reader(true) { - // Acquire Python GIL - auto& manager = python::manager::get_instance(); - python::global_interpreter_lock gil(manager); + // Make sure Python is running and acquire GIL + python::session::start_once(); + python::global_interpreter_lock gil; // Import Python module for data if (!module_dir.empty()) { auto path = PySys_GetObject("path"); // Borrowed reference PyList_Append(path, python::object(module_dir)); - manager.check_error(); + python::session::check_error(); } python::object data_module = PyImport_ImportModule(module.c_str()); @@ -213,7 +58,7 @@ python_reader::python_reader(std::string module, = PyObject_GetAttrString(data_module, num_samples_function.c_str()); python::object num = PyObject_CallObject(num_func, nullptr); m_num_samples = PyLong_AsLong(num); - manager.check_error(); + python::session::check_error(); // Get sample dimensions python::object dims_func @@ -224,7 +69,7 @@ python_reader::python_reader(std::string module, m_sample_dims.push_back(PyLong_AsLong(d)); Py_DECREF(d); } - manager.check_error(); + python::session::check_error(); // Get sample access function m_sample_function = PyObject_GetAttrString(data_module, @@ -233,9 +78,10 @@ python_reader::python_reader(std::string module, } python_reader::~python_reader() { - if (Py_IsInitialized() && m_process_pool != nullptr) { - python::global_interpreter_lock gil(python::manager::get_instance()); + if (python::session::is_active() && m_process_pool != nullptr) { + python::global_interpreter_lock gil; PyObject_CallMethod(m_process_pool, "terminate", nullptr); + PyObject_CallMethod(m_process_pool, "join", nullptr); } } @@ -266,8 +112,7 @@ bool python_reader::fetch_data_block(CPUMat& X, // Acquire Python GIL on first IO thread // Note: Do nothing on other IO threads. if (thread_id != 0) { return true; } - auto& manager = python::manager::get_instance(); - python::global_interpreter_lock gil(manager); + python::global_interpreter_lock gil; // Check that shared memory array is large enough const El::Int sample_size = get_linearized_data_size(); @@ -319,8 +164,7 @@ void python_reader::setup(int num_io_threads, generic_data_reader::setup(num_io_threads, io_thread_pool); // Acquire Python GIL - auto& manager = python::manager::get_instance(); - python::global_interpreter_lock gil(manager); + python::global_interpreter_lock gil; // Import modules python::object main_module = PyImport_ImportModule("__main__"); @@ -372,14 +216,14 @@ void python_reader::setup(int num_io_threads, PyObject_SetAttrString(main_module, sample_func_name.c_str(), m_sample_function); - manager.check_error(); + python::session::check_error(); const std::string shared_array_name = ("_DATA_READER_PYTHON_CPP_shared_memory_array" + std::to_string(instance_id)); PyObject_SetAttrString(main_module, shared_array_name.c_str(), m_shared_memory_array); - manager.check_error(); + python::session::check_error(); // Create wrapper around sample function // Note: We attempt accessing the sample with the buffer protocol @@ -429,7 +273,7 @@ def @wrapper_func@(sample_index, array_offset): std::regex("\\@datatype_typecode\\@"), datatype_typecode); PyRun_SimpleString(wrapper_func_def.c_str()); - manager.check_error(); + python::session::check_error(); m_sample_function_wrapper = PyObject_GetAttrString(main_module, wrapper_func_name.c_str()); diff --git a/src/utils/CMakeLists.txt b/src/utils/CMakeLists.txt index 9b19b996dd6..7531ba58a38 100644 --- a/src/utils/CMakeLists.txt +++ b/src/utils/CMakeLists.txt @@ -14,6 +14,7 @@ set_full_path(THIS_DIR_SOURCES options.cpp profiling.cpp protobuf_utils.cpp + python.cpp random.cpp stack_profiler.cpp stack_trace.cpp diff --git a/src/utils/python.cpp b/src/utils/python.cpp new file mode 100644 index 00000000000..b191a1ef9f0 --- /dev/null +++ b/src/utils/python.cpp @@ -0,0 +1,221 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#include "lbann/utils/python.hpp" +#ifdef LBANN_HAS_PYTHON +#include +#include "lbann/utils/exception.hpp" + +namespace lbann { +namespace python { + +// --------------------------------------------- +// session class +// --------------------------------------------- + +void session::start_once() { get(); } + +bool session::is_active() noexcept { return Py_IsInitialized(); } + +void session::check_error(bool force_error) { + start_once(); + if (!is_active()) { + LBANN_ERROR("embedded Python session has terminated unexpectedly"); + } + global_interpreter_lock gil; + if (force_error || PyErr_Occurred()) { + + // Get error information from Python session + PyObject *type_ptr, *value_ptr, *traceback_ptr; + PyErr_Fetch(&type_ptr, &value_ptr, &traceback_ptr); + object type(type_ptr), value(value_ptr), traceback(traceback_ptr); + + // Construct error message + std::ostringstream err; + err << "detected Python error"; + if (value != nullptr) { + object msg = PyObject_Repr(value); + msg = PyUnicode_AsEncodedString(msg, "utf-8", "Error -"); + err << " (" << PyBytes_AS_STRING(msg.get()) << ")"; + } + + // Print Python traceback if available + if (traceback != nullptr) { + + // Format traceback + object module = PyImport_ImportModule("traceback"); + object func = PyObject_GetAttrString(module, "format_tb"); + object message = PyObject_CallMethod(module, + "format_tb", + "(O)", + traceback.get()); + + // Print traceback + err << "\n\n" << "Python traceback:"; + object iter = PyObject_GetIter(message); + for (object line = PyIter_Next(iter); + line != nullptr; + line = PyIter_Next(iter)) { + const char* line_ = PyUnicode_AsUTF8(line); + err << "\n" << (line_ ? line_ : ""); + } + + } + + // Clean up and throw exception + PyErr_Restore(type.release(), value.release(), traceback.release()); + LBANN_ERROR(err.str()); + + } +} + +session& session::get() { + // Initializing static local variables is thread-safe as of C++11 + static session instance; + return instance; +} + +session::session() { + if (!is_active()) { + + // Hack to display output from Python + // Note: Python outputs didn't appear because MPI intercepts + // stdout and stderr. See + // https://stackoverflow.com/questions/29352485/python-print-not-working-when-embedded-into-mpi-program + Py_UnbufferedStdioFlag = 1; + + // Initialize embedded Python session + Py_Initialize(); + PyEval_InitThreads(); + + // Release GIL + m_thread_state = PyEval_SaveThread(); + + } + if (!is_active()) { + LBANN_ERROR("error initializing embedded Python session"); + } +} + +session::~session() { + if (is_active()) { + if (m_thread_state != nullptr) { + PyEval_RestoreThread(m_thread_state); + } + Py_Finalize(); + } + if (is_active()) { + LBANN_WARNING("error finalizing embedded Python session"); + } +} + +// --------------------------------------------- +// global_interpreter_lock class +// --------------------------------------------- + +global_interpreter_lock::global_interpreter_lock() { + session::start_once(); + if (!session::is_active()) { + LBANN_ERROR("embedded Python session has terminated unexpectedly"); + } + m_gil_state = PyGILState_Ensure(); +} + +global_interpreter_lock::~global_interpreter_lock() { + if (session::is_active()) { + PyGILState_Release(m_gil_state); + } +} + +// --------------------------------------------- +// object class +// --------------------------------------------- + +object::object(PyObject* ptr) : m_ptr(ptr) { + session::check_error(); +} +object::object(const std::string& val) { + global_interpreter_lock gil; + m_ptr = PyUnicode_FromStringAndSize(val.c_str(), val.size()); + session::check_error(); +} +object::object(long val) { + global_interpreter_lock gil; + m_ptr = PyLong_FromLong(val); + session::check_error(); +} +object::object(double val) { + global_interpreter_lock gil; + m_ptr = PyFloat_FromDouble(val); + session::check_error(); +} + +object::object(const object& other) : m_ptr(other.m_ptr) { + global_interpreter_lock gil; + m_ptr = other.m_ptr; + Py_XINCREF(m_ptr); + session::check_error(); +} + +object& object::operator=(const object& other) { + global_interpreter_lock gil; + Py_XDECREF(m_ptr); + m_ptr = other.m_ptr; + Py_XINCREF(m_ptr); + session::check_error(); + return *this; +} + +object::object(object&& other) noexcept : m_ptr(other.m_ptr) { + other.m_ptr = nullptr; +} + +object& object::operator=(object&& other) { + global_interpreter_lock gil; + Py_XDECREF(m_ptr); + m_ptr = other.m_ptr; + other.m_ptr = nullptr; + session::check_error(); + return *this; +} + +object::~object() { + if (session::is_active()) { + global_interpreter_lock gil; + Py_XDECREF(m_ptr); + } +} + +PyObject* object::release() noexcept { + auto old_ptr = m_ptr; + m_ptr = nullptr; + return old_ptr; +} + +} // namespace python +} // namespace lbann + +#endif // LBANN_HAS_PYTHON From e3e550b9b4ea87d04cdb32e15893056ffd47c84c Mon Sep 17 00:00:00 2001 From: Ryan Forsyth Date: Fri, 28 Jun 2019 10:59:31 -0700 Subject: [PATCH 114/634] Update checkpoint logs --- bamboo/clean.sh | 1 + bamboo/unit_tests/test_unit_checkpoint.py | 14 ++++++++------ 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/bamboo/clean.sh b/bamboo/clean.sh index 254930cb247..2864324c3ca 100755 --- a/bamboo/clean.sh +++ b/bamboo/clean.sh @@ -21,6 +21,7 @@ rm -f ${LBANN_DIR}/bamboo/integration_tests/error/*.txt rm -f ${LBANN_DIR}/bamboo/integration_tests/output/*.txt # Unit Tests +rm -rf ${LBANN_DIR}/bamboo/unit_tests/ckpt_* rm -f ${LBANN_DIR}/bamboo/unit_tests/*.prototext* rm -f ${LBANN_DIR}/bamboo/unit_tests/*.pyc rm -rf ${LBANN_DIR}/bamboo/unit_tests/__pycache__ diff --git a/bamboo/unit_tests/test_unit_checkpoint.py b/bamboo/unit_tests/test_unit_checkpoint.py index be468dccbfc..4d44d02348a 100644 --- a/bamboo/unit_tests/test_unit_checkpoint.py +++ b/bamboo/unit_tests/test_unit_checkpoint.py @@ -27,7 +27,8 @@ def skeleton_checkpoint_lenet_shared(cluster, executables, dir_name, if return_code_nockpt != 0: sys.stderr.write('LeNet (no checkpoint) execution failed, exiting with error') sys.exit(1) - os.system('mv ckpt ckpt_baseline') + ckpt_pre = 'ckpt_pre_lenet_shared_{c}'.format(c=compiler_name) + os.system('mv ckpt {c}'.format(c=ckpt_pre)) # Run to checkpoint, printing weights to files. output_file_name = '%s/bamboo/unit_tests/output/checkpoint_lenet_shared_checkpoint_%s_output.txt' % (dir_name, compiler_name) @@ -59,8 +60,8 @@ def skeleton_checkpoint_lenet_shared(cluster, executables, dir_name, sys.stderr.write('LeNet execution (restart from checkpoint) failed, exiting with error') sys.exit(1) - diff_test = os.system('diff -rq ckpt ckpt_baseline') - os.system('rm -rf ckpt*') + diff_test = os.system('diff -rq ckpt {c}'.format(c=ckpt_pre)) + os.system('mv ckpt ckpt_post_lenet_shared_{c}'.format(c=compiler_name)) assert diff_test == 0 @@ -86,7 +87,8 @@ def skeleton_checkpoint_lenet_distributed(cluster, executables, dir_name, if return_code_nockpt != 0: sys.stderr.write('LeNet (no checkpoint) execution failed, exiting with error') sys.exit(1) - os.system('mv ckpt ckpt_baseline') + ckpt_pre = 'ckpt_pre_lenet_distributed_{c}'.format(c=compiler_name) + os.system('mv ckpt {c}'.format(c=ckpt_pre)) # Run to checkpoint, printing weights to files. output_file_name = '%s/bamboo/unit_tests/output/checkpoint_lenet_distributed_checkpoint_%s_output.txt' % (dir_name, compiler_name) @@ -118,8 +120,8 @@ def skeleton_checkpoint_lenet_distributed(cluster, executables, dir_name, sys.stderr.write('LeNet execution (restart from checkpoint) failed, exiting with error') sys.exit(1) - diff_test = os.system('diff -rq ckpt ckpt_baseline') - os.system('rm -rf ckpt*') + diff_test = os.system('diff -rq ckpt {c}'.format(c=ckpt_pre)) + os.system('mv ckpt ckpt_post_lenet_distributed_{c}'.format(c=compiler_name)) assert diff_test == 0 From 93e676c2b0e5afbbae1bfe460d462d6e4befd7a9 Mon Sep 17 00:00:00 2001 From: Nikoli Dryden Date: Sun, 7 Jul 2019 22:14:01 -0500 Subject: [PATCH 115/634] LSF interface generates nodes.txt correctly It now correctly runs one instance per node. --- python/lbann/launcher/lsf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/lbann/launcher/lsf.py b/python/lbann/launcher/lsf.py index cac6830e7a7..c105944dbd1 100644 --- a/python/lbann/launcher/lsf.py +++ b/python/lbann/launcher/lsf.py @@ -94,7 +94,7 @@ def run(command, # Time and node list. s += '\n# ==== Useful info ====\n' s += 'date\n' - s += 'jsrun -n {} -a 1 hostname > {}\n'.format(nodes, nodes_file) + s += 'jsrun -n {} -a 1 -r 1 hostname > {}\n'.format(nodes, nodes_file) s += 'sort --unique --output={0} {0}\n'.format(nodes_file) # Run experiment. From b0242c2c4d1a940d0b7633b85a5da4fc6d0e8ae4 Mon Sep 17 00:00:00 2001 From: Tim Moon Date: Tue, 9 Jul 2019 10:15:47 -0700 Subject: [PATCH 116/634] Added option to specify device for layers in Python frontend. (#1107) --- python/lbann/layer.py | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/python/lbann/layer.py b/python/lbann/layer.py index b553cba52c2..14083b47be2 100644 --- a/python/lbann/layer.py +++ b/python/lbann/layer.py @@ -9,9 +9,14 @@ class Layer(abc.ABC): global_count = 0 # Static counter, used for default names - def __init__(self, parents = [], children = [], weights = [], - name = None, data_layout = 'data_parallel', - hint_layer = None): + def __init__(self, + parents=[], + children=[], + weights=[], + name=None, + device=None, + data_layout=None, + hint_layer=None): """Constructor. Args: @@ -23,6 +28,7 @@ def __init__(self, parents = [], children = [], weights = [], parameters. name (str, optional): Unique identifier (default is 'layer'). + device (str, optional): Device to use, e.g. CPU or GPU. data_layout (str, optional): Data distribution scheme. hint_layer (Layer, optional): Hint for output dimensions. @@ -32,6 +38,7 @@ def __init__(self, parents = [], children = [], weights = [], self.children = [] self.weights = [] self.name = name if name else 'layer{0}'.format(Layer.global_count) + self.device = device self.data_layout = data_layout self.hint_layer = hint_layer @@ -50,8 +57,12 @@ def export_proto(self): proto.children = ' '.join([l.name for l in self.children]) proto.weights = ' '.join([w.name for w in self.weights]) proto.name = self.name - proto.data_layout = self.data_layout - proto.hint_layer = self.hint_layer.name if self.hint_layer else '' + if self.device: + proto.device_allocation = self.device + if self.data_layout: + proto.data_layout = self.data_layout + if self.hint_layer: + proto.hint_layer = self.hint_layer.name return proto def add_parent(self, parent): @@ -90,7 +101,7 @@ def __call__(self, parent): base_class = Layer, base_kwargs = set([ 'parents', 'children', 'weights', - 'name', 'data_layout', 'hint_layer']), + 'name', 'device', 'data_layout', 'hint_layer']), base_has_export_proto = True) for c in classes: globals()[c.__name__] = c From 8d8bb0596935e968d6822e68204502f6ae23605f Mon Sep 17 00:00:00 2001 From: Tim Moon Date: Tue, 9 Jul 2019 15:59:55 -0700 Subject: [PATCH 117/634] Initial implementation of embedding layer. (#1106) --- include/lbann/layers/learning/CMakeLists.txt | 1 + include/lbann/layers/learning/embedding.hpp | 88 ++++++++++++ include/lbann/lbann.hpp | 1 + src/layers/learning/CMakeLists.txt | 1 + src/layers/learning/embedding.cpp | 142 +++++++++++++++++++ src/proto/factories/layer_factory.cpp | 13 ++ src/proto/lbann.proto | 6 + 7 files changed, 252 insertions(+) create mode 100644 include/lbann/layers/learning/embedding.hpp create mode 100644 src/layers/learning/embedding.cpp diff --git a/include/lbann/layers/learning/CMakeLists.txt b/include/lbann/layers/learning/CMakeLists.txt index ac855e21023..e5f7e6337f4 100644 --- a/include/lbann/layers/learning/CMakeLists.txt +++ b/include/lbann/layers/learning/CMakeLists.txt @@ -3,6 +3,7 @@ set_full_path(THIS_DIR_HEADERS base_convolution.hpp convolution.hpp deconvolution.hpp + embedding.hpp fully_connected.hpp fully_connected_cuda.hpp learning.hpp diff --git a/include/lbann/layers/learning/embedding.hpp b/include/lbann/layers/learning/embedding.hpp new file mode 100644 index 00000000000..67708af9e82 --- /dev/null +++ b/include/lbann/layers/learning/embedding.hpp @@ -0,0 +1,88 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_LAYERS_LEARNING_EMBEDDING_HPP_INCLUDED +#define LBANN_LAYERS_LEARNING_EMBEDDING_HPP_INCLUDED + +#include "lbann/layers/layer.hpp" + +namespace lbann { + +template +class embedding_layer : public Layer { +public: + + embedding_layer(lbann_comm* comm, + El::Int dictionary_size, + El::Int embedding_size) + : Layer(comm), + m_dictionary_size{dictionary_size}, + m_embedding_size{embedding_size} { + static_assert(Layout == data_layout::DATA_PARALLEL, + "embedding layer only supports data parallel layout"); + static_assert(Device == El::Device::CPU, + "embedding layer only supports CPU"); + } + + embedding_layer(const embedding_layer& other) = default; + embedding_layer& operator=(const embedding_layer& other) = default; + ~embedding_layer() = default; + + embedding_layer* copy() const override { + return new embedding_layer(*this); + } + + std::string get_type() const override { return "embedding"; } + data_layout get_data_layout() const override { return Layout; } + El::Device get_device_allocation() const override { return Device; } + + description get_description() const override { + auto&& desc = Layer::get_description(); + desc.add("Dictionary size", m_dictionary_size); + desc.add("Embedding size", m_embedding_size); + return desc; + } + +protected: + + void setup_matrices(const El::Grid& grid) override; + void setup_dims() override; + void setup_data() override; + + void fp_compute() override; + void bp_compute() override; + +private: + + El::Int m_dictionary_size; + El::Int m_embedding_size; + StarMat m_dictionary_gradient; + +}; + +} // namespace lbann + +#endif // LBANN_LAYERS_LEARNING_EMBEDDING_HPP_INCLUDED diff --git a/include/lbann/lbann.hpp b/include/lbann/lbann.hpp index aacbdafb1d6..870580b7690 100644 --- a/include/lbann/lbann.hpp +++ b/include/lbann/lbann.hpp @@ -45,6 +45,7 @@ #include "lbann/layers/learning/fully_connected.hpp" #include "lbann/layers/learning/convolution.hpp" #include "lbann/layers/learning/deconvolution.hpp" +#include "lbann/layers/learning/embedding.hpp" /// Loss layers #include "lbann/layers/loss/categorical_accuracy.hpp" diff --git a/src/layers/learning/CMakeLists.txt b/src/layers/learning/CMakeLists.txt index f89b0827617..317f968d510 100644 --- a/src/layers/learning/CMakeLists.txt +++ b/src/layers/learning/CMakeLists.txt @@ -1,5 +1,6 @@ # Add the source files for this directory set_full_path(THIS_DIR_SOURCES + embedding.cpp fully_connected.cpp ) diff --git a/src/layers/learning/embedding.cpp b/src/layers/learning/embedding.cpp new file mode 100644 index 00000000000..c87e33aaa90 --- /dev/null +++ b/src/layers/learning/embedding.cpp @@ -0,0 +1,142 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#include "lbann/layers/learning/embedding.hpp" +#include "lbann/models/model.hpp" + +namespace lbann { + +template <> +void embedding_layer::setup_matrices(const El::Grid& grid) { + Layer::setup_matrices(grid); + m_dictionary_gradient = StarMat(grid); +} + +template <> +void embedding_layer::setup_dims() { + Layer::setup_dims(); + + // Make sure input dimensions are valid + if (this->get_input_size() != 1) { + const auto& input_dims = this->get_input_dims(); + std::ostringstream err; + err << get_type() << " layer \"" << get_name() << "\" " + << "recieved an input tensor with invalid dimensions " + << "(expected 1, got "; + for (size_t i = 0; i < input_dims.size(); ++i) { + err << (i > 0 ? "x" : "") << input_dims[i]; + } + err << ")"; + LBANN_ERROR(err.str()); + } + + // Output is size of embedding vector + this->set_output_dims({static_cast(m_embedding_size)}); + +} + +template <> +void embedding_layer::setup_data() { + Layer::setup_data(); + + // Make sure layer has weights for dictionary + if (this->m_weights.size() != 1) { + std::ostringstream err; + err << "attempted to setup " + << this->get_type() << " layer \"" << this->get_name() << "\" " + << "with an invalid number of weights " + << "(expected 1, " + << "found " << this->m_weights.size() << ")"; + LBANN_ERROR(err.str()); + } + + // Initialize dictionary + auto& dict = *m_weights[0]; + auto matrix_dist = get_prev_activations().DistData(); + matrix_dist.colDist = El::STAR; + matrix_dist.rowDist = El::STAR; + dict.set_dims({static_cast(m_embedding_size)}, + {static_cast(m_dictionary_size)}); + dict.set_matrix_distribution(matrix_dist); + + // Initialize gradient w.r.t. dictionary + m_dictionary_gradient.Resize(m_embedding_size, m_dictionary_size); + +} + +template <> +void embedding_layer::fp_compute() { + + // Local data + const auto& local_dict = m_weights[0]->get_values().LockedMatrix(); + const auto& local_input = get_local_prev_activations(); + auto& local_output = get_local_activations(); + const auto& local_width = local_input.Width(); + + // Populate output matrix with appropriate columns of dictionary + CPUMat dict_v, output_v; + for (El::Int col = 0; col < local_width; ++ col) { + const El::Int ind = static_cast(local_input(0, col)); + El::LockedView(dict_v, local_dict, El::ALL, El::IR(ind)); + El::View(output_v, local_output, El::ALL, El::IR(col)); + El::Copy(dict_v, output_v); + } + +} + +template <> +void embedding_layer::bp_compute() { + + // Embedding layer is not differentiable w.r.t. inputs + El::Zero(get_error_signals()); + + // Nothing to be done if dictionary is not being optimized + if (m_weights[0]->get_optimizer() == nullptr) { return; } + auto& opt = *m_weights[0]->get_optimizer(); + + // Local data + const auto& local_input = get_local_prev_activations(); + auto& local_dict_grad = m_dictionary_gradient.Matrix(); + const auto& local_output_grad = get_local_prev_error_signals(); + const auto& local_width = local_input.Width(); + const auto& mini_batch_size = this->m_model->get_effective_mini_batch_size(); + + // Update appropriate columns of gradient w.r.t. dictionary + El::Zero(local_dict_grad); + CPUMat dict_grad_v, output_grad_v; + for (El::Int col = 0; col < local_width; ++ col) { + const El::Int ind = static_cast(local_input(0, col)); + El::View(dict_grad_v, local_dict_grad, El::ALL, El::IR(ind)); + El::LockedView(output_grad_v, local_output_grad, El::ALL, El::IR(col)); + El::Axpy(DataType{1}, output_grad_v, dict_grad_v); + } + opt.add_to_gradient(m_dictionary_gradient, + DataType{1} / mini_batch_size, + true); + +} + +} // namespace lbann diff --git a/src/proto/factories/layer_factory.cpp b/src/proto/factories/layer_factory.cpp index 8f73ac23c2e..7c946758719 100644 --- a/src/proto/factories/layer_factory.cpp +++ b/src/proto/factories/layer_factory.cpp @@ -198,6 +198,19 @@ std::unique_ptr construct_layer( } } + // Embedding layer + if (proto_layer.has_embedding()) { + const auto& params = proto_layer.embedding(); + if (Layout == data_layout::DATA_PARALLEL + && Device == El::Device::CPU) { + return lbann::make_unique>( + comm, params.dictionary_size(), params.embedding_size()); + } else { + LBANN_ERROR("embedding layer is only supported with " + "data-parallel data layout and on CPU"); + } + } + // Transform layers if (proto_layer.has_reshape()) { const auto& params = proto_layer.reshape(); diff --git a/src/proto/lbann.proto b/src/proto/lbann.proto index ac6b8ee1092..25478afb88c 100644 --- a/src/proto/lbann.proto +++ b/src/proto/lbann.proto @@ -856,6 +856,7 @@ message Layer { FullyConnected fully_connected = 11; Convolution convolution = 13; Deconvolution deconvolution = 305; + Embedding embedding = 328; // Loss layers CrossEntropy cross_entropy = 60; @@ -1282,6 +1283,11 @@ message Deconvolution { double l2_regularization_factor = 12; //default: 0 } +message Embedding { + int64 dictionary_size = 1; + int64 embedding_size = 2; +} + ////////////////// // Image layers // ////////////////// From f6d8a9d61d24dd9a8d1950a7226122f569d880bc Mon Sep 17 00:00:00 2001 From: Sam Ade Jacobs Date: Tue, 9 Jul 2019 18:03:13 -0700 Subject: [PATCH 118/634] Remove dropout layer from unit test --- .../prototext/jag_single_layer_ae.prototext | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/bamboo/unit_tests/prototext/jag_single_layer_ae.prototext b/bamboo/unit_tests/prototext/jag_single_layer_ae.prototext index c171bbdfbb9..dbdc59990e1 100644 --- a/bamboo/unit_tests/prototext/jag_single_layer_ae.prototext +++ b/bamboo/unit_tests/prototext/jag_single_layer_ae.prototext @@ -99,18 +99,9 @@ model { data_layout: "data_parallel" elu {} } - layer { - parents: "encodeelu" - name: "encodedropout" - data_layout: "data_parallel" - dropout { - keep_prob: 0.9 - } - } - #Y'(reconstructed images and scalar) layer { - parents: "encodedropout" + parents: "encodeelu" name: "decode" data_layout: "data_parallel" fully_connected { From b7a3d4502eacf397b9c10d7fb67fe5d2571eda92 Mon Sep 17 00:00:00 2001 From: "Thomas R. Benson" Date: Wed, 10 Jul 2019 10:38:46 -0700 Subject: [PATCH 119/634] fix exception handling warnings; minor exception cleanup --- include/lbann/utils/opencv.hpp | 8 ++++---- .../jag_utils/check_for_duplicate_samples.cpp | 8 ++------ model_zoo/jag_utils/check_images.cpp | 14 ++++---------- model_zoo/jag_utils/extract_random_samples.cpp | 8 ++++---- model_zoo/jag_utils/generate_corrupt_samples.cpp | 2 +- model_zoo/jag_utils/load_balance.cpp | 15 +++++++-------- 6 files changed, 22 insertions(+), 33 deletions(-) diff --git a/include/lbann/utils/opencv.hpp b/include/lbann/utils/opencv.hpp index f14208ca7b4..bf0b360316a 100644 --- a/include/lbann/utils/opencv.hpp +++ b/include/lbann/utils/opencv.hpp @@ -38,7 +38,7 @@ namespace utils { * Check whether data is an image. * Currently requires data to be a uint8_t CPUMat, with 3 dimensions, the first * (channel) being 1 or 3. - * + * * @param data The data to check. * @param dims The dimensions associated with data. */ @@ -48,7 +48,7 @@ inline bool check_is_image(const utils::type_erased_matrix& data, // Check if we can do the conversion. const auto& unused = data.template get(); (void) unused; - } catch (utils::bad_any_cast) { + } catch (const utils::bad_any_cast&) { return false; } if (dims.size() != 3 || (dims[0] != 1 && dims[0] != 3)) { @@ -62,7 +62,7 @@ inline bool check_is_image(const utils::type_erased_matrix& data, * Currently requires data to be a uint8_t CPUMat, with 3 dimensions, the first * (channel) being 1 or 3. * Also throws an error if OpenCV is not supported. - * + * * @param data The data to check. * @param dims The dimensions associated with data. */ @@ -72,7 +72,7 @@ inline void assert_is_image(const utils::type_erased_matrix& data, // Check if we can do the conversion. const auto& unused = data.template get(); (void) unused; - } catch (utils::bad_any_cast) { + } catch (const utils::bad_any_cast&) { LBANN_ERROR("Data is not an image: not uint8_t."); } if (dims.size() != 3 || (dims[0] != 1 && dims[0] != 3)) { diff --git a/model_zoo/jag_utils/check_for_duplicate_samples.cpp b/model_zoo/jag_utils/check_for_duplicate_samples.cpp index fc33898a401..0cbf170d36a 100644 --- a/model_zoo/jag_utils/check_for_duplicate_samples.cpp +++ b/model_zoo/jag_utils/check_for_duplicate_samples.cpp @@ -97,7 +97,7 @@ int main(int argc, char *argv[]) { std::vector cnames; try { conduit::relay::io::hdf5_group_list_child_names(hdf5_file_hnd, "/", cnames); - } catch (std::exception e) { + } catch (const std::exception&) { throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__) + " :: hdf5_group_list_child_names failed; " + files[j]); } @@ -137,10 +137,7 @@ int main(int argc, char *argv[]) { testme.insert(the_test); } } - } catch (exception const &e) { - El::ReportException(e); - return EXIT_FAILURE; - } catch (std::exception const &e) { + } catch (const std::exception& e) { El::ReportException(e); return EXIT_FAILURE; } @@ -156,4 +153,3 @@ void get_input_names(std::unordered_set &s) { s.insert("shape_model_initial_modes:(2,1)"); s.insert("shape_model_initial_modes:(1,0)"); } - diff --git a/model_zoo/jag_utils/check_images.cpp b/model_zoo/jag_utils/check_images.cpp index 436df059b7e..1b31eb7f8b9 100644 --- a/model_zoo/jag_utils/check_images.cpp +++ b/model_zoo/jag_utils/check_images.cpp @@ -85,9 +85,6 @@ int main(int argc, char *argv[]) { if (h % 10 == 0) std::cout << rank << " :: processed " << h << " files\n"; try { hdf5_file_hnd = conduit::relay::io::hdf5_open_file_for_read( files[j] ); - } catch (std::exception e) { - std::cerr << rank << " :: exception hdf5_open_file_for_read: " << files[j] << "\n"; - continue; } catch (...) { std::cerr << rank << " :: exception hdf5_open_file_for_read: " << files[j] << "\n"; continue; @@ -96,7 +93,7 @@ int main(int argc, char *argv[]) { std::vector cnames; try { conduit::relay::io::hdf5_group_list_child_names(hdf5_file_hnd, "/", cnames); - } catch (std::exception e) { + } catch (const std::exception&) { std::cerr << rank << " :: exception hdf5_group_list_child_names: " << files[j] << "\n"; continue; } @@ -106,7 +103,7 @@ int main(int argc, char *argv[]) { key = "/" + cnames[i] + "/performance/success"; try { conduit::relay::io::hdf5_read(hdf5_file_hnd, key, n_ok); - } catch (exception const &e) { + } catch (const exception& e) { throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__) + " :: caught exception reading success flag for child " + std::to_string(i) + " of " + std::to_string(cnames.size()) + "; " + e.what()); } int success = n_ok.to_int64(); @@ -116,17 +113,14 @@ int main(int argc, char *argv[]) { std::vector image_names; try { conduit::relay::io::hdf5_group_list_child_names(hdf5_file_hnd, key, image_names); - } catch (std::exception const &e) { + } catch (const std::exception&) { std::cerr << rank << " :: exception :hdf5_group_list_child_names for images: " << files[j] << "\n"; continue; } } } } - } catch (exception const &e) { - El::ReportException(e); - return EXIT_FAILURE; - } catch (std::exception const &e) { + } catch (const std::exception& e) { El::ReportException(e); return EXIT_FAILURE; } diff --git a/model_zoo/jag_utils/extract_random_samples.cpp b/model_zoo/jag_utils/extract_random_samples.cpp index ef636db2741..de0f5115796 100644 --- a/model_zoo/jag_utils/extract_random_samples.cpp +++ b/model_zoo/jag_utils/extract_random_samples.cpp @@ -148,11 +148,11 @@ int main(int argc, char *argv[]) { extract_samples(comm.get(), rank, np, conduit_filenames, samples); - } catch (exception& e) { + } catch (const exception& e) { std::cerr << "\n\n" << rank << " ::::: caught exception, outer try/catch: " << e.what() << "\n\n"; El::ReportException(e); return EXIT_FAILURE; - } catch (std::exception& e) { + } catch (const std::exception& e) { El::ReportException(e); return EXIT_FAILURE; } @@ -413,7 +413,7 @@ std::cerr << rank << " samples.size: " << samples.size() << " np: " << np << "\n try { conduit::relay::io::hdf5_close_file( hdf5_file_hnd ); - } catch (exception e) { + } catch (const exception& e) { throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__) + " :: exception hdf5_close_file; " + filenames[j] + "; " + e.what()); } @@ -432,7 +432,7 @@ std::cerr << rank << " samples.size: " << samples.size() << " np: " << np << "\n << "_" << file_id++ << ".bundle"; try { conduit::relay::io::save(save_me, fn.str(), "hdf5"); - } catch (exception e) { + } catch (const exception& e) { throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__) + " :: exception conduit::relay::save(); what: " + e.what()); } } diff --git a/model_zoo/jag_utils/generate_corrupt_samples.cpp b/model_zoo/jag_utils/generate_corrupt_samples.cpp index e145e1cfb40..524f3de7d3f 100644 --- a/model_zoo/jag_utils/generate_corrupt_samples.cpp +++ b/model_zoo/jag_utils/generate_corrupt_samples.cpp @@ -101,7 +101,7 @@ int main(int argc, char *argv[]) { std::vector cnames; try { hndl.list_child_names(cnames); - } catch (std::exception e) { + } catch (const std::exception&) { err << "list_child_names failed for this file: " << files[j]; LBANN_ERROR(err.str()); } diff --git a/model_zoo/jag_utils/load_balance.cpp b/model_zoo/jag_utils/load_balance.cpp index c025c918bce..df0372198d0 100644 --- a/model_zoo/jag_utils/load_balance.cpp +++ b/model_zoo/jag_utils/load_balance.cpp @@ -135,7 +135,7 @@ int main(int argc, char *argv[]) { try { hdf5_file_hnd = conduit::relay::io::hdf5_open_file_for_read( files[j].c_str() ); - } catch (std::exception e) { + } catch (const std::exception&) { std::cerr << rank << " :: exception hdf5_open_file_for_read: " << files[j] << "\n"; continue; } @@ -143,7 +143,7 @@ int main(int argc, char *argv[]) { std::vector cnames; try { conduit::relay::io::hdf5_group_list_child_names(hdf5_file_hnd, "/", cnames); - } catch (std::exception e) { + } catch (const std::exception&) { std::cerr << rank << " :: exception hdf5_group_list_child_names; " << files[j] << "\n"; continue; } @@ -154,7 +154,7 @@ int main(int argc, char *argv[]) { key = "/" + cnames[i] + "/performance/success"; try { conduit::relay::io::hdf5_read(hdf5_file_hnd, key, n_ok); - } catch (std::exception e) { + } catch (const std::exception&) { std::cerr << rank << " :: exception reading success flag: " << files[j] << "\n"; continue; } @@ -166,7 +166,7 @@ int main(int argc, char *argv[]) { conduit::relay::io::hdf5_read(hdf5_file_hnd, key, node); save_me["/" + cnames[i]] = node; - } catch (std::exception e) { + } catch (const std::exception&) { throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__) + " :: rank " + std::to_string(rank) + " :: " + "exception reading sample: " + cnames[i] + " which is " + std::to_string(i) + " of " + std::to_string(cnames[i].size()) + "; " + files[j]); } @@ -174,7 +174,7 @@ int main(int argc, char *argv[]) { if (sample_count == samples_per_file) { try { conduit::relay::io::save(save_me, output_fn, "hdf5"); - } catch (exception const &e) { + } catch (const std::exception& e) { std::cerr << rank << " :: exception: failed to save conduit node to disk; what: " << e.what() << "\n"; continue; } catch (...) { @@ -195,14 +195,14 @@ int main(int argc, char *argv[]) { if (sample_count) { try { conduit::relay::io::save(save_me, output_fn, "hdf5"); - } catch (exception const &e) { + } catch (exception const& e) { std::cerr << rank << " :: exception: failed to save conduit node to disk; what: " << e.what() << "\n"; } catch (...) { std::cerr << rank << " :: exception: failed to save conduit node to disk; FINAL FILE\n"; } } - } catch (std::exception const &e) { + } catch (std::exception const& e) { El::ReportException(e); return EXIT_FAILURE; } @@ -210,4 +210,3 @@ int main(int argc, char *argv[]) { // Clean up return EXIT_SUCCESS; } - From 79a6f6a666ce344a4005b7730614fcac5aab3c93 Mon Sep 17 00:00:00 2001 From: "David A. Hysom" Date: Thu, 11 Jul 2019 15:55:43 -0700 Subject: [PATCH 120/634] removed: void image_data_reader::load_conduit_node_from_file(int data_id, conduit::Node &node, std::vector &data) because it is no longer needed ( --- include/lbann/data_readers/data_reader_image.hpp | 1 - src/data_readers/data_reader_image.cpp | 10 ---------- 2 files changed, 11 deletions(-) diff --git a/include/lbann/data_readers/data_reader_image.hpp b/include/lbann/data_readers/data_reader_image.hpp index 3ac80fcadb2..3c9095af07c 100644 --- a/include/lbann/data_readers/data_reader_image.hpp +++ b/include/lbann/data_readers/data_reader_image.hpp @@ -115,7 +115,6 @@ class image_data_reader : public generic_data_reader { int m_num_labels; ///< number of labels void load_conduit_node_from_file(int data_id, conduit::Node &node); - void load_conduit_node_from_file(int data_id, conduit::Node &node, std::vector &data); }; diff --git a/src/data_readers/data_reader_image.cpp b/src/data_readers/data_reader_image.cpp index f8a3071bc11..bff16dbd652 100644 --- a/src/data_readers/data_reader_image.cpp +++ b/src/data_readers/data_reader_image.cpp @@ -238,16 +238,6 @@ std::vector image_data_reader::get_image_list_of_cu return ret; } - -void image_data_reader::load_conduit_node_from_file(int data_id, conduit::Node &node, std::vector &data) { - node.reset(); - const std::string filename = get_file_dir() + m_image_list[data_id].first; - int label = m_image_list[data_id].second; - read_raw_data(filename, data); - node[LBANN_DATA_ID_STR(data_id) + "/label"].set(label); - node[LBANN_DATA_ID_STR(data_id) + "/buffer"].set(data); - node[LBANN_DATA_ID_STR(data_id) + "/buffer_size"] = data.size(); -} void image_data_reader::load_conduit_node_from_file(int data_id, conduit::Node &node) { node.reset(); const std::string filename = get_file_dir() + m_image_list[data_id].first; From 24420be064e1dc5ce61e010cf9204ff933dea11e Mon Sep 17 00:00:00 2001 From: Tim Moon Date: Fri, 12 Jul 2019 11:00:23 -0700 Subject: [PATCH 121/634] Moving "check gradients" callback to end of test phase. (#1108) This allows it to be used together with the "check metric" callback. --- .../callbacks/callback_check_gradients.hpp | 21 ++-- src/callbacks/callback_check_gradients.cpp | 97 +++++++++++++------ 2 files changed, 76 insertions(+), 42 deletions(-) diff --git a/include/lbann/callbacks/callback_check_gradients.hpp b/include/lbann/callbacks/callback_check_gradients.hpp index 8433a00d5f1..fb5d531ff83 100644 --- a/include/lbann/callbacks/callback_check_gradients.hpp +++ b/include/lbann/callbacks/callback_check_gradients.hpp @@ -31,10 +31,11 @@ namespace lbann { -/** Gradient checking callback. - * Gradient checking is performed at the beginning of the test - * phase. Using a fourth-order finite difference scheme, a numerical - * partial derivative is computed for every weight parameter. If the +/** @brief Gradient checking callback. + * + * Gradient checking is performed at the end of the test phase. Using + * a fourth-order finite difference scheme, a numerical partial + * derivative is computed for every weight parameter. If the * numerical derivative differs signifcantly from the analytical * derivative computed during backprop, the gradient check has * failed. @@ -42,10 +43,10 @@ namespace lbann { class lbann_callback_check_gradients : public lbann_callback { public: - /** Constructor. + /** * @param step_size Step size for numerical * differentiation (with a step size of - * zero, the step size is chosen to + * zero, the step size is estimated to * minimize the numerical error). * @param verbose Whether to print results for each * parameter. @@ -58,15 +59,9 @@ class lbann_callback_check_gradients : public lbann_callback { lbann_callback_check_gradients* copy() const override { return new lbann_callback_check_gradients(*this); } - void on_test_begin(model *m) override; + void on_test_end(model *m) override; std::string name() const override { return "check gradients"; } - /** Compute objective function value. - * It is assumed that input data has already been loaded into the - * activations of the first layer. - */ - DataType compute_objective_function(model *m); - private: /** Step size for numerical differentiation. */ diff --git a/src/callbacks/callback_check_gradients.cpp b/src/callbacks/callback_check_gradients.cpp index 9f133f467da..266a32b3ddb 100644 --- a/src/callbacks/callback_check_gradients.cpp +++ b/src/callbacks/callback_check_gradients.cpp @@ -25,9 +25,36 @@ //////////////////////////////////////////////////////////////////////////////// #include "lbann/callbacks/callback_check_gradients.hpp" +#include "lbann/layers/io/input/generic_input_layer.hpp" +#include "lbann/data_readers/data_reader.hpp" namespace lbann { +namespace { + +/** @details Forward prop is applied to all layers, except input + * layers. It is assumed that input layers have already loaded data. + */ +DataType compute_objective_function(model& m) { + + // Forward prop, skipping input layers + for (auto&& l : m.get_layers()) { + if (dynamic_cast(l) == nullptr) { + l->forward_prop(); + } + } + + // Get objective function value + auto&& obj = m.get_objective_function(); + const auto mode = m.get_execution_mode(); + const auto mini_batch_size = m.get_current_mini_batch_size(); + obj->start_evaluation(mode, mini_batch_size); + return obj->finish_evaluation(mode, mini_batch_size); + +} + +} // namespace + lbann_callback_check_gradients ::lbann_callback_check_gradients(DataType step_size, bool verbose, @@ -36,21 +63,32 @@ lbann_callback_check_gradients m_verbose(verbose), m_error_on_failure(error_on_failure) {} -void lbann_callback_check_gradients::on_test_begin(model *m) { +void lbann_callback_check_gradients::on_test_end(model *m) { - // Get model members + // Get objects from model lbann_comm *comm = m->get_comm(); - const std::vector& layers = m->get_layers(); + auto mode = m->get_execution_mode(); + const auto& layers = m->get_layers(); - // Initialize network for testing + // Reset statistics and gradients + m->get_objective_function()->reset_statistics(mode); + for (auto&& met : m->get_metrics()) { + met->reset_statistics(mode); + } for (auto&& w : m->get_weights()) { auto&& opt = w->get_optimizer(); if (opt != nullptr) { opt->clear_gradient(); } } - layers[0]->forward_prop(); + + // Load data in input layers + for (auto&& l : m->get_layers()) { + if (dynamic_cast(l) != nullptr) { + l->forward_prop(); + } + } // Compute objective function - const DataType objective = compute_objective_function(m); + const DataType objective = compute_objective_function(*m); // Choose finite difference step // Note: Consider a central difference scheme: @@ -80,11 +118,11 @@ void lbann_callback_check_gradients::on_test_begin(model *m) { // Print objective function value if (comm->am_world_master()) { - std::cout << "--------------------------------------------------------------------------------" << std::endl - << "Gradient checking..." << std::endl - << " Objective function value = " << objective << std::endl - << " Step size = " << step_size << std::endl - << " Expected gradient error = " << expected_error << std::endl; + std::cout << "----------------------------------------------------------------\n" + << "Gradient checking...\n" + << " Objective function value = " << objective << "\n" + << " Step size = " << step_size << "\n" + << " Expected gradient error = " << expected_error << "\n"; } for (weights *w : m->get_weights()) { @@ -118,13 +156,13 @@ void lbann_callback_check_gradients::on_test_begin(model *m) { // Note: matrix entry is reset after computing objective // function values w->set_value(initial_weight + 2 * step_size, row, col); - const DataType f_2h = compute_objective_function(m); + const DataType f_2h = compute_objective_function(*m); w->set_value(initial_weight + step_size, row, col); - const DataType f_h = compute_objective_function(m); + const DataType f_h = compute_objective_function(*m); w->set_value(initial_weight - step_size, row, col); - const DataType f_nh = compute_objective_function(m); + const DataType f_nh = compute_objective_function(*m); w->set_value(initial_weight - 2 * step_size, row, col); - const DataType f_n2h = compute_objective_function(m); + const DataType f_n2h = compute_objective_function(*m); w->set_value(initial_weight, row, col); // Compute relative error in gradient. @@ -168,23 +206,24 @@ void lbann_callback_check_gradients::on_test_begin(model *m) { } } - if (comm->am_world_master()) { - std::cout << "--------------------------------------------------------------------------------" << std::endl; + std::cout << "----------------------------------------------------------------\n"; } -} - -DataType lbann_callback_check_gradients::compute_objective_function(model *m) { - const std::vector& layers = m->get_layers(); - objective_function* obj_fn = m->get_objective_function(); - for (size_t l = 1; l < layers.size(); l++) { - layers[l]->forward_prop(); + // Clean up + /// @todo tym: I'm not sure if data readers are properly reset + for (auto&& l : m->get_layers()) { + auto&& input = dynamic_cast(l); + if (input != nullptr) { + auto&& reader = input->get_data_reader(mode); + reader->set_initial_position(); + } } - obj_fn->start_evaluation(m->get_execution_mode(), - m->get_current_mini_batch_size()); - return obj_fn->finish_evaluation(m->get_execution_mode(), - m->get_current_mini_batch_size()); + m->get_objective_function()->reset_statistics(mode); + for (auto&& met : m->get_metrics()) { + met->reset_statistics(mode); + } + } -} // namespace lbann +} // namespace lbann From 969d17cbe4c046f68f50e4f04d89590dd2c2cfdd Mon Sep 17 00:00:00 2001 From: Nikoli Dryden Date: Sun, 14 Jul 2019 08:51:31 -0700 Subject: [PATCH 122/634] Fuse batchnorm stats allreduces in FP/BP. --- .../regularizers/batch_normalization.hpp | 84 ++++++++++++------- .../regularizers/batch_normalization.cpp | 43 +++++----- .../regularizers/batch_normalization.cu | 43 +++++----- 3 files changed, 96 insertions(+), 74 deletions(-) diff --git a/include/lbann/layers/regularizers/batch_normalization.hpp b/include/lbann/layers/regularizers/batch_normalization.hpp index 2f896fcf081..b439c0e7ded 100644 --- a/include/lbann/layers/regularizers/batch_normalization.hpp +++ b/include/lbann/layers/regularizers/batch_normalization.hpp @@ -71,14 +71,24 @@ class batch_normalization_layer : public regularizer_layer { */ std::unordered_map m_num_per_sum_cache; - /** Current minibatch means. */ - std::unique_ptr m_mean; - /** Current minibatch standard deviations. */ - std::unique_ptr m_var; - /** Gradient w.r.t. means. */ - std::unique_ptr m_mean_gradient; - /** Gradient w.r.t. standard deviations. */ - std::unique_ptr m_var_gradient; + /** @brief Current minibatch means and standard deviations. + * + * These are fused for performance when doing non-local batchnorm. + */ + std::unique_ptr m_mean_and_var; + /** View of current mini-batch means. */ + std::unique_ptr m_mean_v; + /** View of current mini-batch standard deviations. */ + std::unique_ptr m_var_v; + /** @brief Gradients w.r.t. means and standard deviations. + * + * These are fused for performance when doing non-local batchnorm. + */ + std::unique_ptr m_mean_and_var_gradient; + /** View of gradient w.r.t. means. */ + std::unique_ptr m_mean_gradient_v; + /** View of gradient w.r.t. standard deviations. */ + std::unique_ptr m_var_gradient_v; /** Gradient w.r.t. scaling terms. */ std::unique_ptr m_scale_gradient; /** Gradient w.r.t. bias terms. */ @@ -116,12 +126,16 @@ class batch_normalization_layer : public regularizer_layer { m_epsilon(other.m_epsilon), m_stats_aggregation(other.m_stats_aggregation), m_num_per_sum_cache(other.m_num_per_sum_cache), - m_mean(other.m_mean ? other.m_mean->Copy() : nullptr), - m_var(other.m_var ? other.m_var->Copy() : nullptr), - m_mean_gradient(other.m_mean_gradient ? - other.m_mean_gradient->Copy() : nullptr), - m_var_gradient(other.m_var_gradient ? - other.m_var_gradient->Copy() : nullptr), + m_mean_and_var(other.m_mean_and_var ? + other.m_mean_and_var->Copy() : nullptr), + m_mean_v(other.m_mean_v ? other.m_mean_v->Copy() : nullptr), + m_var_v(other.m_var_v ? other.m_var_v->Copy() : nullptr), + m_mean_and_var_gradient(other.m_mean_and_var_gradient ? + other.m_mean_and_var_gradient->Copy() : nullptr), + m_mean_gradient_v(other.m_mean_gradient_v ? + other.m_mean_gradient_v->Copy() : nullptr), + m_var_gradient_v(other.m_var_gradient_v ? + other.m_var_gradient_v->Copy() : nullptr), m_scale_gradient(other.m_scale_gradient ? other.m_scale_gradient->Copy() : nullptr), m_bias_gradient(other.m_bias_gradient ? @@ -135,12 +149,18 @@ class batch_normalization_layer : public regularizer_layer { m_num_per_sum_cache = other.m_num_per_sum_cache; // Deep copy matrices - m_mean.reset(other.m_mean ? other.m_mean->Copy() : nullptr); - m_var.reset(other.m_var ? other.m_var->Copy() : nullptr); - m_mean_gradient.reset(other.m_mean_gradient ? - other.m_mean_gradient->Copy() : nullptr); - m_var_gradient.reset(other.m_var_gradient ? - other.m_var_gradient->Copy() : nullptr); + m_mean_and_var.reset(other.m_mean_and_var ? + other.m_mean_and_var->Copy() : nullptr); + m_mean_v.reset(other.m_mean_v ? + other.m_mean_v->Copy() : nullptr); + m_var_v.reset(other.m_var_v ? + other.m_var_v->Copy() : nullptr); + m_mean_and_var_gradient.reset(other.m_mean_and_var_gradient ? + other.m_mean_and_var_gradient->Copy() : nullptr); + m_mean_gradient_v.reset(other.m_mean_gradient_v ? + other.m_mean_gradient_v->Copy() : nullptr); + m_var_gradient_v.reset(other.m_var_gradient_v ? + other.m_var_gradient_v->Copy() : nullptr); m_scale_gradient.reset(other.m_scale_gradient ? other.m_scale_gradient->Copy() : nullptr); m_bias_gradient.reset(other.m_bias_gradient ? @@ -176,10 +196,12 @@ class batch_normalization_layer : public regularizer_layer { void setup_matrices(const El::Grid& grid) override { regularizer_layer::setup_matrices(grid); - m_mean.reset(new StarMat(grid)); - m_var.reset(new StarMat(grid)); - m_mean_gradient.reset(new StarMat(grid)); - m_var_gradient.reset(new StarMat(grid)); + m_mean_and_var.reset(new StarMat(grid)); + m_mean_v.reset(new StarMat(grid)); + m_var_v.reset(new StarMat(grid)); + m_mean_and_var_gradient.reset(new StarMat(grid)); + m_mean_gradient_v.reset(new StarMat(grid)); + m_var_gradient_v.reset(new StarMat(grid)); m_scale_gradient.reset(new StarMat(grid)); m_bias_gradient.reset(new StarMat(grid)); } @@ -285,13 +307,19 @@ class batch_normalization_layer : public regularizer_layer { } // Initialize matrices - El::Zeros(*m_mean, num_channels, 1); - El::Zeros(*m_var, num_channels, 1); - El::Zeros(*m_mean_gradient, num_channels, 1); - El::Zeros(*m_var_gradient, num_channels, 1); + El::Zeros(*m_mean_and_var, num_channels, 2); + El::Zeros(*m_mean_and_var_gradient, num_channels, 2); El::Zeros(*m_scale_gradient, num_channels, 1); El::Zeros(*m_bias_gradient, num_channels, 1); + // Initialize views. + El::View(*m_mean_v, *m_mean_and_var, El::ALL, El::IR(0, 1)); + El::View(*m_var_v, *m_mean_and_var, El::ALL, El::IR(1, 2)); + El::View(*m_mean_gradient_v, *m_mean_and_var_gradient, + El::ALL, El::IR(0, 1)); + El::View(*m_var_gradient_v, *m_mean_and_var_gradient, + El::ALL, El::IR(1, 2)); + // Initialize freeze state for (auto&& w : this->m_weights) { if (m_frozen) { diff --git a/src/layers/regularizers/batch_normalization.cpp b/src/layers/regularizers/batch_normalization.cpp index 5d1535f01c7..051617e6850 100644 --- a/src/layers/regularizers/batch_normalization.cpp +++ b/src/layers/regularizers/batch_normalization.cpp @@ -50,8 +50,8 @@ void batch_normalization_layer::fp_ if (is_training) { // Local matrices - auto& local_mean = m_mean->Matrix(); - auto& local_var = m_var->Matrix(); + auto& local_mean = m_mean_v->Matrix(); + auto& local_var = m_var_v->Matrix(); auto& local_running_mean = this->m_weights[2]->get_values().Matrix(); auto& local_running_var = this->m_weights[3]->get_values().Matrix(); @@ -75,13 +75,14 @@ void batch_normalization_layer::fp_ El::Int num_per_sum; switch (m_stats_aggregation) { case batch_normalization_stats_aggregation::global: - m_comm->allreduce(*m_mean, m_mean->RedundantComm(), El::mpi::SUM); - m_comm->allreduce(*m_var, m_var->RedundantComm(), El::mpi::SUM); + // Allreduce on fused buffer. + m_comm->allreduce(*m_mean_and_var, m_mean_and_var->RedundantComm(), + El::mpi::SUM); num_per_sum = channel_size * width; break; case batch_normalization_stats_aggregation::node_local: - m_comm->allreduce(*m_mean, m_comm->get_node_comm(), El::mpi::SUM); - m_comm->allreduce(*m_var, m_comm->get_node_comm(), El::mpi::SUM); + // Allreduce on fused buffer. + m_comm->allreduce(*m_mean_and_var, m_comm->get_node_comm(), El::mpi::SUM); if (m_num_per_sum_cache.count(width) == 0) { num_per_sum = channel_size * local_width; num_per_sum = m_comm->allreduce(num_per_sum, m_comm->get_node_comm()); @@ -122,10 +123,10 @@ void batch_normalization_layer::fp_ const auto& local_scale = this->m_weights[0]->get_values().LockedMatrix(); const auto& local_bias = this->m_weights[1]->get_values().LockedMatrix(); const auto& local_mean = (is_training ? - m_mean->LockedMatrix() : + m_mean_v->LockedMatrix() : this->m_weights[2]->get_values().LockedMatrix()); const auto& local_var = (is_training ? - m_var->LockedMatrix() : + m_var_v->LockedMatrix() : this->m_weights[3]->get_values().LockedMatrix()); // Iterate through channels @@ -163,17 +164,17 @@ void batch_normalization_layer::bp_ // Matrices const auto& local_scale = this->m_weights[0]->get_values().LockedMatrix(); const auto& local_mean = (is_training ? - m_mean->LockedMatrix() : + m_mean_v->LockedMatrix() : this->m_weights[2]->get_values().LockedMatrix()); const auto& local_var = (is_training ? - m_var->LockedMatrix() : + m_var_v->LockedMatrix() : this->m_weights[3]->get_values().LockedMatrix()); const auto& input = get_prev_activations(); const auto& local_input = input.LockedMatrix(); const auto& local_gradient_wrt_output = get_local_prev_error_signals(); auto& local_gradient_wrt_input = get_local_error_signals(); - auto& local_mean_gradient = m_mean_gradient->Matrix(); - auto& local_var_gradient = m_var_gradient->Matrix(); + auto& local_mean_gradient = m_mean_gradient_v->Matrix(); + auto& local_var_gradient = m_var_gradient_v->Matrix(); auto& local_scale_gradient = m_scale_gradient->Matrix(); auto& local_bias_gradient = m_bias_gradient->Matrix(); @@ -225,23 +226,19 @@ void batch_normalization_layer::bp_ // Accumulate gradients if (is_training) { if (m_stats_aggregation == batch_normalization_stats_aggregation::global) { - m_comm->allreduce(*m_mean_gradient, - m_mean_gradient->RedundantComm(), - El::mpi::SUM); - m_comm->allreduce(*m_var_gradient, - m_var_gradient->RedundantComm(), + // Allreduce on fused buffer. + m_comm->allreduce(*m_mean_and_var_gradient, + m_mean_and_var_gradient->RedundantComm(), El::mpi::SUM); } else if (m_stats_aggregation == batch_normalization_stats_aggregation::node_local) { - m_comm->allreduce(*m_mean_gradient, - m_comm->get_node_comm(), - El::mpi::SUM); - m_comm->allreduce(*m_var_gradient, + // Allreduce on fused buffer. + m_comm->allreduce(*m_mean_and_var_gradient, m_comm->get_node_comm(), El::mpi::SUM); } } else { - El::Zero(*m_mean_gradient); - El::Zero(*m_var_gradient); + // Zero fused buffer. + El::Zero(*m_mean_and_var_gradient); } optimizer* scale_optimizer = m_weights[0]->get_optimizer(); if (scale_optimizer != nullptr) { diff --git a/src/layers/regularizers/batch_normalization.cu b/src/layers/regularizers/batch_normalization.cu index ae679ec1c64..b9ab2cedbfd 100644 --- a/src/layers/regularizers/batch_normalization.cu +++ b/src/layers/regularizers/batch_normalization.cu @@ -318,8 +318,8 @@ void batch_normalization_layer::fp_ if (is_training) { // Local matrices - auto& local_mean = m_mean->Matrix(); - auto& local_var = m_var->Matrix(); + auto& local_mean = m_mean_v->Matrix(); + auto& local_var = m_var_v->Matrix(); auto& local_running_mean = this->m_weights[2]->get_values().Matrix(); auto& local_running_var = this->m_weights[3]->get_values().Matrix(); @@ -341,13 +341,14 @@ void batch_normalization_layer::fp_ El::Int num_per_sum; switch (m_stats_aggregation) { case batch_normalization_stats_aggregation::global: - m_comm->allreduce(*m_mean, m_mean->RedundantComm(), El::mpi::SUM); - m_comm->allreduce(*m_var, m_var->RedundantComm(), El::mpi::SUM); + // Allreduce on fused buffer. + m_comm->allreduce(*m_mean_and_var, m_mean_and_var->RedundantComm(), + El::mpi::SUM); num_per_sum = channel_size * width; break; case batch_normalization_stats_aggregation::node_local: - m_comm->allreduce(*m_mean, m_comm->get_node_comm(), El::mpi::SUM); - m_comm->allreduce(*m_var, m_comm->get_node_comm(), El::mpi::SUM); + // Allreduce on fused buffer. + m_comm->allreduce(*m_mean_and_var, m_comm->get_node_comm(), El::mpi::SUM); if (m_num_per_sum_cache.count(width) == 0) { num_per_sum = channel_size * local_width; num_per_sum = m_comm->allreduce(num_per_sum, m_comm->get_node_comm()); @@ -382,10 +383,10 @@ void batch_normalization_layer::fp_ const auto& local_scale = this->m_weights[0]->get_values().LockedMatrix(); const auto& local_bias = this->m_weights[1]->get_values().LockedMatrix(); const auto& local_mean = (is_training ? - m_mean->LockedMatrix() : + m_mean_v->LockedMatrix() : this->m_weights[2]->get_values().LockedMatrix()); const auto& local_var = (is_training ? - m_var->LockedMatrix() : + m_var_v->LockedMatrix() : this->m_weights[3]->get_values().LockedMatrix()); if (!local_input.IsEmpty()) { const El::Int block_size = 256; @@ -416,17 +417,17 @@ void batch_normalization_layer::bp_ // Matrices const auto& local_scale = this->m_weights[0]->get_values().LockedMatrix(); const auto& local_mean = (is_training ? - m_mean->LockedMatrix() : + m_mean_v->LockedMatrix() : this->m_weights[2]->get_values().LockedMatrix()); const auto& local_var = (is_training ? - m_var->LockedMatrix() : + m_var_v->LockedMatrix() : this->m_weights[3]->get_values().LockedMatrix()); const auto& input = get_prev_activations(); const auto& local_input = input.LockedMatrix(); const auto& local_gradient_wrt_output = get_local_prev_error_signals(); auto& local_gradient_wrt_input = get_local_error_signals(); - auto& local_mean_gradient = m_mean_gradient->Matrix(); - auto& local_var_gradient = m_var_gradient->Matrix(); + auto& local_mean_gradient = m_mean_gradient_v->Matrix(); + auto& local_var_gradient = m_var_gradient_v->Matrix(); auto& local_scale_gradient = m_scale_gradient->Matrix(); auto& local_bias_gradient = m_bias_gradient->Matrix(); @@ -464,23 +465,19 @@ void batch_normalization_layer::bp_ // Accumulate gradients if (is_training) { if (m_stats_aggregation == batch_normalization_stats_aggregation::global) { - m_comm->allreduce(*m_mean_gradient, - m_mean_gradient->RedundantComm(), - El::mpi::SUM); - m_comm->allreduce(*m_var_gradient, - m_var_gradient->RedundantComm(), + // Allreduce on fused buffer. + m_comm->allreduce(*m_mean_and_var_gradient, + m_mean_and_var_gradient->RedundantComm(), El::mpi::SUM); } else if (m_stats_aggregation == batch_normalization_stats_aggregation::node_local) { - m_comm->allreduce(*m_mean_gradient, - m_comm->get_node_comm(), - El::mpi::SUM); - m_comm->allreduce(*m_var_gradient, + // Allreduce on fused buffer. + m_comm->allreduce(*m_mean_and_var_gradient, m_comm->get_node_comm(), El::mpi::SUM); } } else { - El::Zero(*m_mean_gradient); - El::Zero(*m_var_gradient); + // Zero fused buffer. + El::Zero(*m_mean_and_var_gradient); } optimizer* scale_optimizer = m_weights[0]->get_optimizer(); if (scale_optimizer != nullptr) { From 9ad8e889c032c9232912d64c178d08e82c0f5f41 Mon Sep 17 00:00:00 2001 From: Tim Moon Date: Mon, 15 Jul 2019 11:09:56 -0700 Subject: [PATCH 123/634] LSTM module takes previous state as argument (#1110) * Fixing incorrect weight initialization in LSTM module. * LSTM module takes previous LSTM state as argument. --- python/lbann/modules.py | 59 +++++++++++++++++++++-------------------- 1 file changed, 30 insertions(+), 29 deletions(-) diff --git a/python/lbann/modules.py b/python/lbann/modules.py index dd674f78e6c..f9ea6fca7ad 100644 --- a/python/lbann/modules.py +++ b/python/lbann/modules.py @@ -18,32 +18,32 @@ def _str_list(l): class Module: """Base class for neural network modules. - A module is a pattern of operations that may be applied to a set - of input layers, obtaining a set of output layers. + A module is a pattern of layers that can be added to a layer + graph, possibly multiple times. The pattern typically takes a set + of input layers and obtains a set of output layers. """ def __init__(self): pass - def forward(self, input): - """Apply module pattern to `input`. + def forward(self, *args, **kwargs): + """Apply module pattern. - `input` is a `Layer` or a sequence of `Layer`s. The module - pattern is added to the layer graph and the output layer(s) - are returned. + A module pattern typically takes a set of `Layer`s as input + and returns a set of `Layer`s. """ # Should be overridden in all sub-classes raise NotImplementedError - def __call__(self, input): + def __call__(self, *args, **kwargs): """Apply module mattern to `input`. Syntatic sugar around `forward` function. """ - return self.forward(input) + return self.forward(*args, **kwargs) class FullyConnectedModule(Module): """Basic block for fully-connected neural networks. @@ -275,14 +275,6 @@ def __init__(self, size, bias = True, else 'lstmcell{0}'.format(LSTMCell.global_count)) self.data_layout = data_layout - # Initial state - self.last_output = lbann.Constant(value=0.0, num_neurons=str(size), - name=self.name + '_init_output', - data_layout=self.data_layout) - self.last_cell = lbann.Constant(value=0.0, num_neurons=str(size), - name=self.name + '_init_cell', - data_layout=self.data_layout) - # Weights self.weights = list(make_iterable(weights)) if len(self.weights) > 2: @@ -291,13 +283,13 @@ def __init__(self, size, bias = True, if len(self.weights) == 0: self.weights.append( lbann.Weights(initializer=lbann.UniformInitializer(min=-1/sqrt(self.size), - max=-1/sqrt(self.size)), + max=1/sqrt(self.size)), name=self.name+'_matrix')) if len(self.weights) == 1: self.weights.append( lbann.Weights(initializer=lbann.UniformInitializer(min=-1/sqrt(self.size), - max=-1/sqrt(self.size)), - name=self.name+'_bias')) + max=1/sqrt(self.size)), + name=self.name+'_bias')) # Linearity self.fc = FullyConnectedModule(4*size, bias=bias, @@ -305,17 +297,28 @@ def __init__(self, size, bias = True, name=self.name + '_fc', data_layout=self.data_layout) - def forward(self, x): - """Perform LSTM step. + def forward(self, x, prev_state): + """Apply LSTM step. - State from previous steps is used to compute output. + Args: + x (Layer): Input. + prev_state (tuple with two `Layer`s): State from previous + LSTM step. Comprised of LSTM output and cell state. + + Returns: + (Layer, (Layer, Layer)): The output and state (the output + and cell state). The state can be passed directly into + the next LSTM step. """ self.step += 1 name = '{0}_step{1}'.format(self.name, self.step) + # Get output and cell state from previous step + prev_output, prev_cell = prev_state + # Apply linearity - input_concat = lbann.Concatenation([x, self.last_output], + input_concat = lbann.Concatenation([x, prev_output], name=name + '_input', data_layout=self.data_layout) fc = self.fc(input_concat) @@ -343,7 +346,7 @@ def forward(self, x): data_layout=self.data_layout) # Cell state - cell_forget = lbann.Multiply([f, self.last_cell], + cell_forget = lbann.Multiply([f, prev_cell], name=name + '_cell_forget', data_layout=self.data_layout) cell_input = lbann.Multiply([i, cell_update], @@ -358,7 +361,5 @@ def forward(self, x): output = lbann.Multiply([o, cell_act], name=name, data_layout=self.data_layout) - # Update state and return output - self.last_cell = cell - self.last_output = output - return output + # Return output and state + return output, (output, cell) From 715e0f04e0e96dab77dea618d736775f7ea57ccf Mon Sep 17 00:00:00 2001 From: Ryan Forsyth Date: Mon, 24 Jun 2019 13:09:09 -0700 Subject: [PATCH 124/634] Set up Corona --- bamboo/allocate_and_run.sh | 4 ++-- bamboo/common_python/tools.py | 8 +++---- bamboo/compiler_tests/test_compiler.py | 2 +- ...toencoder_imagenet_objective_functions.csv | 21 +++++++++++++++++++ ..._autoencoder_mnist_objective_functions.csv | 6 ++++++ .../corona/gcc7/expected_performance.csv | 5 +++++ bamboo/run.sh | 2 +- 7 files changed, 40 insertions(+), 8 deletions(-) create mode 100644 bamboo/integration_tests/expected_values/corona/gcc7/expected_conv_autoencoder_imagenet_objective_functions.csv create mode 100644 bamboo/integration_tests/expected_values/corona/gcc7/expected_conv_autoencoder_mnist_objective_functions.csv create mode 100644 bamboo/integration_tests/expected_values/corona/gcc7/expected_performance.csv diff --git a/bamboo/allocate_and_run.sh b/bamboo/allocate_and_run.sh index 23a01bc1ad8..9054983fe52 100755 --- a/bamboo/allocate_and_run.sh +++ b/bamboo/allocate_and_run.sh @@ -31,7 +31,7 @@ if [ "${CLUSTER}" = 'pascal' ]; then fi if [ ${WEEKLY} -ne 0 ]; then - salloc -N16 -t 900 ./run.sh --weekly + salloc -N16 --partition=pbatch -t 900 ./run.sh --weekly if [ "${CLUSTER}" = 'catalyst' ]; then cd integration_tests python -m pytest -s test_integration_performance_full_alexnet_clang6 --weekly --run --junitxml=alexnet_clang6_results.xml @@ -40,5 +40,5 @@ if [ ${WEEKLY} -ne 0 ]; then cd .. fi else - salloc -N16 -t 900 ./run.sh + salloc -N16 --partition=pbatch -t 900 ./run.sh fi diff --git a/bamboo/common_python/tools.py b/bamboo/common_python/tools.py index e209f1e1c3e..365c8eafe32 100644 --- a/bamboo/common_python/tools.py +++ b/bamboo/common_python/tools.py @@ -65,7 +65,7 @@ def get_command(cluster, process_executable_existence(executable, skip_no_exe) # Determine scheduler - if cluster in ['catalyst', 'pascal']: + if cluster in ['catalyst', 'corona', 'pascal']: scheduler = 'slurm' elif cluster == 'ray': scheduler = 'lsf' @@ -255,14 +255,14 @@ def get_command(cluster, # Determine data file paths # If there is no regex match, then re.sub keeps the original string if data_filedir_default is not None: - if cluster in ['catalyst', 'pascal',]: + if cluster in ['catalyst', 'corona', 'pascal',]: # option_data_filedir = data_filedir_default # lscratchh, presumably pass # No need to pass in a parameter elif cluster == 'ray': option_data_filedir = ' --data_filedir=%s' % re.sub( '[a-z]scratch[a-z]', 'gscratchr', data_filedir_default) elif None not in data_file_parameters: - if cluster in ['catalyst', 'pascal']: + if cluster in ['catalyst', 'corona', 'pascal']: # option_data_filedir_train = data_filedir_train_default # option_data_filename_train = data_filename_train_default # option_data_filedir_test = data_filedir_test_default @@ -396,7 +396,7 @@ def get_default_exes(default_dirname, cluster): default_exes = {} default_exes['default'] = '%s/build/gnu.Release.%s.llnl.gov/install/bin/lbann' % (default_dirname, cluster) - if cluster in ['catalyst', 'pascal']: + if cluster in ['catalyst', 'corona', 'pascal']: # x86_cpu - catalyst # x86_gpu_pascal - pascal default_exes['clang6'] = exes['clang6'] diff --git a/bamboo/compiler_tests/test_compiler.py b/bamboo/compiler_tests/test_compiler.py index 5f637519901..0bc8ea24938 100644 --- a/bamboo/compiler_tests/test_compiler.py +++ b/bamboo/compiler_tests/test_compiler.py @@ -6,7 +6,7 @@ def test_compiler_build_script(cluster, dirname): - if cluster in ['pascal']: + if cluster in ['corona', 'pascal']: output_file_name = '%s/bamboo/compiler_tests/output/build_script_output.txt' % (dirname) error_file_name = '%s/bamboo/compiler_tests/error/build_script_error.txt' % (dirname) command = '%s/bamboo/compiler_tests/build_script.sh > %s 2> %s' % ( diff --git a/bamboo/integration_tests/expected_values/corona/gcc7/expected_conv_autoencoder_imagenet_objective_functions.csv b/bamboo/integration_tests/expected_values/corona/gcc7/expected_conv_autoencoder_imagenet_objective_functions.csv new file mode 100644 index 00000000000..d1fec964160 --- /dev/null +++ b/bamboo/integration_tests/expected_values/corona/gcc7/expected_conv_autoencoder_imagenet_objective_functions.csv @@ -0,0 +1,21 @@ +Epoch_number, training_objective_function_nightly, training_objective_function_weekly +0, 0.983936, 0.608574 +1, 0.908194, 0.590008 +2, 0.900910, 0.587484 +3, 0.899583, 0.586305 +4, 0.897652, 0.585585 +5, 0.889670, 0.585036 +6, 0.890061, 0.584688 +7, 0.888348, 0.584348 +8, 0.888921, 0.584041 +9, 0.883034, 0.583865 +10, 0.888236, 0.583665 +11, 0.881798, 0.583521 +12, 0.884866, 0.583303 +13, 0.883757, 0.58328 +14, 0.881703, 0.5832 +15, 0.883718, 0.583134 +16, 0.875670, 0.583052 +17, 0.877554, 0.583039 +18, 0.882443, 0.582954 +19, 0.881577, 0.582936 diff --git a/bamboo/integration_tests/expected_values/corona/gcc7/expected_conv_autoencoder_mnist_objective_functions.csv b/bamboo/integration_tests/expected_values/corona/gcc7/expected_conv_autoencoder_mnist_objective_functions.csv new file mode 100644 index 00000000000..8bcf25bb71d --- /dev/null +++ b/bamboo/integration_tests/expected_values/corona/gcc7/expected_conv_autoencoder_mnist_objective_functions.csv @@ -0,0 +1,6 @@ +Epoch_number, training_objective_function +0, 0.207514 +1, 0.194710 +2, 0.193221 +3, 0.192864 +4, 0.192755 diff --git a/bamboo/integration_tests/expected_values/corona/gcc7/expected_performance.csv b/bamboo/integration_tests/expected_values/corona/gcc7/expected_performance.csv new file mode 100644 index 00000000000..42b575664c9 --- /dev/null +++ b/bamboo/integration_tests/expected_values/corona/gcc7/expected_performance.csv @@ -0,0 +1,5 @@ +Model_name, training_run_time, training_mean, training_max, training_min, training_stdev, test_accuracy +alexnet_nightly, 55.00, 1.03, 1.90, 0.80, 0.21, 100.00 +alexnet_weekly, 0.00, 0.00, 0.00, 0.00, 0.00, 100.00 +cache_alexnet, 0.00, 0.00, 0.00, 0.00, 0.00, 100.00 +lenet_mnist, 385.00, 0.50, 2.00, 0.51, 0.80, 98.40 diff --git a/bamboo/run.sh b/bamboo/run.sh index 234e950137d..dac097ff0bd 100755 --- a/bamboo/run.sh +++ b/bamboo/run.sh @@ -5,7 +5,7 @@ CLUSTER=$(hostname | sed 's/\([a-zA-Z][a-zA-Z]*\)[0-9]*/\1/g') echo "run.sh CLUSTER=" echo $CLUSTER -if [ "${CLUSTER}" = 'catalyst' ]; then +if [ "${CLUSTER}" = 'catalyst' ] || [ "${CLUSTER}" = 'corona' ]; then PYTHON=python fi From cb78c6d1d12dd940a5ec2c112964909529d62c8b Mon Sep 17 00:00:00 2001 From: Ryan Forsyth Date: Tue, 16 Jul 2019 09:46:12 -0700 Subject: [PATCH 125/634] Update time limits --- bamboo/allocate_and_run.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bamboo/allocate_and_run.sh b/bamboo/allocate_and_run.sh index 9054983fe52..3e43bb23452 100755 --- a/bamboo/allocate_and_run.sh +++ b/bamboo/allocate_and_run.sh @@ -31,7 +31,7 @@ if [ "${CLUSTER}" = 'pascal' ]; then fi if [ ${WEEKLY} -ne 0 ]; then - salloc -N16 --partition=pbatch -t 900 ./run.sh --weekly + salloc -N16 --partition=pbatch -t 1440 ./run.sh --weekly if [ "${CLUSTER}" = 'catalyst' ]; then cd integration_tests python -m pytest -s test_integration_performance_full_alexnet_clang6 --weekly --run --junitxml=alexnet_clang6_results.xml @@ -40,5 +40,5 @@ if [ ${WEEKLY} -ne 0 ]; then cd .. fi else - salloc -N16 --partition=pbatch -t 900 ./run.sh + salloc -N16 --partition=pbatch -t 1440 ./run.sh fi From 6c77d223e28a60b6120da614557545f75e1edc1b Mon Sep 17 00:00:00 2001 From: Ryan Forsyth Date: Wed, 12 Jun 2019 14:32:21 -0700 Subject: [PATCH 126/634] Add JAG reconstruction loss tests --- bamboo/common_python/tools.py | 8 +- .../prototext/jag_100M_metadata.prototext | 119 ------------------ .../test_unit_reconstruction_loss.py | 52 ++++++++ .../data_readers/data_reader_jag.prototext | 0 .../tests/model_jag_single_layer_ae.prototext | 19 ++- 5 files changed, 73 insertions(+), 125 deletions(-) delete mode 100644 bamboo/unit_tests/prototext/jag_100M_metadata.prototext create mode 100644 bamboo/unit_tests/test_unit_reconstruction_loss.py rename bamboo/unit_tests/prototext/jag_reader.prototext => model_zoo/data_readers/data_reader_jag.prototext (100%) rename bamboo/unit_tests/prototext/jag_single_layer_ae.prototext => model_zoo/tests/model_jag_single_layer_ae.prototext (76%) diff --git a/bamboo/common_python/tools.py b/bamboo/common_python/tools.py index 365c8eafe32..7908189b29e 100644 --- a/bamboo/common_python/tools.py +++ b/bamboo/common_python/tools.py @@ -27,6 +27,7 @@ def get_command(cluster, data_reader_path=None, data_reader_percent=None, exit_after_setup=False, + metadata=None, mini_batch_size=None, model_folder=None, model_name=None, @@ -196,6 +197,7 @@ def get_command(cluster, option_data_reader = '' option_data_reader_percent = '' option_exit_after_setup = '' + option_metadata = '' option_mini_batch_size = '' option_model = '' option_num_epochs = '' @@ -313,6 +315,8 @@ def get_command(cluster, option_data_reader_percent = ' --data_reader_percent=%f' % data_reader_percent if exit_after_setup: option_exit_after_setup = ' --exit_after_setup' + if metadata is not None: + option_metadata = ' --metadata={d}/{m}'.format(d=dir_name, m=metadata) if mini_batch_size is not None: option_mini_batch_size = ' --mini_batch_size=%d' % mini_batch_size if num_epochs is not None: @@ -324,12 +328,12 @@ def get_command(cluster, if lbann_errors != []: print('lbann_errors={lbann_errors}.'.format(lbann_errors=lbann_errors)) raise Exception('Invalid Usage: ' + ' , '.join(lbann_errors)) - command_lbann = '%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s' % ( + command_lbann = '%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s' % ( executable, option_ckpt_dir, option_data_filedir, option_data_filedir_train, option_data_filename_train, option_data_filedir_test, option_data_filename_test, option_data_reader, option_data_reader_percent, - option_exit_after_setup, option_mini_batch_size, + option_exit_after_setup, option_metadata, option_mini_batch_size, option_model, option_num_epochs, option_optimizer, option_processes_per_model) diff --git a/bamboo/unit_tests/prototext/jag_100M_metadata.prototext b/bamboo/unit_tests/prototext/jag_100M_metadata.prototext deleted file mode 100644 index d76f3155959..00000000000 --- a/bamboo/unit_tests/prototext/jag_100M_metadata.prototext +++ /dev/null @@ -1,119 +0,0 @@ -######################################################################## -# The JAG normalization values were computed over the 10M + 1MA + 1MB random -# pulls from the 100M data set. The image normalization values were updated -# on 1/30/2019 using the per-channel average of the pixel values -# across all views. -# They are valid for the directories: -# /p/lustre2/brainusr/datasets/10MJAG/ (10M | 1M_A | 1M_B) -# /p/lustre2/brainusr/datasets/10MJAG_balanced_1K/ (1M_A | 1M_B) -# /p/gpfs1/brainusr/datasets/10MJAG/10M | 1M_A | 1M_B -# /p/gpfs1/brainusr/datasets/10MJAG_balanced_1K/ (1M_A | 1M_B) -######################################################################## - -data_set_metadata { - schema { - split_jag_image_channels: true - - # JAG_Image, JAG_Scalar, JAG_Input - independent: [ { pieces: [ JAG_Image, JAG_Scalar ] }, { pieces: [ JAG_Input ] } ] - dependent: [ { pieces: [ JAG_Input ] } ] - - image_prefix: "/outputs/images/" - - image_width: 64 - image_height: 64 - image_num_channels: 4 - - jag_image_keys: ["(0.0, 0.0)/0.0/emi", "(90.0, 0.0)/0.0/emi", "(90.0, 78.0)/0.0/emi"] - - scalar_prefix: "/outputs/scalars/" - - # An empty list indicates to use all - # The commented out variables are not on the Jim's original list but used in the numpy-based format - jag_scalar_keys: - [ "BWx", - "BT", - "tMAXt", # absent in Jim's list - "BWn", - "MAXpressure", - #"BAte", - #"MAXtion", - "tMAXpressure", - "BAt", # absent in Jim's list - "Yn", - "Ye", - "Yx", - #"tMAXte", # absent in Jim's list - #"BAtion", - #"MAXte", - #"tMAXtion", # absent in Jim's list - "BTx", - "MAXt", # absent in Jim's list - #"BTn", - "BApressure", - "tMINradius", - "MINradius" # absent in Jim's list - ] - - # When using all the keys without explicit selection, key filters can be used - # to explicitly exclude the particular variables with keys that matches a filter. - # 'jag_scalar_filters' and 'jag_input_filters' rely on exact key string matching. - # 'jag_scalar_prefix_filters' and 'jag_input_prefix_filters' define a filter as - # the pair of a prefix substring and the minimum key length. - # For example, with the example below, any key that has a length no shorter - # than 26 and starts with the substring "image_(" is excluded. - - jag_scalar_prefix_filters: [ { key_prefix: "image_(" min_len: 26} ] - jag_scalar_filters: [ "iBT" ] - - input_prefix: "/inputs/" - - jag_input_keys: ["shape_model_initial_modes:(4,3)", - "betti_prl15_trans_u", - "betti_prl15_trans_v", - "shape_model_initial_modes:(2,1)", - "shape_model_initial_modes:(1,0)"]; - } - - normalization { - jag_scalar_normalization_params: [ - { scale: 7.610738e+00 bias: -4.075375e-01 }, #BWx - { scale: 1.459875e+00 bias: -3.427656e+00 }, #BT - { scale: 1.490713e+00 bias: -3.495498e+00 }, #tMAXt - { scale: 4.375123e+01 bias: -1.593477e+00 }, #BWn - { scale: 1.685576e-06 bias: -5.330971e-01 }, #MAXpressure - #{ scale: 2.636422e-01 bias: -9.762907e-01 }, #BAte - #{ scale: 2.419509e-01 bias: -9.853402e-01 }, #MAXtion - { scale: 1.430615e+00 bias: -3.351173e+00 }, #tMAXpressure - { scale: 2.636422e-01 bias: -9.762907e-01 }, #BAt - { scale: 7.154074e-18 bias: -1.864709e-02 }, #Yn - { scale: 3.166824e-03 bias: -1.864709e-02 }, #Ye - { scale: 2.102178e-02 bias: -3.071955e-01 }, #Yx - #{ scale: 1.490713e+00 bias: -3.495498e+00 }, #tMAXte - #{ scale: 2.636422e-01 bias: -9.762907e-01 }, #BAtion - #{ scale: 2.419509e-01 bias: -9.853402e-01 }, #MAXte - #{ scale: 1.490713e+00 bias: -3.495498e+00 }, #tMAXtion - { scale: 1.346439e+00 bias: -3.118446e+00 }, #BTx - { scale: 2.419509e-01 bias: -9.853402e-01 }, #MAXt - #{ scale: 1.459875e+00 bias: -3.427656e+00 }, #BTn - { scale: 2.061877e-06 bias: -5.213394e-01 }, #BApressure - { scale: 1.392544e+00 bias: -3.239921e+00 }, #tMINradius - { scale: 6.266253e-02 bias: -1.384504e+00 } #MINradius - ] - - jag_input_normalization_params: [ - { scale: 1.666672e+00 bias: 5.000000e-01 }, #shape_model_initial_modes:(4,3) - { scale: 1.000002e+00 bias: -1.603483e-07 }, #betti_prl15_trans_u - { scale: 1.000001e+00 bias: -1.406672e-06 }, #betti_prl15_trans_v - { scale: 1.666675e+00 bias: 4.999992e-01 }, #shape_model_initial_modes:(2,1) - { scale: 1.666669e+00 bias: 5.000008e-01 } #shape_model_initial_modes:(1,0) - ] - - jag_image_normalization_params: [ - { scale: 2.9258502e+01 bias: 0.0e+00 }, # avg = 0.0341781 - { scale: 8.5826596e+02 bias: 0.0e+00 }, # avg = 0.00116514 - { scale: 1.0004872e+05 bias: 0.0e+00 }, # avg = 9.99513e-06 - { scale: 4.8072070e+06 bias: 0.0e+00 } # avg = 2.08021e-07 - ] - } -} diff --git a/bamboo/unit_tests/test_unit_reconstruction_loss.py b/bamboo/unit_tests/test_unit_reconstruction_loss.py new file mode 100644 index 00000000000..85825a02d88 --- /dev/null +++ b/bamboo/unit_tests/test_unit_reconstruction_loss.py @@ -0,0 +1,52 @@ +import sys +sys.path.insert(0, '../common_python') +import os +import pytest +import tools + + +def skeleton_jag_reconstruction_loss(cluster, executables, dir_name, compiler_name): + if compiler_name not in executables: + e = 'skeleton_jag_reconstruction_loss: default_exes[%s] does not exist' % compiler_name + print('Skip - ' + e) + pytest.skip(e) + output_file_name = '%s/bamboo/unit_tests/output/jag_reconstruction_loss_%s_output.txt' % (dir_name, compiler_name) + error_file_name = '%s/bamboo/unit_tests/error/jag_reconstruction_loss_%s_error.txt' % (dir_name, compiler_name) + command = tools.get_command( + cluster=cluster, + executable=executables[compiler_name], + num_nodes=16, + num_processes=32, + dir_name=dir_name, + data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST', + data_reader_name='jag', + metadata='model_zoo/models/jag/wae_cycle_gan/jag_100M_metadata.prototext', + model_folder='tests', + model_name='jag_single_layer_ae', + optimizer_name='adam', + output_file_name=output_file_name, + error_file_name=error_file_name) + return_code = os.system(command) + assert return_code == 0 + + +def test_unit_jag_reconstruction_loss_clang6(cluster, exes, dirname): + skeleton_jag_reconstruction_loss(cluster, exes, dirname, 'clang6') + + +def test_unit_jag_reconstruction_loss_gcc7(cluster, exes, dirname): + skeleton_jag_reconstruction_loss(cluster, exes, dirname, 'gcc7') + + +def test_unit_jag_reconstruction_loss_intel19(cluster, exes, dirname): + skeleton_jag_reconstruction_loss(cluster, exes, dirname, 'intel19') + + +# Run with python -m pytest -s test_unit_ridge_regression.py -k 'test_unit_jag_reconstruction_loss_exe' --exe= +def test_unit_jag_reconstruction_loss_exe(cluster, dirname, exe): + if exe is None: + e = 'test_unit_jag_reconstruction_loss_exe: Non-local testing' + print('Skip - ' + e) + pytest.skip(e) + exes = {'exe': exe} + skeleton_jag_reconstruction_loss(cluster, exes, dirname, 'exe') diff --git a/bamboo/unit_tests/prototext/jag_reader.prototext b/model_zoo/data_readers/data_reader_jag.prototext similarity index 100% rename from bamboo/unit_tests/prototext/jag_reader.prototext rename to model_zoo/data_readers/data_reader_jag.prototext diff --git a/bamboo/unit_tests/prototext/jag_single_layer_ae.prototext b/model_zoo/tests/model_jag_single_layer_ae.prototext similarity index 76% rename from bamboo/unit_tests/prototext/jag_single_layer_ae.prototext rename to model_zoo/tests/model_jag_single_layer_ae.prototext index dbdc59990e1..572017c8366 100644 --- a/bamboo/unit_tests/prototext/jag_single_layer_ae.prototext +++ b/model_zoo/tests/model_jag_single_layer_ae.prototext @@ -1,7 +1,7 @@ -#Unit test for JAG model and (particularly) data reader -#Run time for this example is about 2s per epoch on 16 nodes (32 tasks) -#Example on how to run: -#srun --nodes=16 --ntasks=32 build/gnu.Release.catalyst.llnl.gov/lbann/build/model_zoo/lbann --model=bamboo/unit_tests/prototext/jag_single_layer_ae.prototext --optimizer=model_zoo/optimizers/opt_adam.prototext --reader=bamboo/unit_tests/prototext/jag_reader.prototext --metadata=model_zoo/models/jag/wae_cycle_gan/jag_100M_metadata.prototext +# Unit test for JAG model and (particularly) data reader +# Run time for this example is about 2s per epoch on 16 nodes (32 tasks) +# Example on how to run: +# srun --nodes=16 --ntasks=32 build/gnu.Release.catalyst.llnl.gov/lbann/build/model_zoo/lbann --model=model_zoo/tests/model_jag_single_layer_ae.prototext --optimizer=model_zoo/optimizers/opt_adam.prototext --reader=model_zoo/data_readers/data_reader_jag.prototext --metadata=model_zoo/models/jag/wae_cycle_gan/jag_100M_metadata.prototext model { name: "ae_model" shareable_training_data_reader:false @@ -43,6 +43,17 @@ model { } } callback { timer {} } + # See lbann/src/proto/lbann.proto CallbackCheckMetric + # See lbann/src/callbacks/callback_check_metric.cpp + callback { + check_metric { + metric: "reconstr_loss", + lower_bound: 20.3956, + upper_bound: 22.3956, + error_on_failure: true, + execution_modes: "testing" + } + } ################################################### # start of layers From 4369e1eaf7630c2bfc6810abf0862bf7775f1b8b Mon Sep 17 00:00:00 2001 From: Ryan Forsyth Date: Thu, 18 Jul 2019 10:23:26 -0700 Subject: [PATCH 127/634] Update test flags --- bamboo/allocate_and_run.sh | 10 ++++++++-- bamboo/common_python/tools.py | 20 ++++++++++++++++---- bamboo/run.sh | 8 ++++---- 3 files changed, 28 insertions(+), 10 deletions(-) diff --git a/bamboo/allocate_and_run.sh b/bamboo/allocate_and_run.sh index 3e43bb23452..dda1dc69d14 100755 --- a/bamboo/allocate_and_run.sh +++ b/bamboo/allocate_and_run.sh @@ -31,7 +31,8 @@ if [ "${CLUSTER}" = 'pascal' ]; then fi if [ ${WEEKLY} -ne 0 ]; then - salloc -N16 --partition=pbatch -t 1440 ./run.sh --weekly + ALLOCATION_TIME_LIMIT=720 + timeout 24h salloc -N16 --partition=pbatch -t $ALLOCATION_TIME_LIMIT ./run.sh --weekly if [ "${CLUSTER}" = 'catalyst' ]; then cd integration_tests python -m pytest -s test_integration_performance_full_alexnet_clang6 --weekly --run --junitxml=alexnet_clang6_results.xml @@ -40,5 +41,10 @@ if [ ${WEEKLY} -ne 0 ]; then cd .. fi else - salloc -N16 --partition=pbatch -t 1440 ./run.sh + if [ "${CLUSTER}" = 'catalyst' ]; then + ALLOCATION_TIME_LIMIT=240 + elif [ "${CLUSTER}" = 'corona' ] || [ "${CLUSTER}" = 'pascal' ]; then + ALLOCATION_TIME_LIMIT=660 + fi + timeout 24h salloc -N16 --partition=pbatch -t $ALLOCATION_TIME_LIMIT ./run.sh fi diff --git a/bamboo/common_python/tools.py b/bamboo/common_python/tools.py index 7908189b29e..1480a6d9017 100644 --- a/bamboo/common_python/tools.py +++ b/bamboo/common_python/tools.py @@ -73,6 +73,7 @@ def get_command(cluster, else: raise Exception('Unsupported Cluster: %s' % cluster) + MAX_TIME = 60 # Description of command line options are from the appropriate command's # man pages if scheduler == 'slurm': @@ -112,9 +113,15 @@ def get_command(cluster, # Create run command if command_allocate == '': - command_run = 'srun --mpibind=off' + space = '' + # If nodes have already been allocated, + # then an individual test should not take longer than MAX_TIME. + if time_limit > MAX_TIME: + time_limit = MAX_TIME else: - command_run = ' srun --mpibind=off' + space = ' ' + command_run = '{s}srun --mpibind=off --time={t}'.format( + s=space, t=time_limit) option_num_processes = '' if num_processes is not None: # --ntasks => Specify the number of tasks to run. @@ -170,9 +177,14 @@ def get_command(cluster, # Create run command if command_allocate == '': - command_run = 'mpirun' + space = '' + # If nodes have already been allocated, + # then an individual test should not take longer than MAX_TIME. + if time_limit > MAX_TIME: + time_limit = MAX_TIME else: - command_run = ' mpirun' + space = ' ' + command_run = '{s}mpirun --timeout {t}'.format(s=space, t=time_limit) option_num_processes = '' option_processes_per_node = '' if num_processes is not None: diff --git a/bamboo/run.sh b/bamboo/run.sh index dac097ff0bd..22b9256e314 100755 --- a/bamboo/run.sh +++ b/bamboo/run.sh @@ -42,22 +42,22 @@ echo "Task: Cleaning" echo "Task: Compiler Tests" cd compiler_tests module load cmake/3.9.2 -$PYTHON -m pytest -s --junitxml=results.xml +$PYTHON -m pytest -s -vv --durations=0 --junitxml=results.xml cd .. echo "Task: Integration Tests" cd integration_tests if [ ${WEEKLY} -ne 0 ]; then - $PYTHON -m pytest -s --weekly --junitxml=results.xml + $PYTHON -m pytest -s -vv --durations=0 --weekly --junitxml=results.xml else - $PYTHON -m pytest -s --junitxml=results.xml + $PYTHON -m pytest -s -vv --durations=0 --junitxml=results.xml fi cd .. echo "Task: Unit Tests" cd unit_tests -$PYTHON -m pytest -s --junitxml=results.xml +$PYTHON -m pytest -s -vv --durations=0 --junitxml=results.xml cd .. echo "Task: Finished" From 90093c9dd81e115063911577a16d02f9448cc0ec Mon Sep 17 00:00:00 2001 From: Ryan Forsyth Date: Mon, 25 Mar 2019 16:21:08 -0700 Subject: [PATCH 128/634] Set up Lassen testing --- bamboo/allocate_and_run.sh | 41 +++--- bamboo/common_python/test_tools.py | 119 +++++++++++++----- bamboo/common_python/tools.py | 110 +++++++++++----- bamboo/compiler_tests/test_compiler.py | 100 +++++++-------- bamboo/integration_tests/common_code.py | 38 ++++-- ...toencoder_imagenet_objective_functions.csv | 21 ++++ ..._autoencoder_mnist_objective_functions.csv | 6 + .../lassen/gcc7/expected_performance.csv | 5 + .../test_integration_autoencoders.py | 2 +- bamboo/run.sh | 2 +- bamboo/unit_tests/test_unit_layer_clamp.py | 2 +- .../unit_tests/test_unit_layer_covariance.py | 2 +- bamboo/unit_tests/test_unit_layer_elu.py | 2 +- bamboo/unit_tests/test_unit_layer_identity.py | 2 +- bamboo/unit_tests/test_unit_layer_l1_norm.py | 2 +- bamboo/unit_tests/test_unit_layer_l2_norm2.py | 2 +- .../unit_tests/test_unit_layer_leaky_relu.py | 2 +- .../unit_tests/test_unit_layer_log_sigmoid.py | 2 +- .../unit_tests/test_unit_layer_log_softmax.py | 2 +- .../test_unit_layer_mean_absolute_error.py | 2 +- bamboo/unit_tests/test_unit_layer_relu.py | 2 +- bamboo/unit_tests/test_unit_layer_selu.py | 2 +- bamboo/unit_tests/test_unit_layer_sigmoid.py | 2 +- bamboo/unit_tests/test_unit_layer_softmax.py | 2 +- bamboo/unit_tests/test_unit_layer_softplus.py | 2 +- bamboo/unit_tests/test_unit_layer_softsign.py | 2 +- .../test_unit_layer_squared_difference.py | 2 +- .../unit_tests/test_unit_layer_tessellate.py | 2 +- bamboo/unit_tests/test_unit_layer_variance.py | 2 +- bamboo/unit_tests/test_unit_lbann2_reload.py | 2 +- .../unit_tests/test_unit_lbann_invocation.py | 3 +- docs/continuous_integration.rst | 22 ++-- .../data_reader_imagenet_lassen.prototext | 65 ++++++++++ 33 files changed, 397 insertions(+), 177 deletions(-) create mode 100644 bamboo/integration_tests/expected_values/lassen/gcc7/expected_conv_autoencoder_imagenet_objective_functions.csv create mode 100644 bamboo/integration_tests/expected_values/lassen/gcc7/expected_conv_autoencoder_mnist_objective_functions.csv create mode 100644 bamboo/integration_tests/expected_values/lassen/gcc7/expected_performance.csv create mode 100644 model_zoo/data_readers/data_reader_imagenet_lassen.prototext diff --git a/bamboo/allocate_and_run.sh b/bamboo/allocate_and_run.sh index dda1dc69d14..b7f14295461 100755 --- a/bamboo/allocate_and_run.sh +++ b/bamboo/allocate_and_run.sh @@ -1,3 +1,5 @@ +#!/bin/bash -l + CLUSTER=$(hostname | sed 's/\([a-zA-Z][a-zA-Z]*\)[0-9]*/\1/g') echo "allocate_and_run.sh CLUSTER=" @@ -30,21 +32,30 @@ if [ "${CLUSTER}" = 'pascal' ]; then export MV2_USE_CUDA=1 fi -if [ ${WEEKLY} -ne 0 ]; then - ALLOCATION_TIME_LIMIT=720 - timeout 24h salloc -N16 --partition=pbatch -t $ALLOCATION_TIME_LIMIT ./run.sh --weekly - if [ "${CLUSTER}" = 'catalyst' ]; then - cd integration_tests - python -m pytest -s test_integration_performance_full_alexnet_clang6 --weekly --run --junitxml=alexnet_clang6_results.xml - python -m pytest -s test_integration_performance_full_alexnet_gcc7 --weekly --run --junitxml=alexnet_gcc7_results.xml - # python -m pytest -s test_integration_performance_full_alexnet_intel19 --weekly --run --junitxml=alexnet_intel19_results.xml - cd .. +if [ "${CLUSTER}" = 'lassen' ]; then + ALLOCATION_TIME_LIMIT=600 + if [ ${WEEKLY} -ne 0 ]; then + timeout 24h bsub -G guests -Is -q pbatch -nnodes 16 -W $ALLOCATION_TIME_LIMIT ./run.sh --weekly + else + timeout 24h bsub -G guests -Is -q pbatch -nnodes 16 -W $ALLOCATION_TIME_LIMIT ./run.sh fi -else - if [ "${CLUSTER}" = 'catalyst' ]; then - ALLOCATION_TIME_LIMIT=240 - elif [ "${CLUSTER}" = 'corona' ] || [ "${CLUSTER}" = 'pascal' ]; then - ALLOCATION_TIME_LIMIT=660 +elif [ "${CLUSTER}" = 'catalyst' ] || [ "${CLUSTER}" = 'corona' ] || [ "${CLUSTER}" = 'pascal' ]; then + if [ ${WEEKLY} -ne 0 ]; then + ALLOCATION_TIME_LIMIT=720 + timeout 24h salloc -N16 --partition=pbatch -t $ALLOCATION_TIME_LIMIT ./run.sh --weekly + if [ "${CLUSTER}" = 'catalyst' ]; then + cd integration_tests + python -m pytest -s test_integration_performance_full_alexnet_clang6 --weekly --run --junitxml=alexnet_clang6_results.xml + python -m pytest -s test_integration_performance_full_alexnet_gcc7 --weekly --run --junitxml=alexnet_gcc7_results.xml + # python -m pytest -s test_integration_performance_full_alexnet_intel19 --weekly --run --junitxml=alexnet_intel19_results.xml + cd .. + fi + else + if [ "${CLUSTER}" = 'catalyst' ]; then + ALLOCATION_TIME_LIMIT=240 + elif [ "${CLUSTER}" = 'corona' ] || [ "${CLUSTER}" = 'pascal' ]; then + ALLOCATION_TIME_LIMIT=660 + fi + timeout 24h salloc -N16 --partition=pbatch -t $ALLOCATION_TIME_LIMIT ./run.sh fi - timeout 24h salloc -N16 --partition=pbatch -t $ALLOCATION_TIME_LIMIT ./run.sh fi diff --git a/bamboo/common_python/test_tools.py b/bamboo/common_python/test_tools.py index c787b9976c1..8d2ec02ff5d 100644 --- a/bamboo/common_python/test_tools.py +++ b/bamboo/common_python/test_tools.py @@ -1,7 +1,9 @@ import pytest +import subprocess import tools -# This test isn't in a directory to be run from Bamboo + +# This test file isn't in a directory to be run from Bamboo # Run locally with python -m pytest -s d = dict( @@ -28,27 +30,36 @@ def test_command_catalyst(): actual = tools.get_command(cluster='catalyst', **d) - expected = 'salloc --nodes=20 --partition=pdebug --time=30 srun --mpibind=off --ntasks=40 exe --reader=dir/model_zoo/data_readers/data_reader_mnist.prototext --data_reader_percent=0.100000 --exit_after_setup --mini_batch_size=15 --model=dir/model_zoo/models/folder/model_lenet.prototext --num_epochs=7 --optimizer=dir/model_zoo/optimizers/opt_adagrad.prototext --procs_per_model=10 > output_file 2> error_file' + expected = 'salloc --nodes=20 --partition=pdebug --time=30 srun --mpibind=off --time=30 --ntasks=40 exe --reader=dir/model_zoo/data_readers/data_reader_mnist.prototext --data_reader_percent=0.100000 --exit_after_setup --mini_batch_size=15 --model=dir/model_zoo/models/folder/model_lenet.prototext --num_epochs=7 --optimizer=dir/model_zoo/optimizers/opt_adagrad.prototext --procs_per_model=10 > output_file 2> error_file' + assert actual == expected + + +def test_command_lassen(): + actual = tools.get_command(cluster='lassen', **d) + expected = 'bsub -G guests -Is -q pdebug -nnodes 20 -W 30 jsrun -b "packed:10" -c 40 -g 4 -d packed -n 16 -r 1 -a 4 exe --data_filedir=gpfs1/filedir --reader=dir/model_zoo/data_readers/data_reader_mnist.prototext --data_reader_percent=0.100000 --exit_after_setup --mini_batch_size=15 --model=dir/model_zoo/models/folder/model_lenet.prototext --num_epochs=7 --optimizer=dir/model_zoo/optimizers/opt_adagrad.prototext --procs_per_model=10 > output_file 2> error_file' assert actual == expected def test_command_pascal(): actual = tools.get_command(cluster='pascal', **d) - expected = 'salloc --nodes=20 --partition=pbatch --time=30 srun --mpibind=off --ntasks=40 exe --reader=dir/model_zoo/data_readers/data_reader_mnist.prototext --data_reader_percent=0.100000 --exit_after_setup --mini_batch_size=15 --model=dir/model_zoo/models/folder/model_lenet.prototext --num_epochs=7 --optimizer=dir/model_zoo/optimizers/opt_adagrad.prototext --procs_per_model=10 > output_file 2> error_file' + expected = 'salloc --nodes=20 --partition=pbatch --time=30 srun --mpibind=off --time=30 --ntasks=40 exe --reader=dir/model_zoo/data_readers/data_reader_mnist.prototext --data_reader_percent=0.100000 --exit_after_setup --mini_batch_size=15 --model=dir/model_zoo/models/folder/model_lenet.prototext --num_epochs=7 --optimizer=dir/model_zoo/optimizers/opt_adagrad.prototext --procs_per_model=10 > output_file 2> error_file' assert actual == expected - + def test_command_ray(): actual = tools.get_command(cluster='ray', **d) - expected = 'bsub -x -G guests -Is -n 40 -q pdebug -R "span[ptile=2]" -W 30 mpirun -np 40 -N 2 exe --data_filedir=gscratchr/filedir --reader=dir/model_zoo/data_readers/data_reader_mnist.prototext --data_reader_percent=0.100000 --exit_after_setup --mini_batch_size=15 --model=dir/model_zoo/models/folder/model_lenet.prototext --num_epochs=7 --optimizer=dir/model_zoo/optimizers/opt_adagrad.prototext --procs_per_model=10 > output_file 2> error_file' + expected = 'bsub -x -G guests -Is -n 40 -q pdebug -R "span[ptile=2]" -W 30 mpirun --timeout=30 -np 40 -N 2 exe --data_filedir=gscratchr/filedir --reader=dir/model_zoo/data_readers/data_reader_mnist.prototext --data_reader_percent=0.100000 --exit_after_setup --mini_batch_size=15 --model=dir/model_zoo/models/folder/model_lenet.prototext --num_epochs=7 --optimizer=dir/model_zoo/optimizers/opt_adagrad.prototext --procs_per_model=10 > output_file 2> error_file' assert actual == expected + # Test error cases ############################################################ def test_blacklisted_substrings(): try: - tools.get_command('ray', 'exe', partition=';', optimizer_path='--model=new_model', check_executable_existence=False) + tools.get_command('ray', 'exe', partition=';', + optimizer_path='--model=new_model', + check_executable_existence=False) assert False except Exception as e: actual = str(e) @@ -68,7 +79,9 @@ def test_unsupported_cluster(): def test_bad_model_1(): try: - tools.get_command('ray', 'exe', dir_name='dir', model_folder='folder', model_name='name', model_path='path', check_executable_existence=False) + tools.get_command('ray', 'exe', dir_name='dir', model_folder='folder', + model_name='name', model_path='path', + check_executable_existence=False) assert False except Exception as e: actual = str(e) @@ -78,7 +91,8 @@ def test_bad_model_1(): def test_bad_model_2(): try: - tools.get_command('ray', 'exe', dir_name='dir', model_folder='folder', model_path='path', check_executable_existence=False) + tools.get_command('ray', 'exe', dir_name='dir', model_folder='folder', + model_path='path', check_executable_existence=False) assert False except Exception as e: actual = str(e) @@ -88,7 +102,8 @@ def test_bad_model_2(): def test_bad_model_3(): try: - tools.get_command('ray', 'exe', dir_name='dir', model_name='name', model_path='path', check_executable_existence=False) + tools.get_command('ray', 'exe', dir_name='dir', model_name='name', + model_path='path', check_executable_existence=False) assert False except Exception as e: actual = str(e) @@ -98,7 +113,8 @@ def test_bad_model_3(): def test_bad_model_4(): try: - tools.get_command('ray', 'exe', dir_name='dir', model_folder='folder', check_executable_existence=False) + tools.get_command('ray', 'exe', dir_name='dir', model_folder='folder', + check_executable_existence=False) assert False except Exception as e: actual = str(e) @@ -108,7 +124,8 @@ def test_bad_model_4(): def test_bad_model_5(): try: - tools.get_command('ray', 'exe', dir_name='dir', model_name='name', check_executable_existence=False) + tools.get_command('ray', 'exe', dir_name='dir', model_name='name', + check_executable_existence=False) assert False except Exception as e: actual = str(e) @@ -118,7 +135,9 @@ def test_bad_model_5(): def test_bad_data_reader(): try: - tools.get_command('catalyst', 'exe', dir_name='dir', data_reader_name='name', data_reader_path='path', check_executable_existence=False) + tools.get_command('catalyst', 'exe', dir_name='dir', + data_reader_name='name', data_reader_path='path', + check_executable_existence=False) assert False except Exception as e: actual = str(e) @@ -128,7 +147,9 @@ def test_bad_data_reader(): def test_bad_optimizer(): try: - tools.get_command('ray', 'exe', dir_name='dir', optimizer_name='name', optimizer_path='path', check_executable_existence=False) + tools.get_command('ray', 'exe', dir_name='dir', optimizer_name='name', + optimizer_path='path', + check_executable_existence=False) assert False except Exception as e: actual = str(e) @@ -138,7 +159,8 @@ def test_bad_optimizer(): def test_bad_dir_name_1(): try: - tools.get_command('ray', 'exe', dir_name='dir', check_executable_existence=False) + tools.get_command('ray', 'exe', dir_name='dir', + check_executable_existence=False) assert False except Exception as e: actual = str(e) @@ -148,7 +170,8 @@ def test_bad_dir_name_1(): def test_bad_dir_name_2(): try: - tools.get_command('ray', 'exe', model_folder='folder', check_executable_existence=False) + tools.get_command('ray', 'exe', model_folder='folder', + check_executable_existence=False) assert False except Exception as e: actual = str(e) @@ -158,7 +181,8 @@ def test_bad_dir_name_2(): def test_bad_dir_name_3(): try: - tools.get_command('ray', 'exe', model_name='name', check_executable_existence=False) + tools.get_command('ray', 'exe', model_name='name', + check_executable_existence=False) assert False except Exception as e: actual = str(e) @@ -168,7 +192,8 @@ def test_bad_dir_name_3(): def test_bad_dir_name_4(): try: - tools.get_command('catalyst', 'exe', data_reader_name='name', check_executable_existence=False) + tools.get_command('catalyst', 'exe', data_reader_name='name', + check_executable_existence=False) assert False except Exception as e: actual = str(e) @@ -178,7 +203,8 @@ def test_bad_dir_name_4(): def test_bad_dir_name_5(): try: - tools.get_command('ray', 'exe', optimizer_name='name', check_executable_existence=False) + tools.get_command('ray', 'exe', optimizer_name='name', + check_executable_existence=False) assert False except Exception as e: actual = str(e) @@ -188,7 +214,9 @@ def test_bad_dir_name_5(): def test_bad_data_filedir_1(): try: - tools.get_command('ray', 'exe', dir_name='dir', data_reader_name='name', data_filedir_default='filedir', data_filedir_train_default='a', + tools.get_command('ray', 'exe', dir_name='dir', data_reader_name='name', + data_filedir_default='filedir', + data_filedir_train_default='a', check_executable_existence=False) assert False except Exception as e: @@ -199,7 +227,9 @@ def test_bad_data_filedir_1(): def test_bad_data_filedir_2(): try: - tools.get_command('ray', 'exe', dir_name='dir', data_reader_name='name', data_filedir_default='filedir', data_filename_train_default='b', + tools.get_command('ray', 'exe', dir_name='dir', data_reader_name='name', + data_filedir_default='filedir', + data_filename_train_default='b', check_executable_existence=False) assert False except Exception as e: @@ -210,7 +240,9 @@ def test_bad_data_filedir_2(): def test_bad_data_filedir_3(): try: - tools.get_command('ray', 'exe', dir_name='dir', data_reader_name='name', data_filedir_default='filedir', data_filedir_test_default='c', + tools.get_command('ray', 'exe', dir_name='dir', data_reader_name='name', + data_filedir_default='filedir', + data_filedir_test_default='c', check_executable_existence=False) assert False except Exception as e: @@ -221,7 +253,9 @@ def test_bad_data_filedir_3(): def test_bad_data_filedir_4(): try: - tools.get_command('ray', 'exe', dir_name='dir', data_reader_name='name', data_filedir_default='filedir', data_filename_test_default='d', + tools.get_command('ray', 'exe', dir_name='dir', data_reader_name='name', + data_filedir_default='filedir', + data_filename_test_default='d', check_executable_existence=False) assert False except Exception as e: @@ -232,7 +266,10 @@ def test_bad_data_filedir_4(): def test_bad_data_filedir_5(): try: - tools.get_command('ray', 'exe', data_reader_path='path', data_filedir_default='filedir', data_filedir_train_default='e', check_executable_existence=False) + tools.get_command('ray', 'exe', data_reader_path='path', + data_filedir_default='filedir', + data_filedir_train_default='e', + check_executable_existence=False) assert False except Exception as e: actual = str(e) @@ -242,7 +279,10 @@ def test_bad_data_filedir_5(): def test_bad_data_filedir_6(): try: - tools.get_command('ray', 'exe', data_reader_path='path', data_filedir_default='filedir', data_filename_train_default='f', check_executable_existence=False) + tools.get_command('ray', 'exe', data_reader_path='path', + data_filedir_default='filedir', + data_filename_train_default='f', + check_executable_existence=False) assert False except Exception as e: actual = str(e) @@ -252,7 +292,10 @@ def test_bad_data_filedir_6(): def test_bad_data_filedir_7(): try: - tools.get_command('ray', 'exe', data_reader_path='path', data_filedir_default='filedir', data_filedir_test_default='g', check_executable_existence=False) + tools.get_command('ray', 'exe', data_reader_path='path', + data_filedir_default='filedir', + data_filedir_test_default='g', + check_executable_existence=False) assert False except Exception as e: actual = str(e) @@ -262,7 +305,10 @@ def test_bad_data_filedir_7(): def test_bad_data_filedir_8(): try: - tools.get_command('ray', 'exe', data_reader_path='path', data_filedir_default='filedir', data_filename_test_default='h', check_executable_existence=False) + tools.get_command('ray', 'exe', data_reader_path='path', + data_filedir_default='filedir', + data_filename_test_default='h', + check_executable_existence=False) assert False except Exception as e: actual = str(e) @@ -272,7 +318,8 @@ def test_bad_data_filedir_8(): def test_bad_data_filedir_9(): try: - tools.get_command('ray', 'exe', dir_name='dir', data_reader_name='name', check_executable_existence=False) + tools.get_command('ray', 'exe', dir_name='dir', data_reader_name='name', + check_executable_existence=False) assert False except Exception as e: actual = str(e) @@ -282,7 +329,8 @@ def test_bad_data_filedir_9(): def test_bad_data_filedir_10(): try: - tools.get_command('ray', 'exe', data_reader_path='path', check_executable_existence=False) + tools.get_command('ray', 'exe', data_reader_path='path', + check_executable_existence=False) assert False except Exception as e: actual = str(e) @@ -292,7 +340,8 @@ def test_bad_data_filedir_10(): def test_bad_data_filedir_11(): try: - tools.get_command('ray', 'exe', data_filedir_default='filedir', check_executable_existence=False) + tools.get_command('ray', 'exe', data_filedir_default='filedir', + check_executable_existence=False) assert False except Exception as e: actual = str(e) @@ -302,7 +351,8 @@ def test_bad_data_filedir_11(): def test_bad_data_filedir_12(): try: - tools.get_command('ray', 'exe', data_filedir_train_default='a', check_executable_existence=False) + tools.get_command('ray', 'exe', data_filedir_train_default='a', + check_executable_existence=False) assert False except Exception as e: actual = str(e) @@ -312,7 +362,8 @@ def test_bad_data_filedir_12(): def test_bad_data_filedir_13(): try: - tools.get_command('ray', 'exe', data_filename_train_default='b', check_executable_existence=False) + tools.get_command('ray', 'exe', data_filename_train_default='b', + check_executable_existence=False) assert False except Exception as e: actual = str(e) @@ -322,7 +373,8 @@ def test_bad_data_filedir_13(): def test_bad_data_filedir_14(): try: - tools.get_command('ray', 'exe', data_filedir_test_default='c', check_executable_existence=False) + tools.get_command('ray', 'exe', data_filedir_test_default='c', + check_executable_existence=False) assert False except Exception as e: actual = str(e) @@ -332,7 +384,8 @@ def test_bad_data_filedir_14(): def test_bad_data_filedir_15(): try: - tools.get_command('ray', 'exe', data_filename_test_default='e', check_executable_existence=False) + tools.get_command('ray', 'exe', data_filename_test_default='e', + check_executable_existence=False) assert False except Exception as e: actual = str(e) diff --git a/bamboo/common_python/tools.py b/bamboo/common_python/tools.py index 1480a6d9017..5fe16d4be4e 100644 --- a/bamboo/common_python/tools.py +++ b/bamboo/common_python/tools.py @@ -1,4 +1,3 @@ -import pytest import math, os, re @@ -68,7 +67,7 @@ def get_command(cluster, # Determine scheduler if cluster in ['catalyst', 'corona', 'pascal']: scheduler = 'slurm' - elif cluster == 'ray': + elif cluster in ['lassen', 'ray']: scheduler = 'lsf' else: raise Exception('Unsupported Cluster: %s' % cluster) @@ -133,23 +132,28 @@ def get_command(cluster, # Create allocate command command_allocate = '' # Allocate nodes only if we don't already have an allocation. - if os.getenv('LSB_HOSTS') is None: + if (os.getenv('LSB_HOSTS') is None) and (os.getenv('LSB_JOBID') is None): print('Allocating lsf nodes.') command_allocate = 'bsub' - # x => Puts the host running your job into exclusive execution - # mode. - option_exclusive = ' -x' + option_exclusive = '' + if cluster != 'lassen': + # x => Puts the host running your job into exclusive execution + # mode. + option_exclusive = ' -x' # G=> For fairshare scheduling. Associates the job with the # specified group. option_group = ' -G guests' # Is => Submits an interactive job and creates a pseudo-terminal # with shell mode when the job starts. option_interactive = ' -Is' + option_num_nodes = '' option_num_processes = '' option_partition = '' option_processes_per_node = '' option_time_limit = '' - if num_processes is not None: + if cluster == 'lassen': + option_num_nodes = ' -nnodes {n}'.format(n=num_nodes) + elif num_processes is not None: # n => Submits a parallel job and specifies the number of # tasks in the job. option_num_processes = ' -n %d' % num_processes @@ -157,7 +161,7 @@ def get_command(cluster, # R => Runs the job on a host that meets the specified # resource requirements. option_processes_per_node = ' -R "span[ptile=%d]"' % int( - math.ceil(float(num_processes)/num_nodes)) + math.ceil(float(num_processes) / num_nodes)) if partition is not None: # q => Submits the job to one of the specified queues. option_partition = ' -q %s' % partition @@ -168,10 +172,10 @@ def get_command(cluster, time_limit = max_ray_time # W => Sets the runtime limit of the job. option_time_limit = ' -W %d' % time_limit - command_allocate = '%s%s%s%s%s%s%s%s' % ( + command_allocate = '%s%s%s%s%s%s%s%s%s' % ( command_allocate, option_exclusive, option_group, option_interactive, option_num_processes, option_partition, - option_processes_per_node, option_time_limit) + option_num_nodes, option_processes_per_node, option_time_limit) else: print('lsf nodes already allocated.') @@ -184,17 +188,43 @@ def get_command(cluster, time_limit = MAX_TIME else: space = ' ' - command_run = '{s}mpirun --timeout {t}'.format(s=space, t=time_limit) + if cluster == 'lassen': + # Cannot specify time limit for jsrun. + command_run = '{s}jsrun'.format(s=space) + else: + command_run = '{s}mpirun --timeout={t}'.format(s=space, t=time_limit) + option_bind = '' + option_cpu_per_resource = '' + option_gpu_per_resource = '' + option_launch_distribution = '' option_num_processes = '' option_processes_per_node = '' + option_resources_per_host = '' + option_tasks_per_resource = '' if num_processes is not None: - # -np => Run this many copies of the program on the given nodes. - option_num_processes = ' -np %d' % num_processes - if (num_nodes is not None) and (num_nodes != 0): - option_processes_per_node = ' -N %d' % int( - math.ceil(float(num_processes)/num_nodes)) - command_run = '%s%s%s' % ( - command_run, option_num_processes, option_processes_per_node) + if cluster == 'lassen': + option_bind = ' -b "packed:10"' + option_cpu_per_resource = ' -c 40' + option_gpu_per_resource = ' -g 4' + option_launch_distribution = ' -d packed' + # Avoid `nrs (32) should not be greater than rs_per_host (1) * number of servers available (16).` + if num_processes > 16: + num_processes = 16 + option_num_processes = ' -n {n}'.format(n=num_processes) + option_resources_per_host = ' -r 1' + option_tasks_per_resource = ' -a 4' + else: + # -np => Run this many copies of the program on the given nodes. + option_num_processes = ' -np %d' % num_processes + if (num_nodes is not None) and (num_nodes != 0): + processes_per_node = int( + math.ceil(float(num_processes)/num_nodes)) + option_processes_per_node = ' -N %d' % processes_per_node + command_run = '%s%s%s%s%s%s%s%s%s' % ( + command_run, option_bind, option_cpu_per_resource, + option_gpu_per_resource, option_launch_distribution, + option_num_processes, option_processes_per_node, + option_resources_per_host, option_tasks_per_resource) else: raise Exception('Unsupported Scheduler %s' % scheduler) @@ -272,16 +302,35 @@ def get_command(cluster, if cluster in ['catalyst', 'corona', 'pascal',]: # option_data_filedir = data_filedir_default # lscratchh, presumably pass # No need to pass in a parameter + elif cluster == 'lassen': + option_data_filedir = ' --data_filedir=%s' % re.sub( + '[a-z]scratch[a-z]', 'gpfs1', data_filedir_default) elif cluster == 'ray': option_data_filedir = ' --data_filedir=%s' % re.sub( '[a-z]scratch[a-z]', 'gscratchr', data_filedir_default) elif None not in data_file_parameters: + # Everything in data_file_parameters has a non-None value. if cluster in ['catalyst', 'corona', 'pascal']: # option_data_filedir_train = data_filedir_train_default # option_data_filename_train = data_filename_train_default # option_data_filedir_test = data_filedir_test_default # option_data_filename_train = data_filename_test_default pass # No need to pass in a parameter + elif cluster == 'lassen': + filename_train = re.sub( + '[a-z]scratch[a-z]', 'gpfs1', data_filename_train_default) + filename_train = re.sub( + 'labels', 'original/labels', filename_train) + print('filename_train={f}'.format(f=filename_train)) + filename_test = re.sub( + '[a-z]scratch[a-z]', 'gpfs1', data_filename_test_default) + filename_test = re.sub( + 'labels', 'original/labels', filename_test) + print('filename_test={f}'.format(f=filename_test)) + option_data_filedir_train = ' --data_filedir_train=%s' % re.sub('[a-z]scratch[a-z]', 'gpfs1', data_filedir_train_default) + option_data_filename_train = ' --data_filename_train=%s' % filename_train + option_data_filedir_test = ' --data_filedir_test=%s' % re.sub('[a-z]scratch[a-z]', 'gpfs1', data_filedir_test_default) + option_data_filename_test = ' --data_filename_test=%s' % filename_test elif cluster == 'ray': option_data_filedir_train = ' --data_filedir_train=%s' % re.sub('[a-z]scratch[a-z]', 'gscratchr', data_filedir_train_default) option_data_filename_train = ' --data_filename_train=%s' % re.sub('[a-z]scratch[a-z]', 'gscratchr', data_filename_train_default) @@ -300,15 +349,16 @@ def get_command(cluster, else: # if None in data_file_parameters: # If any are None if data_file_parameters == [None, None, None, None]: # If all are None - lbann_errors.append( - ('data_reader_name or data_reader_path is set but not' - ' data_filedir_default. If a data reader is provided,' - ' the default filedir must be set. This allows for' - ' determining what the filedir should be on each' - ' cluster. Alternatively, some or all of' - ' [data_filedir_train_default, data_filename_train' - '_default, data_filedir_test_default, data_filename' - '_test_default] can be set.')) + if data_reader_name != 'synthetic': + lbann_errors.append( + ('data_reader_name or data_reader_path is set but not' + ' data_filedir_default. If a data reader is provided,' + ' the default filedir must be set. This allows for' + ' determining what the filedir should be on each' + ' cluster. Alternatively, some or all of' + ' [data_filedir_train_default, data_filename_train' + '_default, data_filedir_test_default, data_filename' + '_test_default] can be set.')) # else: no data_file parameters are set else: if data_filedir_default is not None: @@ -374,6 +424,7 @@ def process_executable_existence(executable, skip_no_exe=True): if not executable_exists: error_string = 'Executable does not exist: %s' % executable if skip_no_exe: + import pytest pytest.skip(error_string) else: raise Exception(error_string) @@ -412,9 +463,11 @@ def get_default_exes(default_dirname, cluster): default_exes = {} default_exes['default'] = '%s/build/gnu.Release.%s.llnl.gov/install/bin/lbann' % (default_dirname, cluster) - if cluster in ['catalyst', 'corona', 'pascal']: + if cluster in ['catalyst', 'corona', 'lassen', 'pascal']: + # Define all compilers. # x86_cpu - catalyst # x86_gpu_pascal - pascal + # ppc64le_gpu_lassen - lassen default_exes['clang6'] = exes['clang6'] default_exes['gcc7'] = exes['gcc7'] default_exes['intel19'] = exes['intel19'] @@ -423,5 +476,6 @@ def get_default_exes(default_dirname, cluster): default_exes['gcc7_debug'] = exes['gcc7_debug'] default_exes['intel19_debug'] = exes['intel19_debug'] + print('default_exes={d}'.format(d=default_exes)) return default_exes diff --git a/bamboo/compiler_tests/test_compiler.py b/bamboo/compiler_tests/test_compiler.py index 0bc8ea24938..5c8be7bee7a 100644 --- a/bamboo/compiler_tests/test_compiler.py +++ b/bamboo/compiler_tests/test_compiler.py @@ -6,32 +6,27 @@ def test_compiler_build_script(cluster, dirname): - if cluster in ['corona', 'pascal']: - output_file_name = '%s/bamboo/compiler_tests/output/build_script_output.txt' % (dirname) - error_file_name = '%s/bamboo/compiler_tests/error/build_script_error.txt' % (dirname) - command = '%s/bamboo/compiler_tests/build_script.sh > %s 2> %s' % ( - dirname, output_file_name, error_file_name) - return_code = os.system(command) - if return_code != 0: - output_file = open(output_file_name, 'r') - for line in output_file: - print('%s: %s' % (output_file_name, line)) - error_file = open(error_file_name, 'r') - for line in error_file: - print('%s: %s' % (error_file_name, line)) - assert return_code == 0 - else: + if cluster not in ['corona', 'lassen', 'pascal']: e = 'test_compiler_build_script: Unsupported Cluster %s' % cluster print('Skip - ' + e) pytest.skip(e) + output_file_name = '%s/bamboo/compiler_tests/output/build_script_output.txt' % (dirname) + error_file_name = '%s/bamboo/compiler_tests/error/build_script_error.txt' % (dirname) + command = '%s/bamboo/compiler_tests/build_script.sh > %s 2> %s' % ( + dirname, output_file_name, error_file_name) + return_code = os.system(command) + if return_code != 0: + output_file = open(output_file_name, 'r') + for line in output_file: + print('%s: %s' % (output_file_name, line)) + error_file = open(error_file_name, 'r') + for line in error_file: + print('%s: %s' % (error_file_name, line)) + assert return_code == 0 def test_compiler_clang6_release(cluster, dirname): - try: - skeleton_clang6(cluster, dirname, False) - except AssertionError as e: - print(e) - build_script(cluster, dirname, 'clang6', False) + skeleton_clang6(cluster, dirname, False) path = '%s/bamboo/compiler_tests/builds/%s_clang-6.0.0_rel/build/model_zoo/lbann' % (dirname, cluster) if not os.path.exists(path): path = '%s/build/clang.Release.%s.llnl.gov/install/bin/lbann' % (dirname, cluster) @@ -39,11 +34,7 @@ def test_compiler_clang6_release(cluster, dirname): def test_compiler_clang6_debug(cluster, dirname): - try: - skeleton_clang6(cluster, dirname, True) - except AssertionError as e: - print(e) - build_script(cluster, dirname, 'clang6', True) + skeleton_clang6(cluster, dirname, True) path = '%s/bamboo/compiler_tests/builds/%s_clang-6.0.0_debug/build/model_zoo/lbann' % (dirname, cluster) if not os.path.exists(path): path = '%s/build/clang.Debug.%s.llnl.gov/install/bin/lbann' % (dirname, cluster) @@ -51,11 +42,7 @@ def test_compiler_clang6_debug(cluster, dirname): def test_compiler_gcc7_release(cluster, dirname): - try: - skeleton_gcc7(cluster, dirname, False) - except AssertionError as e: - print(e) - build_script(cluster, dirname, 'gcc7', False) + skeleton_gcc7(cluster, dirname, False) path = '%s/bamboo/compiler_tests/builds/%s_gcc-7.1.0_rel/build/model_zoo/lbann' % (dirname, cluster) if not os.path.exists(path): path = '%s/build/gnu.Release.%s.llnl.gov/install/bin/lbann' % (dirname, cluster) @@ -63,11 +50,7 @@ def test_compiler_gcc7_release(cluster, dirname): def test_compiler_gcc7_debug(cluster, dirname): - try: - skeleton_gcc7(cluster, dirname, True) - except AssertionError as e: - print(e) - build_script(cluster, dirname, 'gcc7', True) + skeleton_gcc7(cluster, dirname, True) path = '%s/bamboo/compiler_tests/builds/%s_gcc-7.1.0_debug/build/model_zoo/lbann' % (dirname, cluster) if not os.path.exists(path): path = '%s/build/gnu.Debug.%s.llnl.gov/install/bin/lbann' % (dirname, cluster) @@ -75,11 +58,7 @@ def test_compiler_gcc7_debug(cluster, dirname): def test_compiler_intel19_release(cluster, dirname): - try: - skeleton_intel19(cluster, dirname, False) - except AssertionError as e: - print(e) - build_script(cluster, dirname, 'intel19', False) + skeleton_intel19(cluster, dirname, False) path = '%s/bamboo/compiler_tests/builds/%s_intel-19.0.0_rel/build/model_zoo/lbann' % (dirname, cluster) if not os.path.exists(path): path = '%s/build/intel.Release.%s.llnl.gov/install/bin/lbann' % (dirname, cluster) @@ -87,11 +66,7 @@ def test_compiler_intel19_release(cluster, dirname): def test_compiler_intel19_debug(cluster, dirname): - try: - skeleton_intel19(cluster, dirname, True) - except AssertionError as e: - print(e) - build_script(cluster, dirname, 'intel19', True) + skeleton_intel19(cluster, dirname, True) path = '%s/bamboo/compiler_tests/builds/%s_intel-19.0.0_debug/build/model_zoo/lbann' % (dirname, cluster) if not os.path.exists(path): path = '%s/build/intel.Debug.%s.llnl.gov/install/bin/lbann' % (dirname, cluster) @@ -99,33 +74,44 @@ def test_compiler_intel19_debug(cluster, dirname): def skeleton_clang6(cluster, dir_name, debug, should_log=False): - if cluster in ['catalyst']: - spack_skeleton(dir_name, 'clang@6.0.0', 'mvapich2@2.2', debug, should_log) - build_skeleton(dir_name, 'clang@6.0.0', debug, should_log) - else: + if cluster not in ['catalyst']: e = 'skeleton_clang6: Unsupported Cluster %s' % cluster print('Skip - ' + e) pytest.skip(e) + try: + spack_skeleton(dir_name, 'clang@6.0.0', 'mvapich2@2.2', debug, + should_log) + build_skeleton(dir_name, 'clang@6.0.0', debug, should_log) + except AssertionError as e: + print(e) + build_script(cluster, dir_name, 'clang6', debug) def skeleton_gcc7(cluster, dir_name, debug, should_log=False): - if cluster in ['catalyst']: - spack_skeleton(dir_name, 'gcc@7.1.0', 'mvapich2@2.2', debug, should_log) - build_skeleton(dir_name, 'gcc@7.1.0', debug, should_log) - else: + if cluster not in ['catalyst', 'pascal']: e = 'skeleton_gcc7: Unsupported Cluster %s' % cluster print('Skip - ' + e) pytest.skip(e) + try: + spack_skeleton(dir_name, 'gcc@7.1.0', 'mvapich2@2.2', debug, should_log) + build_skeleton(dir_name, 'gcc@7.1.0', debug, should_log) + except AssertionError as e: + print(e) + build_script(cluster, dir_name, 'gcc7', debug) def skeleton_intel19(cluster, dir_name, debug, should_log=False): - if cluster in []: # ['catalyst']: - spack_skeleton(dir_name, 'intel@19.0.0', 'mvapich2@2.2', debug, should_log) - build_skeleton(dir_name, 'intel@19.0.0', debug, should_log) - else: + if cluster not in []: # Taking out 'catalyst' e = 'skeleton_intel19: Unsupported Cluster %s' % cluster print('Skip - ' + e) pytest.skip(e) + try: + spack_skeleton(dir_name, 'intel@19.0.0', 'mvapich2@2.2', debug, + should_log) + build_skeleton(dir_name, 'intel@19.0.0', debug, should_log) + except AssertionError as e: + print(e) + build_script(cluster, dir_name, 'intel19', debug) def spack_skeleton(dir_name, compiler, mpi_lib, debug, should_log): diff --git a/bamboo/integration_tests/common_code.py b/bamboo/integration_tests/common_code.py index 76bb59257c6..a939f0effc4 100644 --- a/bamboo/integration_tests/common_code.py +++ b/bamboo/integration_tests/common_code.py @@ -12,18 +12,32 @@ def get_command(cluster, dir_name, model_folder, model_name, executable, # If doing weekly testing, increase data_reader_percent if weekly: data_reader_percent = 0.10 - command = tools.get_command( - cluster=cluster, executable=executable, num_nodes=16, - partition='pbatch', time_limit=600, num_processes=32, - dir_name=dir_name, - data_filedir_train_default='/p/lscratchh/brainusr/datasets/ILSVRC2012/original/train/', - data_filename_train_default='/p/lscratchh/brainusr/datasets/ILSVRC2012/labels/train.txt', - data_filedir_test_default='/p/lscratchh/brainusr/datasets/ILSVRC2012/original/val/', - data_filename_test_default='/p/lscratchh/brainusr/datasets/ILSVRC2012/labels/val.txt', - data_reader_name='imagenet', data_reader_percent=data_reader_percent, - model_folder=model_folder, model_name=model_name, num_epochs=20, - optimizer_name='adagrad', output_file_name=output_file_name, - error_file_name=error_file_name) + if cluster == 'lassen': + command = tools.get_command( + cluster=cluster, executable=executable, num_nodes=16, + partition='pbatch', time_limit=600, num_processes=32, + dir_name=dir_name, + data_filedir_train_default='/p/lscratchh/brainusr/datasets/ILSVRC2012/original/train/', + data_filename_train_default='/p/lscratchh/brainusr/datasets/ILSVRC2012/labels/train.txt', + data_filedir_test_default='/p/lscratchh/brainusr/datasets/ILSVRC2012/original/val/', + data_filename_test_default='/p/lscratchh/brainusr/datasets/ILSVRC2012/labels/val.txt', + data_reader_name='imagenet_lassen', data_reader_percent=data_reader_percent, + model_folder=model_folder, model_name=model_name, num_epochs=20, + optimizer_name='adagrad', output_file_name=output_file_name, + error_file_name=error_file_name) + else: + command = tools.get_command( + cluster=cluster, executable=executable, num_nodes=16, + partition='pbatch', time_limit=600, num_processes=32, + dir_name=dir_name, + data_filedir_train_default='/p/lscratchh/brainusr/datasets/ILSVRC2012/original/train/', + data_filename_train_default='/p/lscratchh/brainusr/datasets/ILSVRC2012/labels/train.txt', + data_filedir_test_default='/p/lscratchh/brainusr/datasets/ILSVRC2012/original/val/', + data_filename_test_default='/p/lscratchh/brainusr/datasets/ILSVRC2012/labels/val.txt', + data_reader_name='imagenet', data_reader_percent=data_reader_percent, + model_folder=model_folder, model_name=model_name, num_epochs=20, + optimizer_name='adagrad', output_file_name=output_file_name, + error_file_name=error_file_name) elif model_name in ['conv_autoencoder_mnist', 'lenet_mnist']: if (model_name == 'lenet_mnist') and \ (compiler_name in ['clang6', 'intel19']): diff --git a/bamboo/integration_tests/expected_values/lassen/gcc7/expected_conv_autoencoder_imagenet_objective_functions.csv b/bamboo/integration_tests/expected_values/lassen/gcc7/expected_conv_autoencoder_imagenet_objective_functions.csv new file mode 100644 index 00000000000..003794fd557 --- /dev/null +++ b/bamboo/integration_tests/expected_values/lassen/gcc7/expected_conv_autoencoder_imagenet_objective_functions.csv @@ -0,0 +1,21 @@ +Epoch_number, training_objective_function_nightly, training_objective_function_weekly +0, 0.675652, 0.608574 +1, 0.590008, 0.590008 +2, 0.587484, 0.587484 +3, 0.586305, 0.586305 +4, 0.585585, 0.585585 +5, 0.585036, 0.585036 +6, 0.584688, 0.584688 +7, 0.584348, 0.584348 +8, 0.584041, 0.584041 +9, 0.583865, 0.583865 +10, 0.583665, 0.583665 +11, 0.583521, 0.583521 +12, 0.583303, 0.583303 +13, 0.58328, 0.58328 +14, 0.5832, 0.5832 +15, 0.583134, 0.583134 +16, 0.583052, 0.583052 +17, 0.583039, 0.583039 +18, 0.582954, 0.582954 +19, 0.582936, 0.582936 diff --git a/bamboo/integration_tests/expected_values/lassen/gcc7/expected_conv_autoencoder_mnist_objective_functions.csv b/bamboo/integration_tests/expected_values/lassen/gcc7/expected_conv_autoencoder_mnist_objective_functions.csv new file mode 100644 index 00000000000..8bcf25bb71d --- /dev/null +++ b/bamboo/integration_tests/expected_values/lassen/gcc7/expected_conv_autoencoder_mnist_objective_functions.csv @@ -0,0 +1,6 @@ +Epoch_number, training_objective_function +0, 0.207514 +1, 0.194710 +2, 0.193221 +3, 0.192864 +4, 0.192755 diff --git a/bamboo/integration_tests/expected_values/lassen/gcc7/expected_performance.csv b/bamboo/integration_tests/expected_values/lassen/gcc7/expected_performance.csv new file mode 100644 index 00000000000..09dca6d2de5 --- /dev/null +++ b/bamboo/integration_tests/expected_values/lassen/gcc7/expected_performance.csv @@ -0,0 +1,5 @@ +Model_name, training_run_time, training_mean, training_max, training_min, training_stdev, test_accuracy +alexnet_nightly, 23.00, 0.70, 10.30, 0.10, 1.20, 100.00 +alexnet_weekly, 0.00, 0.00, 0.00, 0.00, 0.00, 100.00 +cache_alexnet, 0.00, 0.00, 0.00, 0.00, 0.00, 100.00 +lenet_mnist, 10.10, 0.06, 5.30, 0.01, 0.60, 98.30 diff --git a/bamboo/integration_tests/test_integration_autoencoders.py b/bamboo/integration_tests/test_integration_autoencoders.py index b256b11c8df..25b02387f73 100644 --- a/bamboo/integration_tests/test_integration_autoencoders.py +++ b/bamboo/integration_tests/test_integration_autoencoders.py @@ -53,7 +53,7 @@ def run_tests(actual_objective_functions, model_name, dir_name, cluster, def skeleton_autoencoder_imagenet(cluster, dir_name, executables, compiler_name, weekly): - if cluster in ['pascal']: + if cluster in ['lassen', 'pascal']: e = 'skeleton_autoencoder_imagenet: does not run on GPU' print('Skip - ' + e) pytest.skip(e) diff --git a/bamboo/run.sh b/bamboo/run.sh index 22b9256e314..a671532e4dd 100755 --- a/bamboo/run.sh +++ b/bamboo/run.sh @@ -5,7 +5,7 @@ CLUSTER=$(hostname | sed 's/\([a-zA-Z][a-zA-Z]*\)[0-9]*/\1/g') echo "run.sh CLUSTER=" echo $CLUSTER -if [ "${CLUSTER}" = 'catalyst' ] || [ "${CLUSTER}" = 'corona' ]; then +if [ "${CLUSTER}" = 'catalyst' ] || [ "${CLUSTER}" = 'corona' ] || [ "${CLUSTER}" = 'lassen' ]; then PYTHON=python fi diff --git a/bamboo/unit_tests/test_unit_layer_clamp.py b/bamboo/unit_tests/test_unit_layer_clamp.py index 56b8ca50520..ea8182b8a56 100644 --- a/bamboo/unit_tests/test_unit_layer_clamp.py +++ b/bamboo/unit_tests/test_unit_layer_clamp.py @@ -15,7 +15,7 @@ def skeleton_layer_clamp(cluster, executables, dir_name, compiler_name): command = tools.get_command( cluster=cluster, executable=executables[compiler_name], num_nodes=1, num_processes=2, dir_name=dir_name, - data_filedir_default='', data_reader_name='synthetic', + data_reader_name='synthetic', model_folder='tests/layer_tests', model_name='clamp', optimizer_name='sgd', output_file_name=output_file_name, error_file_name=error_file_name) diff --git a/bamboo/unit_tests/test_unit_layer_covariance.py b/bamboo/unit_tests/test_unit_layer_covariance.py index 7b7de9b50ca..74d0b8da353 100644 --- a/bamboo/unit_tests/test_unit_layer_covariance.py +++ b/bamboo/unit_tests/test_unit_layer_covariance.py @@ -15,7 +15,7 @@ def skeleton_layer_covariance(cluster, executables, dir_name, compiler_name): command = tools.get_command( cluster=cluster, executable=executables[compiler_name], num_nodes=1, num_processes=2, dir_name=dir_name, - data_filedir_default='', data_reader_name='synthetic', + data_reader_name='synthetic', model_folder='tests/layer_tests', model_name='covariance', optimizer_name='sgd', output_file_name=output_file_name, error_file_name=error_file_name) diff --git a/bamboo/unit_tests/test_unit_layer_elu.py b/bamboo/unit_tests/test_unit_layer_elu.py index 4e5abe80398..76332cebcec 100644 --- a/bamboo/unit_tests/test_unit_layer_elu.py +++ b/bamboo/unit_tests/test_unit_layer_elu.py @@ -15,7 +15,7 @@ def skeleton_layer_elu(cluster, executables, dir_name, compiler_name): command = tools.get_command( cluster=cluster, executable=executables[compiler_name], num_nodes=1, num_processes=2, dir_name=dir_name, - data_filedir_default='', data_reader_name='synthetic', + data_reader_name='synthetic', model_folder='tests/layer_tests', model_name='elu', optimizer_name='sgd', output_file_name=output_file_name, error_file_name=error_file_name) diff --git a/bamboo/unit_tests/test_unit_layer_identity.py b/bamboo/unit_tests/test_unit_layer_identity.py index 448531354d4..1ea2d742d22 100644 --- a/bamboo/unit_tests/test_unit_layer_identity.py +++ b/bamboo/unit_tests/test_unit_layer_identity.py @@ -15,7 +15,7 @@ def skeleton_layer_identity(cluster, executables, dir_name, compiler_name): command = tools.get_command( cluster=cluster, executable=executables[compiler_name], num_nodes=1, num_processes=2, dir_name=dir_name, - data_filedir_default='', data_reader_name='synthetic', + data_reader_name='synthetic', model_folder='tests/layer_tests', model_name='identity', optimizer_name='sgd', output_file_name=output_file_name, error_file_name=error_file_name) diff --git a/bamboo/unit_tests/test_unit_layer_l1_norm.py b/bamboo/unit_tests/test_unit_layer_l1_norm.py index b1362658093..6f39cb5a242 100644 --- a/bamboo/unit_tests/test_unit_layer_l1_norm.py +++ b/bamboo/unit_tests/test_unit_layer_l1_norm.py @@ -15,7 +15,7 @@ def skeleton_layer_l1_norm(cluster, executables, dir_name, compiler_name): command = tools.get_command( cluster=cluster, executable=executables[compiler_name], num_nodes=1, num_processes=2, dir_name=dir_name, - data_filedir_default='', data_reader_name='synthetic', + data_reader_name='synthetic', model_folder='tests/layer_tests', model_name='l1_norm', optimizer_name='sgd', output_file_name=output_file_name, error_file_name=error_file_name) diff --git a/bamboo/unit_tests/test_unit_layer_l2_norm2.py b/bamboo/unit_tests/test_unit_layer_l2_norm2.py index 2df17ef30e1..a90fa330eb0 100644 --- a/bamboo/unit_tests/test_unit_layer_l2_norm2.py +++ b/bamboo/unit_tests/test_unit_layer_l2_norm2.py @@ -15,7 +15,7 @@ def skeleton_layer_l2_norm2(cluster, executables, dir_name, compiler_name): command = tools.get_command( cluster=cluster, executable=executables[compiler_name], num_nodes=1, num_processes=2, dir_name=dir_name, - data_filedir_default='', data_reader_name='synthetic', + data_reader_name='synthetic', model_folder='tests/layer_tests', model_name='l2_norm2', optimizer_name='sgd', output_file_name=output_file_name, error_file_name=error_file_name) diff --git a/bamboo/unit_tests/test_unit_layer_leaky_relu.py b/bamboo/unit_tests/test_unit_layer_leaky_relu.py index d62d559bb3e..68b6d8d0fdd 100644 --- a/bamboo/unit_tests/test_unit_layer_leaky_relu.py +++ b/bamboo/unit_tests/test_unit_layer_leaky_relu.py @@ -15,7 +15,7 @@ def skeleton_layer_leaky_relu(cluster, executables, dir_name, compiler_name): command = tools.get_command( cluster=cluster, executable=executables[compiler_name], num_nodes=1, num_processes=2, dir_name=dir_name, - data_filedir_default='', data_reader_name='synthetic', + data_reader_name='synthetic', model_folder='tests/layer_tests', model_name='leaky_relu', optimizer_name='sgd', output_file_name=output_file_name, error_file_name=error_file_name) diff --git a/bamboo/unit_tests/test_unit_layer_log_sigmoid.py b/bamboo/unit_tests/test_unit_layer_log_sigmoid.py index b2b53a46a86..93faa462298 100644 --- a/bamboo/unit_tests/test_unit_layer_log_sigmoid.py +++ b/bamboo/unit_tests/test_unit_layer_log_sigmoid.py @@ -15,7 +15,7 @@ def skeleton_layer_log_sigmoid(cluster, executables, dir_name, compiler_name): command = tools.get_command( cluster=cluster, executable=executables[compiler_name], num_nodes=1, num_processes=2, dir_name=dir_name, - data_filedir_default='', data_reader_name='synthetic', + data_reader_name='synthetic', model_folder='tests/layer_tests', model_name='log_sigmoid', optimizer_name='sgd', output_file_name=output_file_name, error_file_name=error_file_name) diff --git a/bamboo/unit_tests/test_unit_layer_log_softmax.py b/bamboo/unit_tests/test_unit_layer_log_softmax.py index 234b068d714..6fe031609c3 100644 --- a/bamboo/unit_tests/test_unit_layer_log_softmax.py +++ b/bamboo/unit_tests/test_unit_layer_log_softmax.py @@ -15,7 +15,7 @@ def skeleton_layer_log_softmax(cluster, executables, dir_name, compiler_name): command = tools.get_command( cluster=cluster, executable=executables[compiler_name], num_nodes=1, num_processes=2, dir_name=dir_name, - data_filedir_default='', data_reader_name='synthetic', + data_reader_name='synthetic', model_folder='tests/layer_tests', model_name='log_softmax', optimizer_name='sgd', output_file_name=output_file_name, error_file_name=error_file_name) diff --git a/bamboo/unit_tests/test_unit_layer_mean_absolute_error.py b/bamboo/unit_tests/test_unit_layer_mean_absolute_error.py index 744c8e5cd89..7db9912503a 100644 --- a/bamboo/unit_tests/test_unit_layer_mean_absolute_error.py +++ b/bamboo/unit_tests/test_unit_layer_mean_absolute_error.py @@ -15,7 +15,7 @@ def skeleton_layer_mean_absolute_error(cluster, executables, dir_name, compiler_ command = tools.get_command( cluster=cluster, executable=executables[compiler_name], num_nodes=1, num_processes=2, dir_name=dir_name, - data_filedir_default='', data_reader_name='synthetic', + data_reader_name='synthetic', model_folder='tests/layer_tests', model_name='mean_absolute_error', optimizer_name='sgd', output_file_name=output_file_name, error_file_name=error_file_name) diff --git a/bamboo/unit_tests/test_unit_layer_relu.py b/bamboo/unit_tests/test_unit_layer_relu.py index 39d10030ab3..8136ad8f712 100644 --- a/bamboo/unit_tests/test_unit_layer_relu.py +++ b/bamboo/unit_tests/test_unit_layer_relu.py @@ -15,7 +15,7 @@ def skeleton_layer_relu(cluster, executables, dir_name, compiler_name): command = tools.get_command( cluster=cluster, executable=executables[compiler_name], num_nodes=1, num_processes=2, dir_name=dir_name, - data_filedir_default='', data_reader_name='synthetic', + data_reader_name='synthetic', model_folder='tests/layer_tests', model_name='relu', optimizer_name='sgd', output_file_name=output_file_name, error_file_name=error_file_name) diff --git a/bamboo/unit_tests/test_unit_layer_selu.py b/bamboo/unit_tests/test_unit_layer_selu.py index f41f4d5ed57..c920297b2c5 100644 --- a/bamboo/unit_tests/test_unit_layer_selu.py +++ b/bamboo/unit_tests/test_unit_layer_selu.py @@ -15,7 +15,7 @@ def skeleton_layer_selu(cluster, executables, dir_name, compiler_name): command = tools.get_command( cluster=cluster, executable=executables[compiler_name], num_nodes=1, num_processes=2, dir_name=dir_name, - data_filedir_default='', data_reader_name='synthetic', + data_reader_name='synthetic', model_folder='tests/layer_tests', model_name='selu', optimizer_name='sgd', output_file_name=output_file_name, error_file_name=error_file_name) diff --git a/bamboo/unit_tests/test_unit_layer_sigmoid.py b/bamboo/unit_tests/test_unit_layer_sigmoid.py index 7d75d32c1d0..ddb7306630d 100644 --- a/bamboo/unit_tests/test_unit_layer_sigmoid.py +++ b/bamboo/unit_tests/test_unit_layer_sigmoid.py @@ -15,7 +15,7 @@ def skeleton_layer_sigmoid(cluster, executables, dir_name, compiler_name): command = tools.get_command( cluster=cluster, executable=executables[compiler_name], num_nodes=1, num_processes=2, dir_name=dir_name, - data_filedir_default='', data_reader_name='synthetic', + data_reader_name='synthetic', model_folder='tests/layer_tests', model_name='sigmoid', optimizer_name='sgd', output_file_name=output_file_name, error_file_name=error_file_name) diff --git a/bamboo/unit_tests/test_unit_layer_softmax.py b/bamboo/unit_tests/test_unit_layer_softmax.py index af1fa09ac17..ef80a96ce84 100644 --- a/bamboo/unit_tests/test_unit_layer_softmax.py +++ b/bamboo/unit_tests/test_unit_layer_softmax.py @@ -15,7 +15,7 @@ def skeleton_layer_softmax(cluster, executables, dir_name, compiler_name): command = tools.get_command( cluster=cluster, executable=executables[compiler_name], num_nodes=1, num_processes=2, dir_name=dir_name, - data_filedir_default='', data_reader_name='synthetic', + data_reader_name='synthetic', model_folder='tests/layer_tests', model_name='softmax', optimizer_name='sgd', output_file_name=output_file_name, error_file_name=error_file_name) diff --git a/bamboo/unit_tests/test_unit_layer_softplus.py b/bamboo/unit_tests/test_unit_layer_softplus.py index cfcccf9b694..a06a6291ec8 100644 --- a/bamboo/unit_tests/test_unit_layer_softplus.py +++ b/bamboo/unit_tests/test_unit_layer_softplus.py @@ -15,7 +15,7 @@ def skeleton_layer_softplus(cluster, executables, dir_name, compiler_name): command = tools.get_command( cluster=cluster, executable=executables[compiler_name], num_nodes=1, num_processes=2, dir_name=dir_name, - data_filedir_default='', data_reader_name='synthetic', + data_reader_name='synthetic', model_folder='tests/layer_tests', model_name='softplus', optimizer_name='sgd', output_file_name=output_file_name, error_file_name=error_file_name) diff --git a/bamboo/unit_tests/test_unit_layer_softsign.py b/bamboo/unit_tests/test_unit_layer_softsign.py index bff14d3e789..321e2f4b9d3 100644 --- a/bamboo/unit_tests/test_unit_layer_softsign.py +++ b/bamboo/unit_tests/test_unit_layer_softsign.py @@ -15,7 +15,7 @@ def skeleton_layer_softsign(cluster, executables, dir_name, compiler_name): command = tools.get_command( cluster=cluster, executable=executables[compiler_name], num_nodes=1, num_processes=2, dir_name=dir_name, - data_filedir_default='', data_reader_name='synthetic', + data_reader_name='synthetic', model_folder='tests/layer_tests', model_name='softsign', optimizer_name='sgd', output_file_name=output_file_name, error_file_name=error_file_name) diff --git a/bamboo/unit_tests/test_unit_layer_squared_difference.py b/bamboo/unit_tests/test_unit_layer_squared_difference.py index 6050310b4dc..4991552a6ed 100644 --- a/bamboo/unit_tests/test_unit_layer_squared_difference.py +++ b/bamboo/unit_tests/test_unit_layer_squared_difference.py @@ -15,7 +15,7 @@ def skeleton_layer_squared_difference(cluster, executables, dir_name, compiler_n command = tools.get_command( cluster=cluster, executable=executables[compiler_name], num_nodes=1, num_processes=2, dir_name=dir_name, - data_filedir_default='', data_reader_name='synthetic', + data_reader_name='synthetic', model_folder='tests/layer_tests', model_name='squared_difference', optimizer_name='sgd', output_file_name=output_file_name, error_file_name=error_file_name) diff --git a/bamboo/unit_tests/test_unit_layer_tessellate.py b/bamboo/unit_tests/test_unit_layer_tessellate.py index cef99ca567c..14857aab027 100644 --- a/bamboo/unit_tests/test_unit_layer_tessellate.py +++ b/bamboo/unit_tests/test_unit_layer_tessellate.py @@ -15,7 +15,7 @@ def skeleton_layer_tessellate(cluster, executables, dir_name, compiler_name): command = tools.get_command( cluster=cluster, executable=executables[compiler_name], num_nodes=1, num_processes=2, dir_name=dir_name, - data_filedir_default='', data_reader_name='synthetic', + data_reader_name='synthetic', model_folder='tests/layer_tests', model_name='tessellate', optimizer_name='sgd', output_file_name=output_file_name, error_file_name=error_file_name) diff --git a/bamboo/unit_tests/test_unit_layer_variance.py b/bamboo/unit_tests/test_unit_layer_variance.py index b1bb6803707..8ca64bba063 100644 --- a/bamboo/unit_tests/test_unit_layer_variance.py +++ b/bamboo/unit_tests/test_unit_layer_variance.py @@ -15,7 +15,7 @@ def skeleton_layer_variance(cluster, executables, dir_name, compiler_name): command = tools.get_command( cluster=cluster, executable=executables[compiler_name], num_nodes=1, num_processes=2, dir_name=dir_name, - data_filedir_default='', data_reader_name='synthetic', + data_reader_name='synthetic', model_folder='tests/layer_tests', model_name='variance', optimizer_name='sgd', output_file_name=output_file_name, error_file_name=error_file_name) diff --git a/bamboo/unit_tests/test_unit_lbann2_reload.py b/bamboo/unit_tests/test_unit_lbann2_reload.py index f90b33ce62c..1ff1ef76635 100644 --- a/bamboo/unit_tests/test_unit_lbann2_reload.py +++ b/bamboo/unit_tests/test_unit_lbann2_reload.py @@ -125,7 +125,7 @@ def test_unit_lbann2_reload_clang6(cluster, exes, dirname): def test_unit_lbann2_reload_gcc7(cluster, exes, dirname): - if cluster in ['catalyst', 'pascal']: # STILL ERRORS + if cluster in ['catalyst', 'lassen', 'pascal']: # STILL ERRORS pytest.skip('FIXME') skeleton_lbann2_reload(cluster, exes, dirname, 'gcc7') diff --git a/bamboo/unit_tests/test_unit_lbann_invocation.py b/bamboo/unit_tests/test_unit_lbann_invocation.py index 55299c26f9d..8ff69b3cd84 100644 --- a/bamboo/unit_tests/test_unit_lbann_invocation.py +++ b/bamboo/unit_tests/test_unit_lbann_invocation.py @@ -74,7 +74,8 @@ def test_unit_bad_params(cluster, exes): exe = exes['gcc7'] sys.stderr.write('TESTING: run lbann with ill-formed param (missing -) lbann should throw exception\n') (command_allocate, command_run, _, _) = tools.get_command(cluster=cluster, executable=exe, return_tuple=True) - return_code = os.system('%s%s %s -exit_after_setup --reader=prototext/data_reader_mnist.prototext --model={prototext/model_mnist_simple_1.prototext,prototext/model_mnist_simple_1.prototext} --optimizer=prototext/opt_sgd.prototext' % (command_allocate, command_run, exe)) + command_string = '%s%s %s -exit_after_setup --reader=prototext/data_reader_mnist.prototext --model={prototext/model_mnist_simple_1.prototext,prototext/model_mnist_simple_1.prototext} --optimizer=prototext/opt_sgd.prototext' % (command_allocate, command_run, exe) + return_code = os.system(command_string) assert return_code != 0 diff --git a/docs/continuous_integration.rst b/docs/continuous_integration.rst index a3745596da9..396798bc532 100644 --- a/docs/continuous_integration.rst +++ b/docs/continuous_integration.rst @@ -154,15 +154,19 @@ Bamboo Agent Properties Bamboo agent properties are used to specify requirements for each job. -+--------------------------------+-------------+--------------+----------+------------------+---------------------+ -| Agents (jobs) | agent_owner | architecture | cluster | gpu_architecture | sys_type | -+================================+=============+==============+==========+==================+=====================+ -| Catalyst Agents (x86_cpu) | lbannusr | x86_64 | catalyst | none | toss_3_x86_64_ib | -+--------------------------------+-------------+--------------+----------+------------------+---------------------+ -| Pascal Agents (x86_gpu_pascal) | lbannusr | x86_64 | pascal | pascal | chaos_6_x86_64_ib | -+--------------------------------+-------------+--------------+----------+------------------+---------------------+ -| Ray Agents (ppc64le_gpu) | lbannusr | ppc64_le | ray | pascal | blueos_3_ppc64le_ib | -+--------------------------------+-------------+--------------+----------+------------------+---------------------+ ++--------------------------------+-------------+--------------+----------+------------------+------------------------+ +| Agents (jobs) | agent_owner | architecture | cluster | gpu_architecture | sys_type | ++================================+=============+==============+==========+==================+========================+ +| Catalyst Agents (x86_cpu) | lbannusr | x86_64 | catalyst | none | toss_3_x86_64_ib | ++--------------------------------+-------------+--------------+----------+------------------+------------------------+ +| Corona Agents (x86_cpu_corona) | lbannusr | x86_64 | corona | none | toss_3_x86_64_ib | ++--------------------------------+-------------+--------------+----------+------------------+------------------------+ +| Lassen Agents (ppc64le_gpu) | lbannusr | ppc64le | lassen | volta | blueos_3_ppc64le_ib_p9 | ++--------------------------------+-------------+--------------+----------+------------------+------------------------+ +| Pascal Agents (x86_gpu_pascal) | lbannusr | x86_64 | pascal | pascal | chaos_6_x86_64_ib | ++--------------------------------+-------------+--------------+----------+------------------+------------------------+ +| Ray Agents (ppc64le_gpu) | lbannusr | ppc64_le | ray | pascal | blueos_3_ppc64le_ib | ++--------------------------------+-------------+--------------+----------+------------------+------------------------+ Currently, "agent_owner", "architecture", and "gpu_architecture" are used to determine agents to run a job. diff --git a/model_zoo/data_readers/data_reader_imagenet_lassen.prototext b/model_zoo/data_readers/data_reader_imagenet_lassen.prototext new file mode 100644 index 00000000000..08ddf8b8161 --- /dev/null +++ b/model_zoo/data_readers/data_reader_imagenet_lassen.prototext @@ -0,0 +1,65 @@ +data_reader { + reader { + name: "imagenet" + role: "train" + shuffle: true + data_filedir: "/p/gpfs1/brainusr/datasets/ILSVRC2012/original/train/" + data_filename: "/p/gpfs1/brainusr/datasets/ILSVRC2012/original/labels/train.txt" + label_filename: "" + validation_percent: 0.0 + absolute_sample_count: 0 + percent_of_data_to_use: 1.0 + num_labels: 1000 + + transforms { + random_resized_crop { + height: 224 + width: 224 + } + } + transforms { + horizontal_flip { + p: 0.5 + } + } + transforms { + colorize {} + } + transforms { + normalize_to_lbann_layout { + means: "0.406 0.456 0.485" + stddevs: "0.225 0.224 0.229" + } + } + } + + reader { + name: "imagenet" + role: "validate" + shuffle: true + data_filedir: "/p/gpfs1/brainusr/datasets/ILSVRC2012/original/val/" + data_filename: "/p/gpfs1/brainusr/datasets/ILSVRC2012/original/labels/val.txt" + label_filename: "" + absolute_sample_count: 0 + percent_of_data_to_use: 1.0 + num_labels: 1000 + + transforms { + resized_center_crop { + height: 256 + width: 256 + crop_height: 224 + crop_width: 224 + } + } + transforms { + colorize {} + } + transforms { + normalize_to_lbann_layout { + means: "0.406 0.456 0.485" + stddevs: "0.225 0.224 0.229" + } + } + } +} From 1a7d78c323ce4dff0b0885ea6f8d203e208a32e0 Mon Sep 17 00:00:00 2001 From: "Thomas R. Benson" Date: Thu, 18 Jul 2019 16:42:51 -0700 Subject: [PATCH 129/634] fix almost all the clang warnings. dealing with log_msg separately --- include/lbann/callbacks/callback_checksmall.hpp | 2 +- include/lbann/callbacks/callback_perturb_adam.hpp | 4 ++-- .../lbann/callbacks/callback_perturb_dropout.hpp | 6 +++--- .../callbacks/callback_variable_minibatch.hpp | 8 +++++--- .../lbann/data_readers/data_reader_jag_conduit.hpp | 8 ++++---- .../data_readers/data_reader_numpy_npz_conduit.hpp | 2 +- include/lbann/layers/activations/elu.hpp | 2 +- include/lbann/layers/activations/leaky_relu.hpp | 2 +- .../lbann/layers/io/input/generic_input_layer.hpp | 2 +- include/lbann/layers/learning/base_convolution.hpp | 2 +- include/lbann/layers/learning/convolution.hpp | 2 +- include/lbann/layers/learning/deconvolution.hpp | 2 +- include/lbann/layers/learning/embedding.hpp | 2 +- include/lbann/layers/learning/fully_connected.hpp | 2 +- .../layers/loss/top_k_categorical_accuracy.hpp | 2 +- include/lbann/layers/math/clamp.hpp | 2 +- include/lbann/layers/misc/covariance.hpp | 2 +- include/lbann/layers/misc/variance.hpp | 2 +- .../layers/regularizers/batch_normalization.hpp | 2 +- include/lbann/layers/regularizers/dropout.hpp | 2 +- .../regularizers/local_response_normalization.hpp | 2 +- include/lbann/layers/transform/bernoulli.hpp | 2 +- include/lbann/layers/transform/concatenation.hpp | 2 +- include/lbann/layers/transform/constant.hpp | 2 +- include/lbann/layers/transform/gaussian.hpp | 2 +- include/lbann/layers/transform/in_top_k.hpp | 2 +- include/lbann/layers/transform/pooling.hpp | 2 +- include/lbann/layers/transform/reduction.hpp | 2 +- include/lbann/layers/transform/slice.hpp | 2 +- include/lbann/layers/transform/sort.hpp | 2 +- include/lbann/layers/transform/uniform.hpp | 2 +- include/lbann/layers/transform/weighted_sum.hpp | 2 +- include/lbann/transforms/normalize.hpp | 2 +- .../lbann/transforms/repack_HWC_to_CHW_layout.hpp | 2 +- .../vision/normalize_to_lbann_layout.hpp | 4 ++-- .../lbann/transforms/vision/to_lbann_layout.hpp | 2 +- include/lbann/weights/initializer.hpp | 14 +++++++------- .../weights/variance_scaling_initializers.hpp | 8 ++++---- src/data_readers/data_reader_jag_conduit.cpp | 2 +- src/optimizers/adagrad.cpp | 2 +- src/optimizers/adam.cpp | 2 +- src/optimizers/hypergradient_adam.cpp | 2 +- src/optimizers/rmsprop.cpp | 2 +- src/optimizers/sgd.cpp | 2 +- src/proto/init_image_data_readers.cpp | 4 ++-- src/proto/proto_common.cpp | 2 +- src/weights/initializer.cpp | 6 +++--- src/weights/variance_scaling_initializers.cpp | 2 +- 48 files changed, 71 insertions(+), 69 deletions(-) diff --git a/include/lbann/callbacks/callback_checksmall.hpp b/include/lbann/callbacks/callback_checksmall.hpp index 2f66a04d2d9..892ef3282f3 100644 --- a/include/lbann/callbacks/callback_checksmall.hpp +++ b/include/lbann/callbacks/callback_checksmall.hpp @@ -49,7 +49,7 @@ class lbann_callback_checksmall : public lbann_callback { lbann_callback_checksmall() : lbann_callback() {} lbann_callback_checksmall(const lbann_callback_checksmall&) = default; lbann_callback_checksmall& operator=( - const lbann_callback_checksmall&) = default; + const lbann_callback_checksmall&) = delete; lbann_callback_checksmall* copy() const override { return new lbann_callback_checksmall(*this); } diff --git a/include/lbann/callbacks/callback_perturb_adam.hpp b/include/lbann/callbacks/callback_perturb_adam.hpp index 6adf47dd83a..4a580da6ffc 100644 --- a/include/lbann/callbacks/callback_perturb_adam.hpp +++ b/include/lbann/callbacks/callback_perturb_adam.hpp @@ -77,8 +77,8 @@ class lbann_callback_perturb_adam : public lbann_callback { lbann_callback_perturb_adam* copy() const override { return new lbann_callback_perturb_adam(*this); } std::string name() const override { return "perturb Adam"; } - void setup(model* m); - void on_batch_begin(model* m); + void setup(model* m) override; + void on_batch_begin(model* m) override; private: diff --git a/include/lbann/callbacks/callback_perturb_dropout.hpp b/include/lbann/callbacks/callback_perturb_dropout.hpp index 05ef0402362..7564db07b3b 100644 --- a/include/lbann/callbacks/callback_perturb_dropout.hpp +++ b/include/lbann/callbacks/callback_perturb_dropout.hpp @@ -52,7 +52,7 @@ class lbann_callback_perturb_dropout : public lbann_callback { lbann_callback_perturb_dropout* copy() const override { return new lbann_callback_perturb_dropout(*this); } std::string name() const override { return "perturb dropout"; } - void setup(model* m); + void setup(model* m) override; private: @@ -67,9 +67,9 @@ class lbann_callback_perturb_dropout : public lbann_callback { * If empty, all dropout layers in the model will be perturbed. */ std::set m_layer_names; - + template - dropout* get_dropout_layer(Layer* l); + dropout* get_dropout_layer(Layer* l); /** Perturb dropout keep prob in model. */ void perturb(model& m); diff --git a/include/lbann/callbacks/callback_variable_minibatch.hpp b/include/lbann/callbacks/callback_variable_minibatch.hpp index 44d8c62f766..48184956432 100644 --- a/include/lbann/callbacks/callback_variable_minibatch.hpp +++ b/include/lbann/callbacks/callback_variable_minibatch.hpp @@ -44,7 +44,7 @@ class lbann_callback_variable_minibatch : public lbann_callback { lbann_callback_variable_minibatch( const lbann_callback_variable_minibatch&) = default; lbann_callback_variable_minibatch& operator=( - const lbann_callback_variable_minibatch&) = default; + const lbann_callback_variable_minibatch&) = delete; /// Set the initial mini-batch size. void on_train_begin(model *m) override; /// Potentially change the mini-batch size. @@ -69,6 +69,7 @@ class lbann_callback_variable_minibatch : public lbann_callback { void change_learning_rate(model *m, float new_lr) const; /// Get the current learning rate (assumes every layer has the same one). float get_current_learning_rate(model *m) const; + /// Initial mini-batch size. const int m_starting_mbsize; /** @@ -94,13 +95,14 @@ class lbann_callback_step_minibatch : public lbann_callback_variable_minibatch { int ramp_time = 0); lbann_callback_step_minibatch(const lbann_callback_step_minibatch&) = default; lbann_callback_step_minibatch& operator=( - const lbann_callback_step_minibatch&) = default; + const lbann_callback_step_minibatch&) = delete; lbann_callback_step_minibatch* copy() const override { return new lbann_callback_step_minibatch(*this); } std::string name() const override { return "step minibatch"; } protected: bool schedule(model *m, int& new_mbsize, float& new_lr, int& ramp_time) override; + /// Number of epochs between mini-batch size increases. int m_step; /// Number of steps to ramp the learning rate over. @@ -128,7 +130,7 @@ class lbann_callback_minibatch_schedule : public lbann_callback_variable_minibat lbann_callback_minibatch_schedule( const lbann_callback_minibatch_schedule&) = default; lbann_callback_minibatch_schedule& operator=( - const lbann_callback_minibatch_schedule&) = default; + const lbann_callback_minibatch_schedule&) = delete; lbann_callback_minibatch_schedule* copy() const override { return new lbann_callback_minibatch_schedule(*this); } diff --git a/include/lbann/data_readers/data_reader_jag_conduit.hpp b/include/lbann/data_readers/data_reader_jag_conduit.hpp index 2e09b555574..f178f5953f6 100644 --- a/include/lbann/data_readers/data_reader_jag_conduit.hpp +++ b/include/lbann/data_readers/data_reader_jag_conduit.hpp @@ -175,8 +175,8 @@ class data_reader_jag_conduit : public generic_data_reader { /// Set every reader instances in a model to have an independent index list void set_list_per_model(bool flag) { m_list_per_model = flag; }; - bool has_list_per_model() const { return m_list_per_model; } - bool has_list_per_trainer() const { return m_list_per_trainer; } + bool has_list_per_model() const override { return m_list_per_model; } + bool has_list_per_trainer() const override { return m_list_per_trainer; } /// Fetch data of a mini-batch or reuse it from the cache of the leading reader @@ -361,12 +361,12 @@ class data_reader_jag_conduit : public generic_data_reader { /// Obtain image data std::vector< std::vector > get_image_data(const size_t i, conduit::Node& sample) const; - bool data_store_active() const { + bool data_store_active() const override { bool flag = generic_data_reader::data_store_active(); return (m_data_store != nullptr && flag); } - bool priming_data_store() const { + bool priming_data_store() const override { bool flag = generic_data_reader::priming_data_store(); return (m_data_store != nullptr && flag); } diff --git a/include/lbann/data_readers/data_reader_numpy_npz_conduit.hpp b/include/lbann/data_readers/data_reader_numpy_npz_conduit.hpp index 7d7cd00bf93..414e177b6c9 100644 --- a/include/lbann/data_readers/data_reader_numpy_npz_conduit.hpp +++ b/include/lbann/data_readers/data_reader_numpy_npz_conduit.hpp @@ -73,7 +73,7 @@ namespace lbann { const std::vector get_data_dims() const override { return m_data_dims; } protected: - void preload_data_store(); + void preload_data_store() override; bool fetch_datum(CPUMat& X, int data_id, int mb_idx) override; bool fetch_label(CPUMat& Y, int data_id, int mb_idx) override; diff --git a/include/lbann/layers/activations/elu.hpp b/include/lbann/layers/activations/elu.hpp index 52f797488be..8ca94393c77 100644 --- a/include/lbann/layers/activations/elu.hpp +++ b/include/lbann/layers/activations/elu.hpp @@ -57,7 +57,7 @@ class elu_layer : public Layer { El::Device get_device_allocation() const override { return Device; } description get_description() const override { - auto&& desc = Layer::get_description(); + auto desc = Layer::get_description(); desc.add("alpha", m_alpha); return desc; } diff --git a/include/lbann/layers/activations/leaky_relu.hpp b/include/lbann/layers/activations/leaky_relu.hpp index 0e576117d3c..0c1f7d8f852 100644 --- a/include/lbann/layers/activations/leaky_relu.hpp +++ b/include/lbann/layers/activations/leaky_relu.hpp @@ -57,7 +57,7 @@ class leaky_relu_layer : public Layer { El::Device get_device_allocation() const override { return Device; } description get_description() const override { - auto&& desc = Layer::get_description(); + auto desc = Layer::get_description(); desc.add("Negative slope", m_negative_slope); return desc; } diff --git a/include/lbann/layers/io/input/generic_input_layer.hpp b/include/lbann/layers/io/input/generic_input_layer.hpp index 3dfa79edb79..e3539d99638 100644 --- a/include/lbann/layers/io/input/generic_input_layer.hpp +++ b/include/lbann/layers/io/input/generic_input_layer.hpp @@ -129,7 +129,7 @@ class generic_input_layer : public io_layer { std::string get_type() const override { return "generic_input"; } description get_description() const override { - auto&& desc = io_layer::get_description(); + auto desc = io_layer::get_description(); desc.add("Buffer", m_io_buffers[0]->get_type()); desc.add("Background I/O", this->m_model->background_io_activity_allowed()); return desc; diff --git a/include/lbann/layers/learning/base_convolution.hpp b/include/lbann/layers/learning/base_convolution.hpp index afa3046086b..ecf417c24bb 100644 --- a/include/lbann/layers/learning/base_convolution.hpp +++ b/include/lbann/layers/learning/base_convolution.hpp @@ -187,7 +187,7 @@ class base_convolution_layer : public Layer { } description get_description() const override { - auto&& desc = Layer::get_description(); + auto desc = Layer::get_description(); std::ostringstream ss; // Convolution dimensions diff --git a/include/lbann/layers/learning/convolution.hpp b/include/lbann/layers/learning/convolution.hpp index 9a7cf276a5d..b1afa3e956d 100644 --- a/include/lbann/layers/learning/convolution.hpp +++ b/include/lbann/layers/learning/convolution.hpp @@ -123,7 +123,7 @@ class convolution_layer : public base_convolution_layer { } - std::vector get_kernel_dims() const { + std::vector get_kernel_dims() const override { std::vector dims; dims.push_back(this->m_output_channels); dims.push_back(this->get_input_dims()[0] / this->m_groups); diff --git a/include/lbann/layers/learning/deconvolution.hpp b/include/lbann/layers/learning/deconvolution.hpp index f3c1f7bdd9e..56e962fb8d6 100644 --- a/include/lbann/layers/learning/deconvolution.hpp +++ b/include/lbann/layers/learning/deconvolution.hpp @@ -142,7 +142,7 @@ class deconvolution_layer : public base_convolution_layer { protected: - std::vector get_kernel_dims() const { + std::vector get_kernel_dims() const override { std::vector dims; dims.push_back(this->get_input_dims()[0]); dims.push_back(this->m_output_channels); diff --git a/include/lbann/layers/learning/embedding.hpp b/include/lbann/layers/learning/embedding.hpp index 67708af9e82..9e5d5c697f0 100644 --- a/include/lbann/layers/learning/embedding.hpp +++ b/include/lbann/layers/learning/embedding.hpp @@ -60,7 +60,7 @@ class embedding_layer : public Layer { El::Device get_device_allocation() const override { return Device; } description get_description() const override { - auto&& desc = Layer::get_description(); + auto desc = Layer::get_description(); desc.add("Dictionary size", m_dictionary_size); desc.add("Embedding size", m_embedding_size); return desc; diff --git a/include/lbann/layers/learning/fully_connected.hpp b/include/lbann/layers/learning/fully_connected.hpp index f62c2318594..b89f4c04733 100644 --- a/include/lbann/layers/learning/fully_connected.hpp +++ b/include/lbann/layers/learning/fully_connected.hpp @@ -100,7 +100,7 @@ class fully_connected_layer : public learning_layer { El::Device get_device_allocation() const override { return Dev; } description get_description() const override { - auto&& desc = learning_layer::get_description(); + auto desc = learning_layer::get_description(); const auto& bias_str = (m_bias_scaling_factor == DataType(0) ? "disabled" : "enabled"); desc.add("Bias", bias_str); diff --git a/include/lbann/layers/loss/top_k_categorical_accuracy.hpp b/include/lbann/layers/loss/top_k_categorical_accuracy.hpp index 6e0389e5f73..4442419d4de 100644 --- a/include/lbann/layers/loss/top_k_categorical_accuracy.hpp +++ b/include/lbann/layers/loss/top_k_categorical_accuracy.hpp @@ -59,7 +59,7 @@ class top_k_categorical_accuracy_layer : public Layer { El::Device get_device_allocation() const override { return Dev; } description get_description() const override { - auto&& desc = Layer::get_description(); + auto desc = Layer::get_description(); desc.add("k", m_k); return desc; } diff --git a/include/lbann/layers/math/clamp.hpp b/include/lbann/layers/math/clamp.hpp index 4b79dc06c09..0d7a2264369 100644 --- a/include/lbann/layers/math/clamp.hpp +++ b/include/lbann/layers/math/clamp.hpp @@ -59,7 +59,7 @@ class clamp_layer : public Layer { El::Device get_device_allocation() const override { return Device; } description get_description() const override { - auto&& desc = Layer::get_description(); + auto desc = Layer::get_description(); std::stringstream ss; ss << "[" << m_min << "," << m_max << "]"; desc.add("Range", ss.str()); diff --git a/include/lbann/layers/misc/covariance.hpp b/include/lbann/layers/misc/covariance.hpp index 8f31d12d545..23390b2eead 100644 --- a/include/lbann/layers/misc/covariance.hpp +++ b/include/lbann/layers/misc/covariance.hpp @@ -72,7 +72,7 @@ class covariance_layer : public Layer { El::Device get_device_allocation() const override { return Device; } description get_description() const override { - auto&& desc = Layer::get_description(); + auto desc = Layer::get_description(); desc.add("Biased", m_biased); return desc; } diff --git a/include/lbann/layers/misc/variance.hpp b/include/lbann/layers/misc/variance.hpp index bc36581b73f..685f1f9c340 100644 --- a/include/lbann/layers/misc/variance.hpp +++ b/include/lbann/layers/misc/variance.hpp @@ -69,7 +69,7 @@ class variance_layer : public Layer { El::Device get_device_allocation() const override { return Device; } description get_description() const override { - auto&& desc = Layer::get_description(); + auto desc = Layer::get_description(); desc.add("Biased", m_biased); return desc; } diff --git a/include/lbann/layers/regularizers/batch_normalization.hpp b/include/lbann/layers/regularizers/batch_normalization.hpp index b439c0e7ded..470bfa897c0 100644 --- a/include/lbann/layers/regularizers/batch_normalization.hpp +++ b/include/lbann/layers/regularizers/batch_normalization.hpp @@ -175,7 +175,7 @@ class batch_normalization_layer : public regularizer_layer { El::Device get_device_allocation() const override { return Dev; } description get_description() const override { - auto&& desc = regularizer_layer::get_description(); + auto desc = regularizer_layer::get_description(); desc.add("Decay", m_decay); desc.add("Epsilon", m_epsilon); switch (m_stats_aggregation) { diff --git a/include/lbann/layers/regularizers/dropout.hpp b/include/lbann/layers/regularizers/dropout.hpp index 0de30c38fd1..2dfde7b21a0 100644 --- a/include/lbann/layers/regularizers/dropout.hpp +++ b/include/lbann/layers/regularizers/dropout.hpp @@ -118,7 +118,7 @@ class dropout : public regularizer_layer { El::Device get_device_allocation() const override { return Dev; } description get_description() const override { - auto&& desc = regularizer_layer::get_description(); + auto desc = regularizer_layer::get_description(); desc.add("Keep probability", m_keep_prob); return desc; } diff --git a/include/lbann/layers/regularizers/local_response_normalization.hpp b/include/lbann/layers/regularizers/local_response_normalization.hpp index 23ff7051fab..b6cbbb014c7 100644 --- a/include/lbann/layers/regularizers/local_response_normalization.hpp +++ b/include/lbann/layers/regularizers/local_response_normalization.hpp @@ -128,7 +128,7 @@ class local_response_normalization_layer : public regularizer_layer { El::Device get_device_allocation() const override { return Dev; } description get_description() const override { - auto&& desc = regularizer_layer::get_description(); + auto desc = regularizer_layer::get_description(); desc.add("alpha", m_alpha); desc.add("beta", m_beta); desc.add("k", m_k); diff --git a/include/lbann/layers/transform/bernoulli.hpp b/include/lbann/layers/transform/bernoulli.hpp index d3e827e6ee7..2f6fc0d4077 100644 --- a/include/lbann/layers/transform/bernoulli.hpp +++ b/include/lbann/layers/transform/bernoulli.hpp @@ -56,7 +56,7 @@ class bernoulli_layer : public transform_layer { El::Device get_device_allocation() const override { return Dev; } description get_description() const override { - auto&& desc = transform_layer::get_description(); + auto desc = transform_layer::get_description(); desc.add("Probability", m_prob); return desc; } diff --git a/include/lbann/layers/transform/concatenation.hpp b/include/lbann/layers/transform/concatenation.hpp index 5355787269f..2630ea6a3d8 100644 --- a/include/lbann/layers/transform/concatenation.hpp +++ b/include/lbann/layers/transform/concatenation.hpp @@ -64,7 +64,7 @@ class concatenation_layer : public transform_layer { El::Device get_device_allocation() const override { return Dev; } description get_description() const override { - auto&& desc = transform_layer::get_description(); + auto desc = transform_layer::get_description(); desc.add("Concatenation dimension", m_concat_dim); return desc; } diff --git a/include/lbann/layers/transform/constant.hpp b/include/lbann/layers/transform/constant.hpp index f4390884a56..3324e621ffd 100644 --- a/include/lbann/layers/transform/constant.hpp +++ b/include/lbann/layers/transform/constant.hpp @@ -50,7 +50,7 @@ class constant_layer : public transform_layer { El::Device get_device_allocation() const override { return Dev; } description get_description() const override { - auto&& desc = transform_layer::get_description(); + auto desc = transform_layer::get_description(); desc.add("Value", m_value); return desc; } diff --git a/include/lbann/layers/transform/gaussian.hpp b/include/lbann/layers/transform/gaussian.hpp index 7ab43afc3a7..2de8e7e9af3 100644 --- a/include/lbann/layers/transform/gaussian.hpp +++ b/include/lbann/layers/transform/gaussian.hpp @@ -60,7 +60,7 @@ class gaussian_layer : public transform_layer { El::Device get_device_allocation() const override { return Dev; } description get_description() const override { - auto&& desc = transform_layer::get_description(); + auto desc = transform_layer::get_description(); desc.add("Mean", m_mean); desc.add("Standard deviation", m_stdev); return desc; diff --git a/include/lbann/layers/transform/in_top_k.hpp b/include/lbann/layers/transform/in_top_k.hpp index 85abe8caba4..959fb37881e 100644 --- a/include/lbann/layers/transform/in_top_k.hpp +++ b/include/lbann/layers/transform/in_top_k.hpp @@ -57,7 +57,7 @@ class in_top_k_layer : public transform_layer { El::Device get_device_allocation() const override { return Dev; } description get_description() const override { - auto&& desc = transform_layer::get_description(); + auto desc = transform_layer::get_description(); desc.add("k", m_k); return desc; } diff --git a/include/lbann/layers/transform/pooling.hpp b/include/lbann/layers/transform/pooling.hpp index abf6689aa82..e70bf9dd303 100644 --- a/include/lbann/layers/transform/pooling.hpp +++ b/include/lbann/layers/transform/pooling.hpp @@ -163,7 +163,7 @@ class pooling_layer : public transform_layer { El::Device get_device_allocation() const override { return Dev; } description get_description() const override { - auto&& desc = transform_layer::get_description(); + auto desc = transform_layer::get_description(); std::stringstream ss; // Pool mode diff --git a/include/lbann/layers/transform/reduction.hpp b/include/lbann/layers/transform/reduction.hpp index 15df56534e1..0328ccf4c3e 100644 --- a/include/lbann/layers/transform/reduction.hpp +++ b/include/lbann/layers/transform/reduction.hpp @@ -67,7 +67,7 @@ class reduction_layer : public transform_layer { El::Device get_device_allocation() const override { return Dev; } description get_description() const override { - auto&& desc = transform_layer::get_description(); + auto desc = transform_layer::get_description(); std::string mode_str; switch (m_mode) { case reduction_mode::SUM: mode_str = "sum"; break; diff --git a/include/lbann/layers/transform/slice.hpp b/include/lbann/layers/transform/slice.hpp index 62143bc32b8..98113accec9 100644 --- a/include/lbann/layers/transform/slice.hpp +++ b/include/lbann/layers/transform/slice.hpp @@ -84,7 +84,7 @@ class slice_layer : public transform_layer { std::vector get_slice_points() const { return m_slice_points; } description get_description() const override { - auto&& desc = transform_layer::get_description(); + auto desc = transform_layer::get_description(); desc.add("Slice dimension", m_slice_dim); std::stringstream ss; for (size_t i = 0; i < m_slice_points.size(); ++i) { diff --git a/include/lbann/layers/transform/sort.hpp b/include/lbann/layers/transform/sort.hpp index 8d04e25a795..131297383bc 100644 --- a/include/lbann/layers/transform/sort.hpp +++ b/include/lbann/layers/transform/sort.hpp @@ -87,7 +87,7 @@ class sort_layer : public transform_layer { El::Device get_device_allocation() const override { return Dev; } description get_description() const override { - auto&& desc = transform_layer::get_description(); + auto desc = transform_layer::get_description(); desc.add("Descending", m_descending); return desc; } diff --git a/include/lbann/layers/transform/uniform.hpp b/include/lbann/layers/transform/uniform.hpp index b10bbb03375..acaf26952a5 100644 --- a/include/lbann/layers/transform/uniform.hpp +++ b/include/lbann/layers/transform/uniform.hpp @@ -61,7 +61,7 @@ class uniform_layer : public transform_layer { El::Device get_device_allocation() const override { return Dev; } description get_description() const override { - auto&& desc = transform_layer::get_description(); + auto desc = transform_layer::get_description(); std::stringstream ss; ss << "[" << m_min << "," << m_max << ")"; desc.add("Range", ss.str()); diff --git a/include/lbann/layers/transform/weighted_sum.hpp b/include/lbann/layers/transform/weighted_sum.hpp index 5f77caeaa9a..fe1e367be98 100644 --- a/include/lbann/layers/transform/weighted_sum.hpp +++ b/include/lbann/layers/transform/weighted_sum.hpp @@ -55,7 +55,7 @@ class weighted_sum_layer : public transform_layer { El::Device get_device_allocation() const override { return Dev; } description get_description() const override { - auto&& desc = transform_layer::get_description(); + auto desc = transform_layer::get_description(); std::stringstream ss; for (size_t i = 0; i < m_scaling_factors.size(); ++i) { ss << (i > 0 ? ", " : "") << m_scaling_factors[i]; diff --git a/include/lbann/transforms/normalize.hpp b/include/lbann/transforms/normalize.hpp index 259d3d3ec12..3c21a86afbd 100644 --- a/include/lbann/transforms/normalize.hpp +++ b/include/lbann/transforms/normalize.hpp @@ -54,7 +54,7 @@ class normalize : public transform { std::string get_type() const override { return "normalize"; } - bool supports_non_inplace() const { return true; } + bool supports_non_inplace() const override { return true; } void apply(utils::type_erased_matrix& data, std::vector& dims) override; void apply(utils::type_erased_matrix& data, CPUMat& out, diff --git a/include/lbann/transforms/repack_HWC_to_CHW_layout.hpp b/include/lbann/transforms/repack_HWC_to_CHW_layout.hpp index ea74b6c29fa..59a02fc78fe 100644 --- a/include/lbann/transforms/repack_HWC_to_CHW_layout.hpp +++ b/include/lbann/transforms/repack_HWC_to_CHW_layout.hpp @@ -42,7 +42,7 @@ class repack_HWC_to_CHW_layout : public transform { std::string get_type() const override { return "to_lbann_layout"; } - bool supports_non_inplace() const { return true; } + bool supports_non_inplace() const override { return true; } void apply(utils::type_erased_matrix& data, std::vector& dims) override; diff --git a/include/lbann/transforms/vision/normalize_to_lbann_layout.hpp b/include/lbann/transforms/vision/normalize_to_lbann_layout.hpp index 2f01d08fbe3..385dd1dc446 100644 --- a/include/lbann/transforms/vision/normalize_to_lbann_layout.hpp +++ b/include/lbann/transforms/vision/normalize_to_lbann_layout.hpp @@ -49,12 +49,12 @@ class normalize_to_lbann_layout : public transform { LBANN_ERROR("Normalize mean and std have different numbers of channels."); } } - + transform* copy() const override { return new normalize_to_lbann_layout(*this); } std::string get_type() const override { return "normalize_to_lbann_layout"; } - bool supports_non_inplace() const { return true; } + bool supports_non_inplace() const override { return true; } void apply(utils::type_erased_matrix& data, std::vector& dims) override; diff --git a/include/lbann/transforms/vision/to_lbann_layout.hpp b/include/lbann/transforms/vision/to_lbann_layout.hpp index d342e39be0b..27610f4e094 100644 --- a/include/lbann/transforms/vision/to_lbann_layout.hpp +++ b/include/lbann/transforms/vision/to_lbann_layout.hpp @@ -43,7 +43,7 @@ class to_lbann_layout : public transform { std::string get_type() const override { return "to_lbann_layout"; } - bool supports_non_inplace() const { return true; } + bool supports_non_inplace() const override { return true; } void apply(utils::type_erased_matrix& data, std::vector& dims) override; diff --git a/include/lbann/weights/initializer.hpp b/include/lbann/weights/initializer.hpp index 84f696d9554..41fd8b9bf59 100644 --- a/include/lbann/weights/initializer.hpp +++ b/include/lbann/weights/initializer.hpp @@ -60,8 +60,8 @@ class constant_initializer : public weights_initializer { constant_initializer* copy() const override { return new constant_initializer(*this); } - std::string get_type() const { return "constant"; } - description get_description() const; + std::string get_type() const override { return "constant"; } + description get_description() const override; void fill(AbsDistMat& matrix) override; private: @@ -83,7 +83,7 @@ class value_initializer : public weights_initializer { value_initializer* copy() const override { return new value_initializer(*this); } - std::string get_type() const { return "value"; } + std::string get_type() const override { return "value"; } void fill(AbsDistMat& matrix) override; private: @@ -102,8 +102,8 @@ class uniform_initializer : public weights_initializer { uniform_initializer* copy() const override { return new uniform_initializer(*this); } - std::string get_type() const { return "uniform"; } - description get_description() const; + std::string get_type() const override{ return "uniform"; } + description get_description() const override; void fill(AbsDistMat& matrix) override; private: @@ -126,8 +126,8 @@ class normal_initializer : public weights_initializer { normal_initializer* copy() const override { return new normal_initializer(*this); } - std::string get_type() const { return "normal"; } - description get_description() const; + std::string get_type() const override { return "normal"; } + description get_description() const override; void fill(AbsDistMat& matrix) override; private: diff --git a/include/lbann/weights/variance_scaling_initializers.hpp b/include/lbann/weights/variance_scaling_initializers.hpp index c6256cfe956..16a5359fbb6 100644 --- a/include/lbann/weights/variance_scaling_initializers.hpp +++ b/include/lbann/weights/variance_scaling_initializers.hpp @@ -45,7 +45,7 @@ namespace lbann { class variance_scaling_initializer : public weights_initializer { public: variance_scaling_initializer(probability_distribution dist); - description get_description() const; + description get_description() const override; void fill(AbsDistMat& matrix) override; /** Set fan-in parameter. */ @@ -78,7 +78,7 @@ class glorot_initializer : public variance_scaling_initializer { glorot_initializer* copy() const override { return new glorot_initializer(*this); } - std::string get_type() const { return "Glorot"; } + std::string get_type() const override { return "Glorot"; } protected: DataType get_variance(El::Int fan_in, El::Int fan_out) override; }; @@ -91,7 +91,7 @@ class he_initializer : public variance_scaling_initializer { he_initializer* copy() const override { return new he_initializer(*this); } - std::string get_type() const { return "He"; } + std::string get_type() const override { return "He"; } protected: DataType get_variance(El::Int fan_in, El::Int fan_out) override; }; @@ -104,7 +104,7 @@ class lecun_initializer : public variance_scaling_initializer { lecun_initializer* copy() const override { return new lecun_initializer(*this); } - std::string get_type() const { return "LeCun"; } + std::string get_type() const override { return "LeCun"; } protected: DataType get_variance(El::Int fan_in, El::Int fan_out) override; }; diff --git a/src/data_readers/data_reader_jag_conduit.cpp b/src/data_readers/data_reader_jag_conduit.cpp index 2764162b014..fda776f30f5 100644 --- a/src/data_readers/data_reader_jag_conduit.cpp +++ b/src/data_readers/data_reader_jag_conduit.cpp @@ -1319,7 +1319,7 @@ data_reader_jag_conduit::create_datum_views(CPUMat& X, const std::vector El::View(X_v[i], X, El::IR(h, h_end), El::IR(mb_idx, mb_idx + 1)); h = h_end; } - return std::move(X_v); + return X_v; } bool data_reader_jag_conduit::fetch(CPUMat& X, int data_id, conduit::Node& sample, int mb_idx, int tid, diff --git a/src/optimizers/adagrad.cpp b/src/optimizers/adagrad.cpp index 49198956f82..ab17e618b67 100644 --- a/src/optimizers/adagrad.cpp +++ b/src/optimizers/adagrad.cpp @@ -45,7 +45,7 @@ adagrad& adagrad::operator=(const adagrad& other) { } description adagrad::get_description() const { - auto&& desc = optimizer::get_description(); + auto desc = optimizer::get_description(); desc.add("eps", m_eps); return desc; } diff --git a/src/optimizers/adam.cpp b/src/optimizers/adam.cpp index 37a6c912f5c..2b01c384e92 100644 --- a/src/optimizers/adam.cpp +++ b/src/optimizers/adam.cpp @@ -62,7 +62,7 @@ adam& adam::operator=(const adam& other) { } description adam::get_description() const { - auto&& desc = optimizer::get_description(); + auto desc = optimizer::get_description(); desc.add("beta1", m_beta1); desc.add("beta2", m_beta2); desc.add("eps", m_eps); diff --git a/src/optimizers/hypergradient_adam.cpp b/src/optimizers/hypergradient_adam.cpp index 0da3b9852bb..b8afe018dce 100644 --- a/src/optimizers/hypergradient_adam.cpp +++ b/src/optimizers/hypergradient_adam.cpp @@ -72,7 +72,7 @@ hypergradient_adam& hypergradient_adam::operator=(const hypergradient_adam& othe } description hypergradient_adam::get_description() const { - auto&& desc = optimizer::get_description(); + auto desc = optimizer::get_description(); desc.add("Hypergradient learning rate", m_hyper_learning_rate); desc.add("beta1", m_beta1); desc.add("beta2", m_beta2); diff --git a/src/optimizers/rmsprop.cpp b/src/optimizers/rmsprop.cpp index 870af9f0470..23aa9193e6a 100644 --- a/src/optimizers/rmsprop.cpp +++ b/src/optimizers/rmsprop.cpp @@ -52,7 +52,7 @@ rmsprop& rmsprop::operator=(const rmsprop& other) { } description rmsprop::get_description() const { - auto&& desc = optimizer::get_description(); + auto desc = optimizer::get_description(); desc.add("Decay rate", m_decay_rate); desc.add("eps", m_eps); return desc; diff --git a/src/optimizers/sgd.cpp b/src/optimizers/sgd.cpp index 147a7ee2937..89c47e5da65 100644 --- a/src/optimizers/sgd.cpp +++ b/src/optimizers/sgd.cpp @@ -53,7 +53,7 @@ sgd& sgd::operator=(const sgd& other) { } description sgd::get_description() const { - auto&& desc = optimizer::get_description(); + auto desc = optimizer::get_description(); desc.add("Momentum", m_momentum); desc.add("Nesterov acceleration", m_nesterov); return desc; diff --git a/src/proto/init_image_data_readers.cpp b/src/proto/init_image_data_readers.cpp index 58be634aeb0..6f68213c8dd 100644 --- a/src/proto/init_image_data_readers.cpp +++ b/src/proto/init_image_data_readers.cpp @@ -250,7 +250,7 @@ void init_image_data_reader(const lbann_data::Reader& pb_readme, const lbann_dat } reader->set_transform_pipeline( - std::move(proto::construct_transform_pipeline(pb_readme))); + proto::construct_transform_pipeline(pb_readme)); if (channels == 0) { channels = 3; @@ -291,7 +291,7 @@ void init_org_image_data_reader(const lbann_data::Reader& pb_readme, const bool } reader->set_transform_pipeline( - std::move(proto::construct_transform_pipeline(pb_readme))); + proto::construct_transform_pipeline(pb_readme)); } } diff --git a/src/proto/proto_common.cpp b/src/proto/proto_common.cpp index 165732638a2..089fd1c39ad 100644 --- a/src/proto/proto_common.cpp +++ b/src/proto/proto_common.cpp @@ -345,7 +345,7 @@ void init_data_readers( if (set_transform_pipeline) { reader->set_transform_pipeline( - std::move(proto::construct_transform_pipeline(readme))); + proto::construct_transform_pipeline(readme)); } if (readme.data_filename() != "") { diff --git a/src/weights/initializer.cpp b/src/weights/initializer.cpp index 01c49c72db3..539fb157d9b 100644 --- a/src/weights/initializer.cpp +++ b/src/weights/initializer.cpp @@ -35,7 +35,7 @@ description weights_initializer::get_description() const { } description constant_initializer::get_description() const { - auto&& desc = weights_initializer::get_description(); + auto desc = weights_initializer::get_description(); desc.add("Value", m_value); return desc; } @@ -91,7 +91,7 @@ void value_initializer::fill(AbsDistMat& matrix) { } description uniform_initializer::get_description() const { - auto&& desc = weights_initializer::get_description(); + auto desc = weights_initializer::get_description(); std::stringstream ss; ss << "[" << m_min << "," << m_max << ")"; desc.add("Range", ss.str()); @@ -104,7 +104,7 @@ void uniform_initializer::fill(AbsDistMat& matrix) { } description normal_initializer::get_description() const { - auto&& desc = weights_initializer::get_description(); + auto desc = weights_initializer::get_description(); desc.add("Mean", m_mean); desc.add("Standard deviation", m_standard_deviation); return desc; diff --git a/src/weights/variance_scaling_initializers.cpp b/src/weights/variance_scaling_initializers.cpp index 278a81f5a11..5baad441146 100644 --- a/src/weights/variance_scaling_initializers.cpp +++ b/src/weights/variance_scaling_initializers.cpp @@ -45,7 +45,7 @@ variance_scaling_initializer::variance_scaling_initializer(probability_distribut } description variance_scaling_initializer::get_description() const { - auto&& desc = weights_initializer::get_description(); + auto desc = weights_initializer::get_description(); std::string dist_str; switch (m_prob_dist) { case probability_distribution::gaussian: From f13626779b002c4cb7c399e8d63e7b11f3d673fb Mon Sep 17 00:00:00 2001 From: "Thomas R. Benson" Date: Thu, 18 Jul 2019 18:02:31 -0700 Subject: [PATCH 130/634] remove log_msg macro; replace with LBANN_WARNING --- include/lbann/base.hpp | 18 ------------------ src/data_readers/data_reader_jag_conduit.cpp | 10 ++++++---- 2 files changed, 6 insertions(+), 22 deletions(-) diff --git a/include/lbann/base.hpp b/include/lbann/base.hpp index a4baa63c443..76e38fb0f3f 100644 --- a/include/lbann/base.hpp +++ b/include/lbann/base.hpp @@ -189,24 +189,6 @@ static void __attribute__((used)) _print_local_matrix_dims(AbsMat *m, const char } #define PRINT_LOCAL_MATRIX_DIMS(x) _print_local_matrix_dims(x, #x); -// FIXME -#if 1 -// __FILE__ -#define log_msg(...) {\ - char str[256];\ - sprintf(str, __VA_ARGS__);\ - std::cout << "[" << m_comm->get_trainer_rank() << "." << m_comm->get_rank_in_trainer() << "][" << __FUNCTION__ << "][Line " << __LINE__ << "]" << str << std::endl; \ - } -#define log_simple_msg(...) {\ - char str[256];\ - sprintf(str, __VA_ARGS__);\ - std::cout << "[" << __FUNCTION__ << "][Line " << __LINE__ << "]" << str << std::endl; \ - } -#else -#define log_msg(...) -#define log_simple_msg(...) -#endif - #define LBANN_MAKE_STR(x) _LBANN_MAKE_STR(x) #define _LBANN_MAKE_STR(x) #x diff --git a/src/data_readers/data_reader_jag_conduit.cpp b/src/data_readers/data_reader_jag_conduit.cpp index fda776f30f5..56c1cd2ee52 100644 --- a/src/data_readers/data_reader_jag_conduit.cpp +++ b/src/data_readers/data_reader_jag_conduit.cpp @@ -831,8 +831,10 @@ void data_reader_jag_conduit::load() { /// Merge all of the sample lists m_sample_list.all_gather_packed_lists(*m_comm); if (opts->has_string("write_sample_list") && m_comm->am_trainer_master()) { - const std::string msg = " writing sample list " + sample_list_file; - log_msg(msg.c_str()); + { + const std::string msg = " writing sample list " + sample_list_file; + LBANN_WARNING(msg); + } std::stringstream s; std::string basename = get_basename_without_ext(sample_list_file); std::string ext = get_ext_name(sample_list_file); @@ -866,7 +868,7 @@ void data_reader_jag_conduit::preload_data_store() { (opts->get_bool("ltfb_verbose") && get_comm()->am_trainer_master())) { std::stringstream msg; msg << " for role: " << get_role() << " starting preload"; - log_msg(msg.str().c_str()); + LBANN_WARNING(msg.str()); } for (size_t idx=0; idx < m_shuffled_indices.size(); idx++) { @@ -897,7 +899,7 @@ void data_reader_jag_conduit::preload_data_store() { (opts->get_bool("ltfb_verbose") && get_comm()->am_trainer_master())) { std::stringstream msg; msg << " loading data for role: " << get_role() << " took " << get_time() - tm1 << "s"; - log_msg(msg.str().c_str()); + LBANN_WARNING(msg.str()); } } From 088deb9eed6e389fbaed424ab2165b860b7ec8be Mon Sep 17 00:00:00 2001 From: "Thomas R. Benson" Date: Thu, 18 Jul 2019 18:14:24 -0700 Subject: [PATCH 131/634] address comments on PR --- include/lbann/callbacks/callback_checksmall.hpp | 4 ++-- include/lbann/callbacks/callback_variable_minibatch.hpp | 4 ++-- src/callbacks/callback_checksmall.cpp | 3 +++ 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/include/lbann/callbacks/callback_checksmall.hpp b/include/lbann/callbacks/callback_checksmall.hpp index 892ef3282f3..10907a07051 100644 --- a/include/lbann/callbacks/callback_checksmall.hpp +++ b/include/lbann/callbacks/callback_checksmall.hpp @@ -49,7 +49,7 @@ class lbann_callback_checksmall : public lbann_callback { lbann_callback_checksmall() : lbann_callback() {} lbann_callback_checksmall(const lbann_callback_checksmall&) = default; lbann_callback_checksmall& operator=( - const lbann_callback_checksmall&) = delete; + const lbann_callback_checksmall&) = default; lbann_callback_checksmall* copy() const override { return new lbann_callback_checksmall(*this); } @@ -62,7 +62,7 @@ class lbann_callback_checksmall : public lbann_callback { std::string name() const override { return "checksmall"; } private: /** Smallest allowable value. */ - const DataType m_threshold = std::sqrt(std::numeric_limits::min()); + static const DataType m_threshold; /** Return true if there are no problems with m. */ bool is_good(const AbsDistMat& m); }; diff --git a/include/lbann/callbacks/callback_variable_minibatch.hpp b/include/lbann/callbacks/callback_variable_minibatch.hpp index 48184956432..8828d2b2936 100644 --- a/include/lbann/callbacks/callback_variable_minibatch.hpp +++ b/include/lbann/callbacks/callback_variable_minibatch.hpp @@ -44,7 +44,7 @@ class lbann_callback_variable_minibatch : public lbann_callback { lbann_callback_variable_minibatch( const lbann_callback_variable_minibatch&) = default; lbann_callback_variable_minibatch& operator=( - const lbann_callback_variable_minibatch&) = delete; + const lbann_callback_variable_minibatch&) = default; /// Set the initial mini-batch size. void on_train_begin(model *m) override; /// Potentially change the mini-batch size. @@ -71,7 +71,7 @@ class lbann_callback_variable_minibatch : public lbann_callback { float get_current_learning_rate(model *m) const; /// Initial mini-batch size. - const int m_starting_mbsize; + int m_starting_mbsize; /** * The current mini-batch size for this epoch. * This is kept separately from the model's get_current_mini_batch_size() diff --git a/src/callbacks/callback_checksmall.cpp b/src/callbacks/callback_checksmall.cpp index e310c64b6da..a4fc8e93fcc 100644 --- a/src/callbacks/callback_checksmall.cpp +++ b/src/callbacks/callback_checksmall.cpp @@ -85,4 +85,7 @@ bool lbann_callback_checksmall::is_good(const AbsDistMat& m) { return true; } +const DataType lbann_callback_checksmall::m_threshold + = std::sqrt(std::numeric_limits::min()); + } // namespace lbann From 0c5258966f7375e2ba29d93c8067a6f3f49511c4 Mon Sep 17 00:00:00 2001 From: "Thomas R. Benson" Date: Fri, 19 Jul 2019 23:12:02 -0700 Subject: [PATCH 132/634] Incomplete first sweep at callback re-factory-ing. This will compile and run for all but a few callbacks. The old factory function used a "layer_list" and a "weights_list" argument to select objects that would then be passed into the model. This detail has been glossed over for now as it is logically inconsistent (these objects should be grabbed in the context of a model, i.e., during setup() or just-in-time in the callback functions themselves. The fix for this will be coming soon. --- include/lbann/callbacks/callback.hpp | 8 + .../callbacks/callback_check_dataset.hpp | 4 + .../callbacks/callback_check_gradients.hpp | 5 + .../lbann/callbacks/callback_check_metric.hpp | 9 +- include/lbann/callbacks/callback_checknan.hpp | 4 + .../lbann/callbacks/callback_checkpoint.hpp | 5 + .../lbann/callbacks/callback_checksmall.hpp | 4 + .../callbacks/callback_confusion_matrix.hpp | 5 + include/lbann/callbacks/callback_debug.hpp | 5 + include/lbann/callbacks/callback_debug_io.hpp | 5 + .../callbacks/callback_dump_error_signals.hpp | 5 + .../callbacks/callback_dump_gradients.hpp | 5 + ...callback_dump_minibatch_sample_indices.hpp | 5 + .../lbann/callbacks/callback_dump_outputs.hpp | 9 +- .../lbann/callbacks/callback_dump_weights.hpp | 5 + .../callbacks/callback_gpu_memory_usage.hpp | 4 + include/lbann/callbacks/callback_hang.hpp | 10 +- include/lbann/callbacks/callback_imcomm.hpp | 5 + include/lbann/callbacks/callback_io.hpp | 5 + .../callbacks/callback_learning_rate.hpp | 30 + include/lbann/callbacks/callback_ltfb.hpp | 7 +- include/lbann/callbacks/callback_mixup.hpp | 17 +- .../lbann/callbacks/callback_perturb_adam.hpp | 5 + .../callbacks/callback_perturb_dropout.hpp | 5 + include/lbann/callbacks/callback_print.hpp | 5 + .../callbacks/callback_replace_weights.hpp | 5 + .../lbann/callbacks/callback_save_images.hpp | 5 + .../lbann/callbacks/callback_save_model.hpp | 5 + .../callbacks/callback_save_topk_models.hpp | 13 +- include/lbann/callbacks/callback_summary.hpp | 5 + .../lbann/callbacks/callback_sync_layers.hpp | 5 + .../callbacks/callback_sync_selected.hpp | 5 + include/lbann/callbacks/callback_timer.hpp | 5 + .../callbacks/callback_variable_minibatch.hpp | 13 +- include/lbann/callbacks/profiler.hpp | 5 + include/lbann/proto/factories.hpp | 9 +- include/lbann/proto/proto_common.hpp | 26 + include/lbann/proto/proto_helpers.hpp | 62 ++ src/callbacks/CMakeLists.txt | 1 + src/callbacks/callback_check_gradients.cpp | 13 + src/callbacks/callback_check_metric.cpp | 16 + src/callbacks/callback_checkpoint.cpp | 14 + src/callbacks/callback_confusion_matrix.cpp | 10 + src/callbacks/callback_debug.cpp | 15 + src/callbacks/callback_debug_io.cpp | 36 +- src/callbacks/callback_dump_error_signals.cpp | 10 + src/callbacks/callback_dump_gradients.cpp | 14 +- ...callback_dump_minibatch_sample_indices.cpp | 12 + src/callbacks/callback_dump_outputs.cpp | 21 + src/callbacks/callback_dump_weights.cpp | 8 + src/callbacks/callback_hang.cpp | 54 ++ src/callbacks/callback_imcomm.cpp | 20 + src/callbacks/callback_io.cpp | 12 + src/callbacks/callback_learning_rate.cpp | 95 ++- src/callbacks/callback_ltfb.cpp | 17 + src/callbacks/callback_mixup.cpp | 15 + src/callbacks/callback_perturb_adam.cpp | 16 + src/callbacks/callback_perturb_dropout.cpp | 14 +- src/callbacks/callback_print.cpp | 9 + src/callbacks/callback_replace_weights.cpp | 17 + src/callbacks/callback_save_images.cpp | 15 + src/callbacks/callback_save_model.cpp | 19 + src/callbacks/callback_save_topk_models.cpp | 28 +- src/callbacks/callback_summary.cpp | 11 + src/callbacks/callback_sync_layers.cpp | 10 + src/callbacks/callback_sync_selected.cpp | 42 ++ src/callbacks/callback_timer.cpp | 6 + src/callbacks/callback_variable_minibatch.cpp | 27 + src/callbacks/profiler.cpp | 9 + src/proto/CMakeLists.txt | 1 + src/proto/factories/callback_factory.cpp | 575 +++++------------- src/proto/factories/model_factory.cpp | 10 +- src/proto/lbann.proto | 86 +-- src/proto/proto_common.cpp | 26 + src/proto/proto_helpers.cpp | 69 +++ 75 files changed, 1213 insertions(+), 499 deletions(-) create mode 100644 include/lbann/proto/proto_helpers.hpp create mode 100644 src/callbacks/callback_hang.cpp create mode 100644 src/proto/proto_helpers.cpp diff --git a/include/lbann/callbacks/callback.hpp b/include/lbann/callbacks/callback.hpp index fae45448bb8..3392d352915 100644 --- a/include/lbann/callbacks/callback.hpp +++ b/include/lbann/callbacks/callback.hpp @@ -34,6 +34,14 @@ #include "lbann/models/model.hpp" #include "lbann/layers/layer.hpp" +// A utility macro for easily adding default-constructed sub-class +// builders. +#define ADD_DEFAULT_CALLBACK_BUILDER(Class, FunctionName) \ + inline std::unique_ptr FunctionName( \ + const google::protobuf::Message&, lbann_summary*) { \ + return make_unique(); \ + } + namespace lbann { /** @class lbann_callback diff --git a/include/lbann/callbacks/callback_check_dataset.hpp b/include/lbann/callbacks/callback_check_dataset.hpp index 09ce25d723f..d8b513eb9bf 100644 --- a/include/lbann/callbacks/callback_check_dataset.hpp +++ b/include/lbann/callbacks/callback_check_dataset.hpp @@ -68,6 +68,10 @@ class lbann_callback_check_dataset : public lbann_callback { std::set testing_set; }; +// Builder function +ADD_DEFAULT_CALLBACK_BUILDER( + lbann_callback_check_dataset, build_callback_check_dataset_from_pbuf); + } // namespace lbann #endif // LBANN_CALLBACKS_CALLBACK_CHECK_DATASET_HPP_INCLUDED diff --git a/include/lbann/callbacks/callback_check_gradients.hpp b/include/lbann/callbacks/callback_check_gradients.hpp index fb5d531ff83..1a46d7ea986 100644 --- a/include/lbann/callbacks/callback_check_gradients.hpp +++ b/include/lbann/callbacks/callback_check_gradients.hpp @@ -73,6 +73,11 @@ class lbann_callback_check_gradients : public lbann_callback { }; +// Builder function +std::unique_ptr +build_callback_check_gradients_from_pbuf( + const google::protobuf::Message& proto_msg, lbann_summary*); + } // namespace lbann #endif // LBANN_CALLBACKS_CALLBACK_CHECK_GRADIENTS_HPP_INCLUDED diff --git a/include/lbann/callbacks/callback_check_metric.hpp b/include/lbann/callbacks/callback_check_metric.hpp index 8b094c8c395..0652629c89c 100644 --- a/include/lbann/callbacks/callback_check_metric.hpp +++ b/include/lbann/callbacks/callback_check_metric.hpp @@ -43,7 +43,9 @@ class lbann_callback_check_metric : public lbann_callback { EvalType lower_bound, EvalType upper_bound, bool error_on_failure); - lbann_callback_check_metric* copy() const override { return new lbann_callback_check_metric(*this); } + lbann_callback_check_metric* copy() const override { + return new lbann_callback_check_metric(*this); + } std::string name() const override { return "check metric"; } void on_epoch_end(model* m) override { check_metric(*m); } @@ -73,6 +75,11 @@ class lbann_callback_check_metric : public lbann_callback { }; +// Builder function +std::unique_ptr +build_callback_check_metric_from_pbuf( + const google::protobuf::Message&, lbann_summary*); + } // namespace lbann #endif // LBANN_CALLBACKS_CALLBACK_CHECK_METRIC_HPP_INCLUDED diff --git a/include/lbann/callbacks/callback_checknan.hpp b/include/lbann/callbacks/callback_checknan.hpp index c45a7eee95c..30e90b92806 100644 --- a/include/lbann/callbacks/callback_checknan.hpp +++ b/include/lbann/callbacks/callback_checknan.hpp @@ -61,6 +61,10 @@ class lbann_callback_checknan : public lbann_callback { }; +// Builder function +ADD_DEFAULT_CALLBACK_BUILDER( + lbann_callback_checknan, build_callback_check_nan_from_pbuf); + } // namespace lbann #endif // LBANN_CALLBACKS_CALLBACK_CHECKNAN_HPP_INCLUDED diff --git a/include/lbann/callbacks/callback_checkpoint.hpp b/include/lbann/callbacks/callback_checkpoint.hpp index ebeacdeaa7e..0638accc64e 100644 --- a/include/lbann/callbacks/callback_checkpoint.hpp +++ b/include/lbann/callbacks/callback_checkpoint.hpp @@ -202,6 +202,11 @@ static inline bool read_latest(std::string filename, int *epochLast, int *trainL return true; } +// Builder function +std::unique_ptr +build_callback_checkpoint_from_pbuf( + const google::protobuf::Message&, lbann_summary*); + } // namespace lbann #endif // LBANN_CALLBACKS_CALLBACK_CHECKPOINT_HPP_INCLUDED diff --git a/include/lbann/callbacks/callback_checksmall.hpp b/include/lbann/callbacks/callback_checksmall.hpp index 10907a07051..58ddf67faea 100644 --- a/include/lbann/callbacks/callback_checksmall.hpp +++ b/include/lbann/callbacks/callback_checksmall.hpp @@ -67,6 +67,10 @@ class lbann_callback_checksmall : public lbann_callback { bool is_good(const AbsDistMat& m); }; +// Builder function +ADD_DEFAULT_CALLBACK_BUILDER( + lbann_callback_checksmall, build_callback_check_small_from_pbuf); + } // namespace lbann #endif // LBANN_CALLBACKS_CALLBACK_CHECKSMALL_HPP_INCLUDED diff --git a/include/lbann/callbacks/callback_confusion_matrix.hpp b/include/lbann/callbacks/callback_confusion_matrix.hpp index b87dc8b24a0..c86b1b155b6 100644 --- a/include/lbann/callbacks/callback_confusion_matrix.hpp +++ b/include/lbann/callbacks/callback_confusion_matrix.hpp @@ -110,6 +110,11 @@ class lbann_callback_confusion_matrix : public lbann_callback { }; +// Builder function +std::unique_ptr +build_callback_confusion_matrix_from_pbuf( + const google::protobuf::Message&, lbann_summary*); + } // namespace lbann #endif // LBANN_CALLBACKS_CALLBACK_CONFUSION_MATRIX_HPP_INCLUDED diff --git a/include/lbann/callbacks/callback_debug.hpp b/include/lbann/callbacks/callback_debug.hpp index c342c7ad778..3eaf65a1485 100644 --- a/include/lbann/callbacks/callback_debug.hpp +++ b/include/lbann/callbacks/callback_debug.hpp @@ -103,6 +103,11 @@ class lbann_callback_debug : public lbann_callback { }; +// Builder function +std::unique_ptr +build_callback_debug_from_pbuf( + const google::protobuf::Message&, lbann_summary*); + } // namespace lbann #endif // LBANN_CALLBACKS_CALLBACK_DEBUG_HPP_INCLUDED diff --git a/include/lbann/callbacks/callback_debug_io.hpp b/include/lbann/callbacks/callback_debug_io.hpp index ffaff0af567..aaefc9662b3 100644 --- a/include/lbann/callbacks/callback_debug_io.hpp +++ b/include/lbann/callbacks/callback_debug_io.hpp @@ -84,6 +84,11 @@ class lbann_callback_debug_io : public lbann_callback { int m_debug_lvl; /** Debugging level: 0 - epoch begin, 1 - fwd prop */ }; +// Builder function +std::unique_ptr +build_callback_debug_io_from_pbuf( + const google::protobuf::Message&, lbann_summary*); + } // namespace lbann #endif // LBANN_CALLBACKS_CALLBACK_DEBUG_IO_HPP_INCLUDED diff --git a/include/lbann/callbacks/callback_dump_error_signals.hpp b/include/lbann/callbacks/callback_dump_error_signals.hpp index 0c5571d9597..456cb90f884 100644 --- a/include/lbann/callbacks/callback_dump_error_signals.hpp +++ b/include/lbann/callbacks/callback_dump_error_signals.hpp @@ -58,6 +58,11 @@ class lbann_callback_dump_error_signals : public lbann_callback { }; +// Builder function +std::unique_ptr +build_callback_dump_error_signals_from_pbuf( + const google::protobuf::Message&, lbann_summary*); + } // namespace lbann #endif // LBANN_CALLBACKS_CALLBACK_DUMP_ERROR_SIGNALS_HPP_INCLUDED diff --git a/include/lbann/callbacks/callback_dump_gradients.hpp b/include/lbann/callbacks/callback_dump_gradients.hpp index b0a6d587446..4c2947feedd 100644 --- a/include/lbann/callbacks/callback_dump_gradients.hpp +++ b/include/lbann/callbacks/callback_dump_gradients.hpp @@ -68,6 +68,11 @@ class lbann_callback_dump_gradients : public lbann_callback { std::string m_basename; }; +// Builder function +std::unique_ptr +build_callback_dump_gradients_from_pbuf( + const google::protobuf::Message&, lbann_summary*); + } // namespace lbann #endif // LBANN_CALLBACKS_CALLBACK_DUMP_GRADIENTS_HPP_INCLUDED diff --git a/include/lbann/callbacks/callback_dump_minibatch_sample_indices.hpp b/include/lbann/callbacks/callback_dump_minibatch_sample_indices.hpp index 8840b1a83c5..0ad5e12b58c 100644 --- a/include/lbann/callbacks/callback_dump_minibatch_sample_indices.hpp +++ b/include/lbann/callbacks/callback_dump_minibatch_sample_indices.hpp @@ -73,6 +73,11 @@ class lbann_callback_dump_minibatch_sample_indices : public lbann_callback { std::string m_basename; }; +// Builder function +std::unique_ptr +build_callback_dump_mb_indices_from_pbuf( + const google::protobuf::Message&, lbann_summary*); + } // namespace lbann #endif // LBANN_CALLBACKS_CALLBACK_DUMP_MINIBATCH_SAMPLE_INDICES_HPP_INCLUDED diff --git a/include/lbann/callbacks/callback_dump_outputs.hpp b/include/lbann/callbacks/callback_dump_outputs.hpp index cb0d2cdddab..44ff4f8b1a3 100644 --- a/include/lbann/callbacks/callback_dump_outputs.hpp +++ b/include/lbann/callbacks/callback_dump_outputs.hpp @@ -80,8 +80,8 @@ class lbann_callback_dump_outputs : public lbann_callback { void on_forward_prop_end(model* m, Layer* l) override { dump_outputs(*m, *l); } void on_evaluate_forward_prop_end(model* m, Layer* l) override { - if(m->get_step() % m_batch_interval == 0) { - dump_outputs(*m, *l); + if(m->get_step() % m_batch_interval == 0) { + dump_outputs(*m, *l); } } @@ -112,6 +112,11 @@ class lbann_callback_dump_outputs : public lbann_callback { }; +// Builder function +std::unique_ptr +build_callback_dump_outputs_from_pbuf( + const google::protobuf::Message&, lbann_summary*); + } // namespace lbann #endif // LBANN_CALLBACKS_CALLBACK_DUMP_OUTPUTS_HPP_INCLUDED diff --git a/include/lbann/callbacks/callback_dump_weights.hpp b/include/lbann/callbacks/callback_dump_weights.hpp index 7edb2aacc20..462bd1f6ba9 100644 --- a/include/lbann/callbacks/callback_dump_weights.hpp +++ b/include/lbann/callbacks/callback_dump_weights.hpp @@ -65,6 +65,11 @@ class lbann_callback_dump_weights : public lbann_callback { void dump_weights(model *m, std::string s = ""); }; +// Builder function +std::unique_ptr +build_callback_dump_weights_from_pbuf( + const google::protobuf::Message& proto_msg, lbann_summary*); + } // namespace lbann #endif // LBANN_CALLBACKS_CALLBACK_DUMP_WEIGHTS_HPP_INCLUDED diff --git a/include/lbann/callbacks/callback_gpu_memory_usage.hpp b/include/lbann/callbacks/callback_gpu_memory_usage.hpp index aa890efcc87..610d7d0bc66 100644 --- a/include/lbann/callbacks/callback_gpu_memory_usage.hpp +++ b/include/lbann/callbacks/callback_gpu_memory_usage.hpp @@ -46,6 +46,10 @@ class lbann_callback_gpu_memory_usage : public lbann_callback { std::string name() const override { return "GPU memory usage"; } }; +// Builder function +ADD_DEFAULT_CALLBACK_BUILDER( + lbann_callback_gpu_memory_usage, build_callback_gpu_memory_usage_from_pbuf); + } // namespace lbann #endif // __LBANN_CALLBACKS_CALLBACK_GPU_MEMORY_USAGE_HPP_INCLUDED diff --git a/include/lbann/callbacks/callback_hang.hpp b/include/lbann/callbacks/callback_hang.hpp index 2ec4c68b835..afa6fccbd77 100644 --- a/include/lbann/callbacks/callback_hang.hpp +++ b/include/lbann/callbacks/callback_hang.hpp @@ -49,6 +49,9 @@ class lbann_callback_hang : public lbann_callback { lbann_callback_hang(const lbann_callback_hang&) = default; lbann_callback_hang& operator=(const lbann_callback_hang&) = default; lbann_callback_hang* copy() const override { return new lbann_callback_hang(*this); } + + void setup(model* m) override; + /// Hang on train begin. void on_train_begin(model* m) override { if (m_rank_to_hang == -1 || @@ -59,11 +62,16 @@ class lbann_callback_hang : public lbann_callback { } } std::string name() const override { return "hang"; } - protected: + private: /// The rank that will hang; -1 for every rank. int m_rank_to_hang; }; +// Builder function +std::unique_ptr +build_callback_hang_from_pbuf( + const google::protobuf::Message&, lbann_summary*); + } // namespace lbann #endif // LBANN_CALLBACKS_CALLBACK_HANG_HPP_INCLUDED diff --git a/include/lbann/callbacks/callback_imcomm.hpp b/include/lbann/callbacks/callback_imcomm.hpp index fb52daa2bee..d75d0bb5e44 100644 --- a/include/lbann/callbacks/callback_imcomm.hpp +++ b/include/lbann/callbacks/callback_imcomm.hpp @@ -97,6 +97,11 @@ class lbann_callback_imcomm : public lbann_callback { /** returns a string representation of the weight_initialization */ std::string get_comm_type_name(lbann_callback_imcomm::comm_type m); +// Builder function +std::unique_ptr +build_callback_imcomm_from_pbuf( + const google::protobuf::Message&, lbann_summary*); + } // namespace lbann #endif // LBANN_CALLBACKS_CALLBACK_IMCOMM_HPP_INCLUDED diff --git a/include/lbann/callbacks/callback_io.hpp b/include/lbann/callbacks/callback_io.hpp index 2ed29430a05..fb788e2abe0 100644 --- a/include/lbann/callbacks/callback_io.hpp +++ b/include/lbann/callbacks/callback_io.hpp @@ -55,6 +55,11 @@ class lbann_callback_io : public lbann_callback { std::unordered_set m_layer_indices; }; +// Builder function +std::unique_ptr +build_callback_disp_io_stats_from_pbuf( + const google::protobuf::Message&, lbann_summary*); + } // namespace lbann #endif // LBANN_CALLBACKS_IO_HPP_INCLUDED diff --git a/include/lbann/callbacks/callback_learning_rate.hpp b/include/lbann/callbacks/callback_learning_rate.hpp index 55dd090a7ea..badd3b3a380 100644 --- a/include/lbann/callbacks/callback_learning_rate.hpp +++ b/include/lbann/callbacks/callback_learning_rate.hpp @@ -113,6 +113,11 @@ class lbann_callback_step_learning_rate : public lbann_callback_learning_rate { float m_amt; }; +// Builder function +std::unique_ptr +build_callback_step_learning_rate_from_pbuf( + const google::protobuf::Message&, lbann_summary*); + /** * Decrease the learning rate by a fixed proportion when validation error stops * improving. @@ -151,6 +156,11 @@ class lbann_callback_adaptive_learning_rate : public lbann_callback_learning_rat bool m_adjust_learning_rate = false; }; +// Builder function +std::unique_ptr +build_callback_adaptive_learning_rate_from_pbuf( + const google::protobuf::Message&, lbann_summary*); + /** * Decrease learning rate by a fixed amount at fixed times. */ @@ -186,6 +196,11 @@ class lbann_callback_drop_fixed_learning_rate : std::vector m_drop_epochs; }; +// Builder function +std::unique_ptr +build_callback_drop_fixed_learning_rate_from_pbuf( + const google::protobuf::Message&, lbann_summary*); + /** * Linearly increase the learning rate to reach a target value over a * fixed number of epochs. @@ -229,6 +244,11 @@ class lbann_callback_linear_growth_learning_rate : int64_t m_delay; }; +// Builder function +std::unique_ptr +build_callback_linear_growth_learning_rate_from_pbuf( + const google::protobuf::Message&,lbann_summary*); + /** * Decrease the learning rate by polynomial policy * base_lr*(1 - i_cur/i_max)^p, where @@ -267,6 +287,11 @@ class lbann_callback_poly_learning_rate : public lbann_callback_learning_rate { float m_last_epoch_lr; }; +// Builder function +std::unique_ptr +build_callback_poly_learning_rate_from_pbuf( + const google::protobuf::Message& proto_msg, lbann_summary*); + /** * This implements an adaptive scheme for adjust each optimizer's * learning rate based on the ratio of the norms of its weights and @@ -292,6 +317,11 @@ class lbann_callback_optimizerwise_adaptive_learning_rate : public lbann_callbac float m_scale; }; +// Builder function +std::unique_ptr +build_callback_optimizerwise_adaptive_learning_rate_from_pbuf( + const google::protobuf::Message&,lbann_summary*); + } // namespace lbann #endif // LBANN_CALLBACKS_LEARNING_RATE_HPP_INCLUDED diff --git a/include/lbann/callbacks/callback_ltfb.hpp b/include/lbann/callbacks/callback_ltfb.hpp index c0188bb4f4c..4c4d915db09 100644 --- a/include/lbann/callbacks/callback_ltfb.hpp +++ b/include/lbann/callbacks/callback_ltfb.hpp @@ -155,7 +155,7 @@ class lbann_callback_ltfb : public lbann_callback { /** Inter-trainer communication scheme. */ communication_algorithm m_comm_algo; - + /** Whether to exchange training hyperparameters between trainers */ bool m_exchange_hyperparameters; @@ -168,6 +168,11 @@ class lbann_callback_ltfb : public lbann_callback { }; +// Builder function +std::unique_ptr +build_callback_ltfb_from_pbuf( + const google::protobuf::Message&, lbann_summary*); + } // namespace lbann #endif // LBANN_CALLBACKS_CALLBACK_LTFB_HPP_INCLUDED diff --git a/include/lbann/callbacks/callback_mixup.hpp b/include/lbann/callbacks/callback_mixup.hpp index 3159afb9787..5aa917f686e 100644 --- a/include/lbann/callbacks/callback_mixup.hpp +++ b/include/lbann/callbacks/callback_mixup.hpp @@ -36,20 +36,20 @@ namespace lbann { /** * Apply mixup to named input layers. - * + * * See: - * + * * Zhang, H. et al. "mixup: Beyond Empirical Risk Minimization." ICLR, 2018. * * This implementation does mixup within a single batch, per the recommendation * within the paper. - * + * * This approach may create duplicate images, and so uses - * + * * lambda = max(lambda, 1 - lambda) - * + * * for the mixing value. - * + * * This recommendation comes from https://docs.fast.ai/callbacks.mixup.html * * The recommended default alpha (from the paper) is 0.4. @@ -76,6 +76,11 @@ class callback_mixup : public lbann_callback { float m_alpha; }; +// Builder function +std::unique_ptr +build_callback_mixup_from_pbuf( + const google::protobuf::Message&, lbann_summary*); + } // namespace lbann #endif // LBANN_CALLBACKS_MIXUP_HPP diff --git a/include/lbann/callbacks/callback_perturb_adam.hpp b/include/lbann/callbacks/callback_perturb_adam.hpp index 4a580da6ffc..dbb49b1a645 100644 --- a/include/lbann/callbacks/callback_perturb_adam.hpp +++ b/include/lbann/callbacks/callback_perturb_adam.hpp @@ -122,6 +122,11 @@ class lbann_callback_perturb_adam : public lbann_callback { }; +// Builder function +std::unique_ptr +build_callback_perturb_adam_from_pbuf( + const google::protobuf::Message&, lbann_summary*); + } // namespace lbann #endif // LBANN_CALLBACKS_CALLBACK_PERTURB_ADAM_HPP_INCLUDED diff --git a/include/lbann/callbacks/callback_perturb_dropout.hpp b/include/lbann/callbacks/callback_perturb_dropout.hpp index 7564db07b3b..b629b0688e6 100644 --- a/include/lbann/callbacks/callback_perturb_dropout.hpp +++ b/include/lbann/callbacks/callback_perturb_dropout.hpp @@ -76,6 +76,11 @@ class lbann_callback_perturb_dropout : public lbann_callback { }; +// Builder function +std::unique_ptr +build_callback_perturb_dropout_from_pbuf( + const google::protobuf::Message&, lbann_summary*); + } // namespace lbann #endif // LBANN_CALLBACKS_CALLBACK_PERTURB_DROPOUT_HPP_INCLUDED diff --git a/include/lbann/callbacks/callback_print.hpp b/include/lbann/callbacks/callback_print.hpp index 53c77d2a7a1..95b1e142e6d 100644 --- a/include/lbann/callbacks/callback_print.hpp +++ b/include/lbann/callbacks/callback_print.hpp @@ -58,6 +58,11 @@ class lbann_callback_print : public lbann_callback { }; +// Builder function +std::unique_ptr +build_callback_print_from_pbuf( + const google::protobuf::Message&, lbann_summary*); + } // namespace lbann #endif // LBANN_CALLBACKS_CALLBACK_PRINT_HPP_INCLUDED diff --git a/include/lbann/callbacks/callback_replace_weights.hpp b/include/lbann/callbacks/callback_replace_weights.hpp index 62bf033792c..27de4d96e97 100644 --- a/include/lbann/callbacks/callback_replace_weights.hpp +++ b/include/lbann/callbacks/callback_replace_weights.hpp @@ -66,6 +66,11 @@ class lbann_callback_replace_weights : public lbann_callback { }; +// Builder function +std::unique_ptr +build_callback_replace_weights_from_pbuf( + const google::protobuf::Message&, lbann_summary*); + } // namespace lbann #endif // LBANN_CALLBACKS_CALLBACK_REPLACE_WEIGHTS_HPP_INCLUDED diff --git a/include/lbann/callbacks/callback_save_images.hpp b/include/lbann/callbacks/callback_save_images.hpp index 72d870f3fc1..7f13971d1d4 100644 --- a/include/lbann/callbacks/callback_save_images.hpp +++ b/include/lbann/callbacks/callback_save_images.hpp @@ -71,6 +71,11 @@ class lbann_callback_save_images : public lbann_callback { }; +// Builder function +std::unique_ptr +build_callback_save_images_from_pbuf( + const google::protobuf::Message&, lbann_summary*); + } // namespace lbann #endif // LBANN_CALLBACKS_CALLBACK_SAVE_IMAGES_HPP_INCLUDED diff --git a/include/lbann/callbacks/callback_save_model.hpp b/include/lbann/callbacks/callback_save_model.hpp index e75d8aa32ed..2993ccc09fd 100644 --- a/include/lbann/callbacks/callback_save_model.hpp +++ b/include/lbann/callbacks/callback_save_model.hpp @@ -79,6 +79,11 @@ class lbann_callback_save_model : public lbann_callback { void write_proto_text(const lbann_data::Model& proto, const std::string filename); }; +// Builder function +std::unique_ptr +build_callback_save_model_from_pbuf( + const google::protobuf::Message&, lbann_summary*); + } // namespace lbann #endif // LBANN_CALLBACKS_CALLBACK_SAVE_MODEL_HPP_INCLUDED diff --git a/include/lbann/callbacks/callback_save_topk_models.hpp b/include/lbann/callbacks/callback_save_topk_models.hpp index 4d6bad2f240..e5ed4a4f17d 100644 --- a/include/lbann/callbacks/callback_save_topk_models.hpp +++ b/include/lbann/callbacks/callback_save_topk_models.hpp @@ -42,7 +42,7 @@ namespace lbann { */ class lbann_callback_save_topk_models : public lbann_callback_save_model { public: - lbann_callback_save_topk_models(std::string dir, int k, std::string metric_name, bool ascending_ordering=false) : + lbann_callback_save_topk_models(std::string dir, int k, std::string metric_name, bool ascending_ordering=false) : lbann_callback_save_model(dir,true), m_k(k),m_metric_name(metric_name),m_ascending_ordering(ascending_ordering) {} lbann_callback_save_topk_models(const lbann_callback_save_topk_models&) = default; lbann_callback_save_topk_models& operator=(const lbann_callback_save_topk_models&) = default; @@ -53,12 +53,17 @@ class lbann_callback_save_topk_models : public lbann_callback_save_model { private: /*determine if a trainer's model is in top k, computation done by trainer master processes*/ bool am_in_topk(model *m); - int m_k ; - std::string m_metric_name; - bool m_ascending_ordering; + int m_k ; + std::string m_metric_name; + bool m_ascending_ordering; }; +// Builder function +std::unique_ptr +build_callback_save_topk_models_from_pbuf( + const google::protobuf::Message&, lbann_summary*); + } // namespace lbann #endif // LBANN_CALLBACKS_CALLBACK_SAVE_TOPK_MODELS_HPP_INCLUDED diff --git a/include/lbann/callbacks/callback_summary.hpp b/include/lbann/callbacks/callback_summary.hpp index 15294ac240d..010d0df9c02 100644 --- a/include/lbann/callbacks/callback_summary.hpp +++ b/include/lbann/callbacks/callback_summary.hpp @@ -66,6 +66,11 @@ class lbann_callback_summary : public lbann_callback { int m_mat_interval; }; +// Builder function +std::unique_ptr +build_callback_summary_from_pbuf( + const google::protobuf::Message&, lbann_summary*); + } // namespace lbann #endif // LBANN_CALLBACKS_CALLBACK_SUMMARY_HPP_INCLUDED diff --git a/include/lbann/callbacks/callback_sync_layers.hpp b/include/lbann/callbacks/callback_sync_layers.hpp index 2c9d4984fa8..74c8b943d83 100644 --- a/include/lbann/callbacks/callback_sync_layers.hpp +++ b/include/lbann/callbacks/callback_sync_layers.hpp @@ -75,6 +75,11 @@ class lbann_callback_sync_layers : public lbann_callback { virtual void do_sync(Layer *l); }; +// Builder function +std::unique_ptr +build_callback_sync_layers_from_pbuf( + const google::protobuf::Message&, lbann_summary*); + } // namespace lbann #endif // LBANN_CALLBACKS_CALLBACK_SYNC_LAYERS_HPP_INCLUDED diff --git a/include/lbann/callbacks/callback_sync_selected.hpp b/include/lbann/callbacks/callback_sync_selected.hpp index 53cda7e8b3f..33b140cfd27 100644 --- a/include/lbann/callbacks/callback_sync_selected.hpp +++ b/include/lbann/callbacks/callback_sync_selected.hpp @@ -133,6 +133,11 @@ class lbann_callback_sync_selected : public lbann_callback_sync_layers { static bool m_cuda_profiler_initialized; }; +// Builder function +std::unique_ptr +build_callback_sync_selected_from_pbuf( + const google::protobuf::Message&, lbann_summary*); + } // namespace lbann #endif // LBANN_CALLBACKS_CALLBACK_SYNC_SELECTED_HPP_INCLUDED diff --git a/include/lbann/callbacks/callback_timer.hpp b/include/lbann/callbacks/callback_timer.hpp index a53243e7a3f..90533933629 100644 --- a/include/lbann/callbacks/callback_timer.hpp +++ b/include/lbann/callbacks/callback_timer.hpp @@ -98,6 +98,11 @@ class lbann_callback_timer : public lbann_callback { }; +// Builder function +std::unique_ptr +build_callback_timer_from_pbuf( + const google::protobuf::Message&, lbann_summary*); + } // namespace lbann #endif // LBANN_CALLBACKS_CALLBACK_TIMER_HPP_INCLUDED diff --git a/include/lbann/callbacks/callback_variable_minibatch.hpp b/include/lbann/callbacks/callback_variable_minibatch.hpp index 8828d2b2936..c05e5e1e4b1 100644 --- a/include/lbann/callbacks/callback_variable_minibatch.hpp +++ b/include/lbann/callbacks/callback_variable_minibatch.hpp @@ -103,12 +103,18 @@ class lbann_callback_step_minibatch : public lbann_callback_variable_minibatch { protected: bool schedule(model *m, int& new_mbsize, float& new_lr, int& ramp_time) override; + private: /// Number of epochs between mini-batch size increases. int m_step; /// Number of steps to ramp the learning rate over. int m_ramp_time; }; +// Builder function +std::unique_ptr +build_callback_step_minibatch_from_pbuf( + const google::protobuf::Message&, lbann_summary*); + class lbann_callback_minibatch_schedule : public lbann_callback_variable_minibatch { public: /// Represents a step in a schedule of mini-batch sizes. @@ -137,11 +143,16 @@ class lbann_callback_minibatch_schedule : public lbann_callback_variable_minibat std::string name() const override { return "minibatch schedule"; } protected: bool schedule(model *m, int& new_mbsize, float& new_lr, int& ramp_time) override; - + private: /// Steps in the mini-batch schedule, stored in reverse sorted order. std::vector m_steps; }; +// Builder function +std::unique_ptr +build_callback_minibatch_schedule_from_pbuf( + const google::protobuf::Message&, lbann_summary*); + } // namespace lbann #endif // LBANN_CALLBACKS_VARIABLE_MINIBATCH_HPP_INCLUDED diff --git a/include/lbann/callbacks/profiler.hpp b/include/lbann/callbacks/profiler.hpp index abedbaaa428..dc62c430335 100644 --- a/include/lbann/callbacks/profiler.hpp +++ b/include/lbann/callbacks/profiler.hpp @@ -79,6 +79,11 @@ class lbann_callback_profiler : public lbann_callback { bool m_skip_init; }; +// Builder function +std::unique_ptr +build_callback_profiler_from_pbuf( + const google::protobuf::Message&, lbann_summary*); + } // namespace lbann #endif // LBANN_CALLBACKS_PROFILER_HPP_INCLUDED diff --git a/include/lbann/proto/factories.hpp b/include/lbann/proto/factories.hpp index f9d93f07419..15285c22930 100644 --- a/include/lbann/proto/factories.hpp +++ b/include/lbann/proto/factories.hpp @@ -61,12 +61,9 @@ weights* construct_weights(lbann_comm* comm, const lbann_data::Weights& proto_weights); /** Construct a callback specified with prototext. */ -lbann_callback* construct_callback(lbann_comm* comm, - const lbann_data::Callback& proto_cb, - const std::map& data_readers, - std::vector layer_list, - std::vector weights_list, - lbann_summary* summarizer); +std::unique_ptr +construct_callback(const google::protobuf::Message& proto_cb, + lbann_summary* summarizer); /** Construct a summarizer specified with prototext. * The summarizer is only constructed if the summarizer callback is diff --git a/include/lbann/proto/proto_common.hpp b/include/lbann/proto/proto_common.hpp index b9986dfcc99..9a6825b53ed 100644 --- a/include/lbann/proto/proto_common.hpp +++ b/include/lbann/proto/proto_common.hpp @@ -1,3 +1,29 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + #ifndef LBANN_PROTO_PROTO_COMMON_HPP_INCLUDED #define LBANN_PROTO_PROTO_COMMON_HPP_INCLUDED diff --git a/include/lbann/proto/proto_helpers.hpp b/include/lbann/proto/proto_helpers.hpp new file mode 100644 index 00000000000..0c57cb5f278 --- /dev/null +++ b/include/lbann/proto/proto_helpers.hpp @@ -0,0 +1,62 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_PROTO_PROTO_HELPERS_HPP_INCLUDED +#define LBANN_PROTO_PROTO_HELPERS_HPP_INCLUDED + +#include + +#include +#include +#include + +namespace lbann +{ +namespace proto +{ + +template +struct GenerateBuilderType_struct +{ + using type = std::function(Args...)>; +}; + +template +using generate_builder_type = + typename GenerateBuilderType_struct::type; + +namespace proto_helpers +{ + +/** @brief Get a "derived type" message from the given message. */ +google::protobuf::Message const& +get_oneof_message( + google::protobuf::Message const& msg_in, std::string const& oneof_name); + +}// namespace proto_helpers +}// namespace proto +}// namespace lbann +#endif /* LBANN_PROTO_PROTO_HELPERS_HPP_INCLUDED */ diff --git a/src/callbacks/CMakeLists.txt b/src/callbacks/CMakeLists.txt index 4c520b32a19..a10b751ed8c 100644 --- a/src/callbacks/CMakeLists.txt +++ b/src/callbacks/CMakeLists.txt @@ -16,6 +16,7 @@ set_full_path(THIS_DIR_SOURCES callback_dump_minibatch_sample_indices.cpp callback_dump_weights.cpp callback_early_stopping.cpp + callback_hang.cpp callback_imcomm.cpp callback_io.cpp callback_learning_rate.cpp diff --git a/src/callbacks/callback_check_gradients.cpp b/src/callbacks/callback_check_gradients.cpp index 266a32b3ddb..2e1192b7ad7 100644 --- a/src/callbacks/callback_check_gradients.cpp +++ b/src/callbacks/callback_check_gradients.cpp @@ -28,6 +28,8 @@ #include "lbann/layers/io/input/generic_input_layer.hpp" #include "lbann/data_readers/data_reader.hpp" +#include "lbann.pb.h" + namespace lbann { namespace { @@ -226,4 +228,15 @@ void lbann_callback_check_gradients::on_test_end(model *m) { } +// Builder function +std::unique_ptr +build_callback_check_gradients_from_pbuf( + const google::protobuf::Message& proto_msg, lbann_summary*) { + const auto& params = + dynamic_cast(proto_msg); + return make_unique(params.step_size(), + params.verbose(), + params.error_on_failure()); +} + } // namespace lbann diff --git a/src/callbacks/callback_check_metric.cpp b/src/callbacks/callback_check_metric.cpp index 2e3719a2c82..7b9efa0d395 100644 --- a/src/callbacks/callback_check_metric.cpp +++ b/src/callbacks/callback_check_metric.cpp @@ -26,6 +26,8 @@ #include "lbann/callbacks/callback_check_metric.hpp" +#include "lbann/proto/factories.hpp" + namespace lbann { lbann_callback_check_metric::lbann_callback_check_metric(std::string metric_name, @@ -86,4 +88,18 @@ void lbann_callback_check_metric::check_metric(const model& m) const { } +std::unique_ptr +build_callback_check_metric_from_pbuf( + const google::protobuf::Message& proto_msg, lbann_summary*) { + const auto& params = + dynamic_cast(proto_msg); + const auto& modes = + proto::parse_set(params.execution_modes()); + return make_unique(params.metric(), + modes, + params.lower_bound(), + params.upper_bound(), + params.error_on_failure()); +} + } // namespace lbann diff --git a/src/callbacks/callback_checkpoint.cpp b/src/callbacks/callback_checkpoint.cpp index 2fafefaf836..0aac71d4633 100644 --- a/src/callbacks/callback_checkpoint.cpp +++ b/src/callbacks/callback_checkpoint.cpp @@ -322,4 +322,18 @@ bool lbann_callback_checkpoint::restart(model *m) { return true; } +std::unique_ptr +build_callback_checkpoint_from_pbuf( + const google::protobuf::Message& proto_msg, lbann_summary*) { + const auto& params = + dynamic_cast(proto_msg); + return make_unique(params.checkpoint_dir(), + params.checkpoint_epochs(), + params.checkpoint_steps(), + params.checkpoint_secs(), + params.per_rank_dir(), + params.ckpt_dist_epochs(), + params.ckpt_dist_steps()); +} + } diff --git a/src/callbacks/callback_confusion_matrix.cpp b/src/callbacks/callback_confusion_matrix.cpp index 03eef71e449..42d37825b36 100644 --- a/src/callbacks/callback_confusion_matrix.cpp +++ b/src/callbacks/callback_confusion_matrix.cpp @@ -232,4 +232,14 @@ void lbann_callback_confusion_matrix::save_confusion_matrix(const model& m) { } +std::unique_ptr +build_callback_confusion_matrix_from_pbuf( + const google::protobuf::Message& proto_msg, lbann_summary*) { + const auto& params = + dynamic_cast(proto_msg); + return make_unique(params.prediction(), + params.label(), + params.prefix()); +} + } // namespace lbann diff --git a/src/callbacks/callback_debug.cpp b/src/callbacks/callback_debug.cpp index ffb391272d6..947088c3151 100644 --- a/src/callbacks/callback_debug.cpp +++ b/src/callbacks/callback_debug.cpp @@ -26,6 +26,10 @@ #include "lbann/callbacks/callback_debug.hpp" #include "lbann/comm.hpp" +#include "lbann/proto/factories.hpp" +#include "lbann/utils/memory.hpp" + +#include "lbann.pb.h" namespace lbann { @@ -153,4 +157,15 @@ void lbann_callback_debug::on_optimize_end(model *m, weights *w) { std::cerr << msg.str(); } +std::unique_ptr +build_callback_debug_from_pbuf(const google::protobuf::Message& proto_msg, + lbann_summary* summarizer) { + const auto& params = + dynamic_cast(proto_msg); + // FIXME TRB + const auto& modes = + proto::parse_set(params.phase()); + return make_unique(modes, summarizer); +} + } // namespace lbann diff --git a/src/callbacks/callback_debug_io.cpp b/src/callbacks/callback_debug_io.cpp index 78bedc27c11..feaedd4655f 100644 --- a/src/callbacks/callback_debug_io.cpp +++ b/src/callbacks/callback_debug_io.cpp @@ -28,14 +28,16 @@ #include "lbann/callbacks/callback_debug_io.hpp" +namespace lbann { + /// BVE FIXME @todo The use of execution_mode invalid needs to be reconsidered -void lbann::lbann_callback_debug_io::on_epoch_begin(model *m) { +void lbann_callback_debug_io::on_epoch_begin(model *m) { if(m_debug_phase == execution_mode::invalid || m_debug_phase == execution_mode::training) { print_phase_start(m, execution_mode::training); } } -void lbann::lbann_callback_debug_io::on_forward_prop_begin(model *m, Layer *l) { +void lbann_callback_debug_io::on_forward_prop_begin(model *m, Layer *l) { auto *input = dynamic_cast(l); if (input == nullptr || m_debug_lvl < 1) { return; @@ -51,7 +53,7 @@ void lbann::lbann_callback_debug_io::on_forward_prop_begin(model *m, Layer *l) { /// I think that the reset mini batch index may be off } -void lbann::lbann_callback_debug_io::print_fp_start(model *m, generic_input_layer *input) { +void lbann_callback_debug_io::print_fp_start(model *m, generic_input_layer *input) { const auto& step = m->get_step(); std::cout << "[" << m->get_comm()->get_trainer_rank() << "." << m->get_comm()->get_rank_in_trainer() @@ -71,7 +73,7 @@ void lbann::lbann_callback_debug_io::print_fp_start(model *m, generic_input_laye } // 179i @ 300s (=5m*60s) + 1i @ 100s (=5m*45s):offset <- num models -void lbann::lbann_callback_debug_io::print_phase_start(model *m, execution_mode mode) { +void lbann_callback_debug_io::print_phase_start(model *m, execution_mode mode) { // Get data reader from first input layer in model generic_data_reader* data_reader = nullptr; @@ -122,13 +124,13 @@ void lbann::lbann_callback_debug_io::print_phase_start(model *m, execution_mode //////////////////////////////////////////////////////////////////////////////// // Evaluation phase debugging //////////////////////////////////////////////////////////////////////////////// -void lbann::lbann_callback_debug_io::on_validation_begin(model *m) { +void lbann_callback_debug_io::on_validation_begin(model *m) { if(m_debug_phase == execution_mode::invalid || m_debug_phase == execution_mode::validation) { print_phase_start(m, execution_mode::validation); } } -void lbann::lbann_callback_debug_io::on_evaluate_forward_prop_begin(model *m, Layer *l) { +void lbann_callback_debug_io::on_evaluate_forward_prop_begin(model *m, Layer *l) { auto *input = dynamic_cast(l); if (input == nullptr || m_debug_lvl < 1) { return; @@ -144,8 +146,28 @@ void lbann::lbann_callback_debug_io::on_evaluate_forward_prop_begin(model *m, La //////////////////////////////////////////////////////////////////////////////// // Testing phase debugging //////////////////////////////////////////////////////////////////////////////// -void lbann::lbann_callback_debug_io::on_test_begin(model *m) { +void lbann_callback_debug_io::on_test_begin(model *m) { if(m_debug_phase == execution_mode::invalid || m_debug_phase == execution_mode::testing) { print_phase_start(m, execution_mode::testing); } } + +// FIXME TRB +std::unique_ptr +build_callback_debug_io_from_pbuf( + const google::protobuf::Message& proto_msg, lbann_summary*) { + const auto& params = dynamic_cast(proto_msg); + const auto& phase = params.phase(); + const auto& lvl = params.lvl(); + if (phase == "train" || phase == "training") { + return make_unique(execution_mode::training, lvl); + } else if (phase == "validate" || phase == "validation") { + return make_unique(execution_mode::validation, lvl); + } else if (phase == "test" || phase == "testing") { + return make_unique(execution_mode::testing, lvl); + } else { + return make_unique(); + } +} + +}// namespace lbann diff --git a/src/callbacks/callback_dump_error_signals.cpp b/src/callbacks/callback_dump_error_signals.cpp index f204a1caef4..6cb224ae1cf 100644 --- a/src/callbacks/callback_dump_error_signals.cpp +++ b/src/callbacks/callback_dump_error_signals.cpp @@ -26,6 +26,8 @@ #include "lbann/callbacks/callback_dump_error_signals.hpp" +#include + namespace lbann { void lbann_callback_dump_error_signals::on_backward_prop_end(model *m, Layer *l) { @@ -50,4 +52,12 @@ void lbann_callback_dump_error_signals::on_backward_prop_end(model *m, Layer *l) } +std::unique_ptr +build_callback_dump_error_signals_from_pbuf( + const google::protobuf::Message& proto_msg, lbann_summary*) { + const auto& params = + dynamic_cast(proto_msg); + return make_unique(params.basename()); +} + } // namespace lbann diff --git a/src/callbacks/callback_dump_gradients.cpp b/src/callbacks/callback_dump_gradients.cpp index 7f2a55c25af..8fec2e745fa 100644 --- a/src/callbacks/callback_dump_gradients.cpp +++ b/src/callbacks/callback_dump_gradients.cpp @@ -26,9 +26,12 @@ // lbann_callback_dump_gradients .hpp .cpp - Callbacks to dump gradients //////////////////////////////////////////////////////////////////////////////// -#include #include "lbann/callbacks/callback_dump_gradients.hpp" +#include + +#include + namespace lbann { void lbann_callback_dump_gradients::on_backward_prop_end(model *m) { @@ -47,4 +50,13 @@ void lbann_callback_dump_gradients::on_backward_prop_end(model *m) { } } +std::unique_ptr +build_callback_dump_gradients_from_pbuf( + const google::protobuf::Message& proto_msg, lbann_summary*) { + const auto& params = + dynamic_cast(proto_msg); + return make_unique(params.basename(), + params.interval()); +} + } // namespace lbann diff --git a/src/callbacks/callback_dump_minibatch_sample_indices.cpp b/src/callbacks/callback_dump_minibatch_sample_indices.cpp index cd0aec7dba7..1ef7d39c3bf 100644 --- a/src/callbacks/callback_dump_minibatch_sample_indices.cpp +++ b/src/callbacks/callback_dump_minibatch_sample_indices.cpp @@ -30,6 +30,9 @@ #include #include "lbann/callbacks/callback_dump_minibatch_sample_indices.hpp" #include "lbann/layers/io/input/input_layer.hpp" + +#include + #include #include @@ -74,4 +77,13 @@ void lbann_callback_dump_minibatch_sample_indices::on_evaluate_forward_prop_end( dump_to_file(m, l, m->get_step()); } +std::unique_ptr +build_callback_dump_mb_indices_from_pbuf( + const google::protobuf::Message& proto_msg, lbann_summary*) { + const auto& params = + dynamic_cast(proto_msg); + return make_unique( + params.basename(), + params.interval()); +} } // namespace lbann diff --git a/src/callbacks/callback_dump_outputs.cpp b/src/callbacks/callback_dump_outputs.cpp index 2c5fc57cb34..612a2b198f8 100644 --- a/src/callbacks/callback_dump_outputs.cpp +++ b/src/callbacks/callback_dump_outputs.cpp @@ -27,6 +27,11 @@ #include "lbann/callbacks/callback_dump_outputs.hpp" #include "lbann/utils/file_utils.hpp" +// FIXME TRB +#include "lbann/proto/factories.hpp" + +#include + #ifdef LBANN_HAS_CNPY #include #endif // LBANN_HAS_CNPY @@ -175,4 +180,20 @@ void lbann_callback_dump_outputs::dump_outputs(const model& m, const Layer& l) { } +// FIXME TRB +std::unique_ptr +build_callback_dump_outputs_from_pbuf( + const google::protobuf::Message& proto_msg, lbann_summary*) { + const auto& params = + dynamic_cast(proto_msg); + const auto& layer_names = proto::parse_set<>(params.layers()); + const auto& modes = + proto::parse_set(params.execution_modes()); + return make_unique(layer_names, + modes, + params.batch_interval(), + params.directory(), + params.format()); +} + } // namespace lbann diff --git a/src/callbacks/callback_dump_weights.cpp b/src/callbacks/callback_dump_weights.cpp index 4129a2a2acd..db0dae0db95 100644 --- a/src/callbacks/callback_dump_weights.cpp +++ b/src/callbacks/callback_dump_weights.cpp @@ -55,4 +55,12 @@ void lbann_callback_dump_weights::dump_weights(model *m, std::string s) { } } +std::unique_ptr +build_callback_dump_weights_from_pbuf( + const google::protobuf::Message& proto_msg, lbann_summary*) { + const auto& params = + dynamic_cast(proto_msg); + return make_unique(params.basename()); +} + } // namespace lbann diff --git a/src/callbacks/callback_hang.cpp b/src/callbacks/callback_hang.cpp new file mode 100644 index 00000000000..a6553772a51 --- /dev/null +++ b/src/callbacks/callback_hang.cpp @@ -0,0 +1,54 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#include "lbann/callbacks/callback_hang.hpp" + +#include + +namespace lbann { + +void lbann_callback_hang::setup(model* m) +{ + if (m->get_comm()->am_world_master()) { + if (m_rank_to_hang == -1) { + std::cout << "*** HANGING EVERY RANK IN HANG CALLBACK ***" + << std::endl; + } else { + std::cout << "*** HANGING RANK " << m_rank_to_hang + << " IN HANG CALLBACK ***" << std::endl; + } + } +} + +std::unique_ptr +build_callback_hang_from_pbuf( + const google::protobuf::Message& proto_msg, lbann_summary*) { + const auto& params = + dynamic_cast(proto_msg); + return make_unique(params.rank()); +} + +}// namespace lbann diff --git a/src/callbacks/callback_imcomm.cpp b/src/callbacks/callback_imcomm.cpp index 014d5724af8..d07385ab0f7 100644 --- a/src/callbacks/callback_imcomm.cpp +++ b/src/callbacks/callback_imcomm.cpp @@ -158,4 +158,24 @@ std::string get_comm_type_name(lbann_callback_imcomm::comm_type m) { return comm_type_names[(int)m]; } +std::unique_ptr +build_callback_imcomm_from_pbuf( + const google::protobuf::Message& proto_msg, + lbann_summary* summarizer) { + const auto& params = dynamic_cast(proto_msg); + const auto& type_str = params.intertrainer_comm_method(); + lbann_callback_imcomm::comm_type type = lbann_callback_imcomm::comm_type::NONE; + if (type_str == "none") { + type = lbann_callback_imcomm::comm_type::NONE; + } else if (type_str == "normal") { + type = lbann_callback_imcomm::comm_type::NORMAL; + } else { + std::ostringstream err; + err << "invalid inter-model communication type (" << type_str << ")"; + LBANN_ERROR(err.str()); + } + std::unordered_set selected_weights; /// @todo Initialize weights + return make_unique(type, selected_weights, summarizer); +} + } // namespace lbann diff --git a/src/callbacks/callback_io.cpp b/src/callbacks/callback_io.cpp index 640172f7f07..16365c6f570 100644 --- a/src/callbacks/callback_io.cpp +++ b/src/callbacks/callback_io.cpp @@ -70,4 +70,16 @@ void lbann_callback_io::on_test_end(model *m) { } } +// FIXME TRB +std::unique_ptr +build_callback_disp_io_stats_from_pbuf( + const google::protobuf::Message& proto_msg, lbann_summary*) { + //const auto& params = + // dynamic_cast(proto_msg); + //auto&& l = select_from_list(params.layers(), + // layer_list); + std::unordered_set selected_layers;//(l.begin(), l.end()); + return make_unique(selected_layers); +} + } // namespace lbann diff --git a/src/callbacks/callback_learning_rate.cpp b/src/callbacks/callback_learning_rate.cpp index 07c33580ffd..8849c3ab042 100644 --- a/src/callbacks/callback_learning_rate.cpp +++ b/src/callbacks/callback_learning_rate.cpp @@ -151,7 +151,7 @@ float lbann_callback_adaptive_learning_rate::global_schedule(model *m) { lbann_callback_drop_fixed_learning_rate::lbann_callback_drop_fixed_learning_rate( std::vector drop_epochs, float amt) : - lbann_callback_drop_fixed_learning_rate(drop_epochs, amt, + lbann_callback_drop_fixed_learning_rate(std::move(drop_epochs), amt, std::unordered_set()) {} lbann_callback_drop_fixed_learning_rate::lbann_callback_drop_fixed_learning_rate( @@ -286,4 +286,97 @@ float lbann_callback_optimizerwise_adaptive_learning_rate::optimizer_schedule( } } +// FIXME TRB +std::unique_ptr +build_callback_step_learning_rate_from_pbuf( + const google::protobuf::Message& proto_msg, lbann_summary*) { + const auto& params = + dynamic_cast(proto_msg); + //auto&& w = select_from_list(params.weights(), + // weights_list); + std::unordered_set selected_weights;//(w.begin(), w.end()); + return make_unique(params.step(), + params.amt(), + selected_weights); +} + +// FIXME TRB +std::unique_ptr +build_callback_adaptive_learning_rate_from_pbuf( + const google::protobuf::Message& proto_msg, lbann_summary*) { + const auto& params = + dynamic_cast(proto_msg); + //auto&& w = select_from_list(params.weights(), + // weights_list); + std::unordered_set selected_weights;//(w.begin(), w.end()); + return make_unique(params.patience(), + params.amt(), + selected_weights); +} + +// FIXME TRB +std::unique_ptr +build_callback_drop_fixed_learning_rate_from_pbuf( + const google::protobuf::Message& proto_msg, lbann_summary*) { + const auto& params = + dynamic_cast(proto_msg); + std::vector drop_epochs; + for (int i = 0; i < params.drop_epoch_size(); ++i) { + drop_epochs.push_back(params.drop_epoch(i)); + } + //auto&& w = select_from_list(params.weights(), + // weights_list); + std::unordered_set selected_weights;//(w.begin(), w.end()); + return make_unique( + std::move(drop_epochs), + params.amt(), + selected_weights); +} + +// FIXME TRB +std::unique_ptr +build_callback_linear_growth_learning_rate_from_pbuf( + const google::protobuf::Message& proto_msg,lbann_summary*) { + using MsgType = lbann_data::CallbackLinearGrowthLearningRate; + using CallbackType = lbann_callback_linear_growth_learning_rate; + const auto& params = + dynamic_cast(proto_msg); + //auto&& w = select_from_list(params.weights(), + // weights_list); + std::unordered_set selected_weights;//(w.begin(), w.end()); + return make_unique(params.target(), + params.num_epochs(), + params.delay(), + selected_weights); +} + +// FIXME TRB +std::unique_ptr +build_callback_optimizerwise_adaptive_learning_rate_from_pbuf( + const google::protobuf::Message& proto_msg,lbann_summary*) { + using MsgType = lbann_data::CallbackOptimizerwiseAdaptiveLearningRate; + using CallbackType = lbann_callback_optimizerwise_adaptive_learning_rate; + const auto& params = dynamic_cast(proto_msg); + //auto&& w = select_from_list(params.weights(), + // weights_list); + std::unordered_set selected_weights;//(w.begin(), w.end()); + return make_unique(params.scale(), selected_weights); +} + +// FIXME TRB +std::unique_ptr +build_callback_poly_learning_rate_from_pbuf( + const google::protobuf::Message& proto_msg, lbann_summary*) { + const auto& params = + dynamic_cast(proto_msg); + //auto&& w = select_from_list(params.weights(), + // weights_list); + std::unordered_set selected_weights;//(w.begin(), w.end()); + return make_unique(params.power(), + params.num_epochs(), + params.max_iter(), + params.end_lr(), + selected_weights); +} + } // namespace lbann diff --git a/src/callbacks/callback_ltfb.cpp b/src/callbacks/callback_ltfb.cpp index 2a681d0a497..70772d61b38 100644 --- a/src/callbacks/callback_ltfb.cpp +++ b/src/callbacks/callback_ltfb.cpp @@ -30,6 +30,7 @@ #include "lbann/utils/random.hpp" #include "lbann/optimizers/sgd.hpp" #include "lbann/optimizers/adam.hpp" +#include "lbann/proto/factories.hpp" namespace lbann { @@ -523,4 +524,20 @@ lbann_callback_ltfb::string_to_comm_algo(const std::string& str) { } +std::unique_ptr +build_callback_ltfb_from_pbuf( + const google::protobuf::Message& proto_msg, + lbann_summary* summarizer) { + const auto& params = + dynamic_cast(proto_msg); + return make_unique( + params.batch_interval(), + params.metric(), + proto::parse_set(params.weights()), + params.low_score_wins(), + lbann_callback_ltfb::string_to_comm_algo(params.communication_algorithm()), + params.exchange_hyperparameters(), + summarizer); +} + } // namespace lbann diff --git a/src/callbacks/callback_mixup.cpp b/src/callbacks/callback_mixup.cpp index 9f58a475d04..c982c156835 100644 --- a/src/callbacks/callback_mixup.cpp +++ b/src/callbacks/callback_mixup.cpp @@ -26,10 +26,15 @@ #include #include "lbann/callbacks/callback_mixup.hpp" +#include "lbann/proto/factories.hpp" #include "lbann/utils/beta.hpp" #include "lbann/utils/exception.hpp" #include "lbann/utils/image.hpp" +#include + +#include + namespace lbann { void callback_mixup::on_forward_prop_end(model *m, Layer *l) { @@ -92,4 +97,14 @@ void callback_mixup::on_forward_prop_end(model *m, Layer *l) { } } +std::unique_ptr +build_callback_mixup_from_pbuf( + const google::protobuf::Message& proto_msg, lbann_summary*) { + const auto& params = + dynamic_cast(proto_msg); + const auto& layers_list = proto::parse_list(params.layers()); + std::unordered_set layers(layers_list.begin(), + layers_list.end()); + return make_unique(layers, params.alpha()); +} } // namespace lbann diff --git a/src/callbacks/callback_perturb_adam.cpp b/src/callbacks/callback_perturb_adam.cpp index 7a170be15cf..b73cd378c22 100644 --- a/src/callbacks/callback_perturb_adam.cpp +++ b/src/callbacks/callback_perturb_adam.cpp @@ -25,6 +25,7 @@ //////////////////////////////////////////////////////////////////////////////// #include "lbann/callbacks/callback_perturb_adam.hpp" +#include "lbann/proto/factories.hpp" #include "lbann/utils/random.hpp" namespace lbann { @@ -160,4 +161,19 @@ void lbann_callback_perturb_adam::perturb(lbann_comm& comm, adam& opt) const { } +std::unique_ptr +build_callback_perturb_adam_from_pbuf( + const google::protobuf::Message& proto_msg, lbann_summary*) { + const auto& params = + dynamic_cast(proto_msg); + return make_unique( + params.learning_rate_factor(), + params.beta1_factor(), + params.beta2_factor(), + params.eps_factor(), + params.perturb_during_training(), + params.batch_interval(), + proto::parse_set(params.weights())); +} + } // namespace lbann diff --git a/src/callbacks/callback_perturb_dropout.cpp b/src/callbacks/callback_perturb_dropout.cpp index 36390729e6d..74c6879c107 100644 --- a/src/callbacks/callback_perturb_dropout.cpp +++ b/src/callbacks/callback_perturb_dropout.cpp @@ -25,6 +25,7 @@ //////////////////////////////////////////////////////////////////////////////// #include "lbann/callbacks/callback_perturb_dropout.hpp" +#include "lbann/proto/factories.hpp" #include "lbann/utils/random.hpp" namespace lbann { @@ -56,7 +57,7 @@ void lbann_callback_perturb_dropout::perturb(model& m) { } if (m_layer_names.empty() || m_layer_names.count(l->get_name()) > 0) { - + auto d_dp_cpu = get_dropout_layer(l); auto d_mp_cpu = get_dropout_layer(l); #ifdef LBANN_HAS_GPU @@ -64,7 +65,7 @@ void lbann_callback_perturb_dropout::perturb(model& m) { auto d_mp_gpu = get_dropout_layer(l); #endif // Perturb dropout layer - if(d_dp_cpu != nullptr || d_mp_cpu != nullptr + if(d_dp_cpu != nullptr || d_mp_cpu != nullptr #ifdef LBANN_HAS_GPU || d_dp_gpu != nullptr || d_mp_gpu != nullptr #endif @@ -116,5 +117,14 @@ void lbann_callback_perturb_dropout::perturb(model& m) { } } +std::unique_ptr +build_callback_perturb_dropout_from_pbuf( + const google::protobuf::Message& proto_msg, lbann_summary*) { + const auto& params = + dynamic_cast(proto_msg); + return make_unique( + params.keep_dropout_factor(), + proto::parse_set(params.layers())); +} } // namespace lbann diff --git a/src/callbacks/callback_print.cpp b/src/callbacks/callback_print.cpp index 85b31d0e56d..672cb6ec223 100644 --- a/src/callbacks/callback_print.cpp +++ b/src/callbacks/callback_print.cpp @@ -246,4 +246,13 @@ void lbann_callback_print::report_results(model *m) { } +std::unique_ptr +build_callback_print_from_pbuf( + const google::protobuf::Message& proto_msg, lbann_summary*) { + const auto& params = + dynamic_cast(proto_msg); + return make_unique(params.interval(), + params.print_global_stat_only()); +} + } // namespace lbann diff --git a/src/callbacks/callback_replace_weights.cpp b/src/callbacks/callback_replace_weights.cpp index d177a36f1a9..0dd9ba1f22e 100644 --- a/src/callbacks/callback_replace_weights.cpp +++ b/src/callbacks/callback_replace_weights.cpp @@ -37,5 +37,22 @@ void lbann_callback_replace_weights::on_batch_end(model *m) { } } +std::unique_ptr +build_callback_replace_weights_from_pbuf( + const google::protobuf::Message& proto_msg, lbann_summary*) { + const auto& params = + dynamic_cast(proto_msg); + /* + auto&& src_layers = select_from_list(params.source_layers(), + layer_list); + auto&& dst_layers = select_from_list(params.destination_layers(), + layer_list); + */ + std::vector src_layers, dst_layers;// FIXME TRB + return make_unique( + src_layers, + dst_layers, + params.batch_interval()); +} } // namespace lbann diff --git a/src/callbacks/callback_save_images.cpp b/src/callbacks/callback_save_images.cpp index 83dac7384c8..140d067f3bc 100644 --- a/src/callbacks/callback_save_images.cpp +++ b/src/callbacks/callback_save_images.cpp @@ -25,6 +25,10 @@ //////////////////////////////////////////////////////////////////////////////// #include "lbann/callbacks/callback_save_images.hpp" +#include "lbann/proto/factories.hpp" + +#include + #ifdef LBANN_HAS_OPENCV #include #endif // LBANN_HAS_OPENCV @@ -152,4 +156,15 @@ void lbann_callback_save_images::on_test_end(model *m) { m_layer_names); } +std::unique_ptr +build_callback_save_images_from_pbuf( + const google::protobuf::Message& proto_msg, lbann_summary*) { + const auto& params = + dynamic_cast(proto_msg); + return make_unique( + proto::parse_list<>(params.layers()), + params.image_format(), + params.image_prefix()); +} + } // namespace lbann diff --git a/src/callbacks/callback_save_model.cpp b/src/callbacks/callback_save_model.cpp index b2c82e69718..5f2d917e883 100644 --- a/src/callbacks/callback_save_model.cpp +++ b/src/callbacks/callback_save_model.cpp @@ -174,4 +174,23 @@ bool lbann_callback_save_model::load_model_weights(std::string ckpt_dir, model * return true; } +std::unique_ptr +build_callback_save_model_from_pbuf( + const google::protobuf::Message& proto_msg, lbann_summary*) { + const auto& params = + dynamic_cast(proto_msg); + if(params.extension().size() != 0) { + return make_unique( + params.dir(), + params.disable_save_after_training(), + params.extension()); + } + else { + return make_unique( + params.dir(), + params.disable_save_after_training()); + } +} + + } // namespace lbann diff --git a/src/callbacks/callback_save_topk_models.cpp b/src/callbacks/callback_save_topk_models.cpp index 6cbff7df8c4..9f3f0ed21a0 100644 --- a/src/callbacks/callback_save_topk_models.cpp +++ b/src/callbacks/callback_save_topk_models.cpp @@ -35,7 +35,7 @@ void lbann_callback_save_topk_models::on_test_end(model *m) { if(m->get_comm()->am_trainer_master()) { in_topk = am_in_topk(m); } - m->get_comm()->trainer_broadcast(0, in_topk); + m->get_comm()->trainer_broadcast(0, in_topk); if(in_topk) save_model(m); } @@ -62,8 +62,8 @@ bool lbann_callback_save_topk_models::am_in_topk(model *m) { if (m_k > num_trainers) { std::stringstream err; - err << "k ( " << m_k << ") " - << " can not be greater than number of trainers (" + err << "k ( " << m_k << ") " + << " can not be greater than number of trainers (" << num_trainers << ") " ; LBANN_ERROR(err.str()); } @@ -76,16 +76,28 @@ bool lbann_callback_save_topk_models::am_in_topk(model *m) { //top-k in an descending order else std::sort(top_scores.begin(), top_scores.end(),std::greater()); top_scores.resize(m_k); - + if (comm->am_world_master()) { std::cout << "Top " << m_k << " " << m_metric_name << " average " << std::accumulate(top_scores.begin(), top_scores.end(), EvalType(0))/m_k << std::endl; - } - if(std::find(top_scores.begin(), top_scores.end(), - score_list[comm->get_trainer_rank()]) != top_scores.end()) { + } + if(std::find(top_scores.begin(), top_scores.end(), + score_list[comm->get_trainer_rank()]) != top_scores.end()) { return true; - } + } return false; } +std::unique_ptr +build_callback_save_topk_models_from_pbuf( + const google::protobuf::Message& proto_msg, lbann_summary*) { + const auto& params = + dynamic_cast(proto_msg); + return make_unique( + params.dir(), + params.k(), + params.metric(), + params.ascending_ordering()); +} + } // namespace lbann diff --git a/src/callbacks/callback_summary.cpp b/src/callbacks/callback_summary.cpp index a5d66de440a..6e75c93b471 100644 --- a/src/callbacks/callback_summary.cpp +++ b/src/callbacks/callback_summary.cpp @@ -132,4 +132,15 @@ void lbann_callback_summary::save_histograms(model *m) { } } +std::unique_ptr +build_callback_summary_from_pbuf( + const google::protobuf::Message& proto_msg, + lbann_summary* summarizer) { + const auto& params = + dynamic_cast(proto_msg); + return make_unique(summarizer, + params.batch_interval(), + params.mat_interval()); +} + } // namespace lbann diff --git a/src/callbacks/callback_sync_layers.cpp b/src/callbacks/callback_sync_layers.cpp index de8fb939c55..6a7c674602d 100644 --- a/src/callbacks/callback_sync_layers.cpp +++ b/src/callbacks/callback_sync_layers.cpp @@ -61,4 +61,14 @@ void lbann_callback_sync_layers::do_sync(Layer *l) { } } +std::unique_ptr +build_callback_sync_layers_from_pbuf( + const google::protobuf::Message& proto_msg, lbann_summary*) { + const auto& params = + dynamic_cast(proto_msg); + return make_unique(params.sync_gpus(), + params.sync_mpi(), + params.only_input()); +} + } // namespace lbann diff --git a/src/callbacks/callback_sync_selected.cpp b/src/callbacks/callback_sync_selected.cpp index 8844cd176b2..d16530d62ef 100644 --- a/src/callbacks/callback_sync_selected.cpp +++ b/src/callbacks/callback_sync_selected.cpp @@ -277,4 +277,46 @@ void lbann_callback_sync_selected::do_sync(Layer *l) { #endif } +std::unique_ptr +build_callback_sync_selected_from_pbuf( + const google::protobuf::Message& proto_msg, lbann_summary*) { + const auto& params = + dynamic_cast(proto_msg); + const int num_layers = params.layer_to_sync_size(); + if (num_layers == 0) { + throw lbann_exception("sync_selected requires at least a layer " + "to synchronize."); + } + + using layers_t = lbann_callback_sync_selected::layers_t; + using prop_t = lbann_callback_sync_selected::prop_t; + + layers_t selected_layers; + selected_layers.reserve(num_layers); + + for (int i = 0; i < num_layers; ++i) { + const auto& layer_to_sync = params.layer_to_sync(i); + selected_layers.emplace(layer_to_sync.name(), + static_cast(layer_to_sync.prop())); + } + + auto cb_ptr + = make_unique(selected_layers, + params.async_gpus(), + params.async_mpi()); + +#ifdef LBANN_NVPROF + const auto& cp_setup = params.cuda_profiler_setup(); + if (cp_setup.no_init()) { + lbann_callback_sync_selected::turn_off_init_cuda_profiler(); + } else { + cb_ptr->init_cuda_profiler(cp_setup.config_file(), + cp_setup.output_dir(), + cp_setup.output_mode(), + comm); + } +#endif // LBANN_NVPROF + return cb_ptr; +} + } // namespace lbann diff --git a/src/callbacks/callback_timer.cpp b/src/callbacks/callback_timer.cpp index 2300951a335..c12ed0b39cd 100644 --- a/src/callbacks/callback_timer.cpp +++ b/src/callbacks/callback_timer.cpp @@ -167,4 +167,10 @@ void lbann_callback_timer::timing_end(model& m) { } +std::unique_ptr +build_callback_timer_from_pbuf( + const google::protobuf::Message&, lbann_summary* summarizer) { + return make_unique(summarizer); +} + } // namespace lbann diff --git a/src/callbacks/callback_variable_minibatch.cpp b/src/callbacks/callback_variable_minibatch.cpp index 5300b881b84..875aa4a071f 100644 --- a/src/callbacks/callback_variable_minibatch.cpp +++ b/src/callbacks/callback_variable_minibatch.cpp @@ -183,4 +183,31 @@ bool lbann_callback_minibatch_schedule::schedule( return false; } +std::unique_ptr +build_callback_step_minibatch_from_pbuf( + const google::protobuf::Message& proto_msg, lbann_summary*) { + const auto& params = + dynamic_cast(proto_msg); + return make_unique(params.starting_mbsize(), + params.step(), + params.ramp_time()); +} + +std::unique_ptr +build_callback_minibatch_schedule_from_pbuf( + const google::protobuf::Message& proto_msg, lbann_summary*) { + const auto& params = + dynamic_cast(proto_msg); + std::vector steps; + for (int i = 0; i < params.step_size(); ++i) { + const auto& proto_step = params.step(i); + steps.emplace_back(proto_step.epoch(), + proto_step.mbsize(), + proto_step.lr(), + proto_step.ramp_time()); + } + return make_unique(params.starting_mbsize(), + steps); +} + } // namespace lbann diff --git a/src/callbacks/profiler.cpp b/src/callbacks/profiler.cpp index 05c0fddb215..76e2b93d1c9 100644 --- a/src/callbacks/profiler.cpp +++ b/src/callbacks/profiler.cpp @@ -193,4 +193,13 @@ void lbann_callback_profiler::on_optimize_end(model *m, weights *w) { prof_region_end(("opt " + w->get_name()).c_str(), m_sync); } +std::unique_ptr +build_callback_profiler_from_pbuf( + const google::protobuf::Message& proto_msg, lbann_summary*) { + const auto& params = + dynamic_cast(proto_msg); + return make_unique(params.sync(), + params.skip_init()); +} + } // namespace lbann diff --git a/src/proto/CMakeLists.txt b/src/proto/CMakeLists.txt index 98abbc423e4..51e8a7f2fc6 100644 --- a/src/proto/CMakeLists.txt +++ b/src/proto/CMakeLists.txt @@ -54,6 +54,7 @@ endif (LBANN_HAS_PROTOBUF) set_full_path(THIS_DIR_SOURCES init_image_data_readers.cpp proto_common.cpp + proto_helpers.cpp ) # Add the subdirectories diff --git a/src/proto/factories/callback_factory.cpp b/src/proto/factories/callback_factory.cpp index 3667e42e943..1527e95d057 100644 --- a/src/proto/factories/callback_factory.cpp +++ b/src/proto/factories/callback_factory.cpp @@ -24,430 +24,187 @@ // permissions and limitations under the license. //////////////////////////////////////////////////////////////////////////////// +// Get the declarations of all the builders for registration +#include "lbann/callbacks/callback.hpp" +#include "lbann/callbacks/callback_check_dataset.hpp" +#include "lbann/callbacks/callback_check_gradients.hpp" +#include "lbann/callbacks/callback_check_init.hpp" +#include "lbann/callbacks/callback_check_metric.hpp" +#include "lbann/callbacks/callback_checknan.hpp" +#include "lbann/callbacks/callback_checkpoint.hpp" +#include "lbann/callbacks/callback_checksmall.hpp" +#include "lbann/callbacks/callback_confusion_matrix.hpp" +#include "lbann/callbacks/callback_debug.hpp" +#include "lbann/callbacks/callback_debug_io.hpp" +#include "lbann/callbacks/callback_dump_error_signals.hpp" +#include "lbann/callbacks/callback_dump_gradients.hpp" +#include "lbann/callbacks/callback_dump_minibatch_sample_indices.hpp" +#include "lbann/callbacks/callback_dump_outputs.hpp" +#include "lbann/callbacks/callback_dump_weights.hpp" +#include "lbann/callbacks/callback_early_stopping.hpp" +#include "lbann/callbacks/callback_gpu_memory_usage.hpp" +#include "lbann/callbacks/callback_hang.hpp" +#include "lbann/callbacks/callback_imcomm.hpp" +#include "lbann/callbacks/callback_io.hpp" +#include "lbann/callbacks/callback_learning_rate.hpp" +#include "lbann/callbacks/callback_ltfb.hpp" +#include "lbann/callbacks/callback_mixup.hpp" +#include "lbann/callbacks/callback_perturb_adam.hpp" +#include "lbann/callbacks/callback_perturb_dropout.hpp" +#include "lbann/callbacks/callback_print.hpp" +#include "lbann/callbacks/callback_replace_weights.hpp" +#include "lbann/callbacks/callback_save_images.hpp" +#include "lbann/callbacks/callback_save_model.hpp" +#include "lbann/callbacks/callback_save_topk_models.hpp" +#include "lbann/callbacks/callback_summary.hpp" +#include "lbann/callbacks/callback_sync_layers.hpp" +#include "lbann/callbacks/callback_sync_selected.hpp" +#include "lbann/callbacks/callback_timeline.hpp" +#include "lbann/callbacks/callback_timer.hpp" +#include "lbann/callbacks/callback_variable_minibatch.hpp" + #include "lbann/proto/factories.hpp" -#include "lbann/utils/peek_map.hpp" +#include "lbann/proto/proto_helpers.hpp" +#include "lbann/utils/factory.hpp" +#include "lbann/utils/memory.hpp" + +#include + +#include +#include +#include namespace lbann { namespace proto { - namespace { -/** Select entries from a list based on names. - * Any entry in 'list' with a name found in 'names' (interpreted as a - * space-separated list) is added to the output list. - */ -template -std::vector select_from_list(std::string names, - std::vector list) { - std::vector selected; - for (const auto& name : parse_list(names)) { - for (auto&& t : list) { - if (name == t->get_name()) { - selected.push_back(t); - } - } - } - return selected; -} - - -} // namespace - -lbann_callback* construct_callback(lbann_comm* comm, - const lbann_data::Callback& proto_cb, - const std::map& data_readers, - std::vector layer_list, - std::vector weights_list, - lbann_summary* summarizer) { - std::stringstream err; - - ////////////////////////////////////////////////////////////// - // Display information - ////////////////////////////////////////////////////////////// - - if (proto_cb.has_print()) { - const auto& params = proto_cb.print(); - return new lbann_callback_print(params.interval(), - params.print_global_stat_only()); - } - if (proto_cb.has_timer()) { - return new lbann_callback_timer(summarizer); - } - if (proto_cb.has_disp_io_stats()) { - const auto& params = proto_cb.disp_io_stats(); - auto&& l = select_from_list(params.layers(), - layer_list); - std::unordered_set selected_layers(l.begin(), l.end()); - return new lbann_callback_io(selected_layers); - } - if (proto_cb.has_save_images()) { - const auto& params = proto_cb.save_images(); - return new lbann_callback_save_images(parse_list<>(params.layers()), - params.image_format(), - params.image_prefix()); - } - if (proto_cb.has_confusion_matrix()) { - const auto& params = proto_cb.confusion_matrix(); - return new lbann_callback_confusion_matrix(params.prediction(), - params.label(), - params.prefix()); - } - - ////////////////////////////////////////////////////////////// - // Inter-model communication - ////////////////////////////////////////////////////////////// - - if (proto_cb.has_ltfb()) { - const auto& params = proto_cb.ltfb(); - return new lbann_callback_ltfb(params.batch_interval(), - params.metric(), - parse_set(params.weights()), - params.low_score_wins(), - lbann_callback_ltfb::string_to_comm_algo(params.communication_algorithm()), - params.exchange_hyperparameters(), - summarizer); - } - /// @todo - if (proto_cb.has_imcomm()) { - const auto& params = proto_cb.imcomm(); - const auto& type_str = params.intertrainer_comm_method(); - lbann_callback_imcomm::comm_type type = lbann_callback_imcomm::comm_type::NONE; - if (type_str == "none") { - type = lbann_callback_imcomm::comm_type::NONE; - } else if (type_str == "normal") { - type = lbann_callback_imcomm::comm_type::NORMAL; - } else { - err << "invalid inter-model communication type (" << type_str << ")"; - LBANN_ERROR(err.str()); - } - std::unordered_set selected_weights; /// @todo Initialize weights - return new lbann_callback_imcomm(type, selected_weights, summarizer); - } - - ////////////////////////////////////////////////////////////// - // Learning rate schedules - ////////////////////////////////////////////////////////////// - - if (proto_cb.has_step_learning_rate()) { - const auto& params = proto_cb.step_learning_rate(); - auto&& w = select_from_list(params.weights(), - weights_list); - std::unordered_set selected_weights(w.begin(), w.end()); - return new lbann_callback_step_learning_rate(params.step(), - params.amt(), - selected_weights); - } - if (proto_cb.has_adaptive_learning_rate()) { - const auto& params = proto_cb.adaptive_learning_rate(); - auto&& w = select_from_list(params.weights(), - weights_list); - std::unordered_set selected_weights(w.begin(), w.end()); - return new lbann_callback_adaptive_learning_rate(params.patience(), - params.amt(), - selected_weights); - } - if (proto_cb.has_drop_fixed_learning_rate()) { - const auto& params = proto_cb.drop_fixed_learning_rate(); - std::vector drop_epochs; - for (int i = 0; i < params.drop_epoch_size(); ++i) { - drop_epochs.push_back(params.drop_epoch(i)); - } - auto&& w = select_from_list(params.weights(), - weights_list); - std::unordered_set selected_weights(w.begin(), w.end()); - return new lbann_callback_drop_fixed_learning_rate(drop_epochs, - params.amt(), - selected_weights); - } - if (proto_cb.has_linear_growth_learning_rate()) { - const auto& params = proto_cb.linear_growth_learning_rate(); - auto&& w = select_from_list(params.weights(), - weights_list); - std::unordered_set selected_weights(w.begin(), w.end()); - return new lbann_callback_linear_growth_learning_rate(params.target(), - params.num_epochs(), - params.delay(), - selected_weights); - } - if (proto_cb.has_optimizerwise_adaptive_learning_rate()) { - const auto& params = proto_cb.optimizerwise_adaptive_learning_rate(); - auto&& w = select_from_list(params.weights(), - weights_list); - std::unordered_set selected_weights(w.begin(), w.end()); - return new lbann_callback_optimizerwise_adaptive_learning_rate(params.scale(), - selected_weights); - } - if (proto_cb.has_poly_learning_rate()) { - const auto& params = proto_cb.poly_learning_rate(); - auto&& w = select_from_list(params.weights(), - weights_list); - std::unordered_set selected_weights(w.begin(), w.end()); - return new lbann_callback_poly_learning_rate(params.power(), - params.num_epochs(), - params.max_iter(), - params.end_lr(), - selected_weights); - } - - ////////////////////////////////////////////////////////////// - // Mini-batch schedules - ////////////////////////////////////////////////////////////// - - if (proto_cb.has_step_minibatch()) { - const auto& params = proto_cb.step_minibatch(); - return new lbann_callback_step_minibatch(params.starting_mbsize(), - params.step(), - params.ramp_time()); - } - if (proto_cb.has_minibatch_schedule()) { - const auto& params = proto_cb.minibatch_schedule(); - std::vector steps; - for (int i = 0; i < params.step_size(); ++i) { - const auto& proto_step = params.step(i); - steps.emplace_back(proto_step.epoch(), - proto_step.mbsize(), - proto_step.lr(), - proto_step.ramp_time()); - } - return new lbann_callback_minibatch_schedule(params.starting_mbsize(), - steps); - } - - ////////////////////////////////////////////////////////////// - // Checkpointing and exporting - ////////////////////////////////////////////////////////////// - - if (proto_cb.has_checkpoint()) { - const auto& params = proto_cb.checkpoint(); - return new lbann_callback_checkpoint(params.checkpoint_dir(), - params.checkpoint_epochs(), - params.checkpoint_steps(), - params.checkpoint_secs(), - params.per_rank_dir(), - params.ckpt_dist_epochs(), - params.ckpt_dist_steps()); - } - if (proto_cb.has_save_model()) { - const auto& params = proto_cb.save_model(); - if(params.extension().size() != 0) { - return new lbann_callback_save_model(params.dir(), - params.disable_save_after_training(), - params.extension()); - }else { - return new lbann_callback_save_model(params.dir(), - params.disable_save_after_training()); - } - } - - if (proto_cb.has_save_topk_models()) { - const auto& params = proto_cb.save_topk_models(); - return new lbann_callback_save_topk_models(params.dir(), - params.k(), - params.metric(), - params.ascending_ordering()); - } - - ////////////////////////////////////////////////////////////// - // Weight exchange/replace - ////////////////////////////////////////////////////////////// - - if (proto_cb.has_replace_weights()) { - const auto& params = proto_cb.replace_weights(); - auto&& src_layers = select_from_list(params.source_layers(), - layer_list); - auto&& dst_layers = select_from_list(params.destination_layers(), - layer_list); - return new lbann_callback_replace_weights(src_layers,dst_layers,params.batch_interval()); - } - ////////////////////////////////////////////////////////////// - // Profiling - ////////////////////////////////////////////////////////////// - - if (proto_cb.has_summary()) { - const auto& params = proto_cb.summary(); - return new lbann_callback_summary(summarizer, - params.batch_interval(), - params.mat_interval()); - } - if (proto_cb.has_profiler()) { - return new lbann_callback_profiler(proto_cb.profiler().sync(), - proto_cb.profiler().skip_init()); - } - if (proto_cb.has_sync_layers()) { - const auto& params = proto_cb.sync_layers(); - return new lbann_callback_sync_layers(params.sync_gpus(), - params.sync_mpi(), - params.only_input()); - } - if (proto_cb.has_sync_selected()) { - const auto& params = proto_cb.sync_selected(); - const int num_layers = params.layer_to_sync_size(); - if (num_layers == 0) { - throw lbann_exception("sync_selected requires at least a layer to synchronize."); - } - - using layers_t = lbann_callback_sync_selected::layers_t; - using prop_t = lbann_callback_sync_selected::prop_t; - - layers_t selected_layers; - selected_layers.reserve(num_layers); - - for (int i = 0; i < num_layers; ++i) { - const auto& layer_to_sync = params.layer_to_sync(i); - selected_layers.emplace(layer_to_sync.name(), - static_cast(layer_to_sync.prop())); - } - - lbann_callback_sync_selected* cb_ptr - = new lbann_callback_sync_selected(selected_layers, - params.async_gpus(), - params.async_mpi()); +// Define the factory type. +using factory_type = lbann::generic_factory< + lbann_callback, + std::string, + generate_builder_type, + default_key_error_policy>; + +void register_default_builders(factory_type& factory) +{ + factory.register_builder("CallbackAdaptiveLearningRate", + build_callback_adaptive_learning_rate_from_pbuf); + factory.register_builder("CallbackCheckDataset", + build_callback_check_dataset_from_pbuf); + factory.register_builder("CallbackCheckGradients", + build_callback_check_gradients_from_pbuf); + factory.register_builder("CallbackCheckMetric", + build_callback_check_metric_from_pbuf); + factory.register_builder("CallbackCheckNaN", + build_callback_check_nan_from_pbuf); + factory.register_builder("CallbackCheckpoint", + build_callback_checkpoint_from_pbuf); + factory.register_builder("CallbackCheckSmall", + build_callback_check_small_from_pbuf); + factory.register_builder("CallbackConfusionMatrix", + build_callback_confusion_matrix_from_pbuf); + factory.register_builder("CallbackDebug", + build_callback_debug_from_pbuf); + factory.register_builder("CallbackDebugIO", + build_callback_debug_io_from_pbuf); + factory.register_builder("CallbackDispIOStats", + build_callback_disp_io_stats_from_pbuf); + factory.register_builder("CallbackDropFixedLearningRate", + build_callback_drop_fixed_learning_rate_from_pbuf); + factory.register_builder("CallbackDumpErrorSignals", + build_callback_dump_error_signals_from_pbuf); + factory.register_builder("CallbackDumpGradients", + build_callback_dump_gradients_from_pbuf); + factory.register_builder("CallbackDumpMBIndices", + build_callback_dump_mb_indices_from_pbuf); + factory.register_builder("CallbackDumpOutputs", + build_callback_dump_outputs_from_pbuf); + factory.register_builder("CallbackDumpWeights", + build_callback_dump_weights_from_pbuf); + factory.register_builder("CallbackGPUMemoryUsage", + build_callback_gpu_memory_usage_from_pbuf); + factory.register_builder("CallbackHang", + build_callback_hang_from_pbuf); + factory.register_builder("CallbackImComm", + build_callback_imcomm_from_pbuf); + factory.register_builder( + "CallbackLinearGrowthLearningRate", + build_callback_linear_growth_learning_rate_from_pbuf); + factory.register_builder("CallbackLTFB", + build_callback_ltfb_from_pbuf); + factory.register_builder("CallbackMinibatchSchedule", + build_callback_minibatch_schedule_from_pbuf); + factory.register_builder("CallbackMixup", + build_callback_mixup_from_pbuf); + factory.register_builder( + "CallbackOptimizerwiseAdaptiveLearningRate", + build_callback_optimizerwise_adaptive_learning_rate_from_pbuf); + factory.register_builder("CallbackPerturbAdam", + build_callback_perturb_adam_from_pbuf); + factory.register_builder("CallbackPerturbDropout", + build_callback_perturb_dropout_from_pbuf); + factory.register_builder("CallbackPolyLearningRate", + build_callback_poly_learning_rate_from_pbuf); + factory.register_builder("CallbackPrint", + build_callback_print_from_pbuf); + factory.register_builder("CallbackProfiler", + build_callback_profiler_from_pbuf); + factory.register_builder("CallbackReplaceWeights", + build_callback_replace_weights_from_pbuf); + factory.register_builder("CallbackSaveImages", + build_callback_save_images_from_pbuf); + factory.register_builder("CallbackSaveModel", + build_callback_save_model_from_pbuf); + factory.register_builder("CallbackSaveTopKModels", + build_callback_save_topk_models_from_pbuf); + factory.register_builder("CallbackStepLearningRate", + build_callback_step_learning_rate_from_pbuf); + factory.register_builder("CallbackStepMinibatch", + build_callback_step_minibatch_from_pbuf); + factory.register_builder("CallbackSummary", + build_callback_summary_from_pbuf); + factory.register_builder("CallbackSyncLayers", + build_callback_sync_layers_from_pbuf); + factory.register_builder("CallbackSyncSelected", + build_callback_sync_selected_from_pbuf); + factory.register_builder("CallbackTimer", + build_callback_timer_from_pbuf); +} - #ifdef LBANN_NVPROF - const auto& cp_setup = params.cuda_profiler_setup(); - if (cp_setup.no_init()) { - lbann_callback_sync_selected::turn_off_init_cuda_profiler(); - } else { - cb_ptr->init_cuda_profiler(cp_setup.config_file(), - cp_setup.output_dir(), - cp_setup.output_mode(), - comm); - } - #endif // LBANN_NVPROF - return cb_ptr; - } +bool is_initialized(factory_type const& factory) +{ + return (factory.get_num_registered_builders() > 0); +} - ////////////////////////////////////////////////////////////// - // Debugging - ////////////////////////////////////////////////////////////// +// Manage a global factory +struct factory_manager +{ + factory_type factory_; - if (proto_cb.has_debug()) { - const auto& params = proto_cb.debug(); - const auto& modes = parse_set(params.phase()); - return new lbann_callback_debug(modes, summarizer); - } - if (proto_cb.has_debug_io()) { - const auto& params = proto_cb.debug_io(); - const auto& phase = params.phase(); - const auto& lvl = params.lvl(); - if (phase == "train" || phase == "training") { - return new lbann_callback_debug_io(execution_mode::training, lvl); - } else if (phase == "validate" || phase == "validation") { - return new lbann_callback_debug_io(execution_mode::validation, lvl); - } else if (phase == "test" || phase == "testing") { - return new lbann_callback_debug_io(execution_mode::testing, lvl); - } else { - return new lbann_callback_debug_io(); + factory_manager() { + register_default_builders(factory_); } - } - if (proto_cb.has_dump_weights()) { - const auto& params = proto_cb.dump_weights(); - return new lbann_callback_dump_weights(params.basename()); - } - if (proto_cb.has_dump_outputs()) { - const auto& params = proto_cb.dump_outputs(); - const auto& layer_names = parse_set<>(params.layers()); - const auto& modes = parse_set(params.execution_modes()); - return new lbann_callback_dump_outputs(layer_names, - modes, - params.batch_interval(), - params.directory(), - params.format()); - } - if (proto_cb.has_dump_error_signals()) { - const auto& params = proto_cb.dump_error_signals(); - return new lbann_callback_dump_error_signals(params.basename()); - } - if (proto_cb.has_dump_gradients()) { - const auto& params = proto_cb.dump_gradients(); - return new lbann_callback_dump_gradients(params.basename(), - params.interval()); - } - if (proto_cb.has_dump_mb_indices()) { - const auto& params = proto_cb.dump_mb_indices(); - return new lbann_callback_dump_minibatch_sample_indices(params.basename(), - params.interval()); - } - if (proto_cb.has_check_dataset()) { - return new lbann_callback_check_dataset(); - } - if (proto_cb.has_check_small()) { - return new lbann_callback_checksmall(); - } - if (proto_cb.has_check_nan()) { - return new lbann_callback_checknan(); - } - if (proto_cb.has_hang()) { - const auto& rank_to_hang = proto_cb.hang().rank(); - if (comm->am_world_master()) { - if (rank_to_hang == -1) { - std::cout << "*** HANGING EVERY RANK IN HANG CALLBACK ***" - << std::endl; - } else { - std::cout << "*** HANGING RANK " << rank_to_hang - << " IN HANG CALLBACK ***" << std::endl; - } - } - return new lbann_callback_hang(rank_to_hang); - } - if (proto_cb.has_check_gradients()) { - const auto& params = proto_cb.check_gradients(); - return new lbann_callback_check_gradients(params.step_size(), - params.verbose(), - params.error_on_failure()); - } - if (proto_cb.has_check_metric()) { - const auto& params = proto_cb.check_metric(); - const auto& modes = parse_set(params.execution_modes()); - return new lbann_callback_check_metric(params.metric(), - modes, - params.lower_bound(), - params.upper_bound(), - params.error_on_failure()); - } - - ////////////////////////////////////////////////////////////// - // GPU memory profiling - ////////////////////////////////////////////////////////////// - if (proto_cb.has_gpu_memory_usage()) { - return new lbann_callback_gpu_memory_usage(); - } +}; - ////////////////////////////////////////////////////////////// - // Hyperparameter exploration - ////////////////////////////////////////////////////////////// - if (proto_cb.has_perturb_adam()) { - const auto& params = proto_cb.perturb_adam(); - return new lbann_callback_perturb_adam( - params.learning_rate_factor(), - params.beta1_factor(), - params.beta2_factor(), - params.eps_factor(), - params.perturb_during_training(), - params.batch_interval(), - parse_set(params.weights())); - } +factory_manager factory_mgr_; +factory_type const& get_callback_factory() noexcept +{ + return factory_mgr_.factory_; +} - if (proto_cb.has_perturb_dropout()) { - const auto& params = proto_cb.perturb_dropout(); - return new lbann_callback_perturb_dropout( - params.keep_dropout_factor(), - parse_set(params.layers())); - } +} // namespace - ////////////////////////////////////////////////////////////// - // Data augmentation - ////////////////////////////////////////////////////////////// - if (proto_cb.has_mixup()) { - const auto& params = proto_cb.mixup(); - const auto& layers_list = parse_list(params.layers()); - std::unordered_set layers(layers_list.begin(), - layers_list.end()); - return new callback_mixup(layers, params.alpha()); - } +std::unique_ptr +construct_callback( + const google::protobuf::Message& proto_msg, lbann_summary* summarizer) { - return nullptr; + auto const& factory = get_callback_factory(); + auto const& msg = + proto_helpers::get_oneof_message(proto_msg, "callback_type"); + return factory.create_object(msg.GetDescriptor()->name(), msg, summarizer); } lbann_summary* construct_summarizer(lbann_comm* comm, diff --git a/src/proto/factories/model_factory.cpp b/src/proto/factories/model_factory.cpp index c3ceb22e725..ca3d55da361 100644 --- a/src/proto/factories/model_factory.cpp +++ b/src/proto/factories/model_factory.cpp @@ -269,14 +269,10 @@ model* construct_model(lbann_comm* comm, assign_layers_to_metrics(layer_pointers, metric_list, proto_model); // Construct callbacks - std::vector callback_list; + std::vector> callback_list; auto&& summarizer = construct_summarizer(comm, proto_model); for (int i=0; iadd_layer(std::move(l)); } for (auto&& w : weights_list ) { m->add_weights(w); } for (auto&& met : metric_list ) { m->add_metric(met); } - for (auto&& cb : callback_list) { m->add_callback(cb); } + for (auto&& cb : callback_list) { m->add_callback(cb.release()); } const auto& name = proto_model.name(); if (!name.empty()) { m->set_name(name); diff --git a/src/proto/lbann.proto b/src/proto/lbann.proto index 25478afb88c..557f5693c46 100644 --- a/src/proto/lbann.proto +++ b/src/proto/lbann.proto @@ -404,48 +404,50 @@ message SGD { // Callbacks //======================================================================== message Callback { - // a Callback should contain exactly one of the following - CallbackPrint print = 1; - CallbackTimer timer = 2; - CallbackSummary summary = 3; - CallbackDumpWeights dump_weights = 4; - CallbackDumpOutputs dump_outputs = 5; - CallbackDumpErrorSignals dump_error_signals = 35; - CallbackDumpGradients dump_gradients = 6; - CallbackDumpMBIndices dump_mb_indices = 7; - CallbackDispIOStats disp_io_stats = 8; - CallbackImComm imcomm = 9; - CallbackSaveImages save_images = 10; - CallbackDebug debug = 11; - CallbackAdaptiveLearningRate adaptive_learning_rate = 12; - CallbackStepLearningRate step_learning_rate = 13; - CallbackCustomLearningRate custom_learning_rate = 14; - CallbackCheckSmall check_small = 15; - CallbackCheckNaN check_nan = 16; - CallbackCheckDataset check_dataset = 17; - CallbackHang hang = 18; - CallbackDropFixedLearningRate drop_fixed_learning_rate = 19; - CallbackLinearGrowthLearningRate linear_growth_learning_rate = 20; - CallbackProfiler profiler = 21; - CallbackStepMinibatch step_minibatch = 22; - CallbackCheckGradients check_gradients = 23; - CallbackLTFB ltfb = 24; - CallbackDebugIO debug_io = 25; - CallbackMinibatchSchedule minibatch_schedule = 26; - CallbackOptimizerwiseAdaptiveLearningRate optimizerwise_adaptive_learning_rate = 27; - CallbackCheckpoint checkpoint = 28; - CallbackSaveModel save_model = 29; - CallbackPolyLearningRate poly_learning_rate = 30; - CallbackReplaceWeights replace_weights = 31; - CallbackGPUMemoryUsage gpu_memory_usage = 32; - CallbackSyncLayers sync_layers = 33; - CallbackSyncSelected sync_selected = 34; - CallbackConfusionMatrix confusion_matrix = 36; - CallbackCheckMetric check_metric = 37; - CallbackPerturbAdam perturb_adam = 38; - CallbackPerturbDropout perturb_dropout = 39; - CallbackSaveTopKModels save_topk_models = 40; - CallbackMixup mixup = 41; + // a Callback should contain exactly one of the following + oneof callback_type { + CallbackPrint print = 1; + CallbackTimer timer = 2; + CallbackSummary summary = 3; + CallbackDumpWeights dump_weights = 4; + CallbackDumpOutputs dump_outputs = 5; + CallbackDumpErrorSignals dump_error_signals = 35; + CallbackDumpGradients dump_gradients = 6; + CallbackDumpMBIndices dump_mb_indices = 7; + CallbackDispIOStats disp_io_stats = 8; + CallbackImComm imcomm = 9; + CallbackSaveImages save_images = 10; + CallbackDebug debug = 11; + CallbackAdaptiveLearningRate adaptive_learning_rate = 12; + CallbackStepLearningRate step_learning_rate = 13; + CallbackCustomLearningRate custom_learning_rate = 14; + CallbackCheckSmall check_small = 15; + CallbackCheckNaN check_nan = 16; + CallbackCheckDataset check_dataset = 17; + CallbackHang hang = 18; + CallbackDropFixedLearningRate drop_fixed_learning_rate = 19; + CallbackLinearGrowthLearningRate linear_growth_learning_rate = 20; + CallbackProfiler profiler = 21; + CallbackStepMinibatch step_minibatch = 22; + CallbackCheckGradients check_gradients = 23; + CallbackLTFB ltfb = 24; + CallbackDebugIO debug_io = 25; + CallbackMinibatchSchedule minibatch_schedule = 26; + CallbackOptimizerwiseAdaptiveLearningRate optimizerwise_adaptive_learning_rate = 27; + CallbackCheckpoint checkpoint = 28; + CallbackSaveModel save_model = 29; + CallbackPolyLearningRate poly_learning_rate = 30; + CallbackReplaceWeights replace_weights = 31; + CallbackGPUMemoryUsage gpu_memory_usage = 32; + CallbackSyncLayers sync_layers = 33; + CallbackSyncSelected sync_selected = 34; + CallbackConfusionMatrix confusion_matrix = 36; + CallbackCheckMetric check_metric = 37; + CallbackPerturbAdam perturb_adam = 38; + CallbackPerturbDropout perturb_dropout = 39; + CallbackSaveTopKModels save_topk_models = 40; + CallbackMixup mixup = 41; + } } message CallbackLTFB { diff --git a/src/proto/proto_common.cpp b/src/proto/proto_common.cpp index 089fd1c39ad..b86c9f0fc15 100644 --- a/src/proto/proto_common.cpp +++ b/src/proto/proto_common.cpp @@ -1,3 +1,29 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + #include "lbann/proto/proto_common.hpp" #include "lbann/lbann.hpp" diff --git a/src/proto/proto_helpers.cpp b/src/proto/proto_helpers.cpp new file mode 100644 index 00000000000..a9ea573c136 --- /dev/null +++ b/src/proto/proto_helpers.cpp @@ -0,0 +1,69 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#include "lbann/proto/proto_helpers.hpp" +#include "lbann/utils/exception.hpp" + +#include +#include + +#include + +namespace lbann { +namespace proto { +namespace proto_helpers { + +google::protobuf::Message const& +get_oneof_message( + google::protobuf::Message const& msg_in, std::string const& oneof_name) +{ + auto&& desc = msg_in.GetDescriptor(); + auto&& reflex = msg_in.GetReflection(); + auto&& oneof_handle = desc->FindOneofByName(oneof_name); + if (!oneof_handle) + { + std::string msg_string; + google::protobuf::TextFormat::PrintToString(msg_in, &msg_string); + LBANN_ERROR(std::string("Message has no oneof field named \"") + + oneof_name + "\"\n\nMessage(" + + desc->DebugString() +"):\n\n" + + msg_string); + } + + auto&& oneof_field = reflex->GetOneofFieldDescriptor(msg_in, oneof_handle); + + if (!oneof_field) + LBANN_ERROR("Oneof field in message has not been set."); + + if (oneof_field->type() != google::protobuf::FieldDescriptor::TYPE_MESSAGE) + LBANN_ERROR("Oneof field is not of message type."); + + return reflex->GetMessage(msg_in, oneof_field); +} + +}// namespace proto_helpers +}// namespace proto +}// namespace lbann From 0fe519590eb7aad3d9de5f55bb636f2110773f9b Mon Sep 17 00:00:00 2001 From: "Thomas R. Benson" Date: Sat, 20 Jul 2019 14:33:09 -0700 Subject: [PATCH 133/634] fix base.hpp atrocities --- include/lbann/base.hpp | 23 ++++++++++--------- src/callbacks/callback_debug.cpp | 2 +- src/callbacks/callback_debug_io.cpp | 6 ++--- ...callback_dump_minibatch_sample_indices.cpp | 2 +- src/callbacks/callback_dump_outputs.cpp | 2 +- 5 files changed, 18 insertions(+), 17 deletions(-) diff --git a/include/lbann/base.hpp b/include/lbann/base.hpp index 76e38fb0f3f..10c652a8f00 100644 --- a/include/lbann/base.hpp +++ b/include/lbann/base.hpp @@ -39,6 +39,7 @@ #include "lbann/utils/omp_pragma.hpp" #include +#include namespace lbann { @@ -118,7 +119,7 @@ enum class matrix_format {MC_MR, CIRC_CIRC, STAR_STAR, STAR_VC, MC_STAR, invalid /// Data layout that is optimized for different modes of parallelism enum class data_layout {MODEL_PARALLEL, DATA_PARALLEL, invalid}; -static matrix_format __attribute__((used)) data_layout_to_matrix_format(data_layout layout) { +inline matrix_format data_layout_to_matrix_format(data_layout layout) { matrix_format format; switch(layout) { case data_layout::MODEL_PARALLEL: @@ -129,14 +130,14 @@ static matrix_format __attribute__((used)) data_layout_to_matrix_format(data_lay format = matrix_format::STAR_STAR; break; default: - throw(std::string{} + __FILE__ + " " + std::to_string(__LINE__) + " Invalid data layout selected"); + throw std::runtime_error("Invalid data layout selected"); } return format; } /// Neural network execution mode enum class execution_mode {training, validation, testing, prediction, invalid}; -static const char *__attribute__((used)) _to_string(execution_mode m) { +inline std::string to_string(execution_mode m) { switch(m) { case execution_mode::training: return "training"; @@ -149,7 +150,7 @@ static const char *__attribute__((used)) _to_string(execution_mode m) { case execution_mode::invalid: return "invalid"; default: - throw("Invalid execution mode specified"); /// @todo this should be an lbann_exception but then the class has to move to resolve dependencies + throw std::runtime_error("Invalid execution mode specified"); } } @@ -168,7 +169,7 @@ enum class data_reader_target_mode {CLASSIFICATION, REGRESSION, RECONSTRUCTION, * It checks if the string 'mainStr' ends with given string * 'toMatch' */ -static bool __attribute__((used)) endsWith(const std::string mainStr, const std::string &toMatch) +inline bool endsWith(const std::string mainStr, const std::string &toMatch) { if(mainStr.size() >= toMatch.size() && mainStr.compare(mainStr.size() - toMatch.size(), toMatch.size(), toMatch) == 0) @@ -178,19 +179,19 @@ static bool __attribute__((used)) endsWith(const std::string mainStr, const std: } /// Print the dimensions and name of a Elemental matrix -static void __attribute__((used)) _print_matrix_dims(AbsDistMat *m, const char *name) { +inline void print_matrix_dims(AbsDistMat *m, const char *name) { std::cout << "DISPLAY MATRIX: " << name << " = " << m->Height() << " x " << m->Width() << std::endl; } -#define PRINT_MATRIX_DIMS(x) _print_matrix_dims(x, #x); +#define PRINT_MATRIX_DIMS(x) print_matrix_dims(x, #x); /// Print the dimensions and name of a Elemental matrix -static void __attribute__((used)) _print_local_matrix_dims(AbsMat *m, const char *name) { +inline void print_local_matrix_dims(AbsMat *m, const char *name) { std::cout << "DISPLAY MATRIX: " << name << " = " << m->Height() << " x " << m->Width() << std::endl; } -#define PRINT_LOCAL_MATRIX_DIMS(x) _print_local_matrix_dims(x, #x); +#define PRINT_LOCAL_MATRIX_DIMS(x) print_local_matrix_dims(x, #x); -#define LBANN_MAKE_STR(x) _LBANN_MAKE_STR(x) -#define _LBANN_MAKE_STR(x) #x +#define LBANN_MAKE_STR_(x) #x +#define LBANN_MAKE_STR(x) LBANN_MAKE_STR_(x) } // namespace lbann diff --git a/src/callbacks/callback_debug.cpp b/src/callbacks/callback_debug.cpp index 947088c3151..6f190ad922b 100644 --- a/src/callbacks/callback_debug.cpp +++ b/src/callbacks/callback_debug.cpp @@ -66,7 +66,7 @@ std::string weights_string(const weights& w) { std::string batch_step_string(const model& m) { std::stringstream msg; const auto& mode = m.get_execution_mode(); - msg << _to_string(mode) << " batch " << m.get_step(); + msg << to_string(mode) << " batch " << m.get_step(); msg << " (epoch " << m.get_epoch() << ")"; return msg.str(); } diff --git a/src/callbacks/callback_debug_io.cpp b/src/callbacks/callback_debug_io.cpp index feaedd4655f..108574da28c 100644 --- a/src/callbacks/callback_debug_io.cpp +++ b/src/callbacks/callback_debug_io.cpp @@ -58,7 +58,7 @@ void lbann_callback_debug_io::print_fp_start(model *m, generic_input_layer *inpu std::cout << "[" << m->get_comm()->get_trainer_rank() << "." << m->get_comm()->get_rank_in_trainer() << "] @" << m->get_epoch() << "." << step - << " Phase: " << _to_string(m->get_execution_mode()) + << " Phase: " << to_string(m->get_execution_mode()) << " starting forward propagation for layer " << input->get_name() << " type: " << input->get_type() << " iteration: " << input->get_data_reader()->get_current_mini_batch_index() @@ -92,7 +92,7 @@ void lbann_callback_debug_io::print_phase_start(model *m, execution_mode mode) { std::cout << "[" << m->get_comm()->get_trainer_rank() << "." << m->get_comm()->get_rank_in_trainer() << "] @" << 0 << "." << step - << " Starting Phase: " << _to_string(mode) + << " Starting Phase: " << to_string(mode) << " " << (data_reader->get_num_iterations_per_epoch() - 1) << "i @ " << data_reader->get_global_mini_batch_size() << "s (=" << m->get_comm()->get_num_trainers() @@ -112,7 +112,7 @@ void lbann_callback_debug_io::print_phase_start(model *m, execution_mode mode) { std::cout << "[" << m->get_comm()->get_trainer_rank() << "." << m->get_comm()->get_rank_in_trainer() << "] @" << 0 << "." << step - << " Starting Phase: " << _to_string(mode) + << " Starting Phase: " << to_string(mode) << " " << (data_reader->get_num_iterations_per_epoch()) << "i " << " par. readers = " << data_reader->get_num_parallel_readers() diff --git a/src/callbacks/callback_dump_minibatch_sample_indices.cpp b/src/callbacks/callback_dump_minibatch_sample_indices.cpp index 1ef7d39c3bf..b42ae8a4c6f 100644 --- a/src/callbacks/callback_dump_minibatch_sample_indices.cpp +++ b/src/callbacks/callback_dump_minibatch_sample_indices.cpp @@ -58,7 +58,7 @@ void lbann_callback_dump_minibatch_sample_indices::dump_to_file(model *m, Layer const std::string file = (m_basename - + _to_string(m->get_execution_mode()) + + to_string(m->get_execution_mode()) + "-model" + std::to_string(m->get_comm()->get_trainer_rank()) + "-rank" + std::to_string(m->get_comm()->get_rank_in_trainer()) + "-epoch" + std::to_string(m->get_epoch()) diff --git a/src/callbacks/callback_dump_outputs.cpp b/src/callbacks/callback_dump_outputs.cpp index 612a2b198f8..7b5a5416623 100644 --- a/src/callbacks/callback_dump_outputs.cpp +++ b/src/callbacks/callback_dump_outputs.cpp @@ -157,7 +157,7 @@ void lbann_callback_dump_outputs::dump_outputs(const model& m, const Layer& l) { const auto& data = static_cast(circ_data.LockedMatrix()); const std::string file_name = (m_directory + m.get_name() - + "-" + _to_string(mode) + + "-" + to_string(mode) + "-epoch" + std::to_string(epoch) + "-step" + std::to_string(step) + "-" + l.get_name() From c42df5f7522f17a429353f783a6230bf558d45cb Mon Sep 17 00:00:00 2001 From: "Thomas R. Benson" Date: Sat, 20 Jul 2019 15:08:36 -0700 Subject: [PATCH 134/634] simplify working with execution_mode in the parse_list and parse_set functions --- include/lbann/base.hpp | 4 ++ include/lbann/proto/factories.hpp | 23 ---------- include/lbann/proto/proto_common.hpp | 21 +++++++++ src/base.cpp | 22 ++++++++++ src/callbacks/callback_check_metric.cpp | 2 +- src/callbacks/callback_debug.cpp | 2 +- src/callbacks/callback_dump_outputs.cpp | 4 +- src/callbacks/callback_ltfb.cpp | 2 +- src/callbacks/callback_mixup.cpp | 2 +- src/callbacks/callback_perturb_adam.cpp | 2 +- src/callbacks/callback_perturb_dropout.cpp | 2 +- src/callbacks/callback_save_images.cpp | 2 +- src/proto/factories/CMakeLists.txt | 1 - src/proto/factories/factories.cpp | 51 ---------------------- src/proto/proto_common.cpp | 6 +-- 15 files changed, 59 insertions(+), 87 deletions(-) delete mode 100644 src/proto/factories/factories.cpp diff --git a/include/lbann/base.hpp b/include/lbann/base.hpp index 10c652a8f00..3f5d5668648 100644 --- a/include/lbann/base.hpp +++ b/include/lbann/base.hpp @@ -153,6 +153,10 @@ inline std::string to_string(execution_mode m) { throw std::runtime_error("Invalid execution mode specified"); } } +/** @brief Convert a string to an execution_mode. */ +execution_mode from_string(std::string const& str); +/** @brief Extract an execution_mode from a stream. */ +std::istream& operator>>(std::istream& os, execution_mode& e); /** Pooling layer mode */ enum class pool_mode {invalid, max, average, average_no_pad}; diff --git a/include/lbann/proto/factories.hpp b/include/lbann/proto/factories.hpp index 15285c22930..59b05bd24e2 100644 --- a/include/lbann/proto/factories.hpp +++ b/include/lbann/proto/factories.hpp @@ -86,29 +86,6 @@ std::unique_ptr construct_transform( transform::transform_pipeline construct_transform_pipeline( const lbann_data::Reader& data_reader); -/** Parse a space-separated list. */ -template -std::vector parse_list(std::string str) { - std::vector list; - std::stringstream ss(str); - for (T entry; ss >> entry;) { - list.push_back(entry); - } - return list; -} -template <> -std::vector parse_list(std::string str); - -/** Parse a space-separated set. */ -template -std::set parse_set(std::string str) { - std::set set; - for (const auto& entry : parse_list(str)) { - set.insert(entry); - } - return set; -} - } // namespace proto } // namespace lbann diff --git a/include/lbann/proto/proto_common.hpp b/include/lbann/proto/proto_common.hpp index 9a6825b53ed..1f9acb6e252 100644 --- a/include/lbann/proto/proto_common.hpp +++ b/include/lbann/proto/proto_common.hpp @@ -95,6 +95,27 @@ bool write_prototext_file( const std::string& fn, lbann_data::LbannPB& pb); +/** @brief Parse a space-separated list. */ +template +std::vector parse_list(std::string str) { + std::vector list; + std::istringstream ss(str); + for (T entry; ss >> entry;) { + list.push_back(entry); + } + return list; +} + +/** @brief Parse a space-separated set. */ +template +std::set parse_set(std::string str) { + std::set set; + std::istringstream iss(str); + for (T entry; iss >> entry;) { + set.insert(entry); + } + return set; +} } // namespace lbann #endif // LBANN_PROTO_PROTO_COMMON_HPP_INCLUDED diff --git a/src/base.cpp b/src/base.cpp index f67fd982d84..d8cfd74c8ce 100644 --- a/src/base.cpp +++ b/src/base.cpp @@ -108,4 +108,26 @@ std::string get_pool_mode_name(pool_mode m) { return pool_mode_names[(int)m]; } +execution_mode from_string(std::string const& str) { + if (str == "training") + return execution_mode::training; + else if (str == "validation") + return execution_mode::validation; + else if (str == "testing") + return execution_mode::testing; + else if (str == "prediction") + return execution_mode::prediction; + else if (str == "invalid") + return execution_mode::invalid; + else + LBANN_ERROR("\"" + str + "\" is not a valid execution mode."); +} + +std::istream& operator>>(std::istream& is, execution_mode& m) { + std::string tmp; + is >> tmp; + m = from_string(tmp); + return is; +} + } // namespace lbann diff --git a/src/callbacks/callback_check_metric.cpp b/src/callbacks/callback_check_metric.cpp index 7b9efa0d395..a0919854e68 100644 --- a/src/callbacks/callback_check_metric.cpp +++ b/src/callbacks/callback_check_metric.cpp @@ -94,7 +94,7 @@ build_callback_check_metric_from_pbuf( const auto& params = dynamic_cast(proto_msg); const auto& modes = - proto::parse_set(params.execution_modes()); + parse_set(params.execution_modes()); return make_unique(params.metric(), modes, params.lower_bound(), diff --git a/src/callbacks/callback_debug.cpp b/src/callbacks/callback_debug.cpp index 6f190ad922b..5a9af62ac73 100644 --- a/src/callbacks/callback_debug.cpp +++ b/src/callbacks/callback_debug.cpp @@ -164,7 +164,7 @@ build_callback_debug_from_pbuf(const google::protobuf::Message& proto_msg, dynamic_cast(proto_msg); // FIXME TRB const auto& modes = - proto::parse_set(params.phase()); + parse_set(params.phase()); return make_unique(modes, summarizer); } diff --git a/src/callbacks/callback_dump_outputs.cpp b/src/callbacks/callback_dump_outputs.cpp index 7b5a5416623..31e95f03066 100644 --- a/src/callbacks/callback_dump_outputs.cpp +++ b/src/callbacks/callback_dump_outputs.cpp @@ -186,9 +186,9 @@ build_callback_dump_outputs_from_pbuf( const google::protobuf::Message& proto_msg, lbann_summary*) { const auto& params = dynamic_cast(proto_msg); - const auto& layer_names = proto::parse_set<>(params.layers()); + const auto& layer_names = parse_set<>(params.layers()); const auto& modes = - proto::parse_set(params.execution_modes()); + parse_set(params.execution_modes()); return make_unique(layer_names, modes, params.batch_interval(), diff --git a/src/callbacks/callback_ltfb.cpp b/src/callbacks/callback_ltfb.cpp index 70772d61b38..2fe90186d1d 100644 --- a/src/callbacks/callback_ltfb.cpp +++ b/src/callbacks/callback_ltfb.cpp @@ -533,7 +533,7 @@ build_callback_ltfb_from_pbuf( return make_unique( params.batch_interval(), params.metric(), - proto::parse_set(params.weights()), + parse_set(params.weights()), params.low_score_wins(), lbann_callback_ltfb::string_to_comm_algo(params.communication_algorithm()), params.exchange_hyperparameters(), diff --git a/src/callbacks/callback_mixup.cpp b/src/callbacks/callback_mixup.cpp index c982c156835..a24941698d7 100644 --- a/src/callbacks/callback_mixup.cpp +++ b/src/callbacks/callback_mixup.cpp @@ -102,7 +102,7 @@ build_callback_mixup_from_pbuf( const google::protobuf::Message& proto_msg, lbann_summary*) { const auto& params = dynamic_cast(proto_msg); - const auto& layers_list = proto::parse_list(params.layers()); + const auto& layers_list = parse_list(params.layers()); std::unordered_set layers(layers_list.begin(), layers_list.end()); return make_unique(layers, params.alpha()); diff --git a/src/callbacks/callback_perturb_adam.cpp b/src/callbacks/callback_perturb_adam.cpp index b73cd378c22..65a32f8f0bf 100644 --- a/src/callbacks/callback_perturb_adam.cpp +++ b/src/callbacks/callback_perturb_adam.cpp @@ -173,7 +173,7 @@ build_callback_perturb_adam_from_pbuf( params.eps_factor(), params.perturb_during_training(), params.batch_interval(), - proto::parse_set(params.weights())); + parse_set(params.weights())); } } // namespace lbann diff --git a/src/callbacks/callback_perturb_dropout.cpp b/src/callbacks/callback_perturb_dropout.cpp index 74c6879c107..9df485628bf 100644 --- a/src/callbacks/callback_perturb_dropout.cpp +++ b/src/callbacks/callback_perturb_dropout.cpp @@ -124,7 +124,7 @@ build_callback_perturb_dropout_from_pbuf( dynamic_cast(proto_msg); return make_unique( params.keep_dropout_factor(), - proto::parse_set(params.layers())); + parse_set(params.layers())); } } // namespace lbann diff --git a/src/callbacks/callback_save_images.cpp b/src/callbacks/callback_save_images.cpp index 140d067f3bc..1bb29a22df1 100644 --- a/src/callbacks/callback_save_images.cpp +++ b/src/callbacks/callback_save_images.cpp @@ -162,7 +162,7 @@ build_callback_save_images_from_pbuf( const auto& params = dynamic_cast(proto_msg); return make_unique( - proto::parse_list<>(params.layers()), + parse_list<>(params.layers()), params.image_format(), params.image_prefix()); } diff --git a/src/proto/factories/CMakeLists.txt b/src/proto/factories/CMakeLists.txt index 45d987ab318..05c1259463a 100644 --- a/src/proto/factories/CMakeLists.txt +++ b/src/proto/factories/CMakeLists.txt @@ -1,7 +1,6 @@ # Add the source files for this directory set_full_path(THIS_DIR_SOURCES callback_factory.cpp - factories.cpp layer_factory.cpp layer_graph_factory.cpp model_factory.cpp diff --git a/src/proto/factories/factories.cpp b/src/proto/factories/factories.cpp deleted file mode 100644 index 617e639d8c1..00000000000 --- a/src/proto/factories/factories.cpp +++ /dev/null @@ -1,51 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. -// Produced at the Lawrence Livermore National Laboratory. -// Written by the LBANN Research Team (B. Van Essen, et al.) listed in -// the CONTRIBUTORS file. -// -// LLNL-CODE-697807. -// All rights reserved. -// -// This file is part of LBANN: Livermore Big Artificial Neural Network -// Toolkit. For details, see http://software.llnl.gov/LBANN or -// https://github.com/LLNL/LBANN. -// -// Licensed under the Apache License, Version 2.0 (the "Licensee"); you -// may not use this file except in compliance with the License. You may -// obtain a copy of the License at: -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the license. -//////////////////////////////////////////////////////////////////////////////// - -#include "lbann/proto/factories.hpp" - -namespace lbann { -namespace proto { - -/** Parse a space-separated list of execution modes. */ -template <> -std::vector parse_list(std::string str) { - std::vector list; - for (const auto& mode : parse_list(str)) { - if (mode == "train" || mode == "training") { - list.push_back(execution_mode::training); - } else if (mode == "validate" || mode == "validation") { - list.push_back(execution_mode::validation); - } else if (mode == "test" || mode == "testing") { - list.push_back(execution_mode::testing); - } else { - LBANN_ERROR("invalid execution mode (\"" + mode + "\")"); - } - } - return list; -} - -} // namespace proto -} // namespace lbann diff --git a/src/proto/proto_common.cpp b/src/proto/proto_common.cpp index b86c9f0fc15..be0179ded90 100644 --- a/src/proto/proto_common.cpp +++ b/src/proto/proto_common.cpp @@ -336,14 +336,14 @@ void init_data_readers( if (readme.num_labels() != 0) { reader = new data_reader_synthetic( readme.num_samples(), - proto::parse_list(readme.synth_dimensions()), + parse_list(readme.synth_dimensions()), readme.num_labels(), shuffle); } else { reader = new data_reader_synthetic( readme.num_samples(), - proto::parse_list(readme.synth_dimensions()), - proto::parse_list(readme.synth_response_dimensions()), + parse_list(readme.synth_dimensions()), + parse_list(readme.synth_response_dimensions()), shuffle); } } else if (name == "mesh") { From a58cd31f81384422ca69de017e4596f1ce2d822c Mon Sep 17 00:00:00 2001 From: "Thomas R. Benson" Date: Sat, 20 Jul 2019 15:55:24 -0700 Subject: [PATCH 135/634] move callbacks to a separate proto file --- python/lbann/__init__.py | 2 +- python/lbann/callback.py | 6 +- src/callbacks/callback_check_gradients.cpp | 2 +- src/callbacks/callback_debug.cpp | 2 +- src/callbacks/callback_dump_error_signals.cpp | 2 +- src/callbacks/callback_dump_gradients.cpp | 2 +- ...callback_dump_minibatch_sample_indices.cpp | 2 +- src/callbacks/callback_dump_outputs.cpp | 2 +- src/callbacks/callback_hang.cpp | 2 +- src/callbacks/callback_mixup.cpp | 2 +- src/callbacks/callback_save_images.cpp | 2 +- src/proto/CMakeLists.txt | 14 +- src/proto/callbacks.proto | 333 +++++++++++++++++ src/proto/lbann.proto | 336 ++---------------- 14 files changed, 384 insertions(+), 325 deletions(-) create mode 100644 src/proto/callbacks.proto diff --git a/python/lbann/__init__.py b/python/lbann/__init__.py index 7b48f126daf..5036777b488 100644 --- a/python/lbann/__init__.py +++ b/python/lbann/__init__.py @@ -19,7 +19,7 @@ _lbann_exe = _config['Paths']['lbann_exe'] except: pass -import lbann_pb2 +import lbann_pb2, callbacks_pb2 def lbann_exe(): """LBANN executable.""" return _lbann_exe if _lbann_exe else 'lbann' diff --git a/python/lbann/callback.py b/python/lbann/callback.py index 151986e3db8..8319a0fe4b4 100644 --- a/python/lbann/callback.py +++ b/python/lbann/callback.py @@ -1,6 +1,6 @@ """Callbacks for neural network training.""" import abc -from lbann import lbann_pb2 +from lbann import callbacks_pb2 import lbann.util.class_generator class Callback(abc.ABC): @@ -11,13 +11,13 @@ def __init__(self): def export_proto(self): """Construct and return a protobuf message.""" - return lbann_pb2.Callback() + return callbacks_pb2.Callback() # Generate Callback sub-classes from lbann.proto # Note: The list of skip fields must be updated if any new fields are # added to the Callback message in lbann.proto classes = lbann.util.class_generator.generate_classes_from_protobuf_message( - lbann_pb2.Callback, + callbacks_pb2.Callback, base_class = Callback, base_has_export_proto = True) for c in classes: diff --git a/src/callbacks/callback_check_gradients.cpp b/src/callbacks/callback_check_gradients.cpp index 2e1192b7ad7..1e9cd248696 100644 --- a/src/callbacks/callback_check_gradients.cpp +++ b/src/callbacks/callback_check_gradients.cpp @@ -28,7 +28,7 @@ #include "lbann/layers/io/input/generic_input_layer.hpp" #include "lbann/data_readers/data_reader.hpp" -#include "lbann.pb.h" +#include "callbacks.pb.h" namespace lbann { diff --git a/src/callbacks/callback_debug.cpp b/src/callbacks/callback_debug.cpp index 5a9af62ac73..ef4f02e117e 100644 --- a/src/callbacks/callback_debug.cpp +++ b/src/callbacks/callback_debug.cpp @@ -29,7 +29,7 @@ #include "lbann/proto/factories.hpp" #include "lbann/utils/memory.hpp" -#include "lbann.pb.h" +#include "callbacks.pb.h" namespace lbann { diff --git a/src/callbacks/callback_dump_error_signals.cpp b/src/callbacks/callback_dump_error_signals.cpp index 6cb224ae1cf..c4e1b5a4b2c 100644 --- a/src/callbacks/callback_dump_error_signals.cpp +++ b/src/callbacks/callback_dump_error_signals.cpp @@ -26,7 +26,7 @@ #include "lbann/callbacks/callback_dump_error_signals.hpp" -#include +#include namespace lbann { diff --git a/src/callbacks/callback_dump_gradients.cpp b/src/callbacks/callback_dump_gradients.cpp index 8fec2e745fa..c8c0e0d34d1 100644 --- a/src/callbacks/callback_dump_gradients.cpp +++ b/src/callbacks/callback_dump_gradients.cpp @@ -28,7 +28,7 @@ #include "lbann/callbacks/callback_dump_gradients.hpp" -#include +#include #include diff --git a/src/callbacks/callback_dump_minibatch_sample_indices.cpp b/src/callbacks/callback_dump_minibatch_sample_indices.cpp index b42ae8a4c6f..13b390e8d90 100644 --- a/src/callbacks/callback_dump_minibatch_sample_indices.cpp +++ b/src/callbacks/callback_dump_minibatch_sample_indices.cpp @@ -31,7 +31,7 @@ #include "lbann/callbacks/callback_dump_minibatch_sample_indices.hpp" #include "lbann/layers/io/input/input_layer.hpp" -#include +#include #include #include diff --git a/src/callbacks/callback_dump_outputs.cpp b/src/callbacks/callback_dump_outputs.cpp index 31e95f03066..6f1bec3af77 100644 --- a/src/callbacks/callback_dump_outputs.cpp +++ b/src/callbacks/callback_dump_outputs.cpp @@ -30,7 +30,7 @@ // FIXME TRB #include "lbann/proto/factories.hpp" -#include +#include #ifdef LBANN_HAS_CNPY #include diff --git a/src/callbacks/callback_hang.cpp b/src/callbacks/callback_hang.cpp index a6553772a51..891310de751 100644 --- a/src/callbacks/callback_hang.cpp +++ b/src/callbacks/callback_hang.cpp @@ -26,7 +26,7 @@ #include "lbann/callbacks/callback_hang.hpp" -#include +#include namespace lbann { diff --git a/src/callbacks/callback_mixup.cpp b/src/callbacks/callback_mixup.cpp index a24941698d7..f38b07a74c4 100644 --- a/src/callbacks/callback_mixup.cpp +++ b/src/callbacks/callback_mixup.cpp @@ -31,7 +31,7 @@ #include "lbann/utils/exception.hpp" #include "lbann/utils/image.hpp" -#include +#include #include diff --git a/src/callbacks/callback_save_images.cpp b/src/callbacks/callback_save_images.cpp index 1bb29a22df1..4d320f5c5b7 100644 --- a/src/callbacks/callback_save_images.cpp +++ b/src/callbacks/callback_save_images.cpp @@ -27,7 +27,7 @@ #include "lbann/callbacks/callback_save_images.hpp" #include "lbann/proto/factories.hpp" -#include +#include #ifdef LBANN_HAS_OPENCV #include diff --git a/src/proto/CMakeLists.txt b/src/proto/CMakeLists.txt index 51e8a7f2fc6..0f8cc593494 100644 --- a/src/proto/CMakeLists.txt +++ b/src/proto/CMakeLists.txt @@ -6,18 +6,24 @@ if (LBANN_HAS_PROTOBUF) # implementation of "protobuf_generate_cpp" but it gives us a custom # command on which we can depend. Using this, when lbann.proto is # touched, CMake will rebuild the LbannProto library. - set(PROTO_SRCS "${CMAKE_CURRENT_BINARY_DIR}/lbann.pb.cc") - set(PROTO_HDRS "${CMAKE_CURRENT_BINARY_DIR}/lbann.pb.h") - set(PROTO_PY "${CMAKE_CURRENT_BINARY_DIR}/lbann_pb2.py") + set_full_path(PROTO_INPUTS lbann.proto callbacks.proto) + + foreach (proto IN LISTS PROTO_INPUTS) + get_filename_component(name "${proto}" NAME_WE) + list(APPEND PROTO_SRCS "${CMAKE_CURRENT_BINARY_DIR}/${name}.pb.cc") + list(APPEND PROTO_HDRS "${CMAKE_CURRENT_BINARY_DIR}/${name}.pb.h") + list(APPEND PROTO_PY "${CMAKE_CURRENT_BINARY_DIR}/${name}_pb2.py") + endforeach () add_custom_command( COMMAND protobuf::protoc "--cpp_out=${CMAKE_CURRENT_BINARY_DIR}" "--python_out=${CMAKE_CURRENT_BINARY_DIR}" "-I" "${CMAKE_CURRENT_SOURCE_DIR}" - "${CMAKE_CURRENT_SOURCE_DIR}/lbann.proto" + "${PROTO_INPUTS}" OUTPUT ${PROTO_SRCS} ${PROTO_HDRS} ${PROTO_PY} DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/lbann.proto" protobuf::protoc COMMENT "Running protoc on lbann.proto" + COMMAND_EXPAND_LISTS VERBATIM) add_custom_target(LbannProto_genSrc diff --git a/src/proto/callbacks.proto b/src/proto/callbacks.proto new file mode 100644 index 00000000000..dd59ac1537e --- /dev/null +++ b/src/proto/callbacks.proto @@ -0,0 +1,333 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +syntax = "proto3"; + +package lbann_data; + +message Callback { + // a Callback should contain exactly one of the following + oneof callback_type { + CallbackPrint print = 1; + CallbackTimer timer = 2; + CallbackSummary summary = 3; + CallbackDumpWeights dump_weights = 4; + CallbackDumpOutputs dump_outputs = 5; + CallbackDumpErrorSignals dump_error_signals = 35; + CallbackDumpGradients dump_gradients = 6; + CallbackDumpMBIndices dump_mb_indices = 7; + CallbackDispIOStats disp_io_stats = 8; + CallbackImComm imcomm = 9; + CallbackSaveImages save_images = 10; + CallbackDebug debug = 11; + CallbackAdaptiveLearningRate adaptive_learning_rate = 12; + CallbackStepLearningRate step_learning_rate = 13; + CallbackCustomLearningRate custom_learning_rate = 14; + CallbackCheckSmall check_small = 15; + CallbackCheckNaN check_nan = 16; + CallbackCheckDataset check_dataset = 17; + CallbackHang hang = 18; + CallbackDropFixedLearningRate drop_fixed_learning_rate = 19; + CallbackLinearGrowthLearningRate linear_growth_learning_rate = 20; + CallbackProfiler profiler = 21; + CallbackStepMinibatch step_minibatch = 22; + CallbackCheckGradients check_gradients = 23; + CallbackLTFB ltfb = 24; + CallbackDebugIO debug_io = 25; + CallbackMinibatchSchedule minibatch_schedule = 26; + CallbackOptimizerwiseAdaptiveLearningRate optimizerwise_adaptive_learning_rate = 27; + CallbackCheckpoint checkpoint = 28; + CallbackSaveModel save_model = 29; + CallbackPolyLearningRate poly_learning_rate = 30; + CallbackReplaceWeights replace_weights = 31; + CallbackGPUMemoryUsage gpu_memory_usage = 32; + CallbackSyncLayers sync_layers = 33; + CallbackSyncSelected sync_selected = 34; + CallbackConfusionMatrix confusion_matrix = 36; + CallbackCheckMetric check_metric = 37; + CallbackPerturbAdam perturb_adam = 38; + CallbackPerturbDropout perturb_dropout = 39; + CallbackSaveTopKModels save_topk_models = 40; + CallbackMixup mixup = 41; + } +} + +message CallbackLTFB { + int64 batch_interval = 1; + string metric = 2; + string weights = 3; // default: all weights + bool low_score_wins = 4; + string communication_algorithm = 5; // default: "sendrecv_weights" + bool exchange_hyperparameters = 6; +} + +message CallbackStepLearningRate { + string weights = 1; //default: all weights + int64 step = 2; + double amt = 3; +} + +message CallbackCustomLearningRate { + //don't know how to support this, since it takes an std::function as an argument +} + +message CallbackAdaptiveLearningRate { + string weights = 1; //default: all weights + int64 patience = 2; + double amt = 3; +} + +message CallbackSaveImages { + string layers = 1; // Layer outputs to save as images + string image_format = 2; // Image format (e.g. jpg, png, pgm) + string image_prefix = 3; // Prefix for saved image files +} + +message CallbackPrint { + int64 interval = 1; //default in lbann_callback_print.hpp is 1 + bool print_global_stat_only = 2; //useful in large scale multi-trainer, default is false +} + +message CallbackProfiler { + bool sync = 1; + bool skip_init = 2; +} + +message CallbackTimer { +} + +message CallbackSummary { + string dir = 1; //directory for the lbann_summary + int64 batch_interval = 2; //default in lbann_callback_summary.hpp is 1 + int64 mat_interval = 3; //default in lbann_callback_summary.hpp is 25 +} + +message CallbackDumpWeights { + string basename = 1; +} + +message CallbackDumpOutputs { + string layers = 1; // Default: all layers + string execution_modes = 2; // Default: all modes + int64 batch_interval = 3; // Frequency for output dumping (default: all steps) + string directory = 4; // Directory for output files + string format = 5; // Options: csv, tsv, npy, npz (default: csv) +} + +message CallbackDumpErrorSignals { + string basename = 1; +} + +message CallbackDumpGradients { + string basename = 1; + int64 interval = 2; +} + +message CallbackDumpMBIndices { + string basename = 1; + int64 interval = 2; +} + +message CallbackDispIOStats { + string layers = 1; //e.g: "2 4 5"; use "10000" to apply to all layers +} + +message CallbackImComm { + string intertrainer_comm_method = 1; + bool all_optimizers = 2; +} + +message CallbackDebug { + string phase = 1; //should be called "modes" +} + +message CallbackDebugIO { + string phase = 1; + int32 lvl = 2; +} + +message CallbackCheckSmall { +} + +message CallbackCheckNaN { +} + +message CallbackCheckDataset { +} + +message CallbackHang { + int64 rank = 1; +} + +message CallbackDropFixedLearningRate { + string weights = 1; + repeated int64 drop_epoch = 2; + double amt = 3; +} + +message CallbackLinearGrowthLearningRate { + string weights = 1; + double target = 2; + int64 num_epochs = 3; + int64 delay = 4; +} + +message CallbackPolyLearningRate { + string weights = 1; + double power = 2; + uint64 num_epochs = 3; + uint64 max_iter = 4; + double end_lr = 5; +} + +message CallbackStepMinibatch { + int64 starting_mbsize = 1; + int64 step = 2; + int64 ramp_time = 3; +} + +message MinibatchScheduleStep { + int64 epoch = 1; + int64 mbsize = 2; + double lr = 3; + int64 ramp_time = 4; +} + +message CallbackOptimizerwiseAdaptiveLearningRate { + string weights = 1; + double scale = 2; +} + +message CallbackMinibatchSchedule { + int64 starting_mbsize = 1; + repeated MinibatchScheduleStep step = 2; +} + +message CallbackCheckGradients { + double step_size = 1; + bool verbose = 2; + bool error_on_failure = 3; // Throw error if gradient check fails +} + +message CallbackCheckMetric { + string metric = 1; + double lower_bound = 2; + double upper_bound = 3; + bool error_on_failure = 4; // Throw error if metric check fails + string execution_modes = 5; // Default: all modes +} + +message CallbackCheckpoint { + string checkpoint_dir = 1; + int64 checkpoint_epochs = 2; + int64 checkpoint_steps = 3; + double checkpoint_secs = 4; + string per_rank_dir = 5; + int64 ckpt_dist_epochs = 6; + int64 ckpt_dist_steps = 7; +} + + +message CallbackSaveModel { + string dir = 1; + string extension = 2; + bool disable_save_after_training = 3; +} + +message CallbackReplaceWeights { + string source_layers = 1; //set of layers to copy weights from + string destination_layers = 2; //set of layers to copy weights to + int64 batch_interval = 3; +} +message CallbackGPUMemoryUsage { +} + +message CallbackSyncLayers { + bool sync_gpus = 1; + bool sync_mpi = 2; + bool only_input = 3; +} + +message CallbackSyncSelected { + message LayerToSync { + enum PropDirection { + Both = 0; + Forward = 1; + Backward = 2; + } + string name = 1; // name of the layer to synchronize + PropDirection prop = 2; // propagation setep to synchronize + } + + message CudaProfilerSetup { + enum OutputMode { + KeyValuePair = 0; + CSV = 1; + } + bool no_init = 1; + string config_file = 2; + string output_dir = 3; + OutputMode output_mode = 4; + } + + bool async_gpus = 1; + bool async_mpi = 2; + repeated LayerToSync layer_to_sync = 3; + CudaProfilerSetup cuda_profiler_setup = 4; +} + +message CallbackConfusionMatrix { + string prediction = 1; // Prediction layer + string label = 2; // Label layer + string prefix = 3; // Prefix for output files +} + +message CallbackPerturbAdam { + float learning_rate_factor = 1; // Learning rate perturbation (in log space) + float beta1_factor = 2; // beta1 perturbation (in log space) + float beta2_factor = 3; // beta2 perturbation (in log space) + float eps_factor = 4; // eps perturbation (in log space) + bool perturb_during_training = 5; // Whether to periodically perturb during training + int64 batch_interval = 6; // Frequency of perturbation if perturb_during_training is true + string weights = 7; // Weights with Adam optimizer +} + +message CallbackPerturbDropout { + float keep_dropout_factor = 1; //Keep dropout prob perturbation (in log space) + string layers = 2; // dropout layers to perturb keep prob, all dropout layers by default +} + +message CallbackSaveTopKModels { + string dir = 1; //directory to save model + int32 k = 2; //number of (top) models to save + string metric = 3; //metrics to use in evaluating models + bool ascending_ordering = 4; //whether to sort metrics per model in ascending order, descending order is default +} + +message CallbackMixup { + string layers = 1; + float alpha = 2; +} diff --git a/src/proto/lbann.proto b/src/proto/lbann.proto index 557f5693c46..845b76a49e9 100644 --- a/src/proto/lbann.proto +++ b/src/proto/lbann.proto @@ -1,7 +1,35 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + syntax = "proto3"; package lbann_data; +import "callbacks.proto"; + message LbannPB { DataReader data_reader = 1; Model model = 2; @@ -399,314 +427,6 @@ message SGD { bool nesterov = 4; } - -//======================================================================== -// Callbacks -//======================================================================== -message Callback { - // a Callback should contain exactly one of the following - oneof callback_type { - CallbackPrint print = 1; - CallbackTimer timer = 2; - CallbackSummary summary = 3; - CallbackDumpWeights dump_weights = 4; - CallbackDumpOutputs dump_outputs = 5; - CallbackDumpErrorSignals dump_error_signals = 35; - CallbackDumpGradients dump_gradients = 6; - CallbackDumpMBIndices dump_mb_indices = 7; - CallbackDispIOStats disp_io_stats = 8; - CallbackImComm imcomm = 9; - CallbackSaveImages save_images = 10; - CallbackDebug debug = 11; - CallbackAdaptiveLearningRate adaptive_learning_rate = 12; - CallbackStepLearningRate step_learning_rate = 13; - CallbackCustomLearningRate custom_learning_rate = 14; - CallbackCheckSmall check_small = 15; - CallbackCheckNaN check_nan = 16; - CallbackCheckDataset check_dataset = 17; - CallbackHang hang = 18; - CallbackDropFixedLearningRate drop_fixed_learning_rate = 19; - CallbackLinearGrowthLearningRate linear_growth_learning_rate = 20; - CallbackProfiler profiler = 21; - CallbackStepMinibatch step_minibatch = 22; - CallbackCheckGradients check_gradients = 23; - CallbackLTFB ltfb = 24; - CallbackDebugIO debug_io = 25; - CallbackMinibatchSchedule minibatch_schedule = 26; - CallbackOptimizerwiseAdaptiveLearningRate optimizerwise_adaptive_learning_rate = 27; - CallbackCheckpoint checkpoint = 28; - CallbackSaveModel save_model = 29; - CallbackPolyLearningRate poly_learning_rate = 30; - CallbackReplaceWeights replace_weights = 31; - CallbackGPUMemoryUsage gpu_memory_usage = 32; - CallbackSyncLayers sync_layers = 33; - CallbackSyncSelected sync_selected = 34; - CallbackConfusionMatrix confusion_matrix = 36; - CallbackCheckMetric check_metric = 37; - CallbackPerturbAdam perturb_adam = 38; - CallbackPerturbDropout perturb_dropout = 39; - CallbackSaveTopKModels save_topk_models = 40; - CallbackMixup mixup = 41; - } -} - -message CallbackLTFB { - int64 batch_interval = 1; - string metric = 2; - string weights = 3; // default: all weights - bool low_score_wins = 4; - string communication_algorithm = 5; // default: "sendrecv_weights" - bool exchange_hyperparameters = 6; -} - -message CallbackStepLearningRate { - string weights = 1; //default: all weights - int64 step = 2; - double amt = 3; -} - -message CallbackCustomLearningRate { - //don't know how to support this, since it takes an std::function as an argument -} - -message CallbackAdaptiveLearningRate { - string weights = 1; //default: all weights - int64 patience = 2; - double amt = 3; -} - -message CallbackSaveImages { - string layers = 1; // Layer outputs to save as images - string image_format = 2; // Image format (e.g. jpg, png, pgm) - string image_prefix = 3; // Prefix for saved image files -} - -message CallbackPrint { - int64 interval = 1; //default in lbann_callback_print.hpp is 1 - bool print_global_stat_only = 2; //useful in large scale multi-trainer, default is false -} - -message CallbackProfiler { - bool sync = 1; - bool skip_init = 2; -} - -message CallbackTimer { -} - -message CallbackSummary { - string dir = 1; //directory for the lbann_summary - int64 batch_interval = 2; //default in lbann_callback_summary.hpp is 1 - int64 mat_interval = 3; //default in lbann_callback_summary.hpp is 25 -} - -message CallbackDumpWeights { - string basename = 1; -} - -message CallbackDumpOutputs { - string layers = 1; // Default: all layers - string execution_modes = 2; // Default: all modes - int64 batch_interval = 3; // Frequency for output dumping (default: all steps) - string directory = 4; // Directory for output files - string format = 5; // Options: csv, tsv, npy, npz (default: csv) -} - -message CallbackDumpErrorSignals { - string basename = 1; -} - -message CallbackDumpGradients { - string basename = 1; - int64 interval = 2; -} - -message CallbackDumpMBIndices { - string basename = 1; - int64 interval = 2; -} - -message CallbackDispIOStats { - string layers = 1; //e.g: "2 4 5"; use "10000" to apply to all layers -} - -message CallbackImComm { - string intertrainer_comm_method = 1; - bool all_optimizers = 2; -} - -message CallbackDebug { - string phase = 1; //should be called "modes" -} - -message CallbackDebugIO { - string phase = 1; - int32 lvl = 2; -} - -message CallbackCheckSmall { -} - -message CallbackCheckNaN { -} - -message CallbackCheckDataset { -} - -message CallbackHang { - int64 rank = 1; -} - -message CallbackDropFixedLearningRate { - string weights = 1; - repeated int64 drop_epoch = 2; - double amt = 3; -} - -message CallbackLinearGrowthLearningRate { - string weights = 1; - double target = 2; - int64 num_epochs = 3; - int64 delay = 4; -} - -message CallbackPolyLearningRate { - string weights = 1; - double power = 2; - uint64 num_epochs = 3; - uint64 max_iter = 4; - double end_lr = 5; -} - -message CallbackStepMinibatch { - int64 starting_mbsize = 1; - int64 step = 2; - int64 ramp_time = 3; -} - -message MinibatchScheduleStep { - int64 epoch = 1; - int64 mbsize = 2; - double lr = 3; - int64 ramp_time = 4; -} - -message CallbackOptimizerwiseAdaptiveLearningRate { - string weights = 1; - double scale = 2; -} - -message CallbackMinibatchSchedule { - int64 starting_mbsize = 1; - repeated MinibatchScheduleStep step = 2; -} - -message CallbackCheckGradients { - double step_size = 1; - bool verbose = 2; - bool error_on_failure = 3; // Throw error if gradient check fails -} - -message CallbackCheckMetric { - string metric = 1; - double lower_bound = 2; - double upper_bound = 3; - bool error_on_failure = 4; // Throw error if metric check fails - string execution_modes = 5; // Default: all modes -} - -message CallbackCheckpoint { - string checkpoint_dir = 1; - int64 checkpoint_epochs = 2; - int64 checkpoint_steps = 3; - double checkpoint_secs = 4; - string per_rank_dir = 5; - int64 ckpt_dist_epochs = 6; - int64 ckpt_dist_steps = 7; -} - - -message CallbackSaveModel { - string dir = 1; - string extension = 2; - bool disable_save_after_training = 3; -} - -message CallbackReplaceWeights { - string source_layers = 1; //set of layers to copy weights from - string destination_layers = 2; //set of layers to copy weights to - int64 batch_interval = 3; -} -message CallbackGPUMemoryUsage { -} - -message CallbackSyncLayers { - bool sync_gpus = 1; - bool sync_mpi = 2; - bool only_input = 3; -} - -message CallbackSyncSelected { - message LayerToSync { - enum PropDirection { - Both = 0; - Forward = 1; - Backward = 2; - } - string name = 1; // name of the layer to synchronize - PropDirection prop = 2; // propagation setep to synchronize - } - - message CudaProfilerSetup { - enum OutputMode { - KeyValuePair = 0; - CSV = 1; - } - bool no_init = 1; - string config_file = 2; - string output_dir = 3; - OutputMode output_mode = 4; - } - - bool async_gpus = 1; - bool async_mpi = 2; - repeated LayerToSync layer_to_sync = 3; - CudaProfilerSetup cuda_profiler_setup = 4; -} - -message CallbackConfusionMatrix { - string prediction = 1; // Prediction layer - string label = 2; // Label layer - string prefix = 3; // Prefix for output files -} - -message CallbackPerturbAdam { - float learning_rate_factor = 1; // Learning rate perturbation (in log space) - float beta1_factor = 2; // beta1 perturbation (in log space) - float beta2_factor = 3; // beta2 perturbation (in log space) - float eps_factor = 4; // eps perturbation (in log space) - bool perturb_during_training = 5; // Whether to periodically perturb during training - int64 batch_interval = 6; // Frequency of perturbation if perturb_during_training is true - string weights = 7; // Weights with Adam optimizer -} - -message CallbackPerturbDropout { - float keep_dropout_factor = 1; //Keep dropout prob perturbation (in log space) - string layers = 2; // dropout layers to perturb keep prob, all dropout layers by default -} - -message CallbackSaveTopKModels { - string dir = 1; //directory to save model - int32 k = 2; //number of (top) models to save - string metric = 3; //metrics to use in evaluating models - bool ascending_ordering = 4; //whether to sort metrics per model in ascending order, descending order is default -} - -message CallbackMixup { - string layers = 1; - float alpha = 2; -} - //======================================================================== // Weights //======================================================================== From afeb88a9fdd33ab9828d0cb288b1034afa3cf465 Mon Sep 17 00:00:00 2001 From: "Thomas R. Benson" Date: Sun, 21 Jul 2019 16:08:56 -0700 Subject: [PATCH 136/634] Update name for function converting strings to execution modes --- include/lbann/base.hpp | 2 +- src/base.cpp | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/include/lbann/base.hpp b/include/lbann/base.hpp index 3f5d5668648..bfe52cec140 100644 --- a/include/lbann/base.hpp +++ b/include/lbann/base.hpp @@ -154,7 +154,7 @@ inline std::string to_string(execution_mode m) { } } /** @brief Convert a string to an execution_mode. */ -execution_mode from_string(std::string const& str); +execution_mode exe_mode_from_string(std::string const& str); /** @brief Extract an execution_mode from a stream. */ std::istream& operator>>(std::istream& os, execution_mode& e); diff --git a/src/base.cpp b/src/base.cpp index d8cfd74c8ce..f0d4a4bdfc9 100644 --- a/src/base.cpp +++ b/src/base.cpp @@ -108,14 +108,14 @@ std::string get_pool_mode_name(pool_mode m) { return pool_mode_names[(int)m]; } -execution_mode from_string(std::string const& str) { - if (str == "training") +execution_mode exe_mode_from_string(std::string const& str) { + if (str == "training" || str == "train") return execution_mode::training; - else if (str == "validation") + else if (str == "validation" || str == "validate") return execution_mode::validation; - else if (str == "testing") + else if (str == "testing" || str == "test") return execution_mode::testing; - else if (str == "prediction") + else if (str == "prediction" || str == "predict") return execution_mode::prediction; else if (str == "invalid") return execution_mode::invalid; @@ -126,7 +126,7 @@ execution_mode from_string(std::string const& str) { std::istream& operator>>(std::istream& is, execution_mode& m) { std::string tmp; is >> tmp; - m = from_string(tmp); + m = exe_mode_from_string(tmp); return is; } From 086a003ac10ee5d2bf12d2b74ea37ab97962bc8b Mon Sep 17 00:00:00 2001 From: "Thomas R. Benson" Date: Sun, 21 Jul 2019 16:10:20 -0700 Subject: [PATCH 137/634] fix outstanding issues with weights_list and layer_list callbacks --- include/lbann/callbacks/callback_io.hpp | 21 ++- .../callbacks/callback_learning_rate.hpp | 41 +++-- .../callbacks/callback_replace_weights.hpp | 19 ++- src/callbacks/callback_debug.cpp | 1 - src/callbacks/callback_debug_io.cpp | 19 ++- src/callbacks/callback_dump_outputs.cpp | 7 +- src/callbacks/callback_helpers.hpp | 53 +++++++ src/callbacks/callback_io.cpp | 25 ++- src/callbacks/callback_learning_rate.cpp | 148 +++++++++--------- src/callbacks/callback_replace_weights.cpp | 24 +-- 10 files changed, 219 insertions(+), 139 deletions(-) create mode 100644 src/callbacks/callback_helpers.hpp diff --git a/include/lbann/callbacks/callback_io.hpp b/include/lbann/callbacks/callback_io.hpp index fb788e2abe0..0149617693e 100644 --- a/include/lbann/callbacks/callback_io.hpp +++ b/include/lbann/callbacks/callback_io.hpp @@ -29,10 +29,13 @@ #ifndef LBANN_CALLBACKS_IO_HPP_INCLUDED #define LBANN_CALLBACKS_IO_HPP_INCLUDED -#include -#include #include "lbann/callbacks/callback.hpp" +#include + +#include +#include + namespace lbann { /** @@ -40,19 +43,23 @@ namespace lbann { */ class lbann_callback_io : public lbann_callback { public: - lbann_callback_io(); + lbann_callback_io() = default; + /** Only apply to specific layers. */ + lbann_callback_io(std::vector const& layers) + : m_layers(layers.begin(), layers.end()) {} + lbann_callback_io(const lbann_callback_io&) = default; lbann_callback_io& operator=(const lbann_callback_io&) = default; - lbann_callback_io* copy() const override { return new lbann_callback_io(*this); } - /** Only apply to specific layers. */ - lbann_callback_io(std::unordered_set layers); + lbann_callback_io* copy() const override { + return new lbann_callback_io(*this); + } /** Report how much I/O has occured per data reader */ void on_epoch_end(model *m) override; void on_test_end(model *m) override; std::string name() const override { return "io"; } private: /** Indicies of layers to monitor. */ - std::unordered_set m_layer_indices; + std::unordered_set m_layers; }; // Builder function diff --git a/include/lbann/callbacks/callback_learning_rate.hpp b/include/lbann/callbacks/callback_learning_rate.hpp index badd3b3a380..43a3c57b073 100644 --- a/include/lbann/callbacks/callback_learning_rate.hpp +++ b/include/lbann/callbacks/callback_learning_rate.hpp @@ -48,7 +48,7 @@ class lbann_callback_learning_rate : public lbann_callback { lbann_callback_learning_rate& operator=( const lbann_callback_learning_rate&) = default; /** Only apply to specific weights. */ - lbann_callback_learning_rate(std::unordered_set weights_list); + lbann_callback_learning_rate(std::vector weights_names); /** Do some initialization. */ void setup(model *m) override; /** Apply global learning rate schedules. */ @@ -65,7 +65,10 @@ class lbann_callback_learning_rate : public lbann_callback { * The returned learning rate will be used to automatically update * the current global learning rate. */ - virtual float global_schedule(model *m) { return m_cur_global_lr; } + virtual float global_schedule(model *m) { + return get_current_global_learning_rate(); + } + /** * This is called at the end of every training mini-batch to update the * learning rate for optimizer opt. The current global learning rate is *not* @@ -75,16 +78,32 @@ class lbann_callback_learning_rate : public lbann_callback { return opt.get_learning_rate(); } - /** Weights to update. */ - std::unordered_set m_weights; + const std::unordered_set& get_weights() const noexcept { + return m_weights; + } - /** + static float get_current_global_learning_rate() noexcept { + return m_cur_global_lr; + } + + static void update_global_learning_rate(float rate) noexcept { + m_cur_global_lr = rate; + } + + private: + /** * This should be maintained by all learning rate schedule * implementations as the current global learning rate. This enables * coordination among different schedules, particularly ones that * work on a per-optimizer basis. */ static float m_cur_global_lr; + + /** Names of the weights being updated. */ + std::vector m_weights_names; + + /** Weights to update. */ + std::unordered_set m_weights; }; /** @@ -95,7 +114,7 @@ class lbann_callback_step_learning_rate : public lbann_callback_learning_rate { /** Decrease the learning rate by amt every step epochs. */ lbann_callback_step_learning_rate(int step, float amt); lbann_callback_step_learning_rate(int step, float amt, - std::unordered_set weights_list); + std::vector weights_names); lbann_callback_step_learning_rate( const lbann_callback_step_learning_rate&) = default; lbann_callback_step_learning_rate& operator=( @@ -130,7 +149,7 @@ class lbann_callback_adaptive_learning_rate : public lbann_callback_learning_rat */ lbann_callback_adaptive_learning_rate(int64_t patience, float amt); lbann_callback_adaptive_learning_rate(int64_t patience, float amt, - std::unordered_set weights_list); + std::vector weights_names); lbann_callback_adaptive_learning_rate( const lbann_callback_adaptive_learning_rate&) = default; lbann_callback_adaptive_learning_rate& operator=( @@ -175,7 +194,7 @@ class lbann_callback_drop_fixed_learning_rate : std::vector drop_epochs, float amt); lbann_callback_drop_fixed_learning_rate( std::vector drop_epochs, float amt, - std::unordered_set weights_list); + std::vector weights_names); lbann_callback_drop_fixed_learning_rate( const lbann_callback_drop_fixed_learning_rate&) = default; lbann_callback_drop_fixed_learning_rate& operator=( @@ -220,7 +239,7 @@ class lbann_callback_linear_growth_learning_rate : float target, int64_t num_epochs, int64_t delay); lbann_callback_linear_growth_learning_rate( float target, int64_t num_epochs, int64_t delay, - std::unordered_set weights_list); + std::vector weights_names); lbann_callback_linear_growth_learning_rate( const lbann_callback_linear_growth_learning_rate&) = default; lbann_callback_linear_growth_learning_rate& operator=( @@ -259,7 +278,7 @@ class lbann_callback_poly_learning_rate : public lbann_callback_learning_rate { public: lbann_callback_poly_learning_rate(double p, uint64_t n_epochs, uint64_t max_iter); lbann_callback_poly_learning_rate(double p, uint64_t n_epochs, uint64_t max_iter, double endl_r, - std::unordered_set weights_list); + std::vector weights_names); lbann_callback_poly_learning_rate( const lbann_callback_poly_learning_rate&) = default; lbann_callback_poly_learning_rate& operator=( @@ -303,7 +322,7 @@ class lbann_callback_optimizerwise_adaptive_learning_rate : public lbann_callbac public: lbann_callback_optimizerwise_adaptive_learning_rate(float scale); lbann_callback_optimizerwise_adaptive_learning_rate( - float scale, std::unordered_set weights_list); + float scale, std::vector weights_names); lbann_callback_optimizerwise_adaptive_learning_rate( const lbann_callback_optimizerwise_adaptive_learning_rate&) = default; lbann_callback_optimizerwise_adaptive_learning_rate& operator=( diff --git a/include/lbann/callbacks/callback_replace_weights.hpp b/include/lbann/callbacks/callback_replace_weights.hpp index 27de4d96e97..359079de1a6 100644 --- a/include/lbann/callbacks/callback_replace_weights.hpp +++ b/include/lbann/callbacks/callback_replace_weights.hpp @@ -42,13 +42,15 @@ namespace lbann { */ class lbann_callback_replace_weights : public lbann_callback { public: - lbann_callback_replace_weights(std::vector src, - std::vector dst, int batch_interval=1) : - lbann_callback(batch_interval), - m_src_layers(std::move(src)), - m_dst_layers(std::move(dst)){ - if(m_src_layers.size() != m_dst_layers.size()) - throw lbann_exception("In replace weights callback: number of src and dest layers does not match."); + lbann_callback_replace_weights( + std::vector src, + std::vector dst, + int batch_interval=1) + : lbann_callback(batch_interval), + m_src_layer_names(std::move(src)), + m_dst_layer_names(std::move(dst)) { + if(m_src_layer_names.size() != m_dst_layer_names.size()) + LBANN_ERROR("In replace weights callback: number of src and dest layers does not match."); } lbann_callback_replace_weights( @@ -58,12 +60,13 @@ class lbann_callback_replace_weights : public lbann_callback { lbann_callback_replace_weights* copy() const override { return new lbann_callback_replace_weights(*this); } + void setup(model *m) override; void on_batch_end(model *m) override; std::string name() const override { return "replace weights"; } private: + std::vector m_src_layer_names, m_dst_layer_names; std::vector m_src_layers, m_dst_layers; - }; // Builder function diff --git a/src/callbacks/callback_debug.cpp b/src/callbacks/callback_debug.cpp index ef4f02e117e..25382a361d2 100644 --- a/src/callbacks/callback_debug.cpp +++ b/src/callbacks/callback_debug.cpp @@ -162,7 +162,6 @@ build_callback_debug_from_pbuf(const google::protobuf::Message& proto_msg, lbann_summary* summarizer) { const auto& params = dynamic_cast(proto_msg); - // FIXME TRB const auto& modes = parse_set(params.phase()); return make_unique(modes, summarizer); diff --git a/src/callbacks/callback_debug_io.cpp b/src/callbacks/callback_debug_io.cpp index 108574da28c..9f8dd5e2530 100644 --- a/src/callbacks/callback_debug_io.cpp +++ b/src/callbacks/callback_debug_io.cpp @@ -152,20 +152,19 @@ void lbann_callback_debug_io::on_test_begin(model *m) { } } -// FIXME TRB std::unique_ptr build_callback_debug_io_from_pbuf( const google::protobuf::Message& proto_msg, lbann_summary*) { - const auto& params = dynamic_cast(proto_msg); - const auto& phase = params.phase(); + const auto& params = + dynamic_cast(proto_msg); + const auto& phase = exe_mode_from_string(params.phase()); const auto& lvl = params.lvl(); - if (phase == "train" || phase == "training") { - return make_unique(execution_mode::training, lvl); - } else if (phase == "validate" || phase == "validation") { - return make_unique(execution_mode::validation, lvl); - } else if (phase == "test" || phase == "testing") { - return make_unique(execution_mode::testing, lvl); - } else { + switch (phase) { + case execution_mode::training: + case execution_mode::validation: + case execution_mode::testing: + return make_unique(phase, lvl); + default: return make_unique(); } } diff --git a/src/callbacks/callback_dump_outputs.cpp b/src/callbacks/callback_dump_outputs.cpp index 6f1bec3af77..f11ac2fbb44 100644 --- a/src/callbacks/callback_dump_outputs.cpp +++ b/src/callbacks/callback_dump_outputs.cpp @@ -25,11 +25,9 @@ //////////////////////////////////////////////////////////////////////////////// #include "lbann/callbacks/callback_dump_outputs.hpp" +#include "lbann/proto/proto_common.hpp" #include "lbann/utils/file_utils.hpp" -// FIXME TRB -#include "lbann/proto/factories.hpp" - #include #ifdef LBANN_HAS_CNPY @@ -180,13 +178,12 @@ void lbann_callback_dump_outputs::dump_outputs(const model& m, const Layer& l) { } -// FIXME TRB std::unique_ptr build_callback_dump_outputs_from_pbuf( const google::protobuf::Message& proto_msg, lbann_summary*) { const auto& params = dynamic_cast(proto_msg); - const auto& layer_names = parse_set<>(params.layers()); + const auto& layer_names = parse_set(params.layers()); const auto& modes = parse_set(params.execution_modes()); return make_unique(layer_names, diff --git a/src/callbacks/callback_helpers.hpp b/src/callbacks/callback_helpers.hpp new file mode 100644 index 00000000000..f4f3fc5422b --- /dev/null +++ b/src/callbacks/callback_helpers.hpp @@ -0,0 +1,53 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#include "lbann/utils/exception.hpp" + +#include +#include + +namespace lbann { +namespace { +template +std::vector select_things_by_name( + std::vector const& things, + std::vector const& thing_names) { + + std::vector out_things; + for (auto const& name : thing_names) { + auto it = std::find_if( + things.begin(), things.end(), + [&name](const T* t) { return t->get_name() == name; }); + if (it != things.end()) + out_things.push_back(*it); + else + LBANN_ERROR(std::string("Requested thing \"") + name + + "\" does not exist in the list of things."); + } + return out_things; +} +}// namespace +}// namespace lbann diff --git a/src/callbacks/callback_io.cpp b/src/callbacks/callback_io.cpp index 16365c6f570..57143c64bb7 100644 --- a/src/callbacks/callback_io.cpp +++ b/src/callbacks/callback_io.cpp @@ -30,19 +30,15 @@ #include "lbann/callbacks/callback_io.hpp" #include "lbann/layers/io/input/generic_input_layer.hpp" +#include "lbann/proto/proto_common.hpp" namespace lbann { -lbann_callback_io::lbann_callback_io() : lbann_callback() {} - -lbann_callback_io::lbann_callback_io( - std::unordered_set layers) : lbann_callback(), m_layer_indices(std::move(layers)) {} - void lbann_callback_io::on_epoch_end(model *m) { lbann_comm *comm = m->get_comm(); for (Layer *layer : m->get_layers()) { - if(m_layer_indices.size() == 0 - || m_layer_indices.find(layer) != m_layer_indices.end()) { + if(m_layers.size() == 0 + || m_layers.find(layer->get_name()) != m_layers.end()) { auto *input = (generic_input_layer *) dynamic_cast (layer); if(input != nullptr) { std::cout << "Rank " << comm->get_trainer_rank() << "." << comm->get_rank_in_trainer() << " processed " @@ -57,8 +53,8 @@ void lbann_callback_io::on_epoch_end(model *m) { void lbann_callback_io::on_test_end(model *m) { lbann_comm *comm = m->get_comm(); for (Layer *layer : m->get_layers()) { - if(m_layer_indices.size() == 0 - || m_layer_indices.find(layer) != m_layer_indices.end()) { + if(m_layers.size() == 0 + || m_layers.find(layer->get_name()) != m_layers.end()) { auto *input = (generic_input_layer *) dynamic_cast (layer); if(input != nullptr) { std::cout << "Rank " << comm->get_trainer_rank() << "." << comm->get_rank_in_trainer() << " processed " @@ -70,16 +66,13 @@ void lbann_callback_io::on_test_end(model *m) { } } -// FIXME TRB std::unique_ptr build_callback_disp_io_stats_from_pbuf( const google::protobuf::Message& proto_msg, lbann_summary*) { - //const auto& params = - // dynamic_cast(proto_msg); - //auto&& l = select_from_list(params.layers(), - // layer_list); - std::unordered_set selected_layers;//(l.begin(), l.end()); - return make_unique(selected_layers); + const auto& params = + dynamic_cast(proto_msg); + return make_unique( + parse_list(params.layers())); } } // namespace lbann diff --git a/src/callbacks/callback_learning_rate.cpp b/src/callbacks/callback_learning_rate.cpp index 8849c3ab042..ab2294fa1f3 100644 --- a/src/callbacks/callback_learning_rate.cpp +++ b/src/callbacks/callback_learning_rate.cpp @@ -27,9 +27,18 @@ //////////////////////////////////////////////////////////////////////////////// #include "lbann/callbacks/callback_learning_rate.hpp" +#include "lbann/proto/proto_common.hpp" + +#include "callback_helpers.hpp" + +#include +#include // std::pow +#include #include +#include +#include #include -#include // std::pow +#include namespace lbann { @@ -38,18 +47,20 @@ float lbann_callback_learning_rate::m_cur_global_lr = 0.0f; lbann_callback_learning_rate::lbann_callback_learning_rate() {} lbann_callback_learning_rate::lbann_callback_learning_rate( - std::unordered_set weights_list) : m_weights(std::move(weights_list)) {} + std::vector weights_names) + : m_weights_names(std::move(weights_names)) {} void lbann_callback_learning_rate::setup(model *m) { // Add all weights if list of weights is not initialized - std::vector weights_list(m_weights.begin(), m_weights.end()); + std::vector weights_list = + select_things_by_name(m->get_weights(), m_weights_names); if (weights_list.empty()) { weights_list = m->get_weights(); } // Remove weights that are not being optimized - m_weights.clear(); + std::unordered_set().swap(m_weights); for (weights *w : weights_list) { optimizer *opt = w->get_optimizer(); if (opt != nullptr) { @@ -73,7 +84,7 @@ void lbann_callback_learning_rate::on_epoch_end(model *m) { << "changing global learning rate to " << new_lr << " at epoch " << m->get_epoch() << std::endl; } - for (weights *w : m_weights) { + for (weights *w : this->get_weights()) { optimizer *opt = w->get_optimizer(); const float old_lr = opt->get_learning_rate(); if (old_lr != new_lr) { @@ -83,7 +94,7 @@ void lbann_callback_learning_rate::on_epoch_end(model *m) { } void lbann_callback_learning_rate::on_backward_prop_end(model *m) { - for (weights *w : m_weights) { + for (weights *w : this->get_weights()) { optimizer& opt = *w->get_optimizer(); const float old_lr = opt.get_learning_rate(); const float new_lr = optimizer_schedule(m, opt); @@ -98,25 +109,27 @@ lbann_callback_step_learning_rate::lbann_callback_step_learning_rate( lbann_callback_learning_rate(), m_step(step), m_amt(amt) {} lbann_callback_step_learning_rate::lbann_callback_step_learning_rate( - int step, float amt, std::unordered_set weights_list) : - lbann_callback_learning_rate(weights_list), m_step(step), m_amt(amt) {} + int step, float amt, std::vector weights_names) : + lbann_callback_learning_rate(std::move(weights_names)), + m_step(step), m_amt(amt) {} float lbann_callback_step_learning_rate::global_schedule(model *m) { if (m->get_epoch() % m_step == 0) { - return m_cur_global_lr * m_amt; + return get_current_global_learning_rate() * m_amt; } else { - return m_cur_global_lr; + return get_current_global_learning_rate(); } } lbann_callback_adaptive_learning_rate::lbann_callback_adaptive_learning_rate( int64_t patience, float amt) : lbann_callback_adaptive_learning_rate(patience, amt, - std::unordered_set()) {} + std::vector()) {} lbann_callback_adaptive_learning_rate::lbann_callback_adaptive_learning_rate( - int64_t patience, float amt, std::unordered_set weights_list) : - lbann_callback_learning_rate(weights_list), m_patience(patience), m_amt(amt) {} + int64_t patience, float amt, std::vector weights_list) : + lbann_callback_learning_rate(std::move(weights_list)), + m_patience(patience), m_amt(amt) {} float lbann_callback_adaptive_learning_rate::global_schedule(model *m) { // Determine behavior the first time this is called in an epoch @@ -143,20 +156,21 @@ float lbann_callback_adaptive_learning_rate::global_schedule(model *m) { // Adjust learning rate if needed if (m_adjust_learning_rate) { - return m_cur_global_lr * m_amt; + return get_current_global_learning_rate() * m_amt; } else { - return m_cur_global_lr; + return get_current_global_learning_rate(); } } lbann_callback_drop_fixed_learning_rate::lbann_callback_drop_fixed_learning_rate( std::vector drop_epochs, float amt) : lbann_callback_drop_fixed_learning_rate(std::move(drop_epochs), amt, - std::unordered_set()) {} + std::vector()) {} lbann_callback_drop_fixed_learning_rate::lbann_callback_drop_fixed_learning_rate( - std::vector drop_epochs, float amt, std::unordered_set weights_list) : - lbann_callback_learning_rate(weights_list), m_amt(amt), m_drop_epochs(std::move(drop_epochs)) { + std::vector drop_epochs, float amt, std::vector weights_names) : + lbann_callback_learning_rate(std::move(weights_names)), + m_amt(amt), m_drop_epochs(std::move(drop_epochs)) { // Sort in reverse order. std::sort(m_drop_epochs.rbegin(), m_drop_epochs.rend()); } @@ -170,46 +184,47 @@ float lbann_callback_drop_fixed_learning_rate::global_schedule(model* m) { // Adjust learning rate if at a drop epoch if (!m_drop_epochs.empty() && m->get_epoch() == m_drop_epochs.back()) { - return m_cur_global_lr * m_amt; + return get_current_global_learning_rate() * m_amt; } else { - return m_cur_global_lr; + return get_current_global_learning_rate(); } } lbann_callback_linear_growth_learning_rate::lbann_callback_linear_growth_learning_rate( float target, int64_t num_epochs) : lbann_callback_linear_growth_learning_rate(target, num_epochs, 0, - std::unordered_set()) {} + std::vector()) {} lbann_callback_linear_growth_learning_rate::lbann_callback_linear_growth_learning_rate( float target, int64_t num_epochs, int64_t delay) : lbann_callback_linear_growth_learning_rate(target, num_epochs, delay, - std::unordered_set()) {} + std::vector()) {} lbann_callback_linear_growth_learning_rate::lbann_callback_linear_growth_learning_rate( float target, int64_t num_epochs, int64_t delay, - std::unordered_set weights_list) : - lbann_callback_learning_rate(weights_list), m_target(target), m_inc(0), + std::vector weights_names) : + lbann_callback_learning_rate(std::move(weights_names)), + m_target(target), m_inc(0), m_num_epochs(num_epochs), m_delay(delay) {} void lbann_callback_linear_growth_learning_rate::setup(model *m) { lbann_callback_learning_rate::setup(m); // Compute the learning rate increase. - if (!m_weights.empty()) { + if (!this->get_weights().empty()) { // Assumes all optimizers have the same initial learning rate. - m_base_lr = m_cur_global_lr; + m_base_lr = get_current_global_learning_rate(); m_inc = (m_target - m_base_lr) / m_num_epochs; } } float lbann_callback_linear_growth_learning_rate::global_schedule(model *m) { if (m->get_epoch() < m_delay) { - return m_cur_global_lr; + return get_current_global_learning_rate(); } else if (m->get_epoch() <= m_num_epochs + m_delay) { int num_left = m_num_epochs + m_delay - m->get_epoch(); return m_base_lr + m_inc*(m_num_epochs - num_left); } else { - return m_cur_global_lr; + return get_current_global_learning_rate(); } } @@ -221,14 +236,14 @@ float lbann_callback_linear_growth_learning_rate::global_schedule(model *m) { */ lbann_callback_poly_learning_rate::lbann_callback_poly_learning_rate( double p, uint64_t n_epochs, uint64_t max_iter) - : lbann_callback_learning_rate(std::unordered_set()), + : lbann_callback_learning_rate(std::vector()), m_p(p), m_num_epochs(n_epochs), m_max_iter(max_iter), m_end_lr(0.0f), m_lr(1.0f), m_last_epoch_lr(1.0f) {} lbann_callback_poly_learning_rate::lbann_callback_poly_learning_rate( - double p, uint64_t n_epochs, uint64_t max_iter, double end_lr, std::unordered_set weights_list) - : lbann_callback_learning_rate(weights_list), + double p, uint64_t n_epochs, uint64_t max_iter, double end_lr, std::vector weights_names) + : lbann_callback_learning_rate(std::move(weights_names)), m_p(p), m_num_epochs(n_epochs), m_max_iter(max_iter), m_end_lr(end_lr), m_lr(1.0f), m_last_epoch_lr(1.0f) {} @@ -250,7 +265,7 @@ void lbann_callback_poly_learning_rate::setup(model *m) { float lbann_callback_poly_learning_rate::global_schedule(model *m) { const float scale = m_lr / m_last_epoch_lr; m_last_epoch_lr = m_lr; - return (m_cur_global_lr - m_end_lr) * scale + m_end_lr; + return (get_current_global_learning_rate() - m_end_lr) * scale + m_end_lr; } /** @@ -262,17 +277,20 @@ float lbann_callback_poly_learning_rate::optimizer_schedule(model *m, optimizer m_lr = static_cast(std::pow(static_cast(m_max_iter - cur_iter)/m_max_iter, m_p)); } const float scale = m_lr / m_last_epoch_lr; - return (m_cur_global_lr - m_end_lr) * scale + m_end_lr; + return (get_current_global_learning_rate() - m_end_lr) * scale + m_end_lr; } -lbann_callback_optimizerwise_adaptive_learning_rate::lbann_callback_optimizerwise_adaptive_learning_rate( +lbann_callback_optimizerwise_adaptive_learning_rate:: +lbann_callback_optimizerwise_adaptive_learning_rate( float scale) : - lbann_callback_optimizerwise_adaptive_learning_rate(scale, - std::unordered_set()) {} + lbann_callback_optimizerwise_adaptive_learning_rate( + scale, + std::vector()) {} -lbann_callback_optimizerwise_adaptive_learning_rate::lbann_callback_optimizerwise_adaptive_learning_rate( - float scale, std::unordered_set weights_list) : - lbann_callback_learning_rate(weights_list), m_scale(scale) {} +lbann_callback_optimizerwise_adaptive_learning_rate:: +lbann_callback_optimizerwise_adaptive_learning_rate( + float scale, std::vector weights_names) : + lbann_callback_learning_rate(std::move(weights_names)), m_scale(scale) {} float lbann_callback_optimizerwise_adaptive_learning_rate::optimizer_schedule( model *m, optimizer &opt) { @@ -280,7 +298,7 @@ float lbann_callback_optimizerwise_adaptive_learning_rate::optimizer_schedule( DataType param_grad_norm = El::Nrm2(opt.get_gradient()); if (param_norm > DataType(0) && param_grad_norm > DataType(0)) { // TODO: Should incorporate weight decay, etc. here. - return m_cur_global_lr * m_scale * param_norm / param_grad_norm; + return get_current_global_learning_rate() * m_scale * param_norm / param_grad_norm; } else { return opt.get_learning_rate(); } @@ -292,12 +310,10 @@ build_callback_step_learning_rate_from_pbuf( const google::protobuf::Message& proto_msg, lbann_summary*) { const auto& params = dynamic_cast(proto_msg); - //auto&& w = select_from_list(params.weights(), - // weights_list); - std::unordered_set selected_weights;//(w.begin(), w.end()); - return make_unique(params.step(), - params.amt(), - selected_weights); + return make_unique( + params.step(), + params.amt(), + parse_list(params.weights())); } // FIXME TRB @@ -306,12 +322,10 @@ build_callback_adaptive_learning_rate_from_pbuf( const google::protobuf::Message& proto_msg, lbann_summary*) { const auto& params = dynamic_cast(proto_msg); - //auto&& w = select_from_list(params.weights(), - // weights_list); - std::unordered_set selected_weights;//(w.begin(), w.end()); - return make_unique(params.patience(), - params.amt(), - selected_weights); + return make_unique( + params.patience(), + params.amt(), + parse_list(params.weights())); } // FIXME TRB @@ -324,13 +338,10 @@ build_callback_drop_fixed_learning_rate_from_pbuf( for (int i = 0; i < params.drop_epoch_size(); ++i) { drop_epochs.push_back(params.drop_epoch(i)); } - //auto&& w = select_from_list(params.weights(), - // weights_list); - std::unordered_set selected_weights;//(w.begin(), w.end()); return make_unique( std::move(drop_epochs), params.amt(), - selected_weights); + parse_list(params.weights())); } // FIXME TRB @@ -341,13 +352,10 @@ build_callback_linear_growth_learning_rate_from_pbuf( using CallbackType = lbann_callback_linear_growth_learning_rate; const auto& params = dynamic_cast(proto_msg); - //auto&& w = select_from_list(params.weights(), - // weights_list); - std::unordered_set selected_weights;//(w.begin(), w.end()); return make_unique(params.target(), params.num_epochs(), params.delay(), - selected_weights); + parse_list(params.weights())); } // FIXME TRB @@ -357,10 +365,8 @@ build_callback_optimizerwise_adaptive_learning_rate_from_pbuf( using MsgType = lbann_data::CallbackOptimizerwiseAdaptiveLearningRate; using CallbackType = lbann_callback_optimizerwise_adaptive_learning_rate; const auto& params = dynamic_cast(proto_msg); - //auto&& w = select_from_list(params.weights(), - // weights_list); - std::unordered_set selected_weights;//(w.begin(), w.end()); - return make_unique(params.scale(), selected_weights); + return make_unique(params.scale(), + parse_list(params.weights())); } // FIXME TRB @@ -369,14 +375,12 @@ build_callback_poly_learning_rate_from_pbuf( const google::protobuf::Message& proto_msg, lbann_summary*) { const auto& params = dynamic_cast(proto_msg); - //auto&& w = select_from_list(params.weights(), - // weights_list); - std::unordered_set selected_weights;//(w.begin(), w.end()); - return make_unique(params.power(), - params.num_epochs(), - params.max_iter(), - params.end_lr(), - selected_weights); + return make_unique( + params.power(), + params.num_epochs(), + params.max_iter(), + params.end_lr(), + parse_list(params.weights())); } } // namespace lbann diff --git a/src/callbacks/callback_replace_weights.cpp b/src/callbacks/callback_replace_weights.cpp index 0dd9ba1f22e..5aa820aae1b 100644 --- a/src/callbacks/callback_replace_weights.cpp +++ b/src/callbacks/callback_replace_weights.cpp @@ -25,9 +25,22 @@ //////////////////////////////////////////////////////////////////////////////// #include "lbann/callbacks/callback_replace_weights.hpp" +#include "lbann/proto/proto_common.hpp" + +#include "callback_helpers.hpp" namespace lbann { +void lbann_callback_replace_weights::setup(model *m) { + auto const layers = m->get_layers(); + m_src_layers = select_things_by_name(layers, m_src_layer_names); + m_dst_layers = select_things_by_name(layers, m_dst_layer_names); + + // Pretend the extra storage space matters + std::vector().swap(m_src_layer_names); + std::vector().swap(m_dst_layer_names); +} + void lbann_callback_replace_weights::on_batch_end(model *m) { const auto& step = m->get_step(execution_mode::training); if(step % m_batch_interval == 0) { @@ -42,16 +55,9 @@ build_callback_replace_weights_from_pbuf( const google::protobuf::Message& proto_msg, lbann_summary*) { const auto& params = dynamic_cast(proto_msg); - /* - auto&& src_layers = select_from_list(params.source_layers(), - layer_list); - auto&& dst_layers = select_from_list(params.destination_layers(), - layer_list); - */ - std::vector src_layers, dst_layers;// FIXME TRB return make_unique( - src_layers, - dst_layers, + parse_list(params.source_layers()), + parse_list(params.destination_layers()), params.batch_interval()); } From 6bacbb5fe31be563dc96ca62085a878bf39f748f Mon Sep 17 00:00:00 2001 From: "Thomas R. Benson" Date: Sun, 21 Jul 2019 16:47:01 -0700 Subject: [PATCH 138/634] make early_stopping, check_init, and timeline callbacks constructible via protobuf and the callback factory --- include/lbann/callbacks/callback_check_init.hpp | 4 ++++ include/lbann/callbacks/callback_checknan.hpp | 2 +- include/lbann/callbacks/callback_checksmall.hpp | 2 +- .../lbann/callbacks/callback_early_stopping.hpp | 5 +++++ include/lbann/callbacks/callback_timeline.hpp | 5 +++++ src/callbacks/callback_early_stopping.cpp | 8 ++++++++ src/callbacks/callback_timeline.cpp | 8 ++++++++ src/proto/callbacks.proto | 14 ++++++++++++++ src/proto/factories/callback_factory.cpp | 6 ++++++ 9 files changed, 52 insertions(+), 2 deletions(-) diff --git a/include/lbann/callbacks/callback_check_init.hpp b/include/lbann/callbacks/callback_check_init.hpp index 6d5572379fb..6acafc01cc2 100644 --- a/include/lbann/callbacks/callback_check_init.hpp +++ b/include/lbann/callbacks/callback_check_init.hpp @@ -53,6 +53,10 @@ class lbann_callback_check_init : public lbann_callback { bool check_equal(const AbsMat& x, const AbsMat& y) const; }; +// Builder function +ADD_DEFAULT_CALLBACK_BUILDER( + lbann_callback_check_init, build_callback_check_init_from_pbuf) + } // namespace lbann #endif // LBANN_CALLBACKS_CALLBACK_CHECK_INIT_HPP_INCLUDED diff --git a/include/lbann/callbacks/callback_checknan.hpp b/include/lbann/callbacks/callback_checknan.hpp index 30e90b92806..b76c4e9d382 100644 --- a/include/lbann/callbacks/callback_checknan.hpp +++ b/include/lbann/callbacks/callback_checknan.hpp @@ -63,7 +63,7 @@ class lbann_callback_checknan : public lbann_callback { // Builder function ADD_DEFAULT_CALLBACK_BUILDER( - lbann_callback_checknan, build_callback_check_nan_from_pbuf); + lbann_callback_checknan, build_callback_check_nan_from_pbuf) } // namespace lbann diff --git a/include/lbann/callbacks/callback_checksmall.hpp b/include/lbann/callbacks/callback_checksmall.hpp index 58ddf67faea..c0ea4e3203c 100644 --- a/include/lbann/callbacks/callback_checksmall.hpp +++ b/include/lbann/callbacks/callback_checksmall.hpp @@ -69,7 +69,7 @@ class lbann_callback_checksmall : public lbann_callback { // Builder function ADD_DEFAULT_CALLBACK_BUILDER( - lbann_callback_checksmall, build_callback_check_small_from_pbuf); + lbann_callback_checksmall, build_callback_check_small_from_pbuf) } // namespace lbann diff --git a/include/lbann/callbacks/callback_early_stopping.hpp b/include/lbann/callbacks/callback_early_stopping.hpp index e02fe4d3601..dc1050d291c 100644 --- a/include/lbann/callbacks/callback_early_stopping.hpp +++ b/include/lbann/callbacks/callback_early_stopping.hpp @@ -62,6 +62,11 @@ class lbann_callback_early_stopping : public lbann_callback { int64_t m_wait = 0; }; +// Builder function +std::unique_ptr +build_callback_early_stopping_from_pbuf( + const google::protobuf::Message&, lbann_summary*); + } // namespace lbann #endif // LBANN_CALLBACKS_EARLY_STOPPING_HPP_INCLUDED diff --git a/include/lbann/callbacks/callback_timeline.hpp b/include/lbann/callbacks/callback_timeline.hpp index 8bf84dd787d..76566fc107b 100644 --- a/include/lbann/callbacks/callback_timeline.hpp +++ b/include/lbann/callbacks/callback_timeline.hpp @@ -87,6 +87,11 @@ class lbann_callback_timeline : public lbann_callback { std::unordered_map>> m_opt_times; }; +// Builder function +std::unique_ptr +build_callback_timeline_from_pbuf( + const google::protobuf::Message&, lbann_summary*); + } // namespace lbann #endif // LBANN_CALLBACKS_CALLBACK_TIMELINE_HPP_INCLUDED diff --git a/src/callbacks/callback_early_stopping.cpp b/src/callbacks/callback_early_stopping.cpp index d7af962290b..9b78d7904c6 100644 --- a/src/callbacks/callback_early_stopping.cpp +++ b/src/callbacks/callback_early_stopping.cpp @@ -60,4 +60,12 @@ void lbann_callback_early_stopping::on_validation_end(model *m) { } } +std::unique_ptr +build_callback_early_stopping_from_pbuf( + const google::protobuf::Message& proto_msg, lbann_summary*) { + const auto& params = + dynamic_cast(proto_msg); + return make_unique(params.patience()); +} + } // namespace lbann diff --git a/src/callbacks/callback_timeline.cpp b/src/callbacks/callback_timeline.cpp index c4701078d29..b1eb8919a59 100644 --- a/src/callbacks/callback_timeline.cpp +++ b/src/callbacks/callback_timeline.cpp @@ -98,4 +98,12 @@ void lbann_callback_timeline::on_optimize_end(model *m, weights *w) { m_opt_times[w->get_name()].emplace_back(m_opt_start_time, end); } +std::unique_ptr +build_callback_timeline_from_pbuf( + const google::protobuf::Message& proto_msg, lbann_summary*) { + const auto& params = + dynamic_cast(proto_msg); + return make_unique(params.directory()); +} + } // namespace lbann diff --git a/src/proto/callbacks.proto b/src/proto/callbacks.proto index dd59ac1537e..e42f76de2b5 100644 --- a/src/proto/callbacks.proto +++ b/src/proto/callbacks.proto @@ -72,6 +72,9 @@ message Callback { CallbackPerturbDropout perturb_dropout = 39; CallbackSaveTopKModels save_topk_models = 40; CallbackMixup mixup = 41; + CallbackCheckInit init = 42; + CallbackEarlyStopping early_stopping = 43; + CallbackTimeline timeline = 44; } } @@ -331,3 +334,14 @@ message CallbackMixup { string layers = 1; float alpha = 2; } + +message CallbackCheckInit { +} + +message CallbackEarlyStopping { + int64 patience = 1; +} + +message CallbackTimeline { + string directory = 1; +} diff --git a/src/proto/factories/callback_factory.cpp b/src/proto/factories/callback_factory.cpp index 1527e95d057..a613f8dc303 100644 --- a/src/proto/factories/callback_factory.cpp +++ b/src/proto/factories/callback_factory.cpp @@ -96,6 +96,8 @@ void register_default_builders(factory_type& factory) build_callback_check_dataset_from_pbuf); factory.register_builder("CallbackCheckGradients", build_callback_check_gradients_from_pbuf); + factory.register_builder("CallbackCheckInit", + build_callback_check_init_from_pbuf); factory.register_builder("CallbackCheckMetric", build_callback_check_metric_from_pbuf); factory.register_builder("CallbackCheckNaN", @@ -124,6 +126,8 @@ void register_default_builders(factory_type& factory) build_callback_dump_outputs_from_pbuf); factory.register_builder("CallbackDumpWeights", build_callback_dump_weights_from_pbuf); + factory.register_builder("CallbackEarlyStopping", + build_callback_early_stopping_from_pbuf); factory.register_builder("CallbackGPUMemoryUsage", build_callback_gpu_memory_usage_from_pbuf); factory.register_builder("CallbackHang", @@ -170,6 +174,8 @@ void register_default_builders(factory_type& factory) build_callback_sync_layers_from_pbuf); factory.register_builder("CallbackSyncSelected", build_callback_sync_selected_from_pbuf); + factory.register_builder("CallbackTimeline", + build_callback_timeline_from_pbuf); factory.register_builder("CallbackTimer", build_callback_timer_from_pbuf); } From dfc8294a409f4f9be1036f6d913bb2120b2dc278 Mon Sep 17 00:00:00 2001 From: "Thomas R. Benson" Date: Mon, 22 Jul 2019 08:39:29 -0700 Subject: [PATCH 139/634] add missing file --- src/proto/callback.proto | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 src/proto/callback.proto diff --git a/src/proto/callback.proto b/src/proto/callback.proto new file mode 100644 index 00000000000..e69de29bb2d From edaf5ea0121c00555414274e4adae73588c045f3 Mon Sep 17 00:00:00 2001 From: "Thomas R. Benson" Date: Mon, 22 Jul 2019 09:14:09 -0700 Subject: [PATCH 140/634] remove unused function --- src/proto/factories/callback_factory.cpp | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/proto/factories/callback_factory.cpp b/src/proto/factories/callback_factory.cpp index a613f8dc303..9d6b24d5f72 100644 --- a/src/proto/factories/callback_factory.cpp +++ b/src/proto/factories/callback_factory.cpp @@ -180,11 +180,6 @@ void register_default_builders(factory_type& factory) build_callback_timer_from_pbuf); } -bool is_initialized(factory_type const& factory) -{ - return (factory.get_num_registered_builders() > 0); -} - // Manage a global factory struct factory_manager { From 5e2a4a1b0a2931447cf85b9160e107644541bf94 Mon Sep 17 00:00:00 2001 From: "David A. Hysom" Date: Mon, 22 Jul 2019 10:28:00 -0700 Subject: [PATCH 141/634] changed "int" to "off_t" for computing shared memory segment size (so we don't over-run INT_MAX); made an exception message more informative. --- src/data_store/data_store_conduit.cpp | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/data_store/data_store_conduit.cpp b/src/data_store/data_store_conduit.cpp index 57915d5687c..37bca740d4a 100644 --- a/src/data_store/data_store_conduit.cpp +++ b/src/data_store/data_store_conduit.cpp @@ -1132,11 +1132,15 @@ void data_store_conduit::compute_image_offsets(std::unordered_map &size void data_store_conduit::allocate_shared_segment(std::unordered_map &sizes, std::vector> &indices) { - int size = 0; + off_t size = 0; + for (auto &&t : sizes) { size += t.second; } m_mem_seg_length = size; + if (m_world_master) { + std::cout << "size of shared memory segment: " << m_mem_seg_length << std::endl; + } //need to ensure name is unique across all data readers m_seg_name = "/our_town_" + m_reader->get_role(); @@ -1177,7 +1181,7 @@ void data_store_conduit::allocate_shared_segment(std::unordered_map &si } int v = ftruncate(shm_fd, size); if (v != 0) { - LBANN_ERROR("ftruncate failed"); + LBANN_ERROR("ftruncate failed for size: " + std::to_string(size)); } void *m = mmap(0, size, PROT_WRITE | PROT_READ, MAP_SHARED, shm_fd, 0); if (m == MAP_FAILED) { From e71dc135140d481309ed356bff6d9fd2137db0a2 Mon Sep 17 00:00:00 2001 From: Nikoli Dryden Date: Mon, 22 Jul 2019 10:37:42 -0700 Subject: [PATCH 142/634] Add method for getting a packed communicator. --- include/lbann/comm.hpp | 12 ++++++++++++ src/comm.cpp | 22 ++++++++++++++++++++++ 2 files changed, 34 insertions(+) diff --git a/include/lbann/comm.hpp b/include/lbann/comm.hpp index 2ab72fe1273..db8c93cf145 100644 --- a/include/lbann/comm.hpp +++ b/include/lbann/comm.hpp @@ -998,6 +998,16 @@ class lbann_comm { return node_comm; } + /** + * Return a communicator containing num_per_group processors. + * + * This will attempt to pack processes so that the processes in each group + * are physically close together on the system. + * + * num_per_group must evenly divide the number of processors in the world. + */ + const El::mpi::Comm& get_packed_group_comm(int num_per_group) const; + /** Return true if rank (in comm) is on the local node. */ bool is_rank_node_local(int rank, const El::mpi::Comm& comm) const { // Translating to COMM_WORLD is typically constant time. @@ -1017,6 +1027,8 @@ class lbann_comm { El::mpi::Comm intertrainer_comm; /** Communicator for every process in the same compute node. */ El::mpi::Comm node_comm; + /** Packed group communicators. */ + mutable std::unordered_map group_communicators; /** Grid for this trainer. */ Grid *grid; /** Number of trainers. */ diff --git a/src/comm.cpp b/src/comm.cpp index 24c545064ea..295bca89459 100644 --- a/src/comm.cpp +++ b/src/comm.cpp @@ -514,6 +514,28 @@ ::Al::ReductionOperator lbann_comm::mpi_op_to_al_op(El::mpi::Op op) { } #endif +const El::mpi::Comm& lbann_comm::get_packed_group_comm(int num_per_group) const { + if (group_communicators.count(num_per_group) == 0) { + // Ensure we can get an even number of groups. + if (get_procs_in_world() % num_per_group != 0) { + std::stringstream err; + err << "Cannot create a packed group comm with group size " + << num_per_group + << " out of " << get_procs_in_world() + << " processes"; + LBANN_ERROR(err.str()); + } + MPI_Comm comm; + MPI_Comm_split( + get_world_comm().GetMPIComm(), + get_rank_in_world() / (get_procs_in_world() / num_per_group), + 0, &comm); + group_communicators.emplace(num_per_group, comm); + MPI_Comm_free(&comm); // El::mpi::Comm duplicates internally. + } + return group_communicators[num_per_group]; +} + void lbann_comm::lbann_comm_abort(std::string msg) { throw lbann_exception(msg); } From 9c3582364bb89a915a1fb030c8d07d5d952883ff Mon Sep 17 00:00:00 2001 From: Nikoli Dryden Date: Mon, 22 Jul 2019 10:38:17 -0700 Subject: [PATCH 143/634] Add support for grouped batchnorm statistics. --- .../regularizers/batch_normalization.hpp | 74 ++++++++----------- .../regularizers/batch_normalization.cpp | 53 ++++++------- .../regularizers/batch_normalization.cu | 53 ++++++------- src/proto/factories/layer_factory.cpp | 33 +++++---- src/proto/lbann.proto | 4 +- 5 files changed, 100 insertions(+), 117 deletions(-) diff --git a/include/lbann/layers/regularizers/batch_normalization.hpp b/include/lbann/layers/regularizers/batch_normalization.hpp index b439c0e7ded..06e150a2cfc 100644 --- a/include/lbann/layers/regularizers/batch_normalization.hpp +++ b/include/lbann/layers/regularizers/batch_normalization.hpp @@ -63,8 +63,12 @@ class batch_normalization_layer : public regularizer_layer { DataType m_decay; /** Small number to avoid division by zero. */ DataType m_epsilon; - /** Type of statistics aggregation to use. */ - batch_normalization_stats_aggregation m_stats_aggregation; + /** @brief Size of group to aggregate statistics over. + * + * If this is 1, the group consists of one process and aggregation + * is local. If it is 0, statistics are aggregated globally. + */ + int m_statistics_group_size; /** * Cache of node-local num_per_sum results for node-local stats. * Indexed by effective mini-batch size. @@ -101,22 +105,22 @@ class batch_normalization_layer : public regularizer_layer { * @param decay Controls the momentum of the running mean/standard * deviation averages. * @param epsilon A small number to avoid division by zero. - * @param stats_aggregation The type of statistics to use when training. + * @param statistics_group_size Number of processors to aggregate + * statistics over. Defaults to 1 (i.e. local aggregation). */ batch_normalization_layer(lbann_comm *comm, DataType decay=0.9, DataType epsilon=1e-5, - batch_normalization_stats_aggregation stats_aggregation = - batch_normalization_stats_aggregation::local) + int statistics_group_size=1) : regularizer_layer(comm), m_decay(decay), m_epsilon(epsilon), - m_stats_aggregation(stats_aggregation) { + m_statistics_group_size(statistics_group_size) { static_assert(T_layout == data_layout::DATA_PARALLEL, "batch normalization only supports DATA_PARALLEL"); #ifdef LBANN_DETERMINISTIC // Force global computation. - m_stats_aggregation = batch_normalization_stats_aggregation::global; + m_statistics_group_size = 0; #endif } @@ -124,7 +128,7 @@ class batch_normalization_layer : public regularizer_layer { : regularizer_layer(other), m_decay(other.m_decay), m_epsilon(other.m_epsilon), - m_stats_aggregation(other.m_stats_aggregation), + m_statistics_group_size(other.m_statistics_group_size), m_num_per_sum_cache(other.m_num_per_sum_cache), m_mean_and_var(other.m_mean_and_var ? other.m_mean_and_var->Copy() : nullptr), @@ -145,7 +149,7 @@ class batch_normalization_layer : public regularizer_layer { regularizer_layer::operator=(other); m_decay = other.m_decay; m_epsilon = other.m_epsilon; - m_stats_aggregation = other.m_stats_aggregation; + m_statistics_group_size = other.m_statistics_group_size; m_num_per_sum_cache = other.m_num_per_sum_cache; // Deep copy matrices @@ -178,17 +182,7 @@ class batch_normalization_layer : public regularizer_layer { auto&& desc = regularizer_layer::get_description(); desc.add("Decay", m_decay); desc.add("Epsilon", m_epsilon); - switch (m_stats_aggregation) { - case batch_normalization_stats_aggregation::local: - desc.add("Statistics aggregation", "local"); - break; - case batch_normalization_stats_aggregation::node_local: - desc.add("Statistics aggregation", "node-local"); - break; - case batch_normalization_stats_aggregation::global: - desc.add("Statistics aggregation", "global"); - break; - } + desc.add("Statistics group size", m_statistics_group_size); return desc; } @@ -220,38 +214,28 @@ class batch_normalization_layer : public regularizer_layer { const auto& output = get_activations(); const auto& mini_batch_size = output.Width(); const auto& local_mini_batch_size = mini_batch_size / output.DistSize(); - if (m_stats_aggregation == batch_normalization_stats_aggregation::global - && mini_batch_size <= 4) { - std::stringstream err; - err << "LBANN warning: " - << get_type() << " layer \"" << get_name() << "\" " - << "is using global statistics and " - << "the mini-batch size (" << mini_batch_size << ") " - << "may be too small to get good statistics"; + if (m_statistics_group_size == 0 && mini_batch_size <= 4) { if (output.DistRank() == 0) { + std::stringstream err; + err << "LBANN warning: " + << get_type() << " layer \"" << get_name() << "\" " + << "is using global statistics and " + << "the mini-batch size (" << mini_batch_size << ") " + << "may be too small to get good statistics"; std::cerr << err.str() << std::endl; } - } else if (m_stats_aggregation == batch_normalization_stats_aggregation::node_local - && local_mini_batch_size*m_comm->get_procs_per_node() <= 4) { - std::stringstream err; - err << "LBANN warning: " - << get_type() << " layer \"" << get_name() << "\" " - << "is using node-local statistics and " - << "the node-local mini-batch size (" - << (local_mini_batch_size*m_comm->get_procs_per_node()) << ") " - << "may be too small to get good statistics"; + } else if (m_statistics_group_size*local_mini_batch_size <= 4) { + // This possibly underestimates the aggregation size for processors with + // smaller local mini-batch sizes. if (output.DistRank() == 0) { - std::cerr << err.str() << std::endl; - } - } else if (m_stats_aggregation == batch_normalization_stats_aggregation::local - && local_mini_batch_size <= 4) { - std::stringstream err; + std::stringstream err; err << "LBANN warning: " << get_type() << " layer \"" << get_name() << "\" " - << "is using local statistics and " - << "the local mini-batch size (" << local_mini_batch_size << ") " + << "is aggregating statistics over " + << m_statistics_group_size + << "processors and the aggregated mini-batch size (" + << (m_statistics_group_size*local_mini_batch_size) << ") " << "may be too small to get good statistics"; - if (output.DistRank() == 0) { std::cerr << err.str() << std::endl; } } diff --git a/src/layers/regularizers/batch_normalization.cpp b/src/layers/regularizers/batch_normalization.cpp index 051617e6850..2d5cd8cc95f 100644 --- a/src/layers/regularizers/batch_normalization.cpp +++ b/src/layers/regularizers/batch_normalization.cpp @@ -73,29 +73,27 @@ void batch_normalization_layer::fp_ local_var(channel, 0) = sqsum; } El::Int num_per_sum; - switch (m_stats_aggregation) { - case batch_normalization_stats_aggregation::global: - // Allreduce on fused buffer. + if (m_statistics_group_size == 0) { + // Global statistics aggregation; allreduce on fused buffer. m_comm->allreduce(*m_mean_and_var, m_mean_and_var->RedundantComm(), El::mpi::SUM); num_per_sum = channel_size * width; - break; - case batch_normalization_stats_aggregation::node_local: - // Allreduce on fused buffer. - m_comm->allreduce(*m_mean_and_var, m_comm->get_node_comm(), El::mpi::SUM); + } else if (m_statistics_group_size == 1) { + // Local aggregation, no allreduce needed. + num_per_sum = channel_size * local_width; + } else { + // Grouped batchnorm. Allreduce on fused buffer. + m_comm->allreduce(*m_mean_and_var, + m_comm->get_packed_group_comm(m_statistics_group_size), + El::mpi::SUM); if (m_num_per_sum_cache.count(width) == 0) { num_per_sum = channel_size * local_width; - num_per_sum = m_comm->allreduce(num_per_sum, m_comm->get_node_comm()); + num_per_sum = m_comm->allreduce( + num_per_sum, m_comm->get_packed_group_comm(m_statistics_group_size)); m_num_per_sum_cache[width] = num_per_sum; } else { num_per_sum = m_num_per_sum_cache[width]; } - break; - case batch_normalization_stats_aggregation::local: - num_per_sum = channel_size * local_width; - break; - default: - LBANN_ERROR("Unknown batch normalization stats aggregation"); } // Compute minibatch statistics @@ -225,15 +223,15 @@ void batch_normalization_layer::bp_ // Accumulate gradients if (is_training) { - if (m_stats_aggregation == batch_normalization_stats_aggregation::global) { - // Allreduce on fused buffer. + if (m_statistics_group_size == 0) { + // Global aggregation; allreduce on fused buffer. m_comm->allreduce(*m_mean_and_var_gradient, m_mean_and_var_gradient->RedundantComm(), El::mpi::SUM); - } else if (m_stats_aggregation == batch_normalization_stats_aggregation::node_local) { - // Allreduce on fused buffer. + } else if (m_statistics_group_size > 1) { + // Grouped batchnorm; allreduce on fused buffer. m_comm->allreduce(*m_mean_and_var_gradient, - m_comm->get_node_comm(), + m_comm->get_packed_group_comm(m_statistics_group_size), El::mpi::SUM); } } else { @@ -255,18 +253,15 @@ void batch_normalization_layer::bp_ // Compute error signal El::Int num_per_sum; - switch (m_stats_aggregation) { - case batch_normalization_stats_aggregation::global: + if (m_statistics_group_size == 0) { + // Global statistics aggregation. num_per_sum = channel_size * width; - break; - case batch_normalization_stats_aggregation::node_local: - num_per_sum = m_num_per_sum_cache[width]; // This was computed in FP. - break; - case batch_normalization_stats_aggregation::local: + } else if (m_statistics_group_size == 1) { + // Local aggregation. num_per_sum = channel_size * local_width; - break; - default: - LBANN_ERROR("Unknown batch normalization stats aggregation"); + } else { + // Grouped batchnorm. + num_per_sum = m_num_per_sum_cache[width]; // This was computed in FP. } if (num_per_sum <= 1) { El::Zero(local_gradient_wrt_input); diff --git a/src/layers/regularizers/batch_normalization.cu b/src/layers/regularizers/batch_normalization.cu index b9ab2cedbfd..02f6b6071f2 100644 --- a/src/layers/regularizers/batch_normalization.cu +++ b/src/layers/regularizers/batch_normalization.cu @@ -339,29 +339,27 @@ void batch_normalization_layer::fp_ local_mean.Buffer(), local_var.Buffer()); } El::Int num_per_sum; - switch (m_stats_aggregation) { - case batch_normalization_stats_aggregation::global: - // Allreduce on fused buffer. + if (m_statistics_group_size == 0) { + // Global statistics aggregation; allreduce on fused buffer. m_comm->allreduce(*m_mean_and_var, m_mean_and_var->RedundantComm(), El::mpi::SUM); num_per_sum = channel_size * width; - break; - case batch_normalization_stats_aggregation::node_local: - // Allreduce on fused buffer. - m_comm->allreduce(*m_mean_and_var, m_comm->get_node_comm(), El::mpi::SUM); + } else if (m_statistics_group_size == 1) { + // Local aggregation, no allreduce needed. + num_per_sum = channel_size * local_width; + } else { + // Grouped batchnorm. Allreduce on fused buffer. + m_comm->allreduce(*m_mean_and_var, + m_comm->get_packed_group_comm(m_statistics_group_size), + El::mpi::SUM); if (m_num_per_sum_cache.count(width) == 0) { num_per_sum = channel_size * local_width; - num_per_sum = m_comm->allreduce(num_per_sum, m_comm->get_node_comm()); + num_per_sum = m_comm->allreduce( + num_per_sum, m_comm->get_packed_group_comm(m_statistics_group_size)); m_num_per_sum_cache[width] = num_per_sum; } else { num_per_sum = m_num_per_sum_cache[width]; } - break; - case batch_normalization_stats_aggregation::local: - num_per_sum = channel_size * local_width; - break; - default: - LBANN_ERROR("Unknown batch normalization stats aggregation"); } // Compute minibatch statistics @@ -464,15 +462,15 @@ void batch_normalization_layer::bp_ // Accumulate gradients if (is_training) { - if (m_stats_aggregation == batch_normalization_stats_aggregation::global) { - // Allreduce on fused buffer. + if (m_statistics_group_size == 0) { + // Global aggregation; allreduce on fused buffer. m_comm->allreduce(*m_mean_and_var_gradient, m_mean_and_var_gradient->RedundantComm(), El::mpi::SUM); - } else if (m_stats_aggregation == batch_normalization_stats_aggregation::node_local) { - // Allreduce on fused buffer. + } else if (m_statistics_group_size > 1) { + // Grouped batchnorm; allreduce on fused buffer. m_comm->allreduce(*m_mean_and_var_gradient, - m_comm->get_node_comm(), + m_comm->get_packed_group_comm(m_statistics_group_size), El::mpi::SUM); } } else { @@ -494,18 +492,15 @@ void batch_normalization_layer::bp_ // Compute error signal El::Int num_per_sum; - switch (m_stats_aggregation) { - case batch_normalization_stats_aggregation::global: + if (m_statistics_group_size == 0) { + // Global statistics aggregation. num_per_sum = channel_size * width; - break; - case batch_normalization_stats_aggregation::node_local: - num_per_sum = m_num_per_sum_cache[width]; // This was computed in FP. - break; - case batch_normalization_stats_aggregation::local: + } else if (m_statistics_group_size == 1) { + // Local aggregation. num_per_sum = channel_size * local_width; - break; - default: - LBANN_ERROR("Unknown batch normalization stats aggregation"); + } else { + // Grouped batchnorm. + num_per_sum = m_num_per_sum_cache[width]; // This was computed in FP. } if (num_per_sum <= 1) { El::Zero(local_gradient_wrt_input); diff --git a/src/proto/factories/layer_factory.cpp b/src/proto/factories/layer_factory.cpp index 7c946758719..f02fef5b1a8 100644 --- a/src/proto/factories/layer_factory.cpp +++ b/src/proto/factories/layer_factory.cpp @@ -421,19 +421,26 @@ std::unique_ptr construct_layer( if (proto_layer.has_batch_normalization()) { const auto& params = proto_layer.batch_normalization(); if (Layout == data_layout::DATA_PARALLEL) { + int statistics_group_size = params.statistics_group_size(); + if (params.global_statistics()) { + statistics_group_size = 0; + } else if (statistics_group_size == 0) { + statistics_group_size = 1; // Default to local. + } const auto& aggr_str = params.stats_aggregation(); - batch_normalization_stats_aggregation aggr = - batch_normalization_stats_aggregation::local; - if (aggr_str == "local" || aggr_str.empty()) { - aggr = batch_normalization_stats_aggregation::local; - } else if (aggr_str == "node_local") { - aggr = batch_normalization_stats_aggregation::node_local; - } else if (aggr_str == "global") { - aggr = batch_normalization_stats_aggregation::global; - } else { - err << "Invalid batch normalization stats aggregation " << aggr_str; - LBANN_ERROR(err.str()); - return nullptr; + if (!aggr_str.empty()) { + LBANN_WARNING("stats_aggregation field for BatchNormalization is deprecated"); + if (aggr_str == "local") { + statistics_group_size = 1; + } else if (aggr_str == "node_local") { + statistics_group_size = comm->get_procs_per_node(); + } else if (aggr_str == "global") { + statistics_group_size = 0; + } else { + err << "Invalid batch normalization stats aggregation " << aggr_str; + LBANN_ERROR(err.str()); + return nullptr; + } } // Set defaults if not given. auto decay = params.decay(); @@ -448,7 +455,7 @@ std::unique_ptr construct_layer( comm, decay, epsilon, - aggr); + statistics_group_size); } else { LBANN_ERROR("batch normalization layer is only supported with " "a data-parallel layout"); diff --git a/src/proto/lbann.proto b/src/proto/lbann.proto index 25478afb88c..78559cd7d25 100644 --- a/src/proto/lbann.proto +++ b/src/proto/lbann.proto @@ -1060,7 +1060,9 @@ message BatchNormalization { double scale_init = 2; //default: 1.0 double bias_init = 3; //default: 0.0 double epsilon = 4; //default: 1e-5 - string stats_aggregation = 5; // default: local + string stats_aggregation = 5; // default: local; deprecated + int64 statistics_group_size = 6; // default: 1 (local) + bool global_statistics = 7; } message SeluDropout { From 7fd90fb1b735a8d07fe4a906af5916102f6af54d Mon Sep 17 00:00:00 2001 From: Nikoli Dryden Date: Mon, 22 Jul 2019 10:38:43 -0700 Subject: [PATCH 144/634] Update Python interface. --- model_zoo/vision/resnet.py | 29 +++++-- python/lbann/contrib/models/wide_resnet.py | 8 +- python/lbann/models/resnet.py | 91 +++++++++++----------- 3 files changed, 73 insertions(+), 55 deletions(-) diff --git a/model_zoo/vision/resnet.py b/model_zoo/vision/resnet.py index 463cb019482..964ad3bb60c 100755 --- a/model_zoo/vision/resnet.py +++ b/model_zoo/vision/resnet.py @@ -38,9 +38,13 @@ '--block-channels', action='store', default=None, type=str, help='Internal channels in each ResNet block (comma-separated list)') parser.add_argument( - '--bn-stats-aggregation', action='store', default='local', type=str, + '--bn-stats-aggregation', action='store', type=str, help=('aggregation mode for batch normalization statistics ' - '(default: "local")')) + '(default: "local") (DEPRECATED)')) +parser.add_argument( + '--bn-statistics-group-size', action='store', default=1, type=int, + help=('Group size for aggregating batch normalization statistics ' + '(default: 1)')) parser.add_argument( '--warmup', action='store_true', help='use a linear warmup') parser.add_argument( @@ -70,6 +74,19 @@ # hardcoded to 1000 labels for ImageNet. imagenet_labels = 1000 +# Handle old-style batchnorm aggregation. +if args.bn_stats_aggregation is not None: + print('--bn-stats-aggregation is deprected, use --bn-statistics-group-size') + if args.bn_stats_aggregation == 'local': + args.bn_statistics_group_size = 1 + elif args.bn_stats_aggregation == 'node_local': + raise RuntimeError('Cannot translate node_local stats aggregation') + elif args.bn_stats_aggregation == 'global': + args.bn_statistics_group_size = 0 + else: + raise RuntimeError('Unknown stats aggregation ' + + args.bn_stats_aggregation) + # Choose ResNet variant resnet_variant_dict = {18: lbann.models.ResNet18, 34: lbann.models.ResNet34, @@ -93,24 +110,24 @@ list(map(int, args.blocks.split(','))), list(map(int, args.block_channels.split(','))), zero_init_residual=True, - bn_stats_aggregation=args.bn_stats_aggregation, + bn_statistics_group_size=args.bn_statistics_group_size, name='custom_resnet', width=args.width) elif args.width == 1: # Vanilla ResNet. resnet = resnet_variant_dict[args.resnet]( imagenet_labels, - bn_stats_aggregation=args.bn_stats_aggregation) + bn_statistics_group_size=args.bn_statistics_group_size) elif args.width == 2 and args.resnet == 50: # Use pre-defined WRN-50-2. resnet = wide_resnet_variant_dict[args.resnet]( imagenet_labels, - bn_stats_aggregation=args.bn_stats_aggregation) + bn_statistics_group_size=args.bn_statistics_group_size) else: # Some other Wide ResNet. resnet = resnet_variant_dict[args.resnet]( imagenet_labels, - bn_stats_aggregation=args.bn_stats_aggregation, + bn_statistics_group_size=args.bn_statistics_group_size, width=args.width) # Construct layer graph diff --git a/python/lbann/contrib/models/wide_resnet.py b/python/lbann/contrib/models/wide_resnet.py index 349c32c6883..392087625a4 100644 --- a/python/lbann/contrib/models/wide_resnet.py +++ b/python/lbann/contrib/models/wide_resnet.py @@ -13,7 +13,7 @@ class WideResNet50_2(lbann.models.resnet.ResNet): def __init__(self, output_size, zero_init_residual=True, - bn_stats_aggregation='local', + bn_statistics_group_size=1, name=None): """Initialize WRN-50-2. @@ -22,8 +22,8 @@ def __init__(self, output_size, zero_init_residual (bool, optional): Whether to initialize the final batch normalization in residual branches with zeros. - bn_stats_aggregation (str, optional): Aggregation mode for - batch normalization statistics. + bn_statistics_group_size (str, optional): Group size for + aggregating batch normalization statistics. name (str, optional): Module name. (default: 'wide_resnet50_module') @@ -33,5 +33,5 @@ def __init__(self, output_size, WideResNet50_2.global_count) super().__init__(lbann.models.resnet.BottleneckBlock, output_size, (3,4,6,3), (64,128,256,512), - zero_init_residual, bn_stats_aggregation, name, + zero_init_residual, bn_statistics_group_size, name, width=2) diff --git a/python/lbann/models/resnet.py b/python/lbann/models/resnet.py index 3becd020348..997a5a91dd7 100644 --- a/python/lbann/models/resnet.py +++ b/python/lbann/models/resnet.py @@ -13,7 +13,7 @@ class ConvBNRelu(lbann.modules.Module): """ def __init__(self, out_channels, kernel_size, stride, padding, - bn_zero_init, bn_stats_aggregation, + bn_zero_init, bn_statistics_group_size, relu, name): """Initialize ConvBNRelu module. @@ -25,8 +25,8 @@ def __init__(self, out_channels, kernel_size, stride, padding, padding (int): Convolution padding. bn_zero_init (bool): Zero-initialize batch normalization scale. - bn_stats_aggregation (str): Aggregation mode for batch - normalization statistics. + bn_statistics_group_size (int): Group size for aggregating + batch normalization statistics. relu (bool): Apply ReLU activation. name (str): Module name. @@ -51,7 +51,7 @@ def __init__(self, out_channels, kernel_size, stride, padding, initializer=lbann.ConstantInitializer(value=0.0), name=self.name + '_bn_bias') self.bn_weights = [bn_scale, bn_bias] - self.bn_stats_aggregation = bn_stats_aggregation + self.bn_statistics_group_size = bn_statistics_group_size # Initialize ReLU self.relu = relu @@ -61,7 +61,8 @@ def forward(self, x): conv = self.conv(x) bn = lbann.BatchNormalization( conv, weights=self.bn_weights, - stats_aggregation=self.bn_stats_aggregation, + statistics_group_size=self.bn_statistics_group_size, + global_statistics=True if self.bn_statistics_group_size == 0 else None, name='{0}_bn_instance{1}'.format(self.name,self.instance)) if self.relu: return lbann.Relu( @@ -80,7 +81,7 @@ class BasicBlock(lbann.modules.Module): def __init__(self, in_channels, mid_channels, downsample, zero_init_residual, - bn_stats_aggregation, name, width=1): + bn_statistics_group_size, name, width=1): """Initialize residual block. Args: @@ -90,8 +91,8 @@ def __init__(self, in_channels, mid_channels, factor of 2 in each spatial dimension). zero_init_residual (bool): Zero-initialize the scale in the final batch normalization in the residual branch. - bn_stats_aggregation (str): Aggregation mode for batch - normalization statistics. + bn_statistics_group_size (int): Group size for aggregating + batch normalization statistics. name (str): Module name. width (float, optional): Width growth factor for 3x3 convolutions. @@ -106,11 +107,11 @@ def __init__(self, in_channels, mid_channels, # Skip connection if downsample: self.branch1 = ConvBNRelu(self.out_channels, 1, 2, 0, - False, bn_stats_aggregation, + False, bn_statistics_group_size, False, self.name + '_branch1') elif in_channels != self.out_channels: self.branch1 = ConvBNRelu(self.out_channels, 1, 1, 0, - False, bn_stats_aggregation, + False, bn_statistics_group_size, False, self.name + '_branch1') else: self.branch1 = None @@ -118,11 +119,11 @@ def __init__(self, in_channels, mid_channels, # Residual branch self.branch2a = ConvBNRelu(mid_channels, 3, (2 if downsample else 1), 1, - False, bn_stats_aggregation, + False, bn_statistics_group_size, True, self.name + '_branch2a') self.branch2b = ConvBNRelu(self.out_channels, 3, 1, 1, zero_init_residual, - bn_stats_aggregation, + bn_statistics_group_size, False, self.name + '_branch2b') def forward(self, x): @@ -144,7 +145,7 @@ class BottleneckBlock(lbann.modules.Module): def __init__(self, in_channels, mid_channels, downsample, zero_init_residual, - bn_stats_aggregation, name, width=1): + bn_statistics_group_size, name, width=1): """Initialize residual block. Args: @@ -154,8 +155,8 @@ def __init__(self, in_channels, mid_channels, factor of 2 in each spatial dimension). zero_init_residual (bool): Zero-initialize the scale in the final batch normalization in the residual branch. - bn_stats_aggregation (str): Aggregation mode for batch - normalization statistics. + bn_statistics_group_size (int): Group size for aggregating + batch normalization statistics. name (str): Module name. width (float, optional): Width growth factor for 3x3 convolutions. @@ -171,26 +172,26 @@ def __init__(self, in_channels, mid_channels, # Skip connection if downsample: self.branch1 = ConvBNRelu(self.out_channels, 1, 2, 0, - False, bn_stats_aggregation, + False, bn_statistics_group_size, False, self.name + '_branch1') elif in_channels != self.out_channels: self.branch1 = ConvBNRelu(self.out_channels, 1, 1, 0, - False, bn_stats_aggregation, + False, bn_statistics_group_size, False, self.name + '_branch1') else: self.branch1 = None # Residual branch self.branch2a = ConvBNRelu(mid_channels, 1, 1, 0, - False, bn_stats_aggregation, + False, bn_statistics_group_size, True, self.name + '_branch2a') self.branch2b = ConvBNRelu(mid_channels, 3, (2 if downsample else 1), 1, - False, bn_stats_aggregation, + False, bn_statistics_group_size, True, self.name + '_branch2b') self.branch2c = ConvBNRelu(self.out_channels, 1, 1, 0, zero_init_residual, - bn_stats_aggregation, + bn_statistics_group_size, False, self.name + '_branch2c') def forward(self, x): @@ -228,7 +229,7 @@ class ResNet(lbann.modules.Module): def __init__(self, block, output_size, layer_sizes, layer_channels, - zero_init_residual, bn_stats_aggregation, + zero_init_residual, bn_statistics_group_size, name, width=1): """Initialize ResNet. @@ -242,8 +243,8 @@ def __init__(self, block, output_size, internal channels in each ResNet layer. zero_init_residual (bool): Whether to initialize the final batch normalization in residual branches with zeros. - bn_stats_aggregation (str): Aggregation mode for batch - normalization statistics. + bn_statistics_group_size (int): Group size for aggregating + batch normalization statistics. name (str): Module name. width (float, optional): Width growth factor. @@ -252,7 +253,7 @@ def __init__(self, block, output_size, self.name = name self.instance = 0 self.conv1 = ConvBNRelu(layer_channels[0], 7, 2, 3, - False, bn_stats_aggregation, + False, bn_statistics_group_size, True, self.name + '_conv1') self.blocks = [] for layer in range(len(layer_sizes)): @@ -264,7 +265,7 @@ def __init__(self, block, output_size, downsample = (i == 0 and layer > 0) b = block(in_channels, mid_channels, downsample, zero_init_residual, - bn_stats_aggregation, + bn_statistics_group_size, '{0}_layer{1}_block{2}'.format(self.name, layer, i), width=width) self.blocks.append(b) @@ -300,7 +301,7 @@ class ResNet18(ResNet): def __init__(self, output_size, zero_init_residual=True, - bn_stats_aggregation='local', + bn_statistics_group_size=1, name=None, width=1): """Initialize ResNet-18. @@ -309,8 +310,8 @@ def __init__(self, output_size, zero_init_residual (bool, optional): Whether to initialize the final batch normalization in residual branches with zeros. - bn_stats_aggregation (str, optional): Aggregation mode for - batch normalization statistics. + bn_statistics_group_size (str, optional): Group size for + aggregating batch normalization statistics. name (str, optional): Module name (default: 'resnet18_module') width (float, optional): Width growth factor. @@ -321,7 +322,7 @@ def __init__(self, output_size, name = 'resnet18_module{0}'.format(ResNet18.global_count) super().__init__(BasicBlock, output_size, (2,2,2,2), (64,128,256,512), - zero_init_residual, bn_stats_aggregation, + zero_init_residual, bn_statistics_group_size, name, width=width) class ResNet34(ResNet): @@ -341,7 +342,7 @@ class ResNet34(ResNet): def __init__(self, output_size, zero_init_residual=True, - bn_stats_aggregation='local', + bn_statistics_group_size=1, name=None, width=1): """Initialize ResNet-34. @@ -350,8 +351,8 @@ def __init__(self, output_size, zero_init_residual (bool, optional): Whether to initialize the final batch normalization in residual branches with zeros. - bn_stats_aggregation (str, optional): Aggregation mode for - batch normalization statistics. + bn_statistics_group_size (str, optional): Group size for + aggregating batch normalization statistics. name (str, optional): Module name (default: 'resnet34_module') width (float, optional): Width growth factor. @@ -362,7 +363,7 @@ def __init__(self, output_size, name = 'resnet34_module{0}'.format(ResNet34.global_count) super().__init__(BasicBlock, output_size, (3,4,6,3), (64,128,256,512), - zero_init_residual, bn_stats_aggregation, + zero_init_residual, bn_statistics_group_size, name, width=width) class ResNet50(ResNet): @@ -382,7 +383,7 @@ class ResNet50(ResNet): def __init__(self, output_size, zero_init_residual=True, - bn_stats_aggregation='local', + bn_statistics_group_size=1, name=None, width=1): """Initialize ResNet-50. @@ -391,8 +392,8 @@ def __init__(self, output_size, zero_init_residual (bool, optional): Whether to initialize the final batch normalization in residual branches with zeros. - bn_stats_aggregation (str, optional): Aggregation mode for - batch normalization statistics. + bn_statistics_group_size (str, optional): Group size for + aggregating batch normalization statistics. name (str, optional): Module name (default: 'resnet50_module') width (float, optional): Width growth factor. @@ -403,7 +404,7 @@ def __init__(self, output_size, name = 'resnet50_module{0}'.format(ResNet50.global_count) super().__init__(BottleneckBlock, output_size, (3,4,6,3), (64,128,256,512), - zero_init_residual, bn_stats_aggregation, + zero_init_residual, bn_statistics_group_size, name, width=width) class ResNet101(ResNet): @@ -423,7 +424,7 @@ class ResNet101(ResNet): def __init__(self, output_size, zero_init_residual=True, - bn_stats_aggregation='local', + bn_statistics_group_size=1, name=None, width=1): """Initialize ResNet-101. @@ -432,8 +433,8 @@ def __init__(self, output_size, zero_init_residual (bool, optional): Whether to initialize the final batch normalization in residual branches with zeros. - bn_stats_aggregation (str, optional): Aggregation mode for - batch normalization statistics. + bn_statistics_group_size (str, optional): Group size for + aggregating batch normalization statistics. name (str, optional): Module name (default: 'resnet101_module') width (float, optional): Width growth factor. @@ -444,7 +445,7 @@ def __init__(self, output_size, name = 'resnet101_module{0}'.format(ResNet101.global_count) super().__init__(BottleneckBlock, output_size, (3,4,23,3), (64,128,256,512), - zero_init_residual, bn_stats_aggregation, + zero_init_residual, bn_statistics_group_size, name, width=width) class ResNet152(ResNet): @@ -464,7 +465,7 @@ class ResNet152(ResNet): def __init__(self, output_size, zero_init_residual=True, - bn_stats_aggregation='local', + bn_statistics_group_size=1, name=None, width=1): """Initialize ResNet-152. @@ -473,8 +474,8 @@ def __init__(self, output_size, zero_init_residual (bool, optional): Whether to initialize the final batch normalization in residual branches with zeros. - bn_stats_aggregation (str, optional): Aggregation mode for - batch normalization statistics. + bn_statistics_group_size (str, optional): Group size for + aggregating batch normalization statistics. name (str, optional): Module name (default: 'resnet152_module') width (float, optional): Width growth factor. @@ -485,5 +486,5 @@ def __init__(self, output_size, name = 'resnet152_module{0}'.format(ResNet152.global_count) super().__init__(BottleneckBlock, output_size, (3,8,36,3), (64,128,256,512), - zero_init_residual, bn_stats_aggregation, + zero_init_residual, bn_statistics_group_size, name, width=width) From 32b702e721aa997ccc93d87cda4e5971857a8ace Mon Sep 17 00:00:00 2001 From: "Thomas R. Benson" Date: Mon, 22 Jul 2019 11:44:52 -0700 Subject: [PATCH 145/634] rename all callbacks to not have redundant prefixes --- include/lbann/callbacks/callback.hpp | 26 +-- .../callbacks/callback_check_dataset.hpp | 28 +-- .../callbacks/callback_check_gradients.hpp | 18 +- .../lbann/callbacks/callback_check_init.hpp | 22 +-- .../lbann/callbacks/callback_check_metric.hpp | 30 ++-- include/lbann/callbacks/callback_checknan.hpp | 34 ++-- .../lbann/callbacks/callback_checkpoint.hpp | 37 ++-- .../lbann/callbacks/callback_checksmall.hpp | 34 ++-- .../callbacks/callback_confusion_matrix.hpp | 18 +- include/lbann/callbacks/callback_debug.hpp | 34 ++-- include/lbann/callbacks/callback_debug_io.hpp | 36 ++-- .../callbacks/callback_dump_error_signals.hpp | 18 +- .../callbacks/callback_dump_gradients.hpp | 30 ++-- ...callback_dump_minibatch_sample_indices.hpp | 32 ++-- .../lbann/callbacks/callback_dump_outputs.hpp | 26 +-- .../lbann/callbacks/callback_dump_weights.hpp | 30 ++-- .../callbacks/callback_early_stopping.hpp | 22 +-- .../callbacks/callback_gpu_memory_usage.hpp | 23 +-- include/lbann/callbacks/callback_hang.hpp | 20 ++- include/lbann/callbacks/callback_imcomm.hpp | 28 +-- include/lbann/callbacks/callback_io.hpp | 24 +-- .../callbacks/callback_learning_rate.hpp | 160 +++++++++--------- include/lbann/callbacks/callback_ltfb.hpp | 16 +- include/lbann/callbacks/callback_mixup.hpp | 12 +- .../lbann/callbacks/callback_perturb_adam.hpp | 12 +- .../callbacks/callback_perturb_dropout.hpp | 12 +- include/lbann/callbacks/callback_print.hpp | 22 +-- .../callbacks/callback_replace_weights.hpp | 26 +-- .../lbann/callbacks/callback_save_images.hpp | 20 ++- .../lbann/callbacks/callback_save_model.hpp | 45 +++-- .../callbacks/callback_save_topk_models.hpp | 22 +-- include/lbann/callbacks/callback_summary.hpp | 24 +-- .../lbann/callbacks/callback_sync_layers.hpp | 28 +-- .../callbacks/callback_sync_selected.hpp | 34 ++-- include/lbann/callbacks/callback_timeline.hpp | 32 ++-- include/lbann/callbacks/callback_timer.hpp | 20 ++- .../callbacks/callback_variable_minibatch.hpp | 54 +++--- include/lbann/callbacks/profiler.hpp | 22 +-- .../layers/io/input/generic_input_layer.hpp | 2 +- include/lbann/layers/layer.hpp | 9 +- include/lbann/models/model.hpp | 8 +- include/lbann/optimizers/adam.hpp | 5 +- include/lbann/proto/factories.hpp | 2 +- model_zoo/lbann2.cpp | 3 +- model_zoo/lbann_inf.cpp | 9 +- src/callbacks/callback_check_dataset.cpp | 18 +- src/callbacks/callback_check_gradients.cpp | 14 +- src/callbacks/callback_check_init.cpp | 10 +- src/callbacks/callback_check_metric.cpp | 14 +- src/callbacks/callback_checknan.cpp | 12 +- src/callbacks/callback_checkpoint.cpp | 32 ++-- src/callbacks/callback_checksmall.cpp | 14 +- src/callbacks/callback_confusion_matrix.cpp | 36 ++-- src/callbacks/callback_debug.cpp | 32 ++-- src/callbacks/callback_debug_io.cpp | 28 +-- src/callbacks/callback_dump_error_signals.cpp | 12 +- src/callbacks/callback_dump_gradients.cpp | 14 +- ...callback_dump_minibatch_sample_indices.cpp | 19 ++- src/callbacks/callback_dump_outputs.cpp | 22 +-- src/callbacks/callback_dump_weights.cpp | 22 +-- src/callbacks/callback_early_stopping.cpp | 16 +- src/callbacks/callback_gpu_memory_usage.cpp | 6 +- src/callbacks/callback_hang.cpp | 12 +- src/callbacks/callback_helpers.hpp | 4 +- src/callbacks/callback_imcomm.cpp | 38 +++-- src/callbacks/callback_io.cpp | 16 +- src/callbacks/callback_learning_rate.cpp | 130 +++++++------- src/callbacks/callback_ltfb.cpp | 34 ++-- src/callbacks/callback_mixup.cpp | 9 +- src/callbacks/callback_perturb_adam.cpp | 20 ++- src/callbacks/callback_perturb_dropout.cpp | 18 +- src/callbacks/callback_print.cpp | 24 +-- src/callbacks/callback_replace_weights.cpp | 14 +- src/callbacks/callback_save_images.cpp | 16 +- src/callbacks/callback_save_model.cpp | 31 ++-- src/callbacks/callback_save_topk_models.cpp | 18 +- src/callbacks/callback_summary.cpp | 28 +-- src/callbacks/callback_sync_layers.cpp | 16 +- src/callbacks/callback_sync_selected.cpp | 74 ++++---- src/callbacks/callback_timeline.cpp | 26 +-- src/callbacks/callback_timer.cpp | 18 +- src/callbacks/callback_variable_minibatch.cpp | 40 ++--- src/callbacks/profiler.cpp | 70 ++++---- src/models/model.cpp | 9 +- src/proto/factories/callback_factory.cpp | 93 +++++----- src/proto/factories/model_factory.cpp | 2 +- src/utils/lbann_library.cpp | 2 +- 87 files changed, 1211 insertions(+), 1041 deletions(-) diff --git a/include/lbann/callbacks/callback.hpp b/include/lbann/callbacks/callback.hpp index 3392d352915..6370bdd2092 100644 --- a/include/lbann/callbacks/callback.hpp +++ b/include/lbann/callbacks/callback.hpp @@ -23,11 +23,11 @@ // implied. See the License for the specific language governing // permissions and limitations under the license. // -// lbann_callback .hpp - Base class for LBANN callbacks +// callback .hpp - Base class for LBANN callbacks //////////////////////////////////////////////////////////////////////////////// -#ifndef __LBANN_CALLBACKS_CALLBACK_HPP_INCLUDED -#define __LBANN_CALLBACKS_CALLBACK_HPP_INCLUDED +#ifndef LBANN_CALLBACKS_CALLBACK_HPP_INCLUDED +#define LBANN_CALLBACKS_CALLBACK_HPP_INCLUDED #include "lbann/base.hpp" #include "lbann/utils/summary.hpp" @@ -37,14 +37,14 @@ // A utility macro for easily adding default-constructed sub-class // builders. #define ADD_DEFAULT_CALLBACK_BUILDER(Class, FunctionName) \ - inline std::unique_ptr FunctionName( \ + inline std::unique_ptr FunctionName( \ const google::protobuf::Message&, lbann_summary*) { \ return make_unique(); \ } namespace lbann { -/** @class lbann_callback +/** @class callback_base * @brief Base class for callbacks during training/testing. * * The method of each callback is called at a given point during @@ -52,7 +52,7 @@ namespace lbann { * care about. Callbacks may be passed a lbann_summary instance, * which they can use to log any relevant information. */ -class lbann_callback { +class callback_base { public: /** @name Constructors and destructor */ @@ -61,17 +61,17 @@ class lbann_callback { /** @brief Initialize a callback with an optional batch interval and * summarizer. */ - lbann_callback(int batch_interval = 1, + callback_base(int batch_interval = 1, lbann_summary *summarizer = nullptr) : m_batch_interval(std::max(batch_interval, 1)), m_summarizer(summarizer) {} - lbann_callback(const lbann_callback&) = default; - virtual ~lbann_callback() {} + callback_base(const callback_base&) = default; + virtual ~callback_base() {} ///@} /** @name Polymorphic copy */ ///@{ - virtual lbann_callback* copy() const = 0; + virtual callback_base* copy() const = 0; ///@} /** @name Modifiers */ @@ -182,10 +182,10 @@ class lbann_callback { * * Performs a shallow (pointer) copy of the summarizer. */ - lbann_callback& operator=(const lbann_callback&) = default; + callback_base& operator=(const callback_base&) = default; protected: - /** @todo Make lbann_callback data private */ + /** @todo Make callback data private */ /** @brief Batch methods should once every this many steps. */ int m_batch_interval; @@ -195,4 +195,4 @@ class lbann_callback { } // namespace lbann -#endif // __LBANN_CALLBACKS_CALLBACK_HPP_INCLUDED +#endif // LBANN_CALLBACKS_CALLBACK_HPP_INCLUDED diff --git a/include/lbann/callbacks/callback_check_dataset.hpp b/include/lbann/callbacks/callback_check_dataset.hpp index d8b513eb9bf..34e34f30e37 100644 --- a/include/lbann/callbacks/callback_check_dataset.hpp +++ b/include/lbann/callbacks/callback_check_dataset.hpp @@ -31,24 +31,25 @@ #include "lbann/callbacks/callback.hpp" namespace lbann { +namespace callback { /** * Save the sample indices for each mini-batch to ordered set. * Check to make sure that all samples were properly processed. */ -class lbann_callback_check_dataset : public lbann_callback { +class check_dataset : public callback_base { public: - using lbann_callback::on_forward_prop_end; - using lbann_callback::on_evaluate_forward_prop_end; + using callback_base::on_forward_prop_end; + using callback_base::on_evaluate_forward_prop_end; - lbann_callback_check_dataset() : - lbann_callback() {} - lbann_callback_check_dataset( - const lbann_callback_check_dataset&) = default; - lbann_callback_check_dataset& operator=( - const lbann_callback_check_dataset&) = default; - lbann_callback_check_dataset* copy() const override { - return new lbann_callback_check_dataset(*this); + check_dataset() : + callback_base() {} + check_dataset( + const check_dataset&) = default; + check_dataset& operator=( + const check_dataset&) = default; + check_dataset* copy() const override { + return new check_dataset(*this); } void on_forward_prop_end(model *m, Layer *l) override; void on_evaluate_forward_prop_end(model *m, Layer *l) override; @@ -70,8 +71,9 @@ class lbann_callback_check_dataset : public lbann_callback { // Builder function ADD_DEFAULT_CALLBACK_BUILDER( - lbann_callback_check_dataset, build_callback_check_dataset_from_pbuf); + check_dataset, build_check_dataset_callback_from_pbuf); -} // namespace lbann +} // namespace callback +} // namespace lbann #endif // LBANN_CALLBACKS_CALLBACK_CHECK_DATASET_HPP_INCLUDED diff --git a/include/lbann/callbacks/callback_check_gradients.hpp b/include/lbann/callbacks/callback_check_gradients.hpp index 1a46d7ea986..a76d81183f8 100644 --- a/include/lbann/callbacks/callback_check_gradients.hpp +++ b/include/lbann/callbacks/callback_check_gradients.hpp @@ -30,6 +30,7 @@ #include "lbann/callbacks/callback.hpp" namespace lbann { +namespace callback { /** @brief Gradient checking callback. * @@ -40,7 +41,7 @@ namespace lbann { * derivative computed during backprop, the gradient check has * failed. */ -class lbann_callback_check_gradients : public lbann_callback { +class check_gradients : public callback_base { public: /** @@ -53,11 +54,11 @@ class lbann_callback_check_gradients : public lbann_callback { * @param error_on_failure Whether to throw an exception for * large gradient errors. */ - lbann_callback_check_gradients(DataType step_size = DataType(0), + check_gradients(DataType step_size = DataType(0), bool verbose = false, bool error_on_failure = false); - lbann_callback_check_gradients* copy() const override { - return new lbann_callback_check_gradients(*this); + check_gradients* copy() const override { + return new check_gradients(*this); } void on_test_end(model *m) override; std::string name() const override { return "check gradients"; } @@ -74,10 +75,11 @@ class lbann_callback_check_gradients : public lbann_callback { }; // Builder function -std::unique_ptr -build_callback_check_gradients_from_pbuf( - const google::protobuf::Message& proto_msg, lbann_summary*); +std::unique_ptr +build_check_gradients_callback_from_pbuf( + const google::protobuf::Message&, lbann_summary*); -} // namespace lbann +} // namespace callback +} // namespace lbann #endif // LBANN_CALLBACKS_CALLBACK_CHECK_GRADIENTS_HPP_INCLUDED diff --git a/include/lbann/callbacks/callback_check_init.hpp b/include/lbann/callbacks/callback_check_init.hpp index 6acafc01cc2..2b670f79390 100644 --- a/include/lbann/callbacks/callback_check_init.hpp +++ b/include/lbann/callbacks/callback_check_init.hpp @@ -23,7 +23,7 @@ // implied. See the License for the specific language governing // permissions and limitations under the license. // -// lbann_callback_check_init .hpp .cpp - Check multi-model init +// check_init .hpp .cpp - Check multi-model init //////////////////////////////////////////////////////////////////////////////// #ifndef LBANN_CALLBACKS_CALLBACK_CHECK_INIT_HPP_INCLUDED @@ -32,18 +32,19 @@ #include "lbann/callbacks/callback.hpp" namespace lbann { +namespace callback { /** * Verify that every model uses the same initialization. */ -class lbann_callback_check_init : public lbann_callback { +class check_init : public callback_base { public: - lbann_callback_check_init() : lbann_callback() {} - lbann_callback_check_init(const lbann_callback_check_init&) = default; - lbann_callback_check_init& operator=( - const lbann_callback_check_init&) = default; - lbann_callback_check_init* copy() const override { - return new lbann_callback_check_init(*this); + check_init() : callback_base() {} + check_init(const check_init&) = default; + check_init& operator=( + const check_init&) = default; + check_init* copy() const override { + return new check_init(*this); } /** Check initializations. */ void on_train_begin(model *m) override; @@ -55,8 +56,9 @@ class lbann_callback_check_init : public lbann_callback { // Builder function ADD_DEFAULT_CALLBACK_BUILDER( - lbann_callback_check_init, build_callback_check_init_from_pbuf) + check_init, build_check_init_callback_from_pbuf) -} // namespace lbann +} // namespace callback +} // namespace lbann #endif // LBANN_CALLBACKS_CALLBACK_CHECK_INIT_HPP_INCLUDED diff --git a/include/lbann/callbacks/callback_check_metric.hpp b/include/lbann/callbacks/callback_check_metric.hpp index 0652629c89c..23b1ebfd317 100644 --- a/include/lbann/callbacks/callback_check_metric.hpp +++ b/include/lbann/callbacks/callback_check_metric.hpp @@ -31,26 +31,27 @@ #include namespace lbann { +namespace callback { /** Metric checking callback. * Checks if a metric value falls within an expected range. */ -class lbann_callback_check_metric : public lbann_callback { +class check_metric : public callback_base { public: - lbann_callback_check_metric(std::string metric_name, - std::set modes, - EvalType lower_bound, - EvalType upper_bound, - bool error_on_failure); - lbann_callback_check_metric* copy() const override { - return new lbann_callback_check_metric(*this); + check_metric(std::string metric_name, + std::set modes, + EvalType lower_bound, + EvalType upper_bound, + bool error_on_failure); + check_metric* copy() const override { + return new check_metric(*this); } std::string name() const override { return "check metric"; } - void on_epoch_end(model* m) override { check_metric(*m); } - void on_validation_end(model* m) override { check_metric(*m); } - void on_test_end(model* m) override { check_metric(*m); } + void on_epoch_end(model* m) override { do_check_metric(*m); } + void on_validation_end(model* m) override { do_check_metric(*m); } + void on_test_end(model* m) override { do_check_metric(*m); } private: @@ -71,15 +72,16 @@ class lbann_callback_check_metric : public lbann_callback { /** Perform metric check. * Does nothing if current execution mode is not in m_modes; */ - void check_metric(const model& m) const; + void do_check_metric(const model& m) const; }; // Builder function -std::unique_ptr -build_callback_check_metric_from_pbuf( +std::unique_ptr +build_check_metric_callback_from_pbuf( const google::protobuf::Message&, lbann_summary*); +} // namespace callback } // namespace lbann #endif // LBANN_CALLBACKS_CALLBACK_CHECK_METRIC_HPP_INCLUDED diff --git a/include/lbann/callbacks/callback_checknan.hpp b/include/lbann/callbacks/callback_checknan.hpp index b76c4e9d382..fe72402e395 100644 --- a/include/lbann/callbacks/callback_checknan.hpp +++ b/include/lbann/callbacks/callback_checknan.hpp @@ -23,31 +23,32 @@ // implied. See the License for the specific language governing // permissions and limitations under the license. // -// lbann_callback_checknan .hpp .cpp - Check matrices for invalid numbers +// check_nan .hpp .cpp - Check matrices for invalid numbers //////////////////////////////////////////////////////////////////////////////// -#ifndef LBANN_CALLBACKS_CALLBACK_CHECKNAN_HPP_INCLUDED -#define LBANN_CALLBACKS_CALLBACK_CHECKNAN_HPP_INCLUDED +#ifndef LBANN_CALLBACKS_CALLBACK_CHECK_NAN_HPP_INCLUDED +#define LBANN_CALLBACKS_CALLBACK_CHECK_NAN_HPP_INCLUDED #include "lbann/callbacks/callback.hpp" namespace lbann { +namespace callback { /** * Check matrices for whether they include any NaNs or infs to help debugging. * This will kill the rank if such values are discovered. */ -class lbann_callback_checknan : public lbann_callback { +class check_nan : public callback_base { public: - using lbann_callback::on_forward_prop_end; - using lbann_callback::on_backward_prop_end; + using callback_base::on_forward_prop_end; + using callback_base::on_backward_prop_end; - lbann_callback_checknan() : lbann_callback() {} - lbann_callback_checknan(const lbann_callback_checknan&) = default; - lbann_callback_checknan& operator=( - const lbann_callback_checknan&) = default; - lbann_callback_checknan* copy() const override { - return new lbann_callback_checknan(*this); + check_nan() : callback_base() {} + check_nan(const check_nan&) = default; + check_nan& operator=( + const check_nan&) = default; + check_nan* copy() const override { + return new check_nan(*this); } /** Check that activations are good. */ void on_forward_prop_end(model *m, Layer *l) override; @@ -57,14 +58,15 @@ class lbann_callback_checknan : public lbann_callback { void on_backward_prop_end(model *m) override; /** Check that weights are good. */ void on_batch_end(model *m) override; - std::string name() const override { return "checknan"; } + std::string name() const override { return "check_nan"; } }; // Builder function ADD_DEFAULT_CALLBACK_BUILDER( - lbann_callback_checknan, build_callback_check_nan_from_pbuf) + check_nan, build_check_nan_callback_from_pbuf) -} // namespace lbann +} // namespace callback +} // namespace lbann -#endif // LBANN_CALLBACKS_CALLBACK_CHECKNAN_HPP_INCLUDED +#endif // LBANN_CALLBACKS_CALLBACK_CHECK_NAN_HPP_INCLUDED diff --git a/include/lbann/callbacks/callback_checkpoint.hpp b/include/lbann/callbacks/callback_checkpoint.hpp index 0638accc64e..b96d85d1d00 100644 --- a/include/lbann/callbacks/callback_checkpoint.hpp +++ b/include/lbann/callbacks/callback_checkpoint.hpp @@ -23,7 +23,7 @@ // implied. See the License for the specific language governing // permissions and limitations under the license. // -// lbann_callback_checkpoint .hpp .cpp - Callback hooks to checkpoint model +// checkpoint .hpp .cpp - Callback hooks to checkpoint model //////////////////////////////////////////////////////////////////////////////// #ifndef LBANN_CALLBACKS_CALLBACK_CHECKPOINT_HPP_INCLUDED #define LBANN_CALLBACKS_CALLBACK_CHECKPOINT_HPP_INCLUDED @@ -32,9 +32,10 @@ #include "lbann/io/persist.hpp" namespace lbann { +namespace callback { /** @brief Checkpoint at given interval in given directory */ -class lbann_callback_checkpoint : public lbann_callback { +class checkpoint : public callback_base { public: /** @brief Construct the checkpoint callback @@ -51,14 +52,14 @@ class lbann_callback_checkpoint : public lbann_callback { * @param ckpt_dist_epochs The frequency of distributed checkpoints in epochs * @param ckpt_dist_steps The frequence of distributed checkpoints in steps */ - lbann_callback_checkpoint(std::string checkpoint_dir, - int checkpoint_epochs, - int checkpoint_steps, - int checkpoint_secs, - std::string per_rank_dir, - int ckpt_dist_epochs, - int ckpt_dist_steps) : - lbann_callback(), + checkpoint(std::string checkpoint_dir, + int checkpoint_epochs, + int checkpoint_steps, + int checkpoint_secs, + std::string per_rank_dir, + int ckpt_dist_epochs, + int ckpt_dist_steps) : + callback_base(), m_checkpoint_dir(checkpoint_dir), m_checkpoint_epochs(checkpoint_epochs), m_checkpoint_steps(checkpoint_steps), @@ -66,9 +67,9 @@ class lbann_callback_checkpoint : public lbann_callback { m_per_rank_dir(per_rank_dir), m_ckpt_dist_epochs(ckpt_dist_epochs), m_ckpt_dist_steps(ckpt_dist_steps) {} - lbann_callback_checkpoint(const lbann_callback_checkpoint&) = default; - lbann_callback_checkpoint& operator=(const lbann_callback_checkpoint&) = default; - lbann_callback_checkpoint* copy() const override { return new lbann_callback_checkpoint(*this); } + checkpoint(const checkpoint&) = default; + checkpoint& operator=(const checkpoint&) = default; + checkpoint* copy() const override { return new checkpoint(*this); } void setup(model *m) override; void on_epoch_end(model *m) override; void on_batch_end(model *m) override; @@ -103,10 +104,11 @@ class lbann_callback_checkpoint : public lbann_callback { } bool need_checkpoint(model *m); - bool checkpoint(model *m); bool restart(model *m); std::string name() const override { return "checkpoint"; } protected: + bool do_checkpoint(model *m); + private: std::string m_checkpoint_dir; int m_checkpoint_epochs; int m_checkpoint_steps; @@ -203,10 +205,11 @@ static inline bool read_latest(std::string filename, int *epochLast, int *trainL } // Builder function -std::unique_ptr -build_callback_checkpoint_from_pbuf( +std::unique_ptr +build_checkpoint_callback_from_pbuf( const google::protobuf::Message&, lbann_summary*); -} // namespace lbann +} // namespace callback +} // namespace lbann #endif // LBANN_CALLBACKS_CALLBACK_CHECKPOINT_HPP_INCLUDED diff --git a/include/lbann/callbacks/callback_checksmall.hpp b/include/lbann/callbacks/callback_checksmall.hpp index c0ea4e3203c..c74e59319ad 100644 --- a/include/lbann/callbacks/callback_checksmall.hpp +++ b/include/lbann/callbacks/callback_checksmall.hpp @@ -23,15 +23,16 @@ // implied. See the License for the specific language governing // permissions and limitations under the license. // -// lbann_callback_checksmall .hpp .cpp - Check matrices for small values +// check_small .hpp .cpp - Check matrices for small values //////////////////////////////////////////////////////////////////////////////// -#ifndef LBANN_CALLBACKS_CALLBACK_CHECKSMALL_HPP_INCLUDED -#define LBANN_CALLBACKS_CALLBACK_CHECKSMALL_HPP_INCLUDED +#ifndef LBANN_CALLBACKS_CALLBACK_CHECK_SMALL_HPP_INCLUDED +#define LBANN_CALLBACKS_CALLBACK_CHECK_SMALL_HPP_INCLUDED #include "lbann/callbacks/callback.hpp" namespace lbann { +namespace callback { /** * Check matrices for whether they include any very small values to avoid @@ -41,17 +42,17 @@ namespace lbann { * smallest floating point value. * This will kill the rank if such values are discovered. */ -class lbann_callback_checksmall : public lbann_callback { +class check_small : public callback_base { public: - using lbann_callback::on_forward_prop_end; - using lbann_callback::on_backward_prop_end; + using callback_base::on_forward_prop_end; + using callback_base::on_backward_prop_end; - lbann_callback_checksmall() : lbann_callback() {} - lbann_callback_checksmall(const lbann_callback_checksmall&) = default; - lbann_callback_checksmall& operator=( - const lbann_callback_checksmall&) = default; - lbann_callback_checksmall* copy() const override { - return new lbann_callback_checksmall(*this); + check_small() : callback_base() {} + check_small(const check_small&) = default; + check_small& operator=( + const check_small&) = default; + check_small* copy() const override { + return new check_small(*this); } /** Check that activations are good. */ void on_forward_prop_end(model *m, Layer *l) override; @@ -59,7 +60,7 @@ class lbann_callback_checksmall : public lbann_callback { void on_backward_prop_end(model *m) override; /** Check that weights are good. */ void on_batch_end(model *m) override; - std::string name() const override { return "checksmall"; } + std::string name() const override { return "check_small"; } private: /** Smallest allowable value. */ static const DataType m_threshold; @@ -69,8 +70,9 @@ class lbann_callback_checksmall : public lbann_callback { // Builder function ADD_DEFAULT_CALLBACK_BUILDER( - lbann_callback_checksmall, build_callback_check_small_from_pbuf) + check_small, build_check_small_callback_from_pbuf) -} // namespace lbann +} // namespace callback +} // namespace lbann -#endif // LBANN_CALLBACKS_CALLBACK_CHECKSMALL_HPP_INCLUDED +#endif // LBANN_CALLBACKS_CALLBACK_CHECK_SMALL_HPP_INCLUDED diff --git a/include/lbann/callbacks/callback_confusion_matrix.hpp b/include/lbann/callbacks/callback_confusion_matrix.hpp index c86b1b155b6..a0a723084cc 100644 --- a/include/lbann/callbacks/callback_confusion_matrix.hpp +++ b/include/lbann/callbacks/callback_confusion_matrix.hpp @@ -30,6 +30,7 @@ #include "lbann/callbacks/callback.hpp" namespace lbann { +namespace callback { /** Compute confusion matrix. * Confusion matrices are saved in CSV files of the form @@ -37,16 +38,16 @@ namespace lbann { * with prediction i and label j. The prediction and label layers are * assumed to output one-hot vectors for each mini-batch sample. */ -class lbann_callback_confusion_matrix : public lbann_callback { +class confusion_matrix : public callback_base { public: - lbann_callback_confusion_matrix(std::string prediction_layer, + confusion_matrix(std::string prediction_layer, std::string label_layer, std::string prefix); - lbann_callback_confusion_matrix(const lbann_callback_confusion_matrix&); - lbann_callback_confusion_matrix& operator=(const lbann_callback_confusion_matrix&); - lbann_callback_confusion_matrix* copy() const override { - return new lbann_callback_confusion_matrix(*this); + confusion_matrix(const confusion_matrix&); + confusion_matrix& operator=(const confusion_matrix&); + confusion_matrix* copy() const override { + return new confusion_matrix(*this); } std::string name() const override { return "confusion matrix"; } @@ -111,10 +112,11 @@ class lbann_callback_confusion_matrix : public lbann_callback { }; // Builder function -std::unique_ptr -build_callback_confusion_matrix_from_pbuf( +std::unique_ptr +build_confusion_matrix_callback_from_pbuf( const google::protobuf::Message&, lbann_summary*); +} // namespace callback } // namespace lbann #endif // LBANN_CALLBACKS_CALLBACK_CONFUSION_MATRIX_HPP_INCLUDED diff --git a/include/lbann/callbacks/callback_debug.hpp b/include/lbann/callbacks/callback_debug.hpp index 3eaf65a1485..455e8b16c4b 100644 --- a/include/lbann/callbacks/callback_debug.hpp +++ b/include/lbann/callbacks/callback_debug.hpp @@ -30,6 +30,7 @@ #include "lbann/callbacks/callback.hpp" namespace lbann { +namespace callback { /** * @brief Phase specific "printf debugging" @@ -41,7 +42,7 @@ namespace lbann { * if \ will print messages for all phases * */ -class lbann_callback_debug : public lbann_callback { +class debug : public callback_base { public: /** @brief Constructor. @@ -49,12 +50,12 @@ class lbann_callback_debug : public lbann_callback { * If modes is empty, status updates will be printed for all * execution modes. */ - lbann_callback_debug(std::set modes, + debug(std::set modes, lbann_summary *summarizer = nullptr) : - lbann_callback(1, summarizer), m_modes(std::move(modes)) {} - lbann_callback_debug(const lbann_callback_debug&) = default; - lbann_callback_debug& operator=(const lbann_callback_debug&) = default; - lbann_callback_debug* copy() const override { return new lbann_callback_debug(*this); } + callback_base(1, summarizer), m_modes(std::move(modes)) {} + debug(const debug&) = default; + debug& operator=(const debug&) = default; + debug* copy() const override { return new debug(*this); } std::string name() const override { return "debug"; } /** @brief Print that a batch is beginning. */ @@ -66,14 +67,14 @@ class lbann_callback_debug : public lbann_callback { /** @brief Print that a layer's forward prop is ending. */ void on_batch_evaluate_end(model *m) override; - using lbann_callback::on_forward_prop_begin; - using lbann_callback::on_forward_prop_end; - using lbann_callback::on_backward_prop_begin; - using lbann_callback::on_backward_prop_end; - using lbann_callback::on_evaluate_forward_prop_begin; - using lbann_callback::on_evaluate_forward_prop_end; - using lbann_callback::on_optimize_begin; - using lbann_callback::on_optimize_end; + using callback_base::on_forward_prop_begin; + using callback_base::on_forward_prop_end; + using callback_base::on_backward_prop_begin; + using callback_base::on_backward_prop_end; + using callback_base::on_evaluate_forward_prop_begin; + using callback_base::on_evaluate_forward_prop_end; + using callback_base::on_optimize_begin; + using callback_base::on_optimize_end; /** @brief Print that a layer's forward prop is beginning. */ void on_forward_prop_begin(model *m, Layer *l) override; @@ -104,10 +105,11 @@ class lbann_callback_debug : public lbann_callback { }; // Builder function -std::unique_ptr -build_callback_debug_from_pbuf( +std::unique_ptr +build_debug_callback_from_pbuf( const google::protobuf::Message&, lbann_summary*); +} // namespace callback } // namespace lbann #endif // LBANN_CALLBACKS_CALLBACK_DEBUG_HPP_INCLUDED diff --git a/include/lbann/callbacks/callback_debug_io.hpp b/include/lbann/callbacks/callback_debug_io.hpp index aaefc9662b3..6d2b11866bb 100644 --- a/include/lbann/callbacks/callback_debug_io.hpp +++ b/include/lbann/callbacks/callback_debug_io.hpp @@ -23,7 +23,7 @@ // implied. See the License for the specific language governing // permissions and limitations under the license. // -// lbann_callback_debug .hpp .cpp - Callback hooks to debug LBANN +// debug .hpp .cpp - Callback hooks to debug LBANN //////////////////////////////////////////////////////////////////////////////// #ifndef LBANN_CALLBACKS_CALLBACK_DEBUG_IO_HPP_INCLUDED @@ -35,30 +35,31 @@ #include "lbann/layers/io/input/input_layer.hpp" namespace lbann { +namespace callback { /** * Print status updates on where training is. */ -class lbann_callback_debug_io : public lbann_callback { +class debug_io : public callback_base { public: - using lbann_callback::on_forward_prop_begin; - using lbann_callback::on_forward_prop_end; - using lbann_callback::on_backward_prop_begin; - using lbann_callback::on_backward_prop_end; - using lbann_callback::on_evaluate_forward_prop_begin; - using lbann_callback::on_evaluate_forward_prop_end; + using callback_base::on_forward_prop_begin; + using callback_base::on_forward_prop_end; + using callback_base::on_backward_prop_begin; + using callback_base::on_backward_prop_end; + using callback_base::on_evaluate_forward_prop_begin; + using callback_base::on_evaluate_forward_prop_end; /** * Debug a particular phase; use invalid to debug every phase. */ - lbann_callback_debug_io(execution_mode phase = execution_mode::invalid, + debug_io(execution_mode phase = execution_mode::invalid, int debug_lvl = 0, lbann_summary *summarizer = nullptr) : - lbann_callback(1, summarizer), m_debug_phase(phase), m_debug_lvl(debug_lvl) {} - lbann_callback_debug_io(const lbann_callback_debug_io&) = default; - lbann_callback_debug_io& operator=( - const lbann_callback_debug_io&) = default; - lbann_callback_debug_io* copy() const override { return new lbann_callback_debug_io(*this); } + callback_base(1, summarizer), m_debug_phase(phase), m_debug_lvl(debug_lvl) {} + debug_io(const debug_io&) = default; + debug_io& operator=( + const debug_io&) = default; + debug_io* copy() const override { return new debug_io(*this); } /** Print that a training epoch is being started. */ void on_epoch_begin(model *m) override; /** Print that forward prop for a layer is beginning. */ @@ -85,10 +86,11 @@ class lbann_callback_debug_io : public lbann_callback { }; // Builder function -std::unique_ptr -build_callback_debug_io_from_pbuf( +std::unique_ptr +build_debug_io_callback_from_pbuf( const google::protobuf::Message&, lbann_summary*); -} // namespace lbann +} // namespace callback +} // namespace lbann #endif // LBANN_CALLBACKS_CALLBACK_DEBUG_IO_HPP_INCLUDED diff --git a/include/lbann/callbacks/callback_dump_error_signals.hpp b/include/lbann/callbacks/callback_dump_error_signals.hpp index 456cb90f884..38644bdac52 100644 --- a/include/lbann/callbacks/callback_dump_error_signals.hpp +++ b/include/lbann/callbacks/callback_dump_error_signals.hpp @@ -30,22 +30,23 @@ #include "lbann/callbacks/callback.hpp" namespace lbann { +namespace callback { /** Dump gradients w.r.t. inputs to file. * After each layer performs a backward prop step, this callback will * dump the gradients w.r.t. inputs (the "error signals") to a * human-readable ASCII file. This is slow and produces a lot of output. */ -class lbann_callback_dump_error_signals : public lbann_callback { +class dump_error_signals : public callback_base { public: /** Constructor. * @param basename The basename for output files. */ - lbann_callback_dump_error_signals(std::string basename = "") - : lbann_callback(), m_basename(basename) {} - lbann_callback_dump_error_signals* copy() const override { - return new lbann_callback_dump_error_signals(*this); + dump_error_signals(std::string basename = "") + : callback_base(), m_basename(basename) {} + dump_error_signals* copy() const override { + return new dump_error_signals(*this); } std::string name() const override { return "dump error signals"; } @@ -59,10 +60,11 @@ class lbann_callback_dump_error_signals : public lbann_callback { }; // Builder function -std::unique_ptr -build_callback_dump_error_signals_from_pbuf( +std::unique_ptr +build_dump_error_signals_callback_from_pbuf( const google::protobuf::Message&, lbann_summary*); -} // namespace lbann +} // namespace callback +} // namespace lbann #endif // LBANN_CALLBACKS_CALLBACK_DUMP_ERROR_SIGNALS_HPP_INCLUDED diff --git a/include/lbann/callbacks/callback_dump_gradients.hpp b/include/lbann/callbacks/callback_dump_gradients.hpp index 4c2947feedd..ddbad213c14 100644 --- a/include/lbann/callbacks/callback_dump_gradients.hpp +++ b/include/lbann/callbacks/callback_dump_gradients.hpp @@ -23,7 +23,7 @@ // implied. See the License for the specific language governing // permissions and limitations under the license. // -// lbann_callback_dump_gradients .hpp .cpp - Callbacks to dump gradients +// dump_gradients .hpp .cpp - Callbacks to dump gradients //////////////////////////////////////////////////////////////////////////////// #ifndef LBANN_CALLBACKS_CALLBACK_DUMP_GRADIENTS_HPP_INCLUDED @@ -34,6 +34,7 @@ #include "lbann/callbacks/callback.hpp" namespace lbann { +namespace callback { /** * @brief Dump gradient matrices to files. @@ -44,22 +45,22 @@ namespace lbann { * that isn't easily done in LBANN. Note this dumps matrices during * each mini-batch. This will be slow and produce a lot of output. */ -class lbann_callback_dump_gradients : public lbann_callback { +class dump_gradients : public callback_base { public: - using lbann_callback::on_backward_prop_end; + using callback_base::on_backward_prop_end; /** * @param basename The basename for writing files. * @param batch_interval The frequency at which to dump the gradients */ - lbann_callback_dump_gradients(std::string basename, int batch_interval = 1) : - lbann_callback(batch_interval), m_basename(std::move(basename)) {} - lbann_callback_dump_gradients( - const lbann_callback_dump_gradients&) = default; - lbann_callback_dump_gradients& operator=( - const lbann_callback_dump_gradients&) = default; - lbann_callback_dump_gradients* copy() const override { - return new lbann_callback_dump_gradients(*this); + dump_gradients(std::string basename, int batch_interval = 1) : + callback_base(batch_interval), m_basename(std::move(basename)) {} + dump_gradients( + const dump_gradients&) = default; + dump_gradients& operator=( + const dump_gradients&) = default; + dump_gradients* copy() const override { + return new dump_gradients(*this); } void on_backward_prop_end(model *m) override; std::string name() const override { return "dump gradients"; } @@ -69,10 +70,11 @@ class lbann_callback_dump_gradients : public lbann_callback { }; // Builder function -std::unique_ptr -build_callback_dump_gradients_from_pbuf( +std::unique_ptr +build_dump_gradients_callback_from_pbuf( const google::protobuf::Message&, lbann_summary*); -} // namespace lbann +} // namespace callback +} // namespace lbann #endif // LBANN_CALLBACKS_CALLBACK_DUMP_GRADIENTS_HPP_INCLUDED diff --git a/include/lbann/callbacks/callback_dump_minibatch_sample_indices.hpp b/include/lbann/callbacks/callback_dump_minibatch_sample_indices.hpp index 0ad5e12b58c..f3f6ddd1e48 100644 --- a/include/lbann/callbacks/callback_dump_minibatch_sample_indices.hpp +++ b/include/lbann/callbacks/callback_dump_minibatch_sample_indices.hpp @@ -23,7 +23,7 @@ // implied. See the License for the specific language governing // permissions and limitations under the license. // -// lbann_callback_dump_minibatch_sample_indices .hpp .cpp - Callbacks +// dump_minibatch_sample_indices .hpp .cpp - Callbacks // to dump the list of indices per minibatch //////////////////////////////////////////////////////////////////////////////// @@ -35,6 +35,7 @@ #include "lbann/callbacks/callback.hpp" namespace lbann { +namespace callback { /** * @brief Dump sample indices for each minibatch to files. @@ -43,24 +44,24 @@ namespace lbann { * vectors during each mini-batch. This will be slow and produce a lot * of output. */ -class lbann_callback_dump_minibatch_sample_indices : public lbann_callback { +class dump_minibatch_sample_indices : public callback_base { public: - using lbann_callback::on_forward_prop_end; - using lbann_callback::on_evaluate_forward_prop_end; + using callback_base::on_forward_prop_end; + using callback_base::on_evaluate_forward_prop_end; /** * @param basename The basename for writing files. * @param batch_interval The frequency at which to dump sample indices */ - lbann_callback_dump_minibatch_sample_indices(std::string basename, + dump_minibatch_sample_indices(std::string basename, int batch_interval = 1) : - lbann_callback(batch_interval), m_basename(std::move(basename)) {} - lbann_callback_dump_minibatch_sample_indices( - const lbann_callback_dump_minibatch_sample_indices&) = default; - lbann_callback_dump_minibatch_sample_indices& operator=( - const lbann_callback_dump_minibatch_sample_indices&) = default; - lbann_callback_dump_minibatch_sample_indices* copy() const override { - return new lbann_callback_dump_minibatch_sample_indices(*this); + callback_base(batch_interval), m_basename(std::move(basename)) {} + dump_minibatch_sample_indices( + const dump_minibatch_sample_indices&) = default; + dump_minibatch_sample_indices& operator=( + const dump_minibatch_sample_indices&) = default; + dump_minibatch_sample_indices* copy() const override { + return new dump_minibatch_sample_indices(*this); } void on_forward_prop_end(model *m, Layer *l) override; void on_evaluate_forward_prop_end(model *m, Layer *l) override; @@ -74,10 +75,11 @@ class lbann_callback_dump_minibatch_sample_indices : public lbann_callback { }; // Builder function -std::unique_ptr -build_callback_dump_mb_indices_from_pbuf( +std::unique_ptr +build_dump_mb_indices_callback_from_pbuf( const google::protobuf::Message&, lbann_summary*); -} // namespace lbann +} // namespace callback +} // namespace lbann #endif // LBANN_CALLBACKS_CALLBACK_DUMP_MINIBATCH_SAMPLE_INDICES_HPP_INCLUDED diff --git a/include/lbann/callbacks/callback_dump_outputs.hpp b/include/lbann/callbacks/callback_dump_outputs.hpp index 44ff4f8b1a3..babc685ffad 100644 --- a/include/lbann/callbacks/callback_dump_outputs.hpp +++ b/include/lbann/callbacks/callback_dump_outputs.hpp @@ -33,6 +33,7 @@ #include namespace lbann { +namespace callback { /** @brief Dump layer output tensors to files. * @@ -50,7 +51,7 @@ namespace lbann { * * CNPY is required to export to NumPy file formats (npy and npz). */ -class lbann_callback_dump_outputs : public lbann_callback { +class dump_outputs : public callback_base { public: /** @brief Construct a callback to dump outputs. @@ -66,23 +67,25 @@ class lbann_callback_dump_outputs : public lbann_callback { * @param file_format Output file format. Options are csv, tsv, * npy, npz (default: csv). */ - lbann_callback_dump_outputs( + dump_outputs( std::set layer_names,// = std::set(), std::set modes, // = std::set(), El::Int batch_interval = 0, std::string directory = "", std::string file_format = ""); - lbann_callback_dump_outputs* copy() const override { - return new lbann_callback_dump_outputs(*this); + dump_outputs* copy() const override { + return new dump_outputs(*this); } std::string name() const override { return "dump outputs"; } - void on_forward_prop_end(model* m, Layer* l) override { dump_outputs(*m, *l); } + void on_forward_prop_end(model* m, Layer* l) override { + do_dump_outputs(*m, *l); + } void on_evaluate_forward_prop_end(model* m, Layer* l) override { - if(m->get_step() % m_batch_interval == 0) { - dump_outputs(*m, *l); - } + if(m->get_step() % m_batch_interval == 0) { + do_dump_outputs(*m, *l); + } } private: @@ -108,15 +111,16 @@ class lbann_callback_dump_outputs : public lbann_callback { /** @brief Dump outputs to file. * @details Returns immediately if an output dump is not needed. */ - void dump_outputs(const model& m, const Layer& l); + void do_dump_outputs(const model& m, const Layer& l); }; // Builder function -std::unique_ptr -build_callback_dump_outputs_from_pbuf( +std::unique_ptr +build_dump_outputs_callback_from_pbuf( const google::protobuf::Message&, lbann_summary*); +} // namespace callback } // namespace lbann #endif // LBANN_CALLBACKS_CALLBACK_DUMP_OUTPUTS_HPP_INCLUDED diff --git a/include/lbann/callbacks/callback_dump_weights.hpp b/include/lbann/callbacks/callback_dump_weights.hpp index 462bd1f6ba9..c28b7e4c461 100644 --- a/include/lbann/callbacks/callback_dump_weights.hpp +++ b/include/lbann/callbacks/callback_dump_weights.hpp @@ -23,7 +23,7 @@ // implied. See the License for the specific language governing // permissions and limitations under the license. // -// lbann_callback_dump_weights .hpp .cpp - Callbacks to dump weight matrices +// dump_weights .hpp .cpp - Callbacks to dump weight matrices //////////////////////////////////////////////////////////////////////////////// #ifndef LBANN_CALLBACKS_CALLBACK_DUMP_WEIGHTS_HPP_INCLUDED @@ -34,6 +34,7 @@ #include "lbann/callbacks/callback.hpp" namespace lbann { +namespace callback { /** * Dump weight matrices to files. @@ -42,18 +43,18 @@ namespace lbann { * is not meant for checkpointing, but for exporting weight matrices for * analysis that isn't easily done in LBANN. */ -class lbann_callback_dump_weights : public lbann_callback { +class dump_weights : public callback_base { public: /** * @param basename The basename for writing files. */ - lbann_callback_dump_weights(std::string basename) : - lbann_callback(), m_basename(std::move(basename)) {} - lbann_callback_dump_weights(const lbann_callback_dump_weights&) = default; - lbann_callback_dump_weights& operator=( - const lbann_callback_dump_weights&) = default; - lbann_callback_dump_weights* copy() const override { - return new lbann_callback_dump_weights(*this); + dump_weights(std::string basename) : + callback_base(), m_basename(std::move(basename)) {} + dump_weights(const dump_weights&) = default; + dump_weights& operator=( + const dump_weights&) = default; + dump_weights* copy() const override { + return new dump_weights(*this); } void on_train_begin(model *m) override; void on_epoch_end(model *m) override; @@ -62,14 +63,15 @@ class lbann_callback_dump_weights : public lbann_callback { /** Basename for writing files. */ std::string m_basename; /// Dump weights from learning layers. - void dump_weights(model *m, std::string s = ""); + void do_dump_weights(model *m, std::string s = ""); }; // Builder function -std::unique_ptr -build_callback_dump_weights_from_pbuf( - const google::protobuf::Message& proto_msg, lbann_summary*); +std::unique_ptr +build_dump_weights_callback_from_pbuf( + const google::protobuf::Message&, lbann_summary*); -} // namespace lbann +} // namespace callback +} // namespace lbann #endif // LBANN_CALLBACKS_CALLBACK_DUMP_WEIGHTS_HPP_INCLUDED diff --git a/include/lbann/callbacks/callback_early_stopping.hpp b/include/lbann/callbacks/callback_early_stopping.hpp index dc1050d291c..8445d20c790 100644 --- a/include/lbann/callbacks/callback_early_stopping.hpp +++ b/include/lbann/callbacks/callback_early_stopping.hpp @@ -34,21 +34,22 @@ #include "lbann/callbacks/callback.hpp" namespace lbann { +namespace callback { /** * Stop training after validation error stops improving. */ -class lbann_callback_early_stopping : public lbann_callback { +class early_stopping : public callback_base { public: /** * Continue training until score has not improved for patience epochs. */ - lbann_callback_early_stopping(int64_t patience); - lbann_callback_early_stopping(const lbann_callback_early_stopping&) = default; - lbann_callback_early_stopping& operator=( - const lbann_callback_early_stopping&) = default; - lbann_callback_early_stopping* copy() const override { - return new lbann_callback_early_stopping(*this); + early_stopping(int64_t patience); + early_stopping(const early_stopping&) = default; + early_stopping& operator=( + const early_stopping&) = default; + early_stopping* copy() const override { + return new early_stopping(*this); } /** Update validation score and check for early stopping. */ void on_validation_end(model *m) override; @@ -63,10 +64,11 @@ class lbann_callback_early_stopping : public lbann_callback { }; // Builder function -std::unique_ptr -build_callback_early_stopping_from_pbuf( +std::unique_ptr +build_early_stopping_callback_from_pbuf( const google::protobuf::Message&, lbann_summary*); -} // namespace lbann +} // namespace callback +} // namespace lbann #endif // LBANN_CALLBACKS_EARLY_STOPPING_HPP_INCLUDED diff --git a/include/lbann/callbacks/callback_gpu_memory_usage.hpp b/include/lbann/callbacks/callback_gpu_memory_usage.hpp index 610d7d0bc66..c594c787f3e 100644 --- a/include/lbann/callbacks/callback_gpu_memory_usage.hpp +++ b/include/lbann/callbacks/callback_gpu_memory_usage.hpp @@ -26,30 +26,33 @@ // callback_gpu_memory_usage .hpp .cpp - Callbacks for printing GPU memory usage //////////////////////////////////////////////////////////////////////////////// -#ifndef __LBANN_CALLBACKS_CALLBACK_GPU_MEMORY_USAGE_HPP_INCLUDED -#define __LBANN_CALLBACKS_CALLBACK_GPU_MEMORY_USAGE_HPP_INCLUDED +#ifndef LBANN_CALLBACKS_CALLBACK_GPU_MEMORY_USAGE_HPP_INCLUDED +#define LBANN_CALLBACKS_CALLBACK_GPU_MEMORY_USAGE_HPP_INCLUDED #include "lbann/callbacks/callback.hpp" namespace lbann { +namespace callback { + /** Callback hooks for printing GPU memory usage. */ -class lbann_callback_gpu_memory_usage : public lbann_callback { +class gpu_memory_usage : public callback_base { public: /** Constructor. */ - lbann_callback_gpu_memory_usage() = default; - lbann_callback_gpu_memory_usage(const lbann_callback_gpu_memory_usage&) = default; - lbann_callback_gpu_memory_usage& operator=(const lbann_callback_gpu_memory_usage&) = default; - lbann_callback_gpu_memory_usage* copy() const override { return new lbann_callback_gpu_memory_usage(*this); } + gpu_memory_usage() = default; + gpu_memory_usage(const gpu_memory_usage&) = default; + gpu_memory_usage& operator=(const gpu_memory_usage&) = default; + gpu_memory_usage* copy() const override { return new gpu_memory_usage(*this); } void on_epoch_begin(model *m) override; std::string name() const override { return "GPU memory usage"; } }; // Builder function ADD_DEFAULT_CALLBACK_BUILDER( - lbann_callback_gpu_memory_usage, build_callback_gpu_memory_usage_from_pbuf); + gpu_memory_usage, build_gpu_memory_usage_callback_from_pbuf); -} // namespace lbann +} // namespace callback +} // namespace lbann -#endif // __LBANN_CALLBACKS_CALLBACK_GPU_MEMORY_USAGE_HPP_INCLUDED +#endif // LBANN_CALLBACKS_CALLBACK_GPU_MEMORY_USAGE_HPP_INCLUDED diff --git a/include/lbann/callbacks/callback_hang.hpp b/include/lbann/callbacks/callback_hang.hpp index afa6fccbd77..d408a110a61 100644 --- a/include/lbann/callbacks/callback_hang.hpp +++ b/include/lbann/callbacks/callback_hang.hpp @@ -23,7 +23,7 @@ // implied. See the License for the specific language governing // permissions and limitations under the license. // -// lbann_callback_hang .hpp .cpp - Callback to hang LBANN for debuggers +// hang .hpp .cpp - Callback to hang LBANN for debuggers //////////////////////////////////////////////////////////////////////////////// #ifndef LBANN_CALLBACKS_CALLBACK_HANG_HPP_INCLUDED @@ -32,6 +32,7 @@ #include "lbann/callbacks/callback.hpp" namespace lbann { +namespace callback { /** * Hang LBANN as training starts so debuggers can attach. @@ -39,16 +40,16 @@ namespace lbann { * Attach to the hung ranks and set the hang flag to false with a debugger to * proceed. */ -class lbann_callback_hang : public lbann_callback { +class hang : public callback_base { public: /** * @param rank_to_hang The rank to hang; -1 for every rank (default). */ - lbann_callback_hang(int rank_to_hang = -1) : + hang(int rank_to_hang = -1) : m_rank_to_hang(rank_to_hang) {} - lbann_callback_hang(const lbann_callback_hang&) = default; - lbann_callback_hang& operator=(const lbann_callback_hang&) = default; - lbann_callback_hang* copy() const override { return new lbann_callback_hang(*this); } + hang(const hang&) = default; + hang& operator=(const hang&) = default; + hang* copy() const override { return new hang(*this); } void setup(model* m) override; @@ -68,10 +69,11 @@ class lbann_callback_hang : public lbann_callback { }; // Builder function -std::unique_ptr -build_callback_hang_from_pbuf( +std::unique_ptr +build_hang_callback_from_pbuf( const google::protobuf::Message&, lbann_summary*); -} // namespace lbann +} // namespace callback +} // namespace lbann #endif // LBANN_CALLBACKS_CALLBACK_HANG_HPP_INCLUDED diff --git a/include/lbann/callbacks/callback_imcomm.hpp b/include/lbann/callbacks/callback_imcomm.hpp index d75d0bb5e44..e906d68a0ea 100644 --- a/include/lbann/callbacks/callback_imcomm.hpp +++ b/include/lbann/callbacks/callback_imcomm.hpp @@ -23,7 +23,7 @@ // implied. See the License for the specific language governing // permissions and limitations under the license. // -// lbann_callback_imcomm .hpp .cpp - Send gradient updates between models +// imcomm .hpp .cpp - Send gradient updates between models //////////////////////////////////////////////////////////////////////////////// #ifndef LBANN_CALLBACKS_CALLBACK_IMCOMM_HPP_INCLUDED @@ -35,14 +35,15 @@ #include "lbann/callbacks/callback.hpp" namespace lbann { +namespace callback { /** * Support inter-model communication after each mini-batch to synchronize * gradient updates. */ -class lbann_callback_imcomm : public lbann_callback { +class imcomm : public callback_base { public: - using lbann_callback::on_backward_prop_end; + using callback_base::on_backward_prop_end; enum comm_type { NONE, /** Do no gradient updates. */ @@ -52,18 +53,18 @@ class lbann_callback_imcomm : public lbann_callback { /** * Initialize with ct being used for all weights. */ - lbann_callback_imcomm(comm_type ct = NORMAL, + imcomm(comm_type ct = NORMAL, lbann_summary *summarizer = nullptr); - lbann_callback_imcomm(const lbann_callback_imcomm&) = default; - lbann_callback_imcomm& operator=(const lbann_callback_imcomm&) = default; - lbann_callback_imcomm* copy() const override { - return new lbann_callback_imcomm(*this); + imcomm(const imcomm&) = default; + imcomm& operator=(const imcomm&) = default; + imcomm* copy() const override { + return new imcomm(*this); } /** * Convenience initialization to do one update type for specific weights. * Implies no inter-model updates for other weights. */ - lbann_callback_imcomm(comm_type ct, std::unordered_set weights_list, + imcomm(comm_type ct, std::unordered_set weights_list, lbann_summary *summarizer = nullptr); /** Choose comm type ct for weights. */ @@ -95,13 +96,14 @@ class lbann_callback_imcomm : public lbann_callback { /** returns a string representation of the weight_initialization */ -std::string get_comm_type_name(lbann_callback_imcomm::comm_type m); +std::string get_comm_type_name(imcomm::comm_type m); // Builder function -std::unique_ptr -build_callback_imcomm_from_pbuf( +std::unique_ptr +build_imcomm_callback_from_pbuf( const google::protobuf::Message&, lbann_summary*); -} // namespace lbann +} // namespace callback +} // namespace lbann #endif // LBANN_CALLBACKS_CALLBACK_IMCOMM_HPP_INCLUDED diff --git a/include/lbann/callbacks/callback_io.hpp b/include/lbann/callbacks/callback_io.hpp index 0149617693e..58c9a2e668d 100644 --- a/include/lbann/callbacks/callback_io.hpp +++ b/include/lbann/callbacks/callback_io.hpp @@ -23,7 +23,7 @@ // implied. See the License for the specific language governing // permissions and limitations under the license. // -// lbann_callback_io .hpp .cpp - Callback hooks for I/O monitoring +// io .hpp .cpp - Callback hooks for I/O monitoring //////////////////////////////////////////////////////////////////////////////// #ifndef LBANN_CALLBACKS_IO_HPP_INCLUDED @@ -37,21 +37,22 @@ #include namespace lbann { +namespace callback { /** * Print information on the amount of IO that layers do. */ -class lbann_callback_io : public lbann_callback { +class io : public callback_base { public: - lbann_callback_io() = default; + io() = default; /** Only apply to specific layers. */ - lbann_callback_io(std::vector const& layers) + io(std::vector const& layers) : m_layers(layers.begin(), layers.end()) {} - lbann_callback_io(const lbann_callback_io&) = default; - lbann_callback_io& operator=(const lbann_callback_io&) = default; - lbann_callback_io* copy() const override { - return new lbann_callback_io(*this); + io(const io&) = default; + io& operator=(const io&) = default; + io* copy() const override { + return new io(*this); } /** Report how much I/O has occured per data reader */ void on_epoch_end(model *m) override; @@ -63,10 +64,11 @@ class lbann_callback_io : public lbann_callback { }; // Builder function -std::unique_ptr -build_callback_disp_io_stats_from_pbuf( +std::unique_ptr +build_disp_io_stats_callback_from_pbuf( const google::protobuf::Message&, lbann_summary*); -} // namespace lbann +} // namespace callback +} // namespace lbann #endif // LBANN_CALLBACKS_IO_HPP_INCLUDED diff --git a/include/lbann/callbacks/callback_learning_rate.hpp b/include/lbann/callbacks/callback_learning_rate.hpp index 43a3c57b073..77ff3d486ae 100644 --- a/include/lbann/callbacks/callback_learning_rate.hpp +++ b/include/lbann/callbacks/callback_learning_rate.hpp @@ -34,27 +34,28 @@ #include "lbann/callbacks/callback.hpp" namespace lbann { +namespace callback { -// Different schedules should inherit from lbann_callback_learning_rate. +// Different schedules should inherit from learning_rate. /** * Base class for learning rate schedules. * Child classes should implement the schedule method to make changes. */ -class lbann_callback_learning_rate : public lbann_callback { +class learning_rate : public callback_base { public: - lbann_callback_learning_rate(); - lbann_callback_learning_rate(const lbann_callback_learning_rate&) = default; - lbann_callback_learning_rate& operator=( - const lbann_callback_learning_rate&) = default; + learning_rate(); + learning_rate(const learning_rate&) = default; + learning_rate& operator=( + const learning_rate&) = default; /** Only apply to specific weights. */ - lbann_callback_learning_rate(std::vector weights_names); + learning_rate(std::vector weights_names); /** Do some initialization. */ void setup(model *m) override; /** Apply global learning rate schedules. */ void on_epoch_end(model *m) override; - using lbann_callback::on_backward_prop_end; + using callback_base::on_backward_prop_end; /** Apply local/per-optimizer learning rate schedules. */ void on_backward_prop_end(model *m) override; protected: @@ -109,18 +110,18 @@ class lbann_callback_learning_rate : public lbann_callback { /** * Decrease the learning rate by a fixed proportion every X epochs. */ -class lbann_callback_step_learning_rate : public lbann_callback_learning_rate { +class step_learning_rate : public learning_rate { public: /** Decrease the learning rate by amt every step epochs. */ - lbann_callback_step_learning_rate(int step, float amt); - lbann_callback_step_learning_rate(int step, float amt, + step_learning_rate(int step, float amt); + step_learning_rate(int step, float amt, std::vector weights_names); - lbann_callback_step_learning_rate( - const lbann_callback_step_learning_rate&) = default; - lbann_callback_step_learning_rate& operator=( - const lbann_callback_step_learning_rate&) = default; - lbann_callback_step_learning_rate* copy() const override { - return new lbann_callback_step_learning_rate(*this); + step_learning_rate( + const step_learning_rate&) = default; + step_learning_rate& operator=( + const step_learning_rate&) = default; + step_learning_rate* copy() const override { + return new step_learning_rate(*this); } std::string name() const override { return "step learning rate"; } protected: @@ -133,29 +134,29 @@ class lbann_callback_step_learning_rate : public lbann_callback_learning_rate { }; // Builder function -std::unique_ptr -build_callback_step_learning_rate_from_pbuf( +std::unique_ptr +build_step_learning_rate_callback_from_pbuf( const google::protobuf::Message&, lbann_summary*); /** * Decrease the learning rate by a fixed proportion when validation error stops * improving. */ -class lbann_callback_adaptive_learning_rate : public lbann_callback_learning_rate { +class adaptive_learning_rate : public learning_rate { public: /** * Decrease the learning rate by amt if accuracy does not improve for patience * epochs. */ - lbann_callback_adaptive_learning_rate(int64_t patience, float amt); - lbann_callback_adaptive_learning_rate(int64_t patience, float amt, + adaptive_learning_rate(int64_t patience, float amt); + adaptive_learning_rate(int64_t patience, float amt, std::vector weights_names); - lbann_callback_adaptive_learning_rate( - const lbann_callback_adaptive_learning_rate&) = default; - lbann_callback_adaptive_learning_rate& operator=( - const lbann_callback_adaptive_learning_rate&) = default; - lbann_callback_adaptive_learning_rate* copy() const override { - return new lbann_callback_adaptive_learning_rate(*this); + adaptive_learning_rate( + const adaptive_learning_rate&) = default; + adaptive_learning_rate& operator=( + const adaptive_learning_rate&) = default; + adaptive_learning_rate* copy() const override { + return new adaptive_learning_rate(*this); } std::string name() const override { return "adaptive learning rate"; } protected: @@ -176,31 +177,31 @@ class lbann_callback_adaptive_learning_rate : public lbann_callback_learning_rat }; // Builder function -std::unique_ptr -build_callback_adaptive_learning_rate_from_pbuf( +std::unique_ptr +build_adaptive_learning_rate_callback_from_pbuf( const google::protobuf::Message&, lbann_summary*); /** * Decrease learning rate by a fixed amount at fixed times. */ -class lbann_callback_drop_fixed_learning_rate : - public lbann_callback_learning_rate { +class drop_fixed_learning_rate : + public learning_rate { public: /** * Decrease the learning rate by amt when each epoch in drop_epochs is * reached. */ - lbann_callback_drop_fixed_learning_rate( + drop_fixed_learning_rate( std::vector drop_epochs, float amt); - lbann_callback_drop_fixed_learning_rate( + drop_fixed_learning_rate( std::vector drop_epochs, float amt, std::vector weights_names); - lbann_callback_drop_fixed_learning_rate( - const lbann_callback_drop_fixed_learning_rate&) = default; - lbann_callback_drop_fixed_learning_rate& operator=( - const lbann_callback_drop_fixed_learning_rate&) = default; - lbann_callback_drop_fixed_learning_rate* copy() const override { - return new lbann_callback_drop_fixed_learning_rate(*this); + drop_fixed_learning_rate( + const drop_fixed_learning_rate&) = default; + drop_fixed_learning_rate& operator=( + const drop_fixed_learning_rate&) = default; + drop_fixed_learning_rate* copy() const override { + return new drop_fixed_learning_rate(*this); } std::string name() const override { return "drop fixed learning rate"; } protected: @@ -216,8 +217,8 @@ class lbann_callback_drop_fixed_learning_rate : }; // Builder function -std::unique_ptr -build_callback_drop_fixed_learning_rate_from_pbuf( +std::unique_ptr +build_drop_fixed_learning_rate_callback_from_pbuf( const google::protobuf::Message&, lbann_summary*); /** @@ -227,25 +228,25 @@ build_callback_drop_fixed_learning_rate_from_pbuf( * learning rate. This also *forces* its schedule and will stomp over * other changes. */ -class lbann_callback_linear_growth_learning_rate : - public lbann_callback_learning_rate { +class linear_growth_learning_rate : + public learning_rate { public: /** * Linearly increase the learning rate to reach target after num_epochs. */ - lbann_callback_linear_growth_learning_rate( + linear_growth_learning_rate( float target, int64_t num_epochs); - lbann_callback_linear_growth_learning_rate( + linear_growth_learning_rate( float target, int64_t num_epochs, int64_t delay); - lbann_callback_linear_growth_learning_rate( + linear_growth_learning_rate( float target, int64_t num_epochs, int64_t delay, std::vector weights_names); - lbann_callback_linear_growth_learning_rate( - const lbann_callback_linear_growth_learning_rate&) = default; - lbann_callback_linear_growth_learning_rate& operator=( - const lbann_callback_linear_growth_learning_rate&) = default; - lbann_callback_linear_growth_learning_rate* copy() const override { - return new lbann_callback_linear_growth_learning_rate(*this); } + linear_growth_learning_rate( + const linear_growth_learning_rate&) = default; + linear_growth_learning_rate& operator=( + const linear_growth_learning_rate&) = default; + linear_growth_learning_rate* copy() const override { + return new linear_growth_learning_rate(*this); } void setup(model *m) override; std::string name() const override { return "linear growth learning rate"; } protected: @@ -264,8 +265,8 @@ class lbann_callback_linear_growth_learning_rate : }; // Builder function -std::unique_ptr -build_callback_linear_growth_learning_rate_from_pbuf( +std::unique_ptr +build_linear_growth_learning_rate_callback_from_pbuf( const google::protobuf::Message&,lbann_summary*); /** @@ -274,17 +275,17 @@ build_callback_linear_growth_learning_rate_from_pbuf( * base_lr is the initial learning rate, i_cur is the current iteration, * i_max is the maximum iteration, and p is a parameter. */ -class lbann_callback_poly_learning_rate : public lbann_callback_learning_rate { +class poly_learning_rate : public learning_rate { public: - lbann_callback_poly_learning_rate(double p, uint64_t n_epochs, uint64_t max_iter); - lbann_callback_poly_learning_rate(double p, uint64_t n_epochs, uint64_t max_iter, double endl_r, + poly_learning_rate(double p, uint64_t n_epochs, uint64_t max_iter); + poly_learning_rate(double p, uint64_t n_epochs, uint64_t max_iter, double endl_r, std::vector weights_names); - lbann_callback_poly_learning_rate( - const lbann_callback_poly_learning_rate&) = default; - lbann_callback_poly_learning_rate& operator=( - const lbann_callback_poly_learning_rate&) = default; - lbann_callback_poly_learning_rate* copy() const override { - return new lbann_callback_poly_learning_rate(*this); + poly_learning_rate( + const poly_learning_rate&) = default; + poly_learning_rate& operator=( + const poly_learning_rate&) = default; + poly_learning_rate* copy() const override { + return new poly_learning_rate(*this); } void setup(model *m) override; std::string name() const override { return "poly learning rate"; } @@ -307,9 +308,9 @@ class lbann_callback_poly_learning_rate : public lbann_callback_learning_rate { }; // Builder function -std::unique_ptr -build_callback_poly_learning_rate_from_pbuf( - const google::protobuf::Message& proto_msg, lbann_summary*); +std::unique_ptr +build_poly_learning_rate_callback_from_pbuf( + const google::protobuf::Message&, lbann_summary*); /** * This implements an adaptive scheme for adjust each optimizer's @@ -318,17 +319,17 @@ build_callback_poly_learning_rate_from_pbuf( * See: You et al. "Scaling SGD Batch Size to 32K for ImageNet * Training", 2017. */ -class lbann_callback_optimizerwise_adaptive_learning_rate : public lbann_callback_learning_rate { +class optimizerwise_adaptive_learning_rate : public learning_rate { public: - lbann_callback_optimizerwise_adaptive_learning_rate(float scale); - lbann_callback_optimizerwise_adaptive_learning_rate( + optimizerwise_adaptive_learning_rate(float scale); + optimizerwise_adaptive_learning_rate( float scale, std::vector weights_names); - lbann_callback_optimizerwise_adaptive_learning_rate( - const lbann_callback_optimizerwise_adaptive_learning_rate&) = default; - lbann_callback_optimizerwise_adaptive_learning_rate& operator=( - const lbann_callback_optimizerwise_adaptive_learning_rate&) = default; - lbann_callback_optimizerwise_adaptive_learning_rate* copy() const override { - return new lbann_callback_optimizerwise_adaptive_learning_rate(*this); } + optimizerwise_adaptive_learning_rate( + const optimizerwise_adaptive_learning_rate&) = default; + optimizerwise_adaptive_learning_rate& operator=( + const optimizerwise_adaptive_learning_rate&) = default; + optimizerwise_adaptive_learning_rate* copy() const override { + return new optimizerwise_adaptive_learning_rate(*this); } std::string name() const override { return "optimizerwise adaptive learning rate"; } protected: float optimizer_schedule(model *m, optimizer &opt) override; @@ -337,10 +338,11 @@ class lbann_callback_optimizerwise_adaptive_learning_rate : public lbann_callbac }; // Builder function -std::unique_ptr -build_callback_optimizerwise_adaptive_learning_rate_from_pbuf( +std::unique_ptr +build_optimizerwise_adaptive_learning_rate_callback_from_pbuf( const google::protobuf::Message&,lbann_summary*); -} // namespace lbann +} // namespace callback +} // namespace lbann #endif // LBANN_CALLBACKS_LEARNING_RATE_HPP_INCLUDED diff --git a/include/lbann/callbacks/callback_ltfb.hpp b/include/lbann/callbacks/callback_ltfb.hpp index 4c4d915db09..f6a4adbd06e 100644 --- a/include/lbann/callbacks/callback_ltfb.hpp +++ b/include/lbann/callbacks/callback_ltfb.hpp @@ -33,6 +33,7 @@ #include namespace lbann { +namespace callback { /** @brief Tournament training. * @@ -57,7 +58,7 @@ namespace lbann { * @todo Exchange optimizer state. * @todo Support heterogeneous models. */ -class lbann_callback_ltfb : public lbann_callback { +class ltfb : public callback_base { public: /** Inter-trainer communication scheme for LTFB. @@ -114,7 +115,7 @@ class lbann_callback_ltfb : public lbann_callback { * @param comm_algo Inter-trainer communication scheme. * @param summarizer The summarizer to use for this callback */ - lbann_callback_ltfb( + ltfb( El::Int batch_interval, std::string metric_name, std::set weights_names = std::set(), @@ -122,9 +123,9 @@ class lbann_callback_ltfb : public lbann_callback { communication_algorithm comm_algo = communication_algorithm::sendrecv_weights, bool exchange_hyperparameters = false, lbann_summary *summarizer = nullptr); - lbann_callback_ltfb(const lbann_callback_ltfb& other); - lbann_callback_ltfb& operator=(const lbann_callback_ltfb& other); - lbann_callback_ltfb* copy() const override { return new lbann_callback_ltfb(*this); } + ltfb(const ltfb& other); + ltfb& operator=(const ltfb& other); + ltfb* copy() const override { return new ltfb(*this); } std::string name() const override { return "LTFB"; } void setup(model *m) override; @@ -169,10 +170,11 @@ class lbann_callback_ltfb : public lbann_callback { }; // Builder function -std::unique_ptr -build_callback_ltfb_from_pbuf( +std::unique_ptr +build_ltfb_callback_from_pbuf( const google::protobuf::Message&, lbann_summary*); +} // namespace callback } // namespace lbann #endif // LBANN_CALLBACKS_CALLBACK_LTFB_HPP_INCLUDED diff --git a/include/lbann/callbacks/callback_mixup.hpp b/include/lbann/callbacks/callback_mixup.hpp index 5aa917f686e..9b67c08ef5e 100644 --- a/include/lbann/callbacks/callback_mixup.hpp +++ b/include/lbann/callbacks/callback_mixup.hpp @@ -33,6 +33,7 @@ #include "lbann/callbacks/callback.hpp" namespace lbann { +namespace callback { /** * Apply mixup to named input layers. @@ -54,11 +55,11 @@ namespace lbann { * * The recommended default alpha (from the paper) is 0.4. */ -class callback_mixup : public lbann_callback { +class callback_mixup : public callback_base { public: /** Apply mixup to layers named in layers with mixup parameter alpha. */ callback_mixup(std::unordered_set layers, float alpha) : - lbann_callback(), m_layers(layers), m_alpha(alpha) { + callback_base(), m_layers(layers), m_alpha(alpha) { if (alpha < 0.0f) { LBANN_ERROR("Mixup alpha must be non-negative."); } @@ -77,10 +78,11 @@ class callback_mixup : public lbann_callback { }; // Builder function -std::unique_ptr -build_callback_mixup_from_pbuf( +std::unique_ptr +build_mixup_callback_from_pbuf( const google::protobuf::Message&, lbann_summary*); -} // namespace lbann +} // namespace callback +} // namespace lbann #endif // LBANN_CALLBACKS_MIXUP_HPP diff --git a/include/lbann/callbacks/callback_perturb_adam.hpp b/include/lbann/callbacks/callback_perturb_adam.hpp index dbb49b1a645..3f6fc02db51 100644 --- a/include/lbann/callbacks/callback_perturb_adam.hpp +++ b/include/lbann/callbacks/callback_perturb_adam.hpp @@ -32,6 +32,7 @@ #include namespace lbann { +namespace callback { /** @brief Hyperparameter exploration with Adam optimizers. * @@ -44,7 +45,7 @@ namespace lbann { * @f$\log(\text{learning rate})@f$, @f$\log(1-\beta_1)@f$, * @f$\log(1-\beta_2)@f$, and @f$\log\epsilon@f$. */ -class lbann_callback_perturb_adam : public lbann_callback { +class perturb_adam : public callback_base { public: /** @param learning_rate_factor Standard deviation of learning rate @@ -66,7 +67,7 @@ class lbann_callback_perturb_adam : public lbann_callback { * empty, all Adam optimizers in the model are * perturbed. */ - lbann_callback_perturb_adam(DataType learning_rate_factor, + perturb_adam(DataType learning_rate_factor, DataType beta1_factor, DataType beta2_factor, DataType eps_factor = 0, @@ -74,7 +75,7 @@ class lbann_callback_perturb_adam : public lbann_callback { El::Int batch_interval = 1, std::set weights_names = std::set()); - lbann_callback_perturb_adam* copy() const override { return new lbann_callback_perturb_adam(*this); } + perturb_adam* copy() const override { return new perturb_adam(*this); } std::string name() const override { return "perturb Adam"; } void setup(model* m) override; @@ -123,10 +124,11 @@ class lbann_callback_perturb_adam : public lbann_callback { }; // Builder function -std::unique_ptr -build_callback_perturb_adam_from_pbuf( +std::unique_ptr +build_perturb_adam_callback_from_pbuf( const google::protobuf::Message&, lbann_summary*); +} // namespace callback } // namespace lbann #endif // LBANN_CALLBACKS_CALLBACK_PERTURB_ADAM_HPP_INCLUDED diff --git a/include/lbann/callbacks/callback_perturb_dropout.hpp b/include/lbann/callbacks/callback_perturb_dropout.hpp index b629b0688e6..13754dd9dd1 100644 --- a/include/lbann/callbacks/callback_perturb_dropout.hpp +++ b/include/lbann/callbacks/callback_perturb_dropout.hpp @@ -32,12 +32,13 @@ #include namespace lbann { +namespace callback { /** @brief Hyperparameter exploration with dropouts. * * Goes through the dropout layers in a model and perturbs keep probability */ -class lbann_callback_perturb_dropout : public lbann_callback { +class perturb_dropout : public callback_base { public: /** @param keep_prob_factor Standard deviation of learning rate @@ -46,10 +47,10 @@ class lbann_callback_perturb_dropout : public lbann_callback { * empty, all dropout layers in the model are * perturbed. */ - lbann_callback_perturb_dropout(EvalType keep_prob_factor, + perturb_dropout(EvalType keep_prob_factor, std::set layer_names = std::set()); - lbann_callback_perturb_dropout* copy() const override { return new lbann_callback_perturb_dropout(*this); } + perturb_dropout* copy() const override { return new perturb_dropout(*this); } std::string name() const override { return "perturb dropout"; } void setup(model* m) override; @@ -77,10 +78,11 @@ class lbann_callback_perturb_dropout : public lbann_callback { }; // Builder function -std::unique_ptr -build_callback_perturb_dropout_from_pbuf( +std::unique_ptr +build_perturb_dropout_callback_from_pbuf( const google::protobuf::Message&, lbann_summary*); +} // namespace callback } // namespace lbann #endif // LBANN_CALLBACKS_CALLBACK_PERTURB_DROPOUT_HPP_INCLUDED diff --git a/include/lbann/callbacks/callback_print.hpp b/include/lbann/callbacks/callback_print.hpp index 95b1e142e6d..15f96c66a23 100644 --- a/include/lbann/callbacks/callback_print.hpp +++ b/include/lbann/callbacks/callback_print.hpp @@ -23,7 +23,7 @@ // implied. See the License for the specific language governing // permissions and limitations under the license. // -// lbann_callback_print .hpp .cpp - Callback hooks to print information +// print .hpp .cpp - Callback hooks to print information //////////////////////////////////////////////////////////////////////////////// #ifndef LBANN_CALLBACKS_CALLBACK_PRINT_HPP_INCLUDED @@ -32,18 +32,19 @@ #include "lbann/callbacks/callback.hpp" namespace lbann { +namespace callback { /** Periodically print computational results. * Prints average objective function value and metric scores after * each training epoch and evaluation. */ -class lbann_callback_print : public lbann_callback { +class print : public callback_base { public: - lbann_callback_print(int batch_interval = 1, bool print_global_stat_only=false) : - lbann_callback(batch_interval), m_print_global_stat_only(print_global_stat_only) {} - lbann_callback_print(const lbann_callback_print&) = default; - lbann_callback_print& operator=(const lbann_callback_print&) = default; - lbann_callback_print* copy() const override { return new lbann_callback_print(*this); } + print(int batch_interval = 1, bool print_global_stat_only=false) : + callback_base(batch_interval), m_print_global_stat_only(print_global_stat_only) {} + print(const print&) = default; + print& operator=(const print&) = default; + print* copy() const override { return new print(*this); } void setup(model *m) override; void on_epoch_begin(model *m) override; void on_epoch_end(model *m) override; @@ -59,10 +60,11 @@ class lbann_callback_print : public lbann_callback { }; // Builder function -std::unique_ptr -build_callback_print_from_pbuf( +std::unique_ptr +build_print_callback_from_pbuf( const google::protobuf::Message&, lbann_summary*); -} // namespace lbann +} // namespace callback +} // namespace lbann #endif // LBANN_CALLBACKS_CALLBACK_PRINT_HPP_INCLUDED diff --git a/include/lbann/callbacks/callback_replace_weights.hpp b/include/lbann/callbacks/callback_replace_weights.hpp index 359079de1a6..2b324ef7b55 100644 --- a/include/lbann/callbacks/callback_replace_weights.hpp +++ b/include/lbann/callbacks/callback_replace_weights.hpp @@ -33,6 +33,7 @@ #include "lbann/callbacks/callback.hpp" namespace lbann { +namespace callback { /** * Weights/parameters replacement on k-batch end @@ -40,25 +41,25 @@ namespace lbann { * Can easily be extended to support replacement by weights name * Given two layers specified in prototext, weights are copied from source layer to destination layer. */ -class lbann_callback_replace_weights : public lbann_callback { +class replace_weights : public callback_base { public: - lbann_callback_replace_weights( + replace_weights( std::vector src, std::vector dst, int batch_interval=1) - : lbann_callback(batch_interval), + : callback_base(batch_interval), m_src_layer_names(std::move(src)), m_dst_layer_names(std::move(dst)) { if(m_src_layer_names.size() != m_dst_layer_names.size()) LBANN_ERROR("In replace weights callback: number of src and dest layers does not match."); } - lbann_callback_replace_weights( - const lbann_callback_replace_weights&) = default; - lbann_callback_replace_weights& operator=( - const lbann_callback_replace_weights&) = default; - lbann_callback_replace_weights* copy() const override { - return new lbann_callback_replace_weights(*this); + replace_weights( + const replace_weights&) = default; + replace_weights& operator=( + const replace_weights&) = default; + replace_weights* copy() const override { + return new replace_weights(*this); } void setup(model *m) override; void on_batch_end(model *m) override; @@ -70,10 +71,11 @@ class lbann_callback_replace_weights : public lbann_callback { }; // Builder function -std::unique_ptr -build_callback_replace_weights_from_pbuf( +std::unique_ptr +build_replace_weights_callback_from_pbuf( const google::protobuf::Message&, lbann_summary*); -} // namespace lbann +} // namespace callback +} // namespace lbann #endif // LBANN_CALLBACKS_CALLBACK_REPLACE_WEIGHTS_HPP_INCLUDED diff --git a/include/lbann/callbacks/callback_save_images.hpp b/include/lbann/callbacks/callback_save_images.hpp index 7f13971d1d4..b772516f630 100644 --- a/include/lbann/callbacks/callback_save_images.hpp +++ b/include/lbann/callbacks/callback_save_images.hpp @@ -32,12 +32,13 @@ #include "lbann/callbacks/callback.hpp" namespace lbann { +namespace callback { /** Save layer outputs as image files. * Image files are in the form * "-.". */ -class lbann_callback_save_images : public lbann_callback { +class save_images : public callback_base { public: /** Constructor. @@ -45,14 +46,14 @@ class lbann_callback_save_images : public lbann_callback { * @param image_format Image file format (e.g. jpg, png, pgm). * @param image_prefix Prefix for image file names. */ - lbann_callback_save_images(std::vector layer_names, + save_images(std::vector layer_names, std::string image_format = "jpg", std::string image_prefix = ""); - lbann_callback_save_images(const lbann_callback_save_images&) = default; - lbann_callback_save_images& operator=( - const lbann_callback_save_images&) = default; - lbann_callback_save_images* copy() const override { - return new lbann_callback_save_images(*this); + save_images(const save_images&) = default; + save_images& operator=( + const save_images&) = default; + save_images* copy() const override { + return new save_images(*this); } void on_epoch_end(model *m) override; void on_test_end(model *m) override; @@ -72,10 +73,11 @@ class lbann_callback_save_images : public lbann_callback { }; // Builder function -std::unique_ptr -build_callback_save_images_from_pbuf( +std::unique_ptr +build_save_images_callback_from_pbuf( const google::protobuf::Message&, lbann_summary*); +} // namespace callback } // namespace lbann #endif // LBANN_CALLBACKS_CALLBACK_SAVE_IMAGES_HPP_INCLUDED diff --git a/include/lbann/callbacks/callback_save_model.hpp b/include/lbann/callbacks/callback_save_model.hpp index 2993ccc09fd..0d8e68f2085 100644 --- a/include/lbann/callbacks/callback_save_model.hpp +++ b/include/lbann/callbacks/callback_save_model.hpp @@ -23,7 +23,7 @@ // implied. See the License for the specific language governing // permissions and limitations under the license. // -// lbann_callback_save_model .hpp .cpp - Callbacks to save model, currently as protobuf +// save_model .hpp .cpp - Callbacks to save model, currently as protobuf //////////////////////////////////////////////////////////////////////////////// #ifndef LBANN_CALLBACKS_CALLBACK_SAVE_MODEL_HPP_INCLUDED @@ -36,54 +36,65 @@ #include namespace lbann { +namespace callback { /** * Save model to as protobuf file and set of weights */ -class lbann_callback_save_model : public lbann_callback { +class save_model : public callback_base { public: /** * @param dir directory to save model * @param disable_save_after_training Don't save after training * @param extension file extension e.g., model, state ...... */ - lbann_callback_save_model(std::string dir, + save_model(std::string dir, bool disable_save_after_training, std::string extension="prototext") : - lbann_callback(), m_dir(std::move(dir)), + callback_base(), m_dir(std::move(dir)), m_disable_save_after_training(disable_save_after_training), m_extension(std::move(extension)) - {} - lbann_callback_save_model(const lbann_callback_save_model&) = default; - lbann_callback_save_model& operator=( - const lbann_callback_save_model&) = default; - lbann_callback_save_model* copy() const override { - return new lbann_callback_save_model(*this); + {} + save_model(const save_model&) = default; + save_model& operator=( + const save_model&) = default; + save_model* copy() const override { + return new save_model(*this); } void on_train_end(model *m) override; - bool save_model(model *m); - bool save_model_weights(model *m); /* ckptdir_is_fullpath flag if true * allow user to specify full path to model weights to load * and allow system to ignore appending trainer id, num of epochs/steps * to default ckpt_dir*/ - static bool load_model_weights(std::string ckpt_dir, model *m, bool ckptdir_is_fullpath=false); + static bool load_model_weights(std::string ckpt_dir, + model *m, + bool ckptdir_is_fullpath=false); std::string name() const override { return "save model"; } + + protected: + friend class lbann::model; + + bool do_save_model(model *m); + bool do_save_model_weights(model *m); + private: std::string m_dir; //directory to save file - bool m_disable_save_after_training; /// Disables the normal behavior of saving when training is complete + /// Disables the normal behavior of saving when training is complete + bool m_disable_save_after_training; std::string m_extension; //file extension persist p; + void write_proto_binary(const lbann_data::Model& proto, const std::string filename); void write_proto_text(const lbann_data::Model& proto, const std::string filename); }; // Builder function -std::unique_ptr -build_callback_save_model_from_pbuf( +std::unique_ptr +build_save_model_callback_from_pbuf( const google::protobuf::Message&, lbann_summary*); -} // namespace lbann +} // namespace callback +} // namespace lbann #endif // LBANN_CALLBACKS_CALLBACK_SAVE_MODEL_HPP_INCLUDED diff --git a/include/lbann/callbacks/callback_save_topk_models.hpp b/include/lbann/callbacks/callback_save_topk_models.hpp index e5ed4a4f17d..a21801ffcea 100644 --- a/include/lbann/callbacks/callback_save_topk_models.hpp +++ b/include/lbann/callbacks/callback_save_topk_models.hpp @@ -23,7 +23,7 @@ // implied. See the License for the specific language governing // permissions and limitations under the license. // -// lbann_callback_save_topk_models .hpp .cpp - Callback to save top k models +// save_topk_models .hpp .cpp - Callback to save top k models //////////////////////////////////////////////////////////////////////////////// #ifndef LBANN_CALLBACKS_CALLBACK_SAVE_TOPK_MODELS_HPP_INCLUDED @@ -32,6 +32,7 @@ #include "lbann/callbacks/callback_save_model.hpp" namespace lbann { +namespace callback { /** Save_topk_models for (e.g., inference and other analysis). * @param dir directory to save model @@ -40,13 +41,13 @@ namespace lbann { * @ordering for the topk, descending order is default * Note: may end up saving more than k models if multiple models (trainers) have the same metric score */ -class lbann_callback_save_topk_models : public lbann_callback_save_model { +class save_topk_models : public save_model { public: - lbann_callback_save_topk_models(std::string dir, int k, std::string metric_name, bool ascending_ordering=false) : - lbann_callback_save_model(dir,true), m_k(k),m_metric_name(metric_name),m_ascending_ordering(ascending_ordering) {} - lbann_callback_save_topk_models(const lbann_callback_save_topk_models&) = default; - lbann_callback_save_topk_models& operator=(const lbann_callback_save_topk_models&) = default; - lbann_callback_save_topk_models* copy() const override { return new lbann_callback_save_topk_models(*this); } + save_topk_models(std::string dir, int k, std::string metric_name, bool ascending_ordering=false) : + save_model(dir,true), m_k(k),m_metric_name(metric_name),m_ascending_ordering(ascending_ordering) {} + save_topk_models(const save_topk_models&) = default; + save_topk_models& operator=(const save_topk_models&) = default; + save_topk_models* copy() const override { return new save_topk_models(*this); } void on_test_end(model *m) override; std::string name() const override { return "save_topk_models"; } @@ -60,10 +61,11 @@ class lbann_callback_save_topk_models : public lbann_callback_save_model { }; // Builder function -std::unique_ptr -build_callback_save_topk_models_from_pbuf( +std::unique_ptr +build_save_topk_models_callback_from_pbuf( const google::protobuf::Message&, lbann_summary*); -} // namespace lbann +} // namespace callback +} // namespace lbann #endif // LBANN_CALLBACKS_CALLBACK_SAVE_TOPK_MODELS_HPP_INCLUDED diff --git a/include/lbann/callbacks/callback_summary.hpp b/include/lbann/callbacks/callback_summary.hpp index 010d0df9c02..c9c2512c3ef 100644 --- a/include/lbann/callbacks/callback_summary.hpp +++ b/include/lbann/callbacks/callback_summary.hpp @@ -23,7 +23,7 @@ // implied. See the License for the specific language governing // permissions and limitations under the license. // -// lbann_callback_summary .hpp .cpp - Callback hooks to summarize to Tensorboard +// summary .hpp .cpp - Callback hooks to summarize to Tensorboard //////////////////////////////////////////////////////////////////////////////// #ifndef LBANN_CALLBACKS_CALLBACK_SUMMARY_HPP_INCLUDED @@ -33,11 +33,12 @@ #include "lbann/utils/summary.hpp" namespace lbann { +namespace callback { /** * Summarize information to Tensorboard using LBANN's summary interface. */ -class lbann_callback_summary : public lbann_callback { +class summary : public callback_base { public: /** * @param summarizer The summary object to write to; this callback takes @@ -46,13 +47,13 @@ class lbann_callback_summary : public lbann_callback { * @param mat_interval FIXME * @todo Document mat_interval parameter. */ - lbann_callback_summary(lbann_summary *summarizer, int batch_interval = 1, + summary(lbann_summary *summarizer, int batch_interval = 1, int mat_interval = 25); - ~lbann_callback_summary() override; - lbann_callback_summary(const lbann_callback_summary&) = default; - lbann_callback_summary& operator=(const lbann_callback_summary&) = default; - lbann_callback_summary* copy() const override { - return new lbann_callback_summary(*this); + ~summary() override; + summary(const summary&) = default; + summary& operator=(const summary&) = default; + summary* copy() const override { + return new summary(*this); } void on_train_begin(model *m) override; void on_batch_end(model *m) override; @@ -67,10 +68,11 @@ class lbann_callback_summary : public lbann_callback { }; // Builder function -std::unique_ptr -build_callback_summary_from_pbuf( +std::unique_ptr +build_summary_callback_from_pbuf( const google::protobuf::Message&, lbann_summary*); -} // namespace lbann +} // namespace callback +} // namespace lbann #endif // LBANN_CALLBACKS_CALLBACK_SUMMARY_HPP_INCLUDED diff --git a/include/lbann/callbacks/callback_sync_layers.hpp b/include/lbann/callbacks/callback_sync_layers.hpp index 74c8b943d83..ebe377833c9 100644 --- a/include/lbann/callbacks/callback_sync_layers.hpp +++ b/include/lbann/callbacks/callback_sync_layers.hpp @@ -32,13 +32,14 @@ #include "lbann/callbacks/callback.hpp" namespace lbann { +namespace callback { /** Synchronize layers after forward and backward prop. * Additionally updates layer timing information to account for this. * Note that this callback should come before the summarizer callback to report * time correctly (otherwise it will be shifted by one mini-batch). */ -class lbann_callback_sync_layers : public lbann_callback { +class sync_layers : public callback_base { public: /** * @param sync_gpus The GPU stream will be synchronized. @@ -46,20 +47,20 @@ class lbann_callback_sync_layers : public lbann_callback { * @param only_input The only synchronization will be after the input layer in * forward prop. */ - lbann_callback_sync_layers(bool sync_gpus = true, bool sync_mpi = true, + sync_layers(bool sync_gpus = true, bool sync_mpi = true, bool only_input = false) : - lbann_callback(1), m_sync_gpus(sync_gpus), m_sync_mpi(sync_mpi), + callback_base(1), m_sync_gpus(sync_gpus), m_sync_mpi(sync_mpi), m_only_input(only_input) {} - lbann_callback_sync_layers(const lbann_callback_sync_layers&) = default; - lbann_callback_sync_layers& operator=( - const lbann_callback_sync_layers&) = default; - lbann_callback_sync_layers* copy() const override { - return new lbann_callback_sync_layers(*this); + sync_layers(const sync_layers&) = default; + sync_layers& operator=( + const sync_layers&) = default; + sync_layers* copy() const override { + return new sync_layers(*this); } std::string name() const override { return "sync_layers"; } - using lbann_callback::on_forward_prop_end; - using lbann_callback::on_backward_prop_end; + using callback_base::on_forward_prop_end; + using callback_base::on_backward_prop_end; void on_forward_prop_end(model *m, Layer *l) override; void on_backward_prop_end(model *m, Layer *l) override; @@ -76,10 +77,11 @@ class lbann_callback_sync_layers : public lbann_callback { }; // Builder function -std::unique_ptr -build_callback_sync_layers_from_pbuf( +std::unique_ptr +build_sync_layers_callback_from_pbuf( const google::protobuf::Message&, lbann_summary*); -} // namespace lbann +} // namespace callback +} // namespace lbann #endif // LBANN_CALLBACKS_CALLBACK_SYNC_LAYERS_HPP_INCLUDED diff --git a/include/lbann/callbacks/callback_sync_selected.hpp b/include/lbann/callbacks/callback_sync_selected.hpp index 33b140cfd27..efffc3d35ca 100644 --- a/include/lbann/callbacks/callback_sync_selected.hpp +++ b/include/lbann/callbacks/callback_sync_selected.hpp @@ -34,6 +34,7 @@ #include namespace lbann { +namespace callback { /** * Synchronize at the beginning and the end of the propagation operation(s) of @@ -45,9 +46,9 @@ namespace lbann { * comes after the local GPU sychronization and before the global MPI barrier * inserted at the end of the selected prop step(s). * Note that this callback should come before the summarizer callback - * as the base callback lbann_callback_sync_layers requires. + * as the base callback sync_layers requires. */ -class lbann_callback_sync_selected : public lbann_callback_sync_layers { +class sync_selected : public sync_layers { public: ///type of propagation toch synchronize enum prop_t {Both = 0, Forward = 1, Backward = 2}; @@ -61,19 +62,19 @@ class lbann_callback_sync_selected : public lbann_callback_sync_layers { * @param async_gpus sets not to synchronize gpus. The default is false. * @param async_mpi sets not to synchronize mpi. The default is false. */ - lbann_callback_sync_selected(const layers_t& layers, + sync_selected(const layers_t& layers, bool async_gpus = false, bool async_mpi = false); - lbann_callback_sync_selected(const lbann_callback_sync_selected&) = default; + sync_selected(const sync_selected&) = default; - lbann_callback_sync_selected& operator=( - const lbann_callback_sync_selected&) = default; + sync_selected& operator=( + const sync_selected&) = default; - lbann_callback_sync_selected* copy() const override { - return new lbann_callback_sync_selected(*this); + sync_selected* copy() const override { + return new sync_selected(*this); } - ~lbann_callback_sync_selected() override; + ~sync_selected() override; std::string name() const override { return "sync_selected"; } std::string get_description() const; @@ -91,10 +92,10 @@ class lbann_callback_sync_selected : public lbann_callback_sync_layers { * Then, populate the layer pointers */ void setup(model *m) override; - using lbann_callback::on_forward_prop_begin; - using lbann_callback::on_backward_prop_begin; - using lbann_callback_sync_layers::on_forward_prop_end; - using lbann_callback_sync_layers::on_backward_prop_end; + using callback_base::on_forward_prop_begin; + using callback_base::on_backward_prop_begin; + using sync_layers::on_forward_prop_end; + using sync_layers::on_backward_prop_end; /// Synchronize at the beginning of the forward prop of layer l void on_forward_prop_begin(model* m, Layer* l) override; @@ -134,10 +135,11 @@ class lbann_callback_sync_selected : public lbann_callback_sync_layers { }; // Builder function -std::unique_ptr -build_callback_sync_selected_from_pbuf( +std::unique_ptr +build_sync_selected_callback_from_pbuf( const google::protobuf::Message&, lbann_summary*); -} // namespace lbann +} // namespace callback +} // namespace lbann #endif // LBANN_CALLBACKS_CALLBACK_SYNC_SELECTED_HPP_INCLUDED diff --git a/include/lbann/callbacks/callback_timeline.hpp b/include/lbann/callbacks/callback_timeline.hpp index 76566fc107b..ff7e99828f2 100644 --- a/include/lbann/callbacks/callback_timeline.hpp +++ b/include/lbann/callbacks/callback_timeline.hpp @@ -33,6 +33,7 @@ #include "lbann/callbacks/callback.hpp" namespace lbann { +namespace callback { /** * Record a timeline of training runtime on each rank and output it to a @@ -41,25 +42,25 @@ namespace lbann { * Each line is a separate event, written as name:start-time:end-time. * Times are relative to the beginning of training. */ -class lbann_callback_timeline : public lbann_callback { +class timeline : public callback_base { public: - lbann_callback_timeline(std::string outdir) : lbann_callback(1), + timeline(std::string outdir) : callback_base(1), m_outdir(outdir) {} - lbann_callback_timeline(const lbann_callback_timeline&) = default; - lbann_callback_timeline& operator=(const lbann_callback_timeline&) = default; - lbann_callback_timeline* copy() const override { - return new lbann_callback_timeline(*this); + timeline(const timeline&) = default; + timeline& operator=(const timeline&) = default; + timeline* copy() const override { + return new timeline(*this); } std::string name() const override { return "timeline"; } void on_train_begin(model *m) override; void on_train_end(model *m) override; - using lbann_callback::on_forward_prop_begin; - using lbann_callback::on_forward_prop_end; - using lbann_callback::on_backward_prop_begin; - using lbann_callback::on_backward_prop_end; - using lbann_callback::on_optimize_begin; - using lbann_callback::on_optimize_end; + using callback_base::on_forward_prop_begin; + using callback_base::on_forward_prop_end; + using callback_base::on_backward_prop_begin; + using callback_base::on_backward_prop_end; + using callback_base::on_optimize_begin; + using callback_base::on_optimize_end; void on_forward_prop_begin(model *m, Layer *l) override; void on_forward_prop_end(model *m, Layer *l) override; @@ -88,10 +89,11 @@ class lbann_callback_timeline : public lbann_callback { }; // Builder function -std::unique_ptr -build_callback_timeline_from_pbuf( +std::unique_ptr +build_timeline_callback_from_pbuf( const google::protobuf::Message&, lbann_summary*); -} // namespace lbann +} // namespace callback +} // namespace lbann #endif // LBANN_CALLBACKS_CALLBACK_TIMELINE_HPP_INCLUDED diff --git a/include/lbann/callbacks/callback_timer.hpp b/include/lbann/callbacks/callback_timer.hpp index 90533933629..2864b075325 100644 --- a/include/lbann/callbacks/callback_timer.hpp +++ b/include/lbann/callbacks/callback_timer.hpp @@ -33,21 +33,22 @@ #include namespace lbann { +namespace callback { /** Record and report model timing results. * Reports the total time and mini-batch time statistics for training * epochs and for model evaluations. This reports times for the * master process in each model. */ -class lbann_callback_timer : public lbann_callback { +class timer : public callback_base { public: - lbann_callback_timer(lbann_summary *summarizer = nullptr) - : lbann_callback(1, summarizer) {} - lbann_callback_timer(const lbann_callback_timer&) = default; - lbann_callback_timer& operator=(const lbann_callback_timer&) = default; - lbann_callback_timer* copy() const override { - return new lbann_callback_timer(*this); + timer(lbann_summary *summarizer = nullptr) + : callback_base(1, summarizer) {} + timer(const timer&) = default; + timer& operator=(const timer&) = default; + timer* copy() const override { + return new timer(*this); } /** Start timing for a training epoch. */ @@ -99,10 +100,11 @@ class lbann_callback_timer : public lbann_callback { }; // Builder function -std::unique_ptr -build_callback_timer_from_pbuf( +std::unique_ptr +build_timer_callback_from_pbuf( const google::protobuf::Message&, lbann_summary*); +} // namespace callback } // namespace lbann #endif // LBANN_CALLBACKS_CALLBACK_TIMER_HPP_INCLUDED diff --git a/include/lbann/callbacks/callback_variable_minibatch.hpp b/include/lbann/callbacks/callback_variable_minibatch.hpp index c05e5e1e4b1..d8a6a09c8ff 100644 --- a/include/lbann/callbacks/callback_variable_minibatch.hpp +++ b/include/lbann/callbacks/callback_variable_minibatch.hpp @@ -32,19 +32,20 @@ #include "lbann/callbacks/callback.hpp" namespace lbann { +namespace callback { /** * Support changing the mini-batch size on different schedules. * Implementations should override implement the abstract methods to define * concrete schedules. */ -class lbann_callback_variable_minibatch : public lbann_callback { +class variable_minibatch : public callback_base { public: - lbann_callback_variable_minibatch(int starting_mbsize); - lbann_callback_variable_minibatch( - const lbann_callback_variable_minibatch&) = default; - lbann_callback_variable_minibatch& operator=( - const lbann_callback_variable_minibatch&) = default; + variable_minibatch(int starting_mbsize); + variable_minibatch( + const variable_minibatch&) = default; + variable_minibatch& operator=( + const variable_minibatch&) = default; /// Set the initial mini-batch size. void on_train_begin(model *m) override; /// Potentially change the mini-batch size. @@ -89,15 +90,15 @@ class lbann_callback_variable_minibatch : public lbann_callback { * Double the mini-batch size every set number of epochs. * Also doubles the learning rate. */ -class lbann_callback_step_minibatch : public lbann_callback_variable_minibatch { +class step_minibatch : public variable_minibatch { public: - lbann_callback_step_minibatch(int starting_mbsize, int step, + step_minibatch(int starting_mbsize, int step, int ramp_time = 0); - lbann_callback_step_minibatch(const lbann_callback_step_minibatch&) = default; - lbann_callback_step_minibatch& operator=( - const lbann_callback_step_minibatch&) = delete; - lbann_callback_step_minibatch* copy() const override { - return new lbann_callback_step_minibatch(*this); + step_minibatch(const step_minibatch&) = default; + step_minibatch& operator=( + const step_minibatch&) = delete; + step_minibatch* copy() const override { + return new step_minibatch(*this); } std::string name() const override { return "step minibatch"; } protected: @@ -111,11 +112,11 @@ class lbann_callback_step_minibatch : public lbann_callback_variable_minibatch { }; // Builder function -std::unique_ptr -build_callback_step_minibatch_from_pbuf( +std::unique_ptr +build_step_minibatch_callback_from_pbuf( const google::protobuf::Message&, lbann_summary*); -class lbann_callback_minibatch_schedule : public lbann_callback_variable_minibatch { +class minibatch_schedule : public variable_minibatch { public: /// Represents a step in a schedule of mini-batch sizes. struct minibatch_step { @@ -131,14 +132,14 @@ class lbann_callback_minibatch_schedule : public lbann_callback_variable_minibat epoch(_epoch), mbsize(_mbsize), lr(_lr), ramp_time(_ramp_time) {} }; - lbann_callback_minibatch_schedule( + minibatch_schedule( int starting_mbsize, std::vector steps); - lbann_callback_minibatch_schedule( - const lbann_callback_minibatch_schedule&) = default; - lbann_callback_minibatch_schedule& operator=( - const lbann_callback_minibatch_schedule&) = delete; - lbann_callback_minibatch_schedule* copy() const override { - return new lbann_callback_minibatch_schedule(*this); + minibatch_schedule( + const minibatch_schedule&) = default; + minibatch_schedule& operator=( + const minibatch_schedule&) = delete; + minibatch_schedule* copy() const override { + return new minibatch_schedule(*this); } std::string name() const override { return "minibatch schedule"; } protected: @@ -149,10 +150,11 @@ class lbann_callback_minibatch_schedule : public lbann_callback_variable_minibat }; // Builder function -std::unique_ptr -build_callback_minibatch_schedule_from_pbuf( +std::unique_ptr +build_minibatch_schedule_callback_from_pbuf( const google::protobuf::Message&, lbann_summary*); -} // namespace lbann +} // namespace callback +} // namespace lbann #endif // LBANN_CALLBACKS_VARIABLE_MINIBATCH_HPP_INCLUDED diff --git a/include/lbann/callbacks/profiler.hpp b/include/lbann/callbacks/profiler.hpp index dc62c430335..f118b8d54c9 100644 --- a/include/lbann/callbacks/profiler.hpp +++ b/include/lbann/callbacks/profiler.hpp @@ -23,7 +23,7 @@ // implied. See the License for the specific language governing // permissions and limitations under the license. // -// lbann_callback_timer .hpp .cpp - Callback hooks to time training +// timer .hpp .cpp - Callback hooks to time training //////////////////////////////////////////////////////////////////////////////// #ifndef LBANN_CALLBACKS_PROFILER_HPP_INCLUDED @@ -32,16 +32,17 @@ #include "lbann/callbacks/callback.hpp" namespace lbann { +namespace callback { /** */ -class lbann_callback_profiler : public lbann_callback { +class profiler : public callback_base { public: - lbann_callback_profiler(bool sync = false, bool skip_init = false); - lbann_callback_profiler(const lbann_callback_profiler&) = default; - lbann_callback_profiler& operator=(const lbann_callback_profiler&) = default; - lbann_callback_profiler* copy() const override { - return new lbann_callback_profiler(*this); + profiler(bool sync = false, bool skip_init = false); + profiler(const profiler&) = default; + profiler& operator=(const profiler&) = default; + profiler* copy() const override { + return new profiler(*this); } void on_epoch_begin(model *m) override; void on_epoch_end(model *m) override; @@ -80,10 +81,11 @@ class lbann_callback_profiler : public lbann_callback { }; // Builder function -std::unique_ptr -build_callback_profiler_from_pbuf( +std::unique_ptr +build_profiler_callback_from_pbuf( const google::protobuf::Message&, lbann_summary*); -} // namespace lbann +} // namespace callback +} // namespace lbann #endif // LBANN_CALLBACKS_PROFILER_HPP_INCLUDED diff --git a/include/lbann/layers/io/input/generic_input_layer.hpp b/include/lbann/layers/io/input/generic_input_layer.hpp index e3539d99638..da8c53c0243 100644 --- a/include/lbann/layers/io/input/generic_input_layer.hpp +++ b/include/lbann/layers/io/input/generic_input_layer.hpp @@ -208,7 +208,7 @@ class generic_input_layer : public io_layer { mini_batch_size = get_current_mini_batch_size(); int effective_mini_batch_size = mini_batch_size; for (auto&& cb : this->m_model->get_callbacks()) { - if (dynamic_cast(cb) != nullptr) { + if (dynamic_cast(cb) != nullptr) { effective_mini_batch_size = get_current_global_mini_batch_size(); break; } diff --git a/include/lbann/layers/layer.hpp b/include/lbann/layers/layer.hpp index 6ed9ecb096b..8ae20b69b57 100644 --- a/include/lbann/layers/layer.hpp +++ b/include/lbann/layers/layer.hpp @@ -44,7 +44,10 @@ namespace lbann { // Forward declarations class model; class weights; -class lbann_callback_sync_layers; +namespace callback { +class sync_layers; +class sync_selected; +} // namespace callback /** * @brief Neural network tensor operation. @@ -64,8 +67,8 @@ class lbann_callback_sync_layers; * the weights. */ class Layer { - friend class lbann_callback_sync_layers; - friend class lbann_callback_sync_selected; + friend class callback::sync_layers; + friend class callback::sync_selected; public: diff --git a/include/lbann/models/model.hpp b/include/lbann/models/model.hpp index 7e8671a5289..eecabcf5d59 100644 --- a/include/lbann/models/model.hpp +++ b/include/lbann/models/model.hpp @@ -47,7 +47,7 @@ namespace lbann { // Forward declarations -class lbann_callback; +class callback_base; /** @brief Abstract base class for neural network models. */ class model { @@ -120,7 +120,7 @@ class model { std::vector get_weights(); /** @brief Get the list of callbacks for the model. */ - virtual std::vector& get_callbacks() { + virtual std::vector& get_callbacks() { return m_callbacks; } @@ -190,7 +190,7 @@ class model { void add_weights(weights *w); /** @brief Register a new callback for the model. */ - void add_callback(lbann_callback *cb); + void add_callback(callback_base *cb); /** @brief Register a new metric for the model. */ void add_metric(metric *m); @@ -489,7 +489,7 @@ class model { std::vector m_metrics; /** @brief Current callbacks to process. */ - std::vector m_callbacks; + std::vector m_callbacks; /** @brief Threads available for I/O */ std::shared_ptr m_io_thread_pool; diff --git a/include/lbann/optimizers/adam.hpp b/include/lbann/optimizers/adam.hpp index 696c8416599..020909aeaa1 100644 --- a/include/lbann/optimizers/adam.hpp +++ b/include/lbann/optimizers/adam.hpp @@ -30,6 +30,9 @@ #include "lbann/optimizers/optimizer.hpp" namespace lbann { +namespace callback { +class perturb_adam; +} // namespace callback /** @brief Adam optimizer. * @@ -141,7 +144,7 @@ class adam : public optimizer { std::unique_ptr m_moment2; /** Hyperparameter exploration. */ - friend class lbann_callback_perturb_adam; + friend class callback::perturb_adam; /** CPU implementation of optimization step. */ void step_compute_cpu(AbsDistMat& values, const AbsDistMat& gradient); diff --git a/include/lbann/proto/factories.hpp b/include/lbann/proto/factories.hpp index 59b05bd24e2..ee51a2116f5 100644 --- a/include/lbann/proto/factories.hpp +++ b/include/lbann/proto/factories.hpp @@ -61,7 +61,7 @@ weights* construct_weights(lbann_comm* comm, const lbann_data::Weights& proto_weights); /** Construct a callback specified with prototext. */ -std::unique_ptr +std::unique_ptr construct_callback(const google::protobuf::Message& proto_cb, lbann_summary* summarizer); diff --git a/model_zoo/lbann2.cpp b/model_zoo/lbann2.cpp index b72ddd2a38f..0d9496873a0 100644 --- a/model_zoo/lbann2.cpp +++ b/model_zoo/lbann2.cpp @@ -63,7 +63,8 @@ int main(int argc, char *argv[]) { } // Load layer weights from checkpoint if checkpoint directory given if(opts->has_string("ckpt_dir")){ - lbann_callback_save_model::load_model_weights(opts->get_string("ckpt_dir"), model_1.get()); + callback::save_model::load_model_weights(opts->get_string("ckpt_dir"), + model_1.get()); } // Train model if (master) { diff --git a/model_zoo/lbann_inf.cpp b/model_zoo/lbann_inf.cpp index 62f5f764f73..a32185e439a 100644 --- a/model_zoo/lbann_inf.cpp +++ b/model_zoo/lbann_inf.cpp @@ -64,10 +64,11 @@ int main(int argc, char *argv[]) { // Load layer weights from checkpoint if checkpoint directory given if(opts->has_string("ckpt_dir")){ for(auto&& m : models) { - bool loaded = lbann_callback_save_model::load_model_weights(opts->get_string("ckpt_dir"), - m.get(), - opts->get_bool("ckptdir_is_fullpath")); - if(!loaded) LBANN_ERROR("Unable to reload model"); + bool loaded = callback::save_model::load_model_weights( + opts->get_string("ckpt_dir"), + m.get(), + opts->get_bool("ckptdir_is_fullpath")); + if(!loaded) LBANN_ERROR("Unable to reload model"); } }else { LBANN_ERROR("Unable to reload model"); diff --git a/src/callbacks/callback_check_dataset.cpp b/src/callbacks/callback_check_dataset.cpp index 5bf0702b54c..51c0254a30b 100644 --- a/src/callbacks/callback_check_dataset.cpp +++ b/src/callbacks/callback_check_dataset.cpp @@ -31,8 +31,9 @@ #include namespace lbann { +namespace callback { -void lbann_callback_check_dataset::add_to_set(model *m, Layer *l, int64_t step, std::set& set) { +void check_dataset::add_to_set(model *m, Layer *l, int64_t step, std::set& set) { if (!dynamic_cast(l)) { return; } @@ -57,11 +58,11 @@ void lbann_callback_check_dataset::add_to_set(model *m, Layer *l, int64_t step, } } -void lbann_callback_check_dataset::on_forward_prop_end(model *m, Layer *l) { +void check_dataset::on_forward_prop_end(model *m, Layer *l) { add_to_set(m, l, m->get_step(), training_set); } -void lbann_callback_check_dataset::on_evaluate_forward_prop_end(model *m, Layer *l) { +void check_dataset::on_evaluate_forward_prop_end(model *m, Layer *l) { switch(m->get_execution_mode()) { case execution_mode::validation: add_to_set(m, l, m->get_step(), validation_set); @@ -70,11 +71,11 @@ void lbann_callback_check_dataset::on_evaluate_forward_prop_end(model *m, Layer add_to_set(m, l, m->get_step(), testing_set); break; default: - throw lbann_exception("lbann_callback_check_dataset: invalid execution phase"); + LBANN_ERROR("check_dataset: invalid execution phase"); } } -void lbann_callback_check_dataset::on_epoch_end(model *m) { +void check_dataset::on_epoch_end(model *m) { lbann_comm* comm = m->get_comm(); std::cout << "Training [" << comm->get_rank_in_trainer() << "] : I have processed " << training_set.size() << " elements" << std::endl; @@ -142,7 +143,7 @@ void lbann_callback_check_dataset::on_epoch_end(model *m) { training_set.clear(); } -void lbann_callback_check_dataset::on_validation_end(model *m) { +void check_dataset::on_validation_end(model *m) { std::cout << "Validation [" << m->get_comm()->get_rank_in_trainer() << "] : I have processed " << validation_set.size() << " elements" << std::endl; #if 0 std::cout << "Validation [" << m->get_comm()->get_rank_in_trainer() << "] "; @@ -154,9 +155,10 @@ void lbann_callback_check_dataset::on_validation_end(model *m) { validation_set.clear(); } -void lbann_callback_check_dataset::on_test_end(model *m) { +void check_dataset::on_test_end(model *m) { std::cout << "Testing [" << m->get_comm()->get_rank_in_trainer() << "] : I have processed " << testing_set.size() << " elements" << std::endl; testing_set.clear(); } -} // namespace lbann +} // namespace callback +} // namespace lbann diff --git a/src/callbacks/callback_check_gradients.cpp b/src/callbacks/callback_check_gradients.cpp index 1e9cd248696..6ae9000804a 100644 --- a/src/callbacks/callback_check_gradients.cpp +++ b/src/callbacks/callback_check_gradients.cpp @@ -31,6 +31,7 @@ #include "callbacks.pb.h" namespace lbann { +namespace callback { namespace { @@ -57,15 +58,15 @@ DataType compute_objective_function(model& m) { } // namespace -lbann_callback_check_gradients - ::lbann_callback_check_gradients(DataType step_size, +check_gradients + ::check_gradients(DataType step_size, bool verbose, bool error_on_failure) : m_step_size(step_size), m_verbose(verbose), m_error_on_failure(error_on_failure) {} -void lbann_callback_check_gradients::on_test_end(model *m) { +void check_gradients::on_test_end(model *m) { // Get objects from model lbann_comm *comm = m->get_comm(); @@ -229,14 +230,15 @@ void lbann_callback_check_gradients::on_test_end(model *m) { } // Builder function -std::unique_ptr -build_callback_check_gradients_from_pbuf( +std::unique_ptr +build_check_gradients_callback_from_pbuf( const google::protobuf::Message& proto_msg, lbann_summary*) { const auto& params = dynamic_cast(proto_msg); - return make_unique(params.step_size(), + return make_unique(params.step_size(), params.verbose(), params.error_on_failure()); } +} // namespace callback } // namespace lbann diff --git a/src/callbacks/callback_check_init.cpp b/src/callbacks/callback_check_init.cpp index 2d50f07dad1..73c4784c88c 100644 --- a/src/callbacks/callback_check_init.cpp +++ b/src/callbacks/callback_check_init.cpp @@ -23,15 +23,16 @@ // implied. See the License for the specific language governing // permissions and limitations under the license. // -// lbann_callback_check_init .hpp .cpp - Check multi-model init +// check_init .hpp .cpp - Check multi-model init //////////////////////////////////////////////////////////////////////////////// #include "lbann/callbacks/callback_check_init.hpp" #include "lbann/utils/exception.hpp" namespace lbann { +namespace callback { -void lbann_callback_check_init::on_train_begin(model *m) { +void check_init::on_train_begin(model *m) { // Skip after the first epoch. if (m->get_epoch() != 0) { return; @@ -71,7 +72,7 @@ void lbann_callback_check_init::on_train_begin(model *m) { } } -bool lbann_callback_check_init::check_equal(const AbsMat& x, const AbsMat& y) const { +bool check_init::check_equal(const AbsMat& x, const AbsMat& y) const { const El::Int height = x.Height(); const El::Int width = x.Width(); if (height != y.Height() || width != y.Width() || x.LDim() != y.LDim()) { @@ -87,4 +88,5 @@ bool lbann_callback_check_init::check_equal(const AbsMat& x, const AbsMat& y) co return true; } -} // namespace lbann +} // namespace callback +} // namespace lbann diff --git a/src/callbacks/callback_check_metric.cpp b/src/callbacks/callback_check_metric.cpp index a0919854e68..c9724899378 100644 --- a/src/callbacks/callback_check_metric.cpp +++ b/src/callbacks/callback_check_metric.cpp @@ -29,8 +29,9 @@ #include "lbann/proto/factories.hpp" namespace lbann { +namespace callback { -lbann_callback_check_metric::lbann_callback_check_metric(std::string metric_name, +check_metric::check_metric(std::string metric_name, std::set modes, EvalType lower_bound, EvalType upper_bound, @@ -51,7 +52,7 @@ lbann_callback_check_metric::lbann_callback_check_metric(std::string metric_name } -void lbann_callback_check_metric::check_metric(const model& m) const { +void check_metric::do_check_metric(const model& m) const { std::stringstream err; // Return immediately if execution mode is invalid @@ -88,18 +89,19 @@ void lbann_callback_check_metric::check_metric(const model& m) const { } -std::unique_ptr -build_callback_check_metric_from_pbuf( +std::unique_ptr +build_check_metric_callback_from_pbuf( const google::protobuf::Message& proto_msg, lbann_summary*) { const auto& params = dynamic_cast(proto_msg); const auto& modes = parse_set(params.execution_modes()); - return make_unique(params.metric(), + return make_unique(params.metric(), modes, params.lower_bound(), params.upper_bound(), params.error_on_failure()); } -} // namespace lbann +} // namespace callback +} // namespace lbann diff --git a/src/callbacks/callback_checknan.cpp b/src/callbacks/callback_checknan.cpp index 143ec4ad776..b52f8db0a96 100644 --- a/src/callbacks/callback_checknan.cpp +++ b/src/callbacks/callback_checknan.cpp @@ -28,6 +28,7 @@ #include "lbann/utils/exception.hpp" namespace lbann { +namespace callback { namespace { @@ -117,7 +118,7 @@ void dump_network(model *m) { } // namespace -void lbann_callback_checknan::on_forward_prop_end(model *m, Layer *l) { +void check_nan::on_forward_prop_end(model *m, Layer *l) { std::stringstream err; const auto& num_outputs = l->get_num_children(); for (int i = 0; i < num_outputs; ++i) { @@ -144,7 +145,7 @@ void lbann_callback_checknan::on_forward_prop_end(model *m, Layer *l) { } } -void lbann_callback_checknan::on_backward_prop_end(model *m, Layer *l) { +void check_nan::on_backward_prop_end(model *m, Layer *l) { std::stringstream err; const auto& num_inputs = l->get_num_parents(); for (int i = 0; i < num_inputs; ++i) { @@ -171,7 +172,7 @@ void lbann_callback_checknan::on_backward_prop_end(model *m, Layer *l) { } } -void lbann_callback_checknan::on_backward_prop_end(model *m) { +void check_nan::on_backward_prop_end(model *m) { std::stringstream err; for (weights *w : m->get_weights()) { auto* opt = w->get_optimizer(); @@ -196,7 +197,7 @@ void lbann_callback_checknan::on_backward_prop_end(model *m) { } } -void lbann_callback_checknan::on_batch_end(model *m) { +void check_nan::on_batch_end(model *m) { std::stringstream err; for (weights *w : m->get_weights()) { El::Int row, col; @@ -218,4 +219,5 @@ void lbann_callback_checknan::on_batch_end(model *m) { } } -} // namespace lbann +} // namespace callback +} // namespace lbann diff --git a/src/callbacks/callback_checkpoint.cpp b/src/callbacks/callback_checkpoint.cpp index 0aac71d4633..385f7337e95 100644 --- a/src/callbacks/callback_checkpoint.cpp +++ b/src/callbacks/callback_checkpoint.cpp @@ -23,45 +23,46 @@ // implied. See the License for the specific language governing // permissions and limitations under the license. // -// lbann_callback_checkpoint .hpp .cpp - Callback hooks to checkpoint model +// checkpoint .hpp .cpp - Callback hooks to checkpoint model //////////////////////////////////////////////////////////////////////////////// #include "lbann/callbacks/callback_checkpoint.hpp" namespace lbann { +namespace callback { // Load from checkpoint occurs during setup callbacks -void lbann_callback_checkpoint::setup(model *m) { +void checkpoint::setup(model *m) { p.set_cb_type(callback_type::invalid); restart(m); } // Interval defined with checkpoint_epochs or ckpt_dist_epochs -void lbann_callback_checkpoint::on_epoch_end(model *m) { +void checkpoint::on_epoch_end(model *m) { p.set_cb_type(callback_type::epoch); if(need_checkpoint(m)){ - checkpoint(m); + do_checkpoint(m); } p.set_cb_type(callback_type::invalid); } // Interval defined with checkpoint_epochs or ckpt_dist_epochs -void lbann_callback_checkpoint::on_validation_end(model *m) { +void checkpoint::on_validation_end(model *m) { p.set_cb_type(callback_type::validation); if(need_checkpoint(m)){ - checkpoint(m); + do_checkpoint(m); } p.set_cb_type(callback_type::invalid); } // Interval defined with checkpoint_steps or ckpt_dist_steps -void lbann_callback_checkpoint::on_batch_end(model *m) { +void checkpoint::on_batch_end(model *m) { p.set_cb_type(callback_type::batch); if(need_checkpoint(m)){ - checkpoint(m); + do_checkpoint(m); } p.set_cb_type(callback_type::invalid); } // Decide if we need to trigger a checkpoint for either mode, based on prototext defined intervals -bool lbann_callback_checkpoint::need_checkpoint(model *m) { +bool checkpoint::need_checkpoint(model *m) { /* TODO: since we're using clocks, this requires a bcast for each call, * we could use number of samples processed to make a local decision */ // if none of our checkpoint conditions are set, assume we're not checkpointing @@ -114,7 +115,7 @@ bool lbann_callback_checkpoint::need_checkpoint(model *m) { } // Checkpoint Shared/Distributed -bool lbann_callback_checkpoint::checkpoint(model *m) { +bool checkpoint::do_checkpoint(model *m) { // if the checkpoint directory is not defined, bail if (m_checkpoint_dir.length() == 0 && m_per_rank_dir.length() == 0) { return false; @@ -204,7 +205,7 @@ bool lbann_callback_checkpoint::checkpoint(model *m) { } // Restart Shared/Distributed -bool lbann_callback_checkpoint::restart(model *m) { +bool checkpoint::restart(model *m) { // if the checkpoint directory is not defined, bail if (m_checkpoint_dir.length() == 0 && m_per_rank_dir.length() == 0) { return false; @@ -322,12 +323,12 @@ bool lbann_callback_checkpoint::restart(model *m) { return true; } -std::unique_ptr -build_callback_checkpoint_from_pbuf( +std::unique_ptr +build_checkpoint_callback_from_pbuf( const google::protobuf::Message& proto_msg, lbann_summary*) { const auto& params = dynamic_cast(proto_msg); - return make_unique(params.checkpoint_dir(), + return make_unique(params.checkpoint_dir(), params.checkpoint_epochs(), params.checkpoint_steps(), params.checkpoint_secs(), @@ -336,4 +337,5 @@ build_callback_checkpoint_from_pbuf( params.ckpt_dist_steps()); } -} +} // namespace callback +} // namespace lbann diff --git a/src/callbacks/callback_checksmall.cpp b/src/callbacks/callback_checksmall.cpp index a4fc8e93fcc..2daf2d00aa8 100644 --- a/src/callbacks/callback_checksmall.cpp +++ b/src/callbacks/callback_checksmall.cpp @@ -28,8 +28,9 @@ #include "lbann/utils/exception.hpp" namespace lbann { +namespace callback { -void lbann_callback_checksmall::on_forward_prop_end(model *m, Layer *l) { +void check_small::on_forward_prop_end(model *m, Layer *l) { const AbsDistMat& acts = l->get_activations(); if (!is_good(acts)) { std::stringstream ss; @@ -41,7 +42,7 @@ void lbann_callback_checksmall::on_forward_prop_end(model *m, Layer *l) { } } -void lbann_callback_checksmall::on_backward_prop_end(model *m) { +void check_small::on_backward_prop_end(model *m) { for (weights *w : m->get_weights()) { optimizer *opt = w->get_optimizer(); if (opt != nullptr && !is_good(opt->get_gradient())) { @@ -55,7 +56,7 @@ void lbann_callback_checksmall::on_backward_prop_end(model *m) { } } -void lbann_callback_checksmall::on_batch_end(model *m) { +void check_small::on_batch_end(model *m) { for (weights *w : m->get_weights()) { if (!is_good(w->get_values())) { std::stringstream ss; @@ -68,7 +69,7 @@ void lbann_callback_checksmall::on_batch_end(model *m) { } } -bool lbann_callback_checksmall::is_good(const AbsDistMat& m) { +bool check_small::is_good(const AbsDistMat& m) { const AbsMat& local_mat = m.LockedMatrix(); const El::Int height = local_mat.Height(); const El::Int width = local_mat.Width(); @@ -85,7 +86,8 @@ bool lbann_callback_checksmall::is_good(const AbsDistMat& m) { return true; } -const DataType lbann_callback_checksmall::m_threshold +const DataType check_small::m_threshold = std::sqrt(std::numeric_limits::min()); -} // namespace lbann +} // namespace callback +} // namespace lbann diff --git a/src/callbacks/callback_confusion_matrix.cpp b/src/callbacks/callback_confusion_matrix.cpp index 42d37825b36..9b4eddd0067 100644 --- a/src/callbacks/callback_confusion_matrix.cpp +++ b/src/callbacks/callback_confusion_matrix.cpp @@ -27,21 +27,22 @@ #include "lbann/callbacks/callback_confusion_matrix.hpp" namespace lbann { +namespace callback { // --------------------------------------------------------- // Constructors // --------------------------------------------------------- -lbann_callback_confusion_matrix::lbann_callback_confusion_matrix(std::string prediction_layer, +confusion_matrix::confusion_matrix(std::string prediction_layer, std::string label_layer, std::string prefix) - : lbann_callback(1, nullptr), + : callback_base(1, nullptr), m_prediction_layer(std::move(prediction_layer)), m_label_layer(std::move(label_layer)), m_prefix(std::move(prefix)) {} -lbann_callback_confusion_matrix::lbann_callback_confusion_matrix(const lbann_callback_confusion_matrix& other) - : lbann_callback(other), +confusion_matrix::confusion_matrix(const confusion_matrix& other) + : callback_base(other), m_prediction_layer(other.m_prediction_layer), m_label_layer(other.m_label_layer), m_prefix(other.m_prefix), @@ -49,8 +50,8 @@ lbann_callback_confusion_matrix::lbann_callback_confusion_matrix(const lbann_cal m_predictions_v(other.m_predictions_v ? other.m_predictions_v->Copy() : nullptr), m_labels_v(other.m_labels_v ? other.m_labels_v->Copy() : nullptr) {} -lbann_callback_confusion_matrix& lbann_callback_confusion_matrix::operator=(const lbann_callback_confusion_matrix& other) { - lbann_callback::operator=(other); +confusion_matrix& confusion_matrix::operator=(const confusion_matrix& other) { + callback_base::operator=(other); m_prediction_layer = other.m_prediction_layer; m_label_layer = other.m_label_layer; m_prefix = other.m_prefix; @@ -64,8 +65,8 @@ lbann_callback_confusion_matrix& lbann_callback_confusion_matrix::operator=(cons // Setup // --------------------------------------------------------- -void lbann_callback_confusion_matrix::setup(model* m) { - lbann_callback::setup(m); +void confusion_matrix::setup(model* m) { + callback_base::setup(m); // Initialize matrix views/copies const auto& predictions = get_predictions(*m); @@ -93,7 +94,7 @@ void lbann_callback_confusion_matrix::setup(model* m) { // Matrix access functions // --------------------------------------------------------- -const AbsDistMat& lbann_callback_confusion_matrix::get_predictions(const model& m) const { +const AbsDistMat& confusion_matrix::get_predictions(const model& m) const { for (const auto* l : m.get_layers()) { if (l->get_name() == m_prediction_layer) { return l->get_activations(); @@ -106,7 +107,7 @@ const AbsDistMat& lbann_callback_confusion_matrix::get_predictions(const model& return m.get_layers()[0]->get_activations(); } -const AbsDistMat& lbann_callback_confusion_matrix::get_labels(const model& m) const { +const AbsDistMat& confusion_matrix::get_labels(const model& m) const { for (const auto* l : m.get_layers()) { if (l->get_name() == m_label_layer) { return l->get_activations(); @@ -123,13 +124,13 @@ const AbsDistMat& lbann_callback_confusion_matrix::get_labels(const model& m) co // Count management functions // --------------------------------------------------------- -void lbann_callback_confusion_matrix::reset_counts(const model& m) { +void confusion_matrix::reset_counts(const model& m) { auto& counts = m_counts[m.get_execution_mode()]; const auto& num_classes = get_predictions(m).Height(); counts.assign(num_classes * num_classes, 0); } -void lbann_callback_confusion_matrix::update_counts(const model& m) { +void confusion_matrix::update_counts(const model& m) { constexpr DataType zero = 0; // Get predictions @@ -177,7 +178,7 @@ void lbann_callback_confusion_matrix::update_counts(const model& m) { } -void lbann_callback_confusion_matrix::save_confusion_matrix(const model& m) { +void confusion_matrix::save_confusion_matrix(const model& m) { // Get counts const auto& mode = m.get_execution_mode(); @@ -232,14 +233,15 @@ void lbann_callback_confusion_matrix::save_confusion_matrix(const model& m) { } -std::unique_ptr -build_callback_confusion_matrix_from_pbuf( +std::unique_ptr +build_confusion_matrix_callback_from_pbuf( const google::protobuf::Message& proto_msg, lbann_summary*) { const auto& params = dynamic_cast(proto_msg); - return make_unique(params.prediction(), + return make_unique(params.prediction(), params.label(), params.prefix()); } -} // namespace lbann +} // namespace callback +} // namespace lbann diff --git a/src/callbacks/callback_debug.cpp b/src/callbacks/callback_debug.cpp index 25382a361d2..bee2fa454a8 100644 --- a/src/callbacks/callback_debug.cpp +++ b/src/callbacks/callback_debug.cpp @@ -32,6 +32,7 @@ #include "callbacks.pb.h" namespace lbann { +namespace callback { namespace { @@ -74,7 +75,7 @@ std::string batch_step_string(const model& m) { } // namespace // Status updates for batch beginnings/endings -void lbann_callback_debug::on_batch_begin(model *m) { +void debug::on_batch_begin(model *m) { if(m_modes.empty() || m_modes.count(m->get_execution_mode()) > 0) { std::stringstream msg; msg << rank_string(*m->get_comm()) << ": " @@ -82,7 +83,7 @@ void lbann_callback_debug::on_batch_begin(model *m) { std::cerr << msg.str(); } } -void lbann_callback_debug::on_batch_end(model *m) { +void debug::on_batch_end(model *m) { if(m_modes.empty() || m_modes.count(m->get_execution_mode()) > 0) { std::stringstream msg; msg << rank_string(*m->get_comm()) << ": " @@ -90,15 +91,15 @@ void lbann_callback_debug::on_batch_end(model *m) { std::cerr << msg.str(); } } -void lbann_callback_debug::on_batch_evaluate_begin(model *m) { +void debug::on_batch_evaluate_begin(model *m) { on_batch_begin(m); } -void lbann_callback_debug::on_batch_evaluate_end(model *m) { +void debug::on_batch_evaluate_end(model *m) { on_batch_end(m); } // Status updates for beginning/ending of layer forward/backward prop -void lbann_callback_debug::on_forward_prop_begin(model *m, Layer *l) { +void debug::on_forward_prop_begin(model *m, Layer *l) { if(m_modes.empty() || m_modes.count(m->get_execution_mode()) > 0) { std::stringstream msg; msg << rank_string(*m->get_comm()) << ": " << layer_string(*l) @@ -107,7 +108,7 @@ void lbann_callback_debug::on_forward_prop_begin(model *m, Layer *l) { std::cerr << msg.str(); } } -void lbann_callback_debug::on_forward_prop_end(model *m, Layer *l) { +void debug::on_forward_prop_end(model *m, Layer *l) { if(m_modes.empty() || m_modes.count(m->get_execution_mode()) > 0) { std::stringstream msg; msg << rank_string(*m->get_comm()) << ": " << layer_string(*l) @@ -116,7 +117,7 @@ void lbann_callback_debug::on_forward_prop_end(model *m, Layer *l) { std::cerr << msg.str(); } } -void lbann_callback_debug::on_backward_prop_begin(model *m, Layer *l) { +void debug::on_backward_prop_begin(model *m, Layer *l) { if(m_modes.empty() || m_modes.count(m->get_execution_mode()) > 0) { std::stringstream msg; msg << rank_string(*m->get_comm()) << ": " << layer_string(*l) @@ -125,7 +126,7 @@ void lbann_callback_debug::on_backward_prop_begin(model *m, Layer *l) { std::cerr << msg.str(); } } -void lbann_callback_debug::on_backward_prop_end(model *m, Layer *l) { +void debug::on_backward_prop_end(model *m, Layer *l) { if(m_modes.empty() || m_modes.count(m->get_execution_mode()) > 0) { std::stringstream msg; msg << rank_string(*m->get_comm()) << ": " << layer_string(*l) @@ -134,22 +135,22 @@ void lbann_callback_debug::on_backward_prop_end(model *m, Layer *l) { std::cerr << msg.str(); } } -void lbann_callback_debug::on_evaluate_forward_prop_begin(model *m, Layer *l) { +void debug::on_evaluate_forward_prop_begin(model *m, Layer *l) { on_forward_prop_begin(m, l); } -void lbann_callback_debug::on_evaluate_forward_prop_end(model *m, Layer *l) { +void debug::on_evaluate_forward_prop_end(model *m, Layer *l) { on_backward_prop_end(m, l); } // Status updates for optimization step -void lbann_callback_debug::on_optimize_begin(model *m, weights *w) { +void debug::on_optimize_begin(model *m, weights *w) { std::stringstream msg; msg << rank_string(*m->get_comm()) << ": " << weights_string(*w) << " is starting optimization step for " << batch_step_string(*m) << std::endl; std::cerr << msg.str(); } -void lbann_callback_debug::on_optimize_end(model *m, weights *w) { +void debug::on_optimize_end(model *m, weights *w) { std::stringstream msg; msg << rank_string(*m->get_comm()) << ": " << weights_string(*w) << " is ending optimization step for " << batch_step_string(*m) @@ -157,14 +158,15 @@ void lbann_callback_debug::on_optimize_end(model *m, weights *w) { std::cerr << msg.str(); } -std::unique_ptr -build_callback_debug_from_pbuf(const google::protobuf::Message& proto_msg, +std::unique_ptr +build_debug_callback_from_pbuf(const google::protobuf::Message& proto_msg, lbann_summary* summarizer) { const auto& params = dynamic_cast(proto_msg); const auto& modes = parse_set(params.phase()); - return make_unique(modes, summarizer); + return make_unique(modes, summarizer); } +} // namespace callback } // namespace lbann diff --git a/src/callbacks/callback_debug_io.cpp b/src/callbacks/callback_debug_io.cpp index 9f8dd5e2530..400460bb5a0 100644 --- a/src/callbacks/callback_debug_io.cpp +++ b/src/callbacks/callback_debug_io.cpp @@ -23,21 +23,22 @@ // implied. See the License for the specific language governing // permissions and limitations under the license. // -// lbann_callback_debug .hpp .cpp - Callback hooks to debug LBANN +// debug .hpp .cpp - Callback hooks to debug LBANN /////////////////////////////////////////////////////////////////////////////// #include "lbann/callbacks/callback_debug_io.hpp" namespace lbann { +namespace callback { /// BVE FIXME @todo The use of execution_mode invalid needs to be reconsidered -void lbann_callback_debug_io::on_epoch_begin(model *m) { +void debug_io::on_epoch_begin(model *m) { if(m_debug_phase == execution_mode::invalid || m_debug_phase == execution_mode::training) { print_phase_start(m, execution_mode::training); } } -void lbann_callback_debug_io::on_forward_prop_begin(model *m, Layer *l) { +void debug_io::on_forward_prop_begin(model *m, Layer *l) { auto *input = dynamic_cast(l); if (input == nullptr || m_debug_lvl < 1) { return; @@ -53,7 +54,7 @@ void lbann_callback_debug_io::on_forward_prop_begin(model *m, Layer *l) { /// I think that the reset mini batch index may be off } -void lbann_callback_debug_io::print_fp_start(model *m, generic_input_layer *input) { +void debug_io::print_fp_start(model *m, generic_input_layer *input) { const auto& step = m->get_step(); std::cout << "[" << m->get_comm()->get_trainer_rank() << "." << m->get_comm()->get_rank_in_trainer() @@ -73,7 +74,7 @@ void lbann_callback_debug_io::print_fp_start(model *m, generic_input_layer *inpu } // 179i @ 300s (=5m*60s) + 1i @ 100s (=5m*45s):offset <- num models -void lbann_callback_debug_io::print_phase_start(model *m, execution_mode mode) { +void debug_io::print_phase_start(model *m, execution_mode mode) { // Get data reader from first input layer in model generic_data_reader* data_reader = nullptr; @@ -124,13 +125,13 @@ void lbann_callback_debug_io::print_phase_start(model *m, execution_mode mode) { //////////////////////////////////////////////////////////////////////////////// // Evaluation phase debugging //////////////////////////////////////////////////////////////////////////////// -void lbann_callback_debug_io::on_validation_begin(model *m) { +void debug_io::on_validation_begin(model *m) { if(m_debug_phase == execution_mode::invalid || m_debug_phase == execution_mode::validation) { print_phase_start(m, execution_mode::validation); } } -void lbann_callback_debug_io::on_evaluate_forward_prop_begin(model *m, Layer *l) { +void debug_io::on_evaluate_forward_prop_begin(model *m, Layer *l) { auto *input = dynamic_cast(l); if (input == nullptr || m_debug_lvl < 1) { return; @@ -146,14 +147,14 @@ void lbann_callback_debug_io::on_evaluate_forward_prop_begin(model *m, Layer *l) //////////////////////////////////////////////////////////////////////////////// // Testing phase debugging //////////////////////////////////////////////////////////////////////////////// -void lbann_callback_debug_io::on_test_begin(model *m) { +void debug_io::on_test_begin(model *m) { if(m_debug_phase == execution_mode::invalid || m_debug_phase == execution_mode::testing) { print_phase_start(m, execution_mode::testing); } } -std::unique_ptr -build_callback_debug_io_from_pbuf( +std::unique_ptr +build_debug_io_callback_from_pbuf( const google::protobuf::Message& proto_msg, lbann_summary*) { const auto& params = dynamic_cast(proto_msg); @@ -163,10 +164,11 @@ build_callback_debug_io_from_pbuf( case execution_mode::training: case execution_mode::validation: case execution_mode::testing: - return make_unique(phase, lvl); + return make_unique(phase, lvl); default: - return make_unique(); + return make_unique(); } } -}// namespace lbann +} // namespace callback +} // namespace lbann diff --git a/src/callbacks/callback_dump_error_signals.cpp b/src/callbacks/callback_dump_error_signals.cpp index c4e1b5a4b2c..f65836bd707 100644 --- a/src/callbacks/callback_dump_error_signals.cpp +++ b/src/callbacks/callback_dump_error_signals.cpp @@ -29,8 +29,9 @@ #include namespace lbann { +namespace callback { -void lbann_callback_dump_error_signals::on_backward_prop_end(model *m, Layer *l) { +void dump_error_signals::on_backward_prop_end(model *m, Layer *l) { // Write each activation matrix to file for (int i = 0; i < l->get_num_parents(); ++i) { @@ -52,12 +53,13 @@ void lbann_callback_dump_error_signals::on_backward_prop_end(model *m, Layer *l) } -std::unique_ptr -build_callback_dump_error_signals_from_pbuf( +std::unique_ptr +build_dump_error_signals_callback_from_pbuf( const google::protobuf::Message& proto_msg, lbann_summary*) { const auto& params = dynamic_cast(proto_msg); - return make_unique(params.basename()); + return make_unique(params.basename()); } -} // namespace lbann +} // namespace callback +} // namespace lbann diff --git a/src/callbacks/callback_dump_gradients.cpp b/src/callbacks/callback_dump_gradients.cpp index c8c0e0d34d1..e78a3358be2 100644 --- a/src/callbacks/callback_dump_gradients.cpp +++ b/src/callbacks/callback_dump_gradients.cpp @@ -23,7 +23,7 @@ // implied. See the License for the specific language governing // permissions and limitations under the license. // -// lbann_callback_dump_gradients .hpp .cpp - Callbacks to dump gradients +// dump_gradients .hpp .cpp - Callbacks to dump gradients //////////////////////////////////////////////////////////////////////////////// #include "lbann/callbacks/callback_dump_gradients.hpp" @@ -33,8 +33,9 @@ #include namespace lbann { +namespace callback { -void lbann_callback_dump_gradients::on_backward_prop_end(model *m) { +void dump_gradients::on_backward_prop_end(model *m) { for (weights *w : m->get_weights()) { optimizer *opt = w->get_optimizer(); if (opt != nullptr) { @@ -50,13 +51,14 @@ void lbann_callback_dump_gradients::on_backward_prop_end(model *m) { } } -std::unique_ptr -build_callback_dump_gradients_from_pbuf( +std::unique_ptr +build_dump_gradients_callback_from_pbuf( const google::protobuf::Message& proto_msg, lbann_summary*) { const auto& params = dynamic_cast(proto_msg); - return make_unique(params.basename(), + return make_unique(params.basename(), params.interval()); } -} // namespace lbann +} // namespace callback +} // namespace lbann diff --git a/src/callbacks/callback_dump_minibatch_sample_indices.cpp b/src/callbacks/callback_dump_minibatch_sample_indices.cpp index 13b390e8d90..03c95b43162 100644 --- a/src/callbacks/callback_dump_minibatch_sample_indices.cpp +++ b/src/callbacks/callback_dump_minibatch_sample_indices.cpp @@ -23,7 +23,7 @@ // implied. See the License for the specific language governing // permissions and limitations under the license. // -// lbann_callback_dump_minibatch_sample_indices .hpp .cpp - Callbacks +// dump_minibatch_sample_indices .hpp .cpp - Callbacks // to dump the list of indices per minibatch //////////////////////////////////////////////////////////////////////////////// @@ -37,8 +37,9 @@ #include namespace lbann { +namespace callback { -void lbann_callback_dump_minibatch_sample_indices::dump_to_file(model *m, Layer *l, int64_t step) { +void dump_minibatch_sample_indices::dump_to_file(model *m, Layer *l, int64_t step) { // Print minibatch sample indices of input layers auto *input = dynamic_cast(l); if (input != nullptr) { @@ -69,21 +70,23 @@ void lbann_callback_dump_minibatch_sample_indices::dump_to_file(model *m, Layer } } -void lbann_callback_dump_minibatch_sample_indices::on_forward_prop_end(model *m, Layer *l) { +void dump_minibatch_sample_indices::on_forward_prop_end(model *m, Layer *l) { dump_to_file(m, l, m->get_step()); } -void lbann_callback_dump_minibatch_sample_indices::on_evaluate_forward_prop_end(model *m, Layer *l) { +void dump_minibatch_sample_indices::on_evaluate_forward_prop_end(model *m, Layer *l) { dump_to_file(m, l, m->get_step()); } -std::unique_ptr -build_callback_dump_mb_indices_from_pbuf( +std::unique_ptr +build_dump_mb_indices_callback_from_pbuf( const google::protobuf::Message& proto_msg, lbann_summary*) { const auto& params = dynamic_cast(proto_msg); - return make_unique( + return make_unique( params.basename(), params.interval()); } -} // namespace lbann + +} // namespace callback +} // namespace lbann diff --git a/src/callbacks/callback_dump_outputs.cpp b/src/callbacks/callback_dump_outputs.cpp index f11ac2fbb44..1cba1b67a3e 100644 --- a/src/callbacks/callback_dump_outputs.cpp +++ b/src/callbacks/callback_dump_outputs.cpp @@ -35,6 +35,7 @@ #endif // LBANN_HAS_CNPY namespace lbann { +namespace callback { namespace { @@ -96,12 +97,12 @@ void save_npz(const std::string& file_name, } // namespace -lbann_callback_dump_outputs::lbann_callback_dump_outputs(std::set layer_names, - std::set modes, - El::Int batch_interval, - std::string directory, - std::string file_format) - : lbann_callback(std::max(batch_interval, El::Int(1))), +dump_outputs::dump_outputs(std::set layer_names, + std::set modes, + El::Int batch_interval, + std::string directory, + std::string file_format) + : callback_base(std::max(batch_interval, El::Int(1))), m_layer_names(std::move(layer_names)), m_modes(std::move(modes)), m_directory(std::move(directory)), @@ -133,7 +134,7 @@ lbann_callback_dump_outputs::lbann_callback_dump_outputs(std::set l } -void lbann_callback_dump_outputs::dump_outputs(const model& m, const Layer& l) { +void dump_outputs::do_dump_outputs(const model& m, const Layer& l) { // Get mini-batch step information const auto& mode = m.get_execution_mode(); @@ -178,19 +179,20 @@ void lbann_callback_dump_outputs::dump_outputs(const model& m, const Layer& l) { } -std::unique_ptr -build_callback_dump_outputs_from_pbuf( +std::unique_ptr +build_dump_outputs_callback_from_pbuf( const google::protobuf::Message& proto_msg, lbann_summary*) { const auto& params = dynamic_cast(proto_msg); const auto& layer_names = parse_set(params.layers()); const auto& modes = parse_set(params.execution_modes()); - return make_unique(layer_names, + return make_unique(layer_names, modes, params.batch_interval(), params.directory(), params.format()); } +} // namespace callback } // namespace lbann diff --git a/src/callbacks/callback_dump_weights.cpp b/src/callbacks/callback_dump_weights.cpp index db0dae0db95..8f8ff886cd2 100644 --- a/src/callbacks/callback_dump_weights.cpp +++ b/src/callbacks/callback_dump_weights.cpp @@ -23,23 +23,24 @@ // implied. See the License for the specific language governing // permissions and limitations under the license. // -// lbann_callback_dump_weights .hpp .cpp - Callbacks to dump weight matrices +// dump_weights .hpp .cpp - Callbacks to dump weight matrices //////////////////////////////////////////////////////////////////////////////// #include #include "lbann/callbacks/callback_dump_weights.hpp" namespace lbann { +namespace callback { -void lbann_callback_dump_weights::on_train_begin(model *m) { - dump_weights(m, "initial"); +void dump_weights::on_train_begin(model *m) { + do_dump_weights(m, "initial"); } -void lbann_callback_dump_weights::on_epoch_end(model *m) { - dump_weights(m); +void dump_weights::on_epoch_end(model *m) { + do_dump_weights(m); } -void lbann_callback_dump_weights::dump_weights(model *m, std::string s) { +void dump_weights::do_dump_weights(model *m, std::string s) { for (weights *w : m->get_weights()) { std::string epoch = "-epoch" + std::to_string(m->get_epoch()-1); if(s != "") { @@ -55,12 +56,13 @@ void lbann_callback_dump_weights::dump_weights(model *m, std::string s) { } } -std::unique_ptr -build_callback_dump_weights_from_pbuf( +std::unique_ptr +build_dump_weights_callback_from_pbuf( const google::protobuf::Message& proto_msg, lbann_summary*) { const auto& params = dynamic_cast(proto_msg); - return make_unique(params.basename()); + return make_unique(params.basename()); } -} // namespace lbann +} // namespace callback +} // namespace lbann diff --git a/src/callbacks/callback_early_stopping.cpp b/src/callbacks/callback_early_stopping.cpp index 9b78d7904c6..0424f279623 100644 --- a/src/callbacks/callback_early_stopping.cpp +++ b/src/callbacks/callback_early_stopping.cpp @@ -29,13 +29,14 @@ #include "lbann/callbacks/callback_early_stopping.hpp" namespace lbann { +namespace callback { -lbann_callback_early_stopping::lbann_callback_early_stopping(int64_t patience) : - lbann_callback(), m_patience(patience) {} +early_stopping::early_stopping(int64_t patience) : + callback_base(), m_patience(patience) {} /// Monitor the objective function to see if the validation score /// continues to improve -void lbann_callback_early_stopping::on_validation_end(model *m) { +void early_stopping::on_validation_end(model *m) { execution_mode mode = m->get_execution_mode(); EvalType score = m->get_objective_function()->get_mean_value(mode); if (score < m_last_score) { @@ -60,12 +61,13 @@ void lbann_callback_early_stopping::on_validation_end(model *m) { } } -std::unique_ptr -build_callback_early_stopping_from_pbuf( +std::unique_ptr +build_early_stopping_callback_from_pbuf( const google::protobuf::Message& proto_msg, lbann_summary*) { const auto& params = dynamic_cast(proto_msg); - return make_unique(params.patience()); + return make_unique(params.patience()); } -} // namespace lbann +} // namespace callback +} // namespace lbann diff --git a/src/callbacks/callback_gpu_memory_usage.cpp b/src/callbacks/callback_gpu_memory_usage.cpp index 4cacebf1a81..3a394e563a0 100644 --- a/src/callbacks/callback_gpu_memory_usage.cpp +++ b/src/callbacks/callback_gpu_memory_usage.cpp @@ -53,8 +53,9 @@ T get_min(const std::vector &v) { } namespace lbann { +namespace callback { -void lbann_callback_gpu_memory_usage::on_epoch_begin(model *m) { +void gpu_memory_usage::on_epoch_begin(model *m) { #ifdef LBANN_HAS_CUDA size_t available; size_t total; @@ -91,4 +92,5 @@ void lbann_callback_gpu_memory_usage::on_epoch_begin(model *m) { #endif } -} // namespace lbann +} // namespace callback +} // namespace lbann diff --git a/src/callbacks/callback_hang.cpp b/src/callbacks/callback_hang.cpp index 891310de751..254441d6c96 100644 --- a/src/callbacks/callback_hang.cpp +++ b/src/callbacks/callback_hang.cpp @@ -29,8 +29,9 @@ #include namespace lbann { +namespace callback { -void lbann_callback_hang::setup(model* m) +void hang::setup(model* m) { if (m->get_comm()->am_world_master()) { if (m_rank_to_hang == -1) { @@ -43,12 +44,13 @@ void lbann_callback_hang::setup(model* m) } } -std::unique_ptr -build_callback_hang_from_pbuf( +std::unique_ptr +build_hang_callback_from_pbuf( const google::protobuf::Message& proto_msg, lbann_summary*) { const auto& params = dynamic_cast(proto_msg); - return make_unique(params.rank()); + return make_unique(params.rank()); } -}// namespace lbann +} // namespace callback +} // namespace lbann diff --git a/src/callbacks/callback_helpers.hpp b/src/callbacks/callback_helpers.hpp index f4f3fc5422b..e612ef21850 100644 --- a/src/callbacks/callback_helpers.hpp +++ b/src/callbacks/callback_helpers.hpp @@ -49,5 +49,5 @@ std::vector select_things_by_name( } return out_things; } -}// namespace -}// namespace lbann +} // namespace +} // namespace lbann diff --git a/src/callbacks/callback_imcomm.cpp b/src/callbacks/callback_imcomm.cpp index d07385ab0f7..70c519bacc3 100644 --- a/src/callbacks/callback_imcomm.cpp +++ b/src/callbacks/callback_imcomm.cpp @@ -23,7 +23,7 @@ // implied. See the License for the specific language governing // permissions and limitations under the license. // -// lbann_callback_imcomm .hpp .cpp - Send gradient updates between models +// imcomm .hpp .cpp - Send gradient updates between models //////////////////////////////////////////////////////////////////////////////// #include @@ -33,28 +33,29 @@ #include "lbann/utils/exception.hpp" namespace lbann { +namespace callback { -lbann_callback_imcomm::lbann_callback_imcomm(lbann_callback_imcomm::comm_type ct, +imcomm::imcomm(imcomm::comm_type ct, lbann_summary *summarizer) : - lbann_callback(1, summarizer), m_default_ct(ct) {} + callback_base(1, summarizer), m_default_ct(ct) {} -lbann_callback_imcomm::lbann_callback_imcomm(lbann_callback_imcomm::comm_type ct, +imcomm::imcomm(imcomm::comm_type ct, std::unordered_set weights_list, lbann_summary *summarizer) : - lbann_callback_imcomm(ct, summarizer) { + imcomm(ct, summarizer) { for (weights *w : weights_list) { m_weights_params[w] = {}; m_weights_params[w].ct = ct; } } -void lbann_callback_imcomm::set_weights_comm(weights *w, +void imcomm::set_weights_comm(weights *w, comm_type ct) { m_weights_params[w] = {}; m_weights_params[w].ct = ct; } -void lbann_callback_imcomm::setup(model *m) { +void imcomm::setup(model *m) { for (weights *w : m->get_weights()) { // Add weights if not already in list @@ -81,7 +82,7 @@ void lbann_callback_imcomm::setup(model *m) { } } -void lbann_callback_imcomm::on_train_begin(model *m) { +void imcomm::on_train_begin(model *m) { lbann_comm *comm = m->get_comm(); if (comm->get_num_trainers() == 1) { return; // No point with only one model. @@ -94,7 +95,7 @@ void lbann_callback_imcomm::on_train_begin(model *m) { } } -void lbann_callback_imcomm::on_backward_prop_end(model *m) { +void imcomm::on_backward_prop_end(model *m) { lbann_comm *comm = m->get_comm(); if (comm->get_num_trainers() == 1 || m->get_execution_mode() != execution_mode::training) { @@ -125,7 +126,7 @@ void lbann_callback_imcomm::on_backward_prop_end(model *m) { } } -void lbann_callback_imcomm::do_summary(model *m, weights *w, +void imcomm::do_summary(model *m, weights *w, EvalType im_time) { if (m_summarizer == nullptr) { return; @@ -150,7 +151,7 @@ static std::vector comm_type_names = { "none", "normal" }; /** returns a string representation of the weight_initialization */ -std::string get_comm_type_name(lbann_callback_imcomm::comm_type m) { +std::string get_comm_type_name(imcomm::comm_type m) { if ((int)m < 0 or (int)m >= (int)comm_type_names.size()) { throw(std::string{} + __FILE__ + " " + std::to_string(__LINE__) + " :: " + " Invalid comm_type"); @@ -158,24 +159,25 @@ std::string get_comm_type_name(lbann_callback_imcomm::comm_type m) { return comm_type_names[(int)m]; } -std::unique_ptr -build_callback_imcomm_from_pbuf( +std::unique_ptr +build_imcomm_callback_from_pbuf( const google::protobuf::Message& proto_msg, lbann_summary* summarizer) { const auto& params = dynamic_cast(proto_msg); const auto& type_str = params.intertrainer_comm_method(); - lbann_callback_imcomm::comm_type type = lbann_callback_imcomm::comm_type::NONE; + imcomm::comm_type type = imcomm::comm_type::NONE; if (type_str == "none") { - type = lbann_callback_imcomm::comm_type::NONE; + type = imcomm::comm_type::NONE; } else if (type_str == "normal") { - type = lbann_callback_imcomm::comm_type::NORMAL; + type = imcomm::comm_type::NORMAL; } else { std::ostringstream err; err << "invalid inter-model communication type (" << type_str << ")"; LBANN_ERROR(err.str()); } std::unordered_set selected_weights; /// @todo Initialize weights - return make_unique(type, selected_weights, summarizer); + return make_unique(type, selected_weights, summarizer); } -} // namespace lbann +} // namespace callback +} // namespace lbann diff --git a/src/callbacks/callback_io.cpp b/src/callbacks/callback_io.cpp index 57143c64bb7..57b6898e745 100644 --- a/src/callbacks/callback_io.cpp +++ b/src/callbacks/callback_io.cpp @@ -23,7 +23,7 @@ // implied. See the License for the specific language governing // permissions and limitations under the license. // -// lbann_callback_io .hpp .cpp - Callback hooks for I/O monitoring +// io .hpp .cpp - Callback hooks for I/O monitoring //////////////////////////////////////////////////////////////////////////////// #include @@ -33,8 +33,9 @@ #include "lbann/proto/proto_common.hpp" namespace lbann { +namespace callback { -void lbann_callback_io::on_epoch_end(model *m) { +void io::on_epoch_end(model *m) { lbann_comm *comm = m->get_comm(); for (Layer *layer : m->get_layers()) { if(m_layers.size() == 0 @@ -50,7 +51,7 @@ void lbann_callback_io::on_epoch_end(model *m) { } } -void lbann_callback_io::on_test_end(model *m) { +void io::on_test_end(model *m) { lbann_comm *comm = m->get_comm(); for (Layer *layer : m->get_layers()) { if(m_layers.size() == 0 @@ -66,13 +67,14 @@ void lbann_callback_io::on_test_end(model *m) { } } -std::unique_ptr -build_callback_disp_io_stats_from_pbuf( +std::unique_ptr +build_disp_io_stats_callback_from_pbuf( const google::protobuf::Message& proto_msg, lbann_summary*) { const auto& params = dynamic_cast(proto_msg); - return make_unique( + return make_unique( parse_list(params.layers())); } -} // namespace lbann +} // namespace callback +} // namespace lbann diff --git a/src/callbacks/callback_learning_rate.cpp b/src/callbacks/callback_learning_rate.cpp index ab2294fa1f3..6ba99107d87 100644 --- a/src/callbacks/callback_learning_rate.cpp +++ b/src/callbacks/callback_learning_rate.cpp @@ -41,16 +41,17 @@ #include namespace lbann { +namespace callback { -float lbann_callback_learning_rate::m_cur_global_lr = 0.0f; +float learning_rate::m_cur_global_lr = 0.0f; -lbann_callback_learning_rate::lbann_callback_learning_rate() {} +learning_rate::learning_rate() {} -lbann_callback_learning_rate::lbann_callback_learning_rate( +learning_rate::learning_rate( std::vector weights_names) : m_weights_names(std::move(weights_names)) {} -void lbann_callback_learning_rate::setup(model *m) { +void learning_rate::setup(model *m) { // Add all weights if list of weights is not initialized std::vector weights_list = @@ -74,7 +75,7 @@ void lbann_callback_learning_rate::setup(model *m) { } -void lbann_callback_learning_rate::on_epoch_end(model *m) { +void learning_rate::on_epoch_end(model *m) { const float new_lr = global_schedule(m); const float old_global_lr = m_cur_global_lr; m_cur_global_lr = new_lr; @@ -93,7 +94,7 @@ void lbann_callback_learning_rate::on_epoch_end(model *m) { } } -void lbann_callback_learning_rate::on_backward_prop_end(model *m) { +void learning_rate::on_backward_prop_end(model *m) { for (weights *w : this->get_weights()) { optimizer& opt = *w->get_optimizer(); const float old_lr = opt.get_learning_rate(); @@ -104,16 +105,16 @@ void lbann_callback_learning_rate::on_backward_prop_end(model *m) { } } -lbann_callback_step_learning_rate::lbann_callback_step_learning_rate( +step_learning_rate::step_learning_rate( int step, float amt) : - lbann_callback_learning_rate(), m_step(step), m_amt(amt) {} + learning_rate(), m_step(step), m_amt(amt) {} -lbann_callback_step_learning_rate::lbann_callback_step_learning_rate( +step_learning_rate::step_learning_rate( int step, float amt, std::vector weights_names) : - lbann_callback_learning_rate(std::move(weights_names)), + learning_rate(std::move(weights_names)), m_step(step), m_amt(amt) {} -float lbann_callback_step_learning_rate::global_schedule(model *m) { +float step_learning_rate::global_schedule(model *m) { if (m->get_epoch() % m_step == 0) { return get_current_global_learning_rate() * m_amt; } else { @@ -121,17 +122,17 @@ float lbann_callback_step_learning_rate::global_schedule(model *m) { } } -lbann_callback_adaptive_learning_rate::lbann_callback_adaptive_learning_rate( +adaptive_learning_rate::adaptive_learning_rate( int64_t patience, float amt) : - lbann_callback_adaptive_learning_rate(patience, amt, + adaptive_learning_rate(patience, amt, std::vector()) {} -lbann_callback_adaptive_learning_rate::lbann_callback_adaptive_learning_rate( +adaptive_learning_rate::adaptive_learning_rate( int64_t patience, float amt, std::vector weights_list) : - lbann_callback_learning_rate(std::move(weights_list)), + learning_rate(std::move(weights_list)), m_patience(patience), m_amt(amt) {} -float lbann_callback_adaptive_learning_rate::global_schedule(model *m) { +float adaptive_learning_rate::global_schedule(model *m) { // Determine behavior the first time this is called in an epoch if (m_cur_epoch != m->get_epoch()) { m_cur_epoch = m->get_epoch(); @@ -162,20 +163,20 @@ float lbann_callback_adaptive_learning_rate::global_schedule(model *m) { } } -lbann_callback_drop_fixed_learning_rate::lbann_callback_drop_fixed_learning_rate( +drop_fixed_learning_rate::drop_fixed_learning_rate( std::vector drop_epochs, float amt) : - lbann_callback_drop_fixed_learning_rate(std::move(drop_epochs), amt, + drop_fixed_learning_rate(std::move(drop_epochs), amt, std::vector()) {} -lbann_callback_drop_fixed_learning_rate::lbann_callback_drop_fixed_learning_rate( +drop_fixed_learning_rate::drop_fixed_learning_rate( std::vector drop_epochs, float amt, std::vector weights_names) : - lbann_callback_learning_rate(std::move(weights_names)), + learning_rate(std::move(weights_names)), m_amt(amt), m_drop_epochs(std::move(drop_epochs)) { // Sort in reverse order. std::sort(m_drop_epochs.rbegin(), m_drop_epochs.rend()); } -float lbann_callback_drop_fixed_learning_rate::global_schedule(model* m) { +float drop_fixed_learning_rate::global_schedule(model* m) { // Delete last drop epoch if we have already passed it while (!m_drop_epochs.empty() && m->get_epoch() > m_drop_epochs.back()) { @@ -190,25 +191,25 @@ float lbann_callback_drop_fixed_learning_rate::global_schedule(model* m) { } } -lbann_callback_linear_growth_learning_rate::lbann_callback_linear_growth_learning_rate( +linear_growth_learning_rate::linear_growth_learning_rate( float target, int64_t num_epochs) : - lbann_callback_linear_growth_learning_rate(target, num_epochs, 0, + linear_growth_learning_rate(target, num_epochs, 0, std::vector()) {} -lbann_callback_linear_growth_learning_rate::lbann_callback_linear_growth_learning_rate( +linear_growth_learning_rate::linear_growth_learning_rate( float target, int64_t num_epochs, int64_t delay) : - lbann_callback_linear_growth_learning_rate(target, num_epochs, delay, + linear_growth_learning_rate(target, num_epochs, delay, std::vector()) {} -lbann_callback_linear_growth_learning_rate::lbann_callback_linear_growth_learning_rate( +linear_growth_learning_rate::linear_growth_learning_rate( float target, int64_t num_epochs, int64_t delay, std::vector weights_names) : - lbann_callback_learning_rate(std::move(weights_names)), + learning_rate(std::move(weights_names)), m_target(target), m_inc(0), m_num_epochs(num_epochs), m_delay(delay) {} -void lbann_callback_linear_growth_learning_rate::setup(model *m) { - lbann_callback_learning_rate::setup(m); +void linear_growth_learning_rate::setup(model *m) { + learning_rate::setup(m); // Compute the learning rate increase. if (!this->get_weights().empty()) { // Assumes all optimizers have the same initial learning rate. @@ -217,7 +218,7 @@ void lbann_callback_linear_growth_learning_rate::setup(model *m) { } } -float lbann_callback_linear_growth_learning_rate::global_schedule(model *m) { +float linear_growth_learning_rate::global_schedule(model *m) { if (m->get_epoch() < m_delay) { return get_current_global_learning_rate(); } else if (m->get_epoch() <= m_num_epochs + m_delay) { @@ -234,16 +235,16 @@ float lbann_callback_linear_growth_learning_rate::global_schedule(model *m) { * In case that max_iter is set to 0, it is calculated from the number of * epochs (n_epochs). n_epochs is not used otherwise. */ -lbann_callback_poly_learning_rate::lbann_callback_poly_learning_rate( +poly_learning_rate::poly_learning_rate( double p, uint64_t n_epochs, uint64_t max_iter) - : lbann_callback_learning_rate(std::vector()), + : learning_rate(std::vector()), m_p(p), m_num_epochs(n_epochs), m_max_iter(max_iter), m_end_lr(0.0f), m_lr(1.0f), m_last_epoch_lr(1.0f) {} -lbann_callback_poly_learning_rate::lbann_callback_poly_learning_rate( +poly_learning_rate::poly_learning_rate( double p, uint64_t n_epochs, uint64_t max_iter, double end_lr, std::vector weights_names) - : lbann_callback_learning_rate(std::move(weights_names)), + : learning_rate(std::move(weights_names)), m_p(p), m_num_epochs(n_epochs), m_max_iter(max_iter), m_end_lr(end_lr), m_lr(1.0f), m_last_epoch_lr(1.0f) {} @@ -252,8 +253,8 @@ lbann_callback_poly_learning_rate::lbann_callback_poly_learning_rate( * Check if the maximum number of iterations is set. If not, compute it by the * number of epochs and the number of iterations per epoch. */ -void lbann_callback_poly_learning_rate::setup(model *m) { - lbann_callback_learning_rate::setup(m); +void poly_learning_rate::setup(model *m) { + learning_rate::setup(m); if (m_max_iter == 0ull) { m_max_iter = m_num_epochs * m->get_num_iterations_per_epoch(execution_mode::training); } @@ -262,7 +263,7 @@ void lbann_callback_poly_learning_rate::setup(model *m) { /** * Keep the record of the learning rate at the end of the current epoch. */ -float lbann_callback_poly_learning_rate::global_schedule(model *m) { +float poly_learning_rate::global_schedule(model *m) { const float scale = m_lr / m_last_epoch_lr; m_last_epoch_lr = m_lr; return (get_current_global_learning_rate() - m_end_lr) * scale + m_end_lr; @@ -271,7 +272,7 @@ float lbann_callback_poly_learning_rate::global_schedule(model *m) { /** * Compute the learning rate for the next iteration. */ -float lbann_callback_poly_learning_rate::optimizer_schedule(model *m, optimizer &opt) { +float poly_learning_rate::optimizer_schedule(model *m, optimizer &opt) { const uint64_t cur_iter = static_cast(m->get_step(execution_mode::training)); if (m_max_iter > cur_iter) { m_lr = static_cast(std::pow(static_cast(m_max_iter - cur_iter)/m_max_iter, m_p)); @@ -280,19 +281,19 @@ float lbann_callback_poly_learning_rate::optimizer_schedule(model *m, optimizer return (get_current_global_learning_rate() - m_end_lr) * scale + m_end_lr; } -lbann_callback_optimizerwise_adaptive_learning_rate:: -lbann_callback_optimizerwise_adaptive_learning_rate( +optimizerwise_adaptive_learning_rate:: +optimizerwise_adaptive_learning_rate( float scale) : - lbann_callback_optimizerwise_adaptive_learning_rate( + optimizerwise_adaptive_learning_rate( scale, std::vector()) {} -lbann_callback_optimizerwise_adaptive_learning_rate:: -lbann_callback_optimizerwise_adaptive_learning_rate( +optimizerwise_adaptive_learning_rate:: +optimizerwise_adaptive_learning_rate( float scale, std::vector weights_names) : - lbann_callback_learning_rate(std::move(weights_names)), m_scale(scale) {} + learning_rate(std::move(weights_names)), m_scale(scale) {} -float lbann_callback_optimizerwise_adaptive_learning_rate::optimizer_schedule( +float optimizerwise_adaptive_learning_rate::optimizer_schedule( model *m, optimizer &opt) { DataType param_norm = El::Nrm2(opt.get_weights().get_values()); DataType param_grad_norm = El::Nrm2(opt.get_gradient()); @@ -305,32 +306,32 @@ float lbann_callback_optimizerwise_adaptive_learning_rate::optimizer_schedule( } // FIXME TRB -std::unique_ptr -build_callback_step_learning_rate_from_pbuf( +std::unique_ptr +build_step_learning_rate_callback_from_pbuf( const google::protobuf::Message& proto_msg, lbann_summary*) { const auto& params = dynamic_cast(proto_msg); - return make_unique( + return make_unique( params.step(), params.amt(), parse_list(params.weights())); } // FIXME TRB -std::unique_ptr -build_callback_adaptive_learning_rate_from_pbuf( +std::unique_ptr +build_adaptive_learning_rate_callback_from_pbuf( const google::protobuf::Message& proto_msg, lbann_summary*) { const auto& params = dynamic_cast(proto_msg); - return make_unique( + return make_unique( params.patience(), params.amt(), parse_list(params.weights())); } // FIXME TRB -std::unique_ptr -build_callback_drop_fixed_learning_rate_from_pbuf( +std::unique_ptr +build_drop_fixed_learning_rate_callback_from_pbuf( const google::protobuf::Message& proto_msg, lbann_summary*) { const auto& params = dynamic_cast(proto_msg); @@ -338,18 +339,18 @@ build_callback_drop_fixed_learning_rate_from_pbuf( for (int i = 0; i < params.drop_epoch_size(); ++i) { drop_epochs.push_back(params.drop_epoch(i)); } - return make_unique( + return make_unique( std::move(drop_epochs), params.amt(), parse_list(params.weights())); } // FIXME TRB -std::unique_ptr -build_callback_linear_growth_learning_rate_from_pbuf( +std::unique_ptr +build_linear_growth_learning_rate_callback_from_pbuf( const google::protobuf::Message& proto_msg,lbann_summary*) { using MsgType = lbann_data::CallbackLinearGrowthLearningRate; - using CallbackType = lbann_callback_linear_growth_learning_rate; + using CallbackType = linear_growth_learning_rate; const auto& params = dynamic_cast(proto_msg); return make_unique(params.target(), @@ -359,23 +360,23 @@ build_callback_linear_growth_learning_rate_from_pbuf( } // FIXME TRB -std::unique_ptr -build_callback_optimizerwise_adaptive_learning_rate_from_pbuf( +std::unique_ptr +build_optimizerwise_adaptive_learning_rate_callback_from_pbuf( const google::protobuf::Message& proto_msg,lbann_summary*) { using MsgType = lbann_data::CallbackOptimizerwiseAdaptiveLearningRate; - using CallbackType = lbann_callback_optimizerwise_adaptive_learning_rate; + using CallbackType = optimizerwise_adaptive_learning_rate; const auto& params = dynamic_cast(proto_msg); return make_unique(params.scale(), parse_list(params.weights())); } // FIXME TRB -std::unique_ptr -build_callback_poly_learning_rate_from_pbuf( +std::unique_ptr +build_poly_learning_rate_callback_from_pbuf( const google::protobuf::Message& proto_msg, lbann_summary*) { const auto& params = dynamic_cast(proto_msg); - return make_unique( + return make_unique( params.power(), params.num_epochs(), params.max_iter(), @@ -383,4 +384,5 @@ build_callback_poly_learning_rate_from_pbuf( parse_list(params.weights())); } -} // namespace lbann +} // namespace callback +} // namespace lbann diff --git a/src/callbacks/callback_ltfb.cpp b/src/callbacks/callback_ltfb.cpp index 2fe90186d1d..568a5880cd7 100644 --- a/src/callbacks/callback_ltfb.cpp +++ b/src/callbacks/callback_ltfb.cpp @@ -33,6 +33,7 @@ #include "lbann/proto/factories.hpp" namespace lbann { +namespace callback { namespace { @@ -321,22 +322,22 @@ EvalType evaluate(model& m, const std::string& metric_name) { } // namespace -lbann_callback_ltfb::lbann_callback_ltfb(El::Int batch_interval, +ltfb::ltfb(El::Int batch_interval, std::string metric_name, std::set weights_names, bool low_score_wins, communication_algorithm comm_algo, bool exchange_hyperparameters, lbann_summary *summarizer) - : lbann_callback(batch_interval, summarizer), + : callback_base(batch_interval, summarizer), m_metric_name(std::move(metric_name)), m_weights_names(std::move(weights_names)), m_low_score_wins(low_score_wins), m_comm_algo(comm_algo), m_exchange_hyperparameters(exchange_hyperparameters) {} -lbann_callback_ltfb::lbann_callback_ltfb(const lbann_callback_ltfb& other) : - lbann_callback(other), +ltfb::ltfb(const ltfb& other) : + callback_base(other), m_metric_name(other.m_metric_name), m_weights_names(other.m_weights_names), m_low_score_wins(other.m_low_score_wins), @@ -352,8 +353,8 @@ lbann_callback_ltfb::lbann_callback_ltfb(const lbann_callback_ltfb& other) : } -lbann_callback_ltfb& lbann_callback_ltfb::operator=(const lbann_callback_ltfb& other) { - lbann_callback::operator=(other); +ltfb& ltfb::operator=(const ltfb& other) { + callback_base::operator=(other); // Shallow copies m_metric_name = other.m_metric_name; @@ -372,7 +373,7 @@ lbann_callback_ltfb& lbann_callback_ltfb::operator=(const lbann_callback_ltfb& o return *this; } -void lbann_callback_ltfb::setup(model *m) { +void ltfb::setup(model *m) { // Create workspace objects const auto& model_weights = m->get_weights(); @@ -384,14 +385,14 @@ void lbann_callback_ltfb::setup(model *m) { // Make sure model does not have inter-trainer communication callback for (auto&& cb : m->get_callbacks()) { - if (dynamic_cast(cb) != nullptr) { + if (dynamic_cast(cb) != nullptr) { LBANN_ERROR("Detected both LTFB and imcomm callbacks. "); } } } -void lbann_callback_ltfb::on_train_begin(model *m) { +void ltfb::on_train_begin(model *m) { auto&& comm = *m->get_comm(); if (comm.am_world_master()) { @@ -406,7 +407,7 @@ void lbann_callback_ltfb::on_train_begin(model *m) { } } -void lbann_callback_ltfb::on_batch_begin(model *m) { +void ltfb::on_batch_begin(model *m) { auto&& comm = *m->get_comm(); // Check whether to start LTFB round @@ -507,8 +508,8 @@ void lbann_callback_ltfb::on_batch_begin(model *m) { } -lbann_callback_ltfb::communication_algorithm -lbann_callback_ltfb::string_to_comm_algo(const std::string& str) { +ltfb::communication_algorithm +ltfb::string_to_comm_algo(const std::string& str) { if (str.empty() || str == "sendrecv_weights") { return communication_algorithm::sendrecv_weights; } @@ -524,20 +525,21 @@ lbann_callback_ltfb::string_to_comm_algo(const std::string& str) { } -std::unique_ptr -build_callback_ltfb_from_pbuf( +std::unique_ptr +build_ltfb_callback_from_pbuf( const google::protobuf::Message& proto_msg, lbann_summary* summarizer) { const auto& params = dynamic_cast(proto_msg); - return make_unique( + return make_unique( params.batch_interval(), params.metric(), parse_set(params.weights()), params.low_score_wins(), - lbann_callback_ltfb::string_to_comm_algo(params.communication_algorithm()), + ltfb::string_to_comm_algo(params.communication_algorithm()), params.exchange_hyperparameters(), summarizer); } +} // namespace callback } // namespace lbann diff --git a/src/callbacks/callback_mixup.cpp b/src/callbacks/callback_mixup.cpp index f38b07a74c4..2a65e721e7b 100644 --- a/src/callbacks/callback_mixup.cpp +++ b/src/callbacks/callback_mixup.cpp @@ -36,6 +36,7 @@ #include namespace lbann { +namespace callback { void callback_mixup::on_forward_prop_end(model *m, Layer *l) { if (!m_layers.count(l->get_name())) { @@ -97,8 +98,8 @@ void callback_mixup::on_forward_prop_end(model *m, Layer *l) { } } -std::unique_ptr -build_callback_mixup_from_pbuf( +std::unique_ptr +build_mixup_callback_from_pbuf( const google::protobuf::Message& proto_msg, lbann_summary*) { const auto& params = dynamic_cast(proto_msg); @@ -107,4 +108,6 @@ build_callback_mixup_from_pbuf( layers_list.end()); return make_unique(layers, params.alpha()); } -} // namespace lbann + +} // namespace callback +} // namespace lbann diff --git a/src/callbacks/callback_perturb_adam.cpp b/src/callbacks/callback_perturb_adam.cpp index 65a32f8f0bf..5af3e9b69bc 100644 --- a/src/callbacks/callback_perturb_adam.cpp +++ b/src/callbacks/callback_perturb_adam.cpp @@ -29,15 +29,16 @@ #include "lbann/utils/random.hpp" namespace lbann { +namespace callback { -lbann_callback_perturb_adam::lbann_callback_perturb_adam(DataType learning_rate_factor, +perturb_adam::perturb_adam(DataType learning_rate_factor, DataType beta1_factor, DataType beta2_factor, DataType eps_factor, bool perturb_during_training, El::Int batch_interval, std::set weights_names) - : lbann_callback(batch_interval), + : callback_base(batch_interval), m_learning_rate_factor(learning_rate_factor), m_beta1_factor(beta1_factor), m_beta2_factor(beta2_factor), @@ -45,17 +46,17 @@ lbann_callback_perturb_adam::lbann_callback_perturb_adam(DataType learning_rate_ m_perturb_during_training(perturb_during_training), m_weights_names(std::move(weights_names)) {} -void lbann_callback_perturb_adam::setup(model* m) { +void perturb_adam::setup(model* m) { perturb(*m); } -void lbann_callback_perturb_adam::on_batch_begin(model* m) { +void perturb_adam::on_batch_begin(model* m) { if (m_perturb_during_training && m->get_step() > 0) { perturb(*m); } } -void lbann_callback_perturb_adam::perturb(model& m) const { +void perturb_adam::perturb(model& m) const { auto* comm = m.get_comm(); for (auto* w : m.get_weights()) { if (w == nullptr) { @@ -92,7 +93,7 @@ void lbann_callback_perturb_adam::perturb(model& m) const { } } -void lbann_callback_perturb_adam::perturb(lbann_comm& comm, adam& opt) const { +void perturb_adam::perturb(lbann_comm& comm, adam& opt) const { // Perturb hyperparameters on master process std::vector hyperparameters(4); @@ -161,12 +162,12 @@ void lbann_callback_perturb_adam::perturb(lbann_comm& comm, adam& opt) const { } -std::unique_ptr -build_callback_perturb_adam_from_pbuf( +std::unique_ptr +build_perturb_adam_callback_from_pbuf( const google::protobuf::Message& proto_msg, lbann_summary*) { const auto& params = dynamic_cast(proto_msg); - return make_unique( + return make_unique( params.learning_rate_factor(), params.beta1_factor(), params.beta2_factor(), @@ -176,4 +177,5 @@ build_callback_perturb_adam_from_pbuf( parse_set(params.weights())); } +} // namespace callback } // namespace lbann diff --git a/src/callbacks/callback_perturb_dropout.cpp b/src/callbacks/callback_perturb_dropout.cpp index 9df485628bf..5f3489391d1 100644 --- a/src/callbacks/callback_perturb_dropout.cpp +++ b/src/callbacks/callback_perturb_dropout.cpp @@ -29,24 +29,25 @@ #include "lbann/utils/random.hpp" namespace lbann { +namespace callback { -lbann_callback_perturb_dropout::lbann_callback_perturb_dropout(EvalType keep_prob_factor, +perturb_dropout::perturb_dropout(EvalType keep_prob_factor, std::set layer_names) - : lbann_callback(1), + : callback_base(1), m_keep_prob_factor(keep_prob_factor), m_layer_names(std::move(layer_names)) {} -void lbann_callback_perturb_dropout::setup(model* m) { +void perturb_dropout::setup(model* m) { perturb(*m); } template -dropout* lbann_callback_perturb_dropout::get_dropout_layer(Layer* l) { +dropout* perturb_dropout::get_dropout_layer(Layer* l) { if(auto d_layer = dynamic_cast*>(l)) return d_layer; else return nullptr; } -void lbann_callback_perturb_dropout::perturb(model& m) { +void perturb_dropout::perturb(model& m) { auto* comm = m.get_comm(); for (auto* l : m.get_layers()) { if (l == nullptr) { @@ -117,14 +118,15 @@ void lbann_callback_perturb_dropout::perturb(model& m) { } } -std::unique_ptr -build_callback_perturb_dropout_from_pbuf( +std::unique_ptr +build_perturb_dropout_callback_from_pbuf( const google::protobuf::Message& proto_msg, lbann_summary*) { const auto& params = dynamic_cast(proto_msg); - return make_unique( + return make_unique( params.keep_dropout_factor(), parse_set(params.layers())); } +} // namespace callback } // namespace lbann diff --git a/src/callbacks/callback_print.cpp b/src/callbacks/callback_print.cpp index 672cb6ec223..0f089db6b59 100644 --- a/src/callbacks/callback_print.cpp +++ b/src/callbacks/callback_print.cpp @@ -23,7 +23,7 @@ // implied. See the License for the specific language governing // permissions and limitations under the license. // -// lbann_callback_print .hpp .cpp - Callback hooks to print information +// print .hpp .cpp - Callback hooks to print information //////////////////////////////////////////////////////////////////////////////// #include @@ -32,8 +32,9 @@ #include namespace lbann { +namespace callback { -void lbann_callback_print::setup(model *m) { +void print::setup(model *m) { #ifdef LBANN_VERSION lbann_comm *comm = m->get_comm(); if (comm->am_world_master()) { @@ -43,7 +44,7 @@ void lbann_callback_print::setup(model *m) { #endif } -void lbann_callback_print::on_epoch_begin(model *m) { +void print::on_epoch_begin(model *m) { lbann_comm *comm = m->get_comm(); if (comm->am_world_master()) { @@ -115,19 +116,19 @@ void lbann_callback_print::on_epoch_begin(model *m) { } } -void lbann_callback_print::on_epoch_end(model *m) { +void print::on_epoch_end(model *m) { report_results(m); } -void lbann_callback_print::on_validation_end(model *m) { +void print::on_validation_end(model *m) { report_results(m); } -void lbann_callback_print::on_test_end(model *m) { +void print::on_test_end(model *m) { report_results(m); } -void lbann_callback_print::report_results(model *m) { +void print::report_results(model *m) { lbann_comm *comm = m->get_comm(); // Get string for execution mode @@ -246,13 +247,14 @@ void lbann_callback_print::report_results(model *m) { } -std::unique_ptr -build_callback_print_from_pbuf( +std::unique_ptr +build_print_callback_from_pbuf( const google::protobuf::Message& proto_msg, lbann_summary*) { const auto& params = dynamic_cast(proto_msg); - return make_unique(params.interval(), + return make_unique(params.interval(), params.print_global_stat_only()); } -} // namespace lbann +} // namespace callback +} // namespace lbann diff --git a/src/callbacks/callback_replace_weights.cpp b/src/callbacks/callback_replace_weights.cpp index 5aa820aae1b..cfe79ed5862 100644 --- a/src/callbacks/callback_replace_weights.cpp +++ b/src/callbacks/callback_replace_weights.cpp @@ -30,8 +30,9 @@ #include "callback_helpers.hpp" namespace lbann { +namespace callback { -void lbann_callback_replace_weights::setup(model *m) { +void replace_weights::setup(model *m) { auto const layers = m->get_layers(); m_src_layers = select_things_by_name(layers, m_src_layer_names); m_dst_layers = select_things_by_name(layers, m_dst_layer_names); @@ -41,7 +42,7 @@ void lbann_callback_replace_weights::setup(model *m) { std::vector().swap(m_dst_layer_names); } -void lbann_callback_replace_weights::on_batch_end(model *m) { +void replace_weights::on_batch_end(model *m) { const auto& step = m->get_step(execution_mode::training); if(step % m_batch_interval == 0) { for(size_t i = 0; i < m_src_layers.size(); i++) { @@ -50,15 +51,16 @@ void lbann_callback_replace_weights::on_batch_end(model *m) { } } -std::unique_ptr -build_callback_replace_weights_from_pbuf( +std::unique_ptr +build_replace_weights_callback_from_pbuf( const google::protobuf::Message& proto_msg, lbann_summary*) { const auto& params = dynamic_cast(proto_msg); - return make_unique( + return make_unique( parse_list(params.source_layers()), parse_list(params.destination_layers()), params.batch_interval()); } -} // namespace lbann +} // namespace callback +} // namespace lbann diff --git a/src/callbacks/callback_save_images.cpp b/src/callbacks/callback_save_images.cpp index 4d320f5c5b7..0aad9ec7889 100644 --- a/src/callbacks/callback_save_images.cpp +++ b/src/callbacks/callback_save_images.cpp @@ -34,6 +34,7 @@ #endif // LBANN_HAS_OPENCV namespace lbann { +namespace callback { namespace { @@ -130,10 +131,10 @@ void save_image(std::string prefix, } // namespace -lbann_callback_save_images::lbann_callback_save_images(std::vector layer_names, +save_images::save_images(std::vector layer_names, std::string image_format, std::string image_prefix) - : lbann_callback(), + : callback_base(), m_layer_names(std::move(layer_names)), m_image_format(image_format.empty() ? "jpg" : image_format), m_image_prefix(std::move(image_prefix)) { @@ -142,29 +143,30 @@ lbann_callback_save_images::lbann_callback_save_images(std::vector #endif // LBANN_HAS_OPENCV } -void lbann_callback_save_images::on_epoch_end(model *m) { +void save_images::on_epoch_end(model *m) { save_image(m_image_prefix + "epoch" + std::to_string(m->get_epoch()), m_image_format, m->get_layers(), m_layer_names); } -void lbann_callback_save_images::on_test_end(model *m) { +void save_images::on_test_end(model *m) { save_image(m_image_prefix + "test", m_image_format, m->get_layers(), m_layer_names); } -std::unique_ptr -build_callback_save_images_from_pbuf( +std::unique_ptr +build_save_images_callback_from_pbuf( const google::protobuf::Message& proto_msg, lbann_summary*) { const auto& params = dynamic_cast(proto_msg); - return make_unique( + return make_unique( parse_list<>(params.layers()), params.image_format(), params.image_prefix()); } +} // namespace callback } // namespace lbann diff --git a/src/callbacks/callback_save_model.cpp b/src/callbacks/callback_save_model.cpp index 5f2d917e883..045ae33da78 100644 --- a/src/callbacks/callback_save_model.cpp +++ b/src/callbacks/callback_save_model.cpp @@ -23,7 +23,7 @@ // implied. See the License for the specific language governing // permissions and limitations under the license. // -// lbann_callback_save_model .hpp .cpp - Callbacks to save a models description and weights +// save_model .hpp .cpp - Callbacks to save a models description and weights //////////////////////////////////////////////////////////////////////////////// #include @@ -38,22 +38,23 @@ #include namespace lbann { +namespace callback { /// Save the model's prototext and weights -void lbann_callback_save_model::on_train_end(model *m) { +void save_model::on_train_end(model *m) { if(!m_disable_save_after_training){ - save_model(m); + do_save_model(m); } } -void lbann_callback_save_model::write_proto_binary(const lbann_data::Model& proto, +void save_model::write_proto_binary(const lbann_data::Model& proto, const std::string filename) { std::fstream output(filename.c_str(), std::ios::out | std::ios::trunc | std::ios::binary); proto.SerializeToOstream(&output); } -void lbann_callback_save_model::write_proto_text(const lbann_data::Model& proto, +void save_model::write_proto_text(const lbann_data::Model& proto, const std::string filename) { int fd = openwrite(filename.c_str()); auto output = new google::protobuf::io::FileOutputStream(fd); @@ -62,11 +63,11 @@ void lbann_callback_save_model::write_proto_text(const lbann_data::Model& proto, close(fd); } -bool lbann_callback_save_model::save_model(model *m) { +bool save_model::do_save_model(model *m) { lbann_data::Model model_param; p.set_cb_type(callback_type::inference); - save_model_weights(m); + do_save_model_weights(m); p.set_cb_type(callback_type::invalid); #if 0 /// @todo BVE FIXME this method for writing out the prototext does not seem to work @@ -82,7 +83,7 @@ bool lbann_callback_save_model::save_model(model *m) { } // Save model weights -bool lbann_callback_save_model::save_model_weights(model *m) { +bool save_model::do_save_model_weights(model *m) { // if the checkpoint directory is not defined, bail if (m_dir.length() == 0) { return false; @@ -133,7 +134,7 @@ bool lbann_callback_save_model::save_model_weights(model *m) { return true; } -bool lbann_callback_save_model::load_model_weights(std::string ckpt_dir, model * m, bool ckptdir_is_fullpath) { +bool save_model::load_model_weights(std::string ckpt_dir, model * m, bool ckptdir_is_fullpath) { std::vector weight_list = std::vector(); std::string active_ckpt_dir; if(ckptdir_is_fullpath) { @@ -174,23 +175,23 @@ bool lbann_callback_save_model::load_model_weights(std::string ckpt_dir, model * return true; } -std::unique_ptr -build_callback_save_model_from_pbuf( +std::unique_ptr +build_save_model_callback_from_pbuf( const google::protobuf::Message& proto_msg, lbann_summary*) { const auto& params = dynamic_cast(proto_msg); if(params.extension().size() != 0) { - return make_unique( + return make_unique( params.dir(), params.disable_save_after_training(), params.extension()); } else { - return make_unique( + return make_unique( params.dir(), params.disable_save_after_training()); } } - -} // namespace lbann +} // namespace callback +} // namespace lbann diff --git a/src/callbacks/callback_save_topk_models.cpp b/src/callbacks/callback_save_topk_models.cpp index 9f3f0ed21a0..ba76cad1c27 100644 --- a/src/callbacks/callback_save_topk_models.cpp +++ b/src/callbacks/callback_save_topk_models.cpp @@ -23,23 +23,24 @@ // implied. See the License for the specific language governing // permissions and limitations under the license. // -// lbann_callback_save_topk_models .hpp .cpp - Callback hooks to save_topk_models information +// save_topk_models .hpp .cpp - Callback hooks to save_topk_models information //////////////////////////////////////////////////////////////////////////////// #include #include "lbann/callbacks/callback_save_topk_models.hpp" namespace lbann { -void lbann_callback_save_topk_models::on_test_end(model *m) { +namespace callback { +void save_topk_models::on_test_end(model *m) { bool in_topk = false; if(m->get_comm()->am_trainer_master()) { in_topk = am_in_topk(m); } m->get_comm()->trainer_broadcast(0, in_topk); - if(in_topk) save_model(m); + if(in_topk) do_save_model(m); } -bool lbann_callback_save_topk_models::am_in_topk(model *m) { +bool save_topk_models::am_in_topk(model *m) { lbann_comm *comm = m->get_comm(); const int num_trainers = comm->get_num_trainers(); std::string mode_string = "test"; @@ -88,16 +89,17 @@ bool lbann_callback_save_topk_models::am_in_topk(model *m) { return false; } -std::unique_ptr -build_callback_save_topk_models_from_pbuf( +std::unique_ptr +build_save_topk_models_callback_from_pbuf( const google::protobuf::Message& proto_msg, lbann_summary*) { const auto& params = dynamic_cast(proto_msg); - return make_unique( + return make_unique( params.dir(), params.k(), params.metric(), params.ascending_ordering()); } -} // namespace lbann +} // namespace callback +} // namespace lbann diff --git a/src/callbacks/callback_summary.cpp b/src/callbacks/callback_summary.cpp index 6e75c93b471..4552b0180c4 100644 --- a/src/callbacks/callback_summary.cpp +++ b/src/callbacks/callback_summary.cpp @@ -23,29 +23,30 @@ // implied. See the License for the specific language governing // permissions and limitations under the license. // -// lbann_callback_summary .hpp .cpp - Callback hooks to summarize to Tensorboard +// summary .hpp .cpp - Callback hooks to summarize to Tensorboard //////////////////////////////////////////////////////////////////////////////// #include "lbann/callbacks/callback_summary.hpp" #include "lbann/utils/profiling.hpp" namespace lbann { +namespace callback { -lbann_callback_summary::lbann_callback_summary(lbann_summary *summarizer, +summary::summary(lbann_summary *summarizer, int batch_interval, int mat_interval) : - lbann_callback(batch_interval, summarizer), + callback_base(batch_interval, summarizer), m_mat_interval(mat_interval) {} -lbann_callback_summary::~lbann_callback_summary() { +summary::~summary() { delete m_summarizer; } -void lbann_callback_summary::on_train_begin(model *m) { +void summary::on_train_begin(model *m) { save_histograms(m); } -void lbann_callback_summary::on_batch_end(model *m) { +void summary::on_batch_end(model *m) { prof_region_begin("summary-batch", prof_colors[0], false); m->summarize_stats(*m_summarizer); if (m_mat_interval > 0 && m->get_step(execution_mode::training) % m_mat_interval == 0) { @@ -70,7 +71,7 @@ void lbann_callback_summary::on_batch_end(model *m) { prof_region_end("summary-batch", false); } -void lbann_callback_summary::on_epoch_end(model *m) { +void summary::on_epoch_end(model *m) { prof_region_begin("summary-epoch", prof_colors[0], false); for (const auto& met : m->get_metrics()) { EvalType train_score = met->get_mean_value(m->get_execution_mode()); @@ -86,7 +87,7 @@ void lbann_callback_summary::on_epoch_end(model *m) { prof_region_end("summary-epoch", false); } -void lbann_callback_summary::on_test_end(model *m) { +void summary::on_test_end(model *m) { prof_region_begin("summary-test", prof_colors[0], false); lbann_comm *comm = m->get_comm(); for (auto&& met : m->get_metrics()) { @@ -106,7 +107,7 @@ void lbann_callback_summary::on_test_end(model *m) { prof_region_end("summary-test", false); } -void lbann_callback_summary::save_histograms(model *m) { +void summary::save_histograms(model *m) { for (const auto& layer : m->get_layers()) { const std::string prefix = layer->get_name() + "/"; for (int i = 0; i < layer->get_num_children(); ++i) { @@ -132,15 +133,16 @@ void lbann_callback_summary::save_histograms(model *m) { } } -std::unique_ptr -build_callback_summary_from_pbuf( +std::unique_ptr +build_summary_callback_from_pbuf( const google::protobuf::Message& proto_msg, lbann_summary* summarizer) { const auto& params = dynamic_cast(proto_msg); - return make_unique(summarizer, + return make_unique(summarizer, params.batch_interval(), params.mat_interval()); } -} // namespace lbann +} // namespace callback +} // namespace lbann diff --git a/src/callbacks/callback_sync_layers.cpp b/src/callbacks/callback_sync_layers.cpp index 6a7c674602d..230afd6d790 100644 --- a/src/callbacks/callback_sync_layers.cpp +++ b/src/callbacks/callback_sync_layers.cpp @@ -31,8 +31,9 @@ #include "lbann/utils/timer.hpp" namespace lbann { +namespace callback { -void lbann_callback_sync_layers::on_forward_prop_end(model *m, Layer *l) { +void sync_layers::on_forward_prop_end(model *m, Layer *l) { if (m_only_input && dynamic_cast(l) == nullptr) { return; // Skip non-input layers. } @@ -41,7 +42,7 @@ void lbann_callback_sync_layers::on_forward_prop_end(model *m, Layer *l) { l->m_fp_time += get_time() - start; } -void lbann_callback_sync_layers::on_backward_prop_end(model *m, Layer *l) { +void sync_layers::on_backward_prop_end(model *m, Layer *l) { if (m_only_input) { return; } @@ -50,7 +51,7 @@ void lbann_callback_sync_layers::on_backward_prop_end(model *m, Layer *l) { l->m_bp_time += get_time() - start; } -void lbann_callback_sync_layers::do_sync(Layer *l) { +void sync_layers::do_sync(Layer *l) { #ifdef LBANN_HAS_CUDNN if (m_sync_gpus) { El::GPUManager::SynchronizeDevice(); @@ -61,14 +62,15 @@ void lbann_callback_sync_layers::do_sync(Layer *l) { } } -std::unique_ptr -build_callback_sync_layers_from_pbuf( +std::unique_ptr +build_sync_layers_callback_from_pbuf( const google::protobuf::Message& proto_msg, lbann_summary*) { const auto& params = dynamic_cast(proto_msg); - return make_unique(params.sync_gpus(), + return make_unique(params.sync_gpus(), params.sync_mpi(), params.only_input()); } -} // namespace lbann +} // namespace callback +} // namespace lbann diff --git a/src/callbacks/callback_sync_selected.cpp b/src/callbacks/callback_sync_selected.cpp index d16530d62ef..771a0cb165d 100644 --- a/src/callbacks/callback_sync_selected.cpp +++ b/src/callbacks/callback_sync_selected.cpp @@ -35,17 +35,18 @@ #endif // LBANN_NVPROF namespace lbann { - -bool lbann_callback_sync_selected::m_cuda_profiler_initialized = false; -const std::map - lbann_callback_sync_selected::m_prop_str - = {std::make_pair(lbann_callback_sync_selected::prop_t::Both, "Both"), - std::make_pair(lbann_callback_sync_selected::prop_t::Forward, "Forward"), - std::make_pair(lbann_callback_sync_selected::prop_t::Backward, "Backward")}; - -lbann_callback_sync_selected::lbann_callback_sync_selected( - const lbann_callback_sync_selected::layers_t& layers, bool async_gpus, bool async_mpi) - : lbann_callback_sync_layers(!async_gpus, !async_mpi, false), +namespace callback { + +bool sync_selected::m_cuda_profiler_initialized = false; +const std::map + sync_selected::m_prop_str + = {std::make_pair(sync_selected::prop_t::Both, "Both"), + std::make_pair(sync_selected::prop_t::Forward, "Forward"), + std::make_pair(sync_selected::prop_t::Backward, "Backward")}; + +sync_selected::sync_selected( + const sync_selected::layers_t& layers, bool async_gpus, bool async_mpi) + : sync_layers(!async_gpus, !async_mpi, false), m_layers(layers), m_all_set(false) { #ifdef LBANN_NVPROF cudaProfilerStop(); // make sure to flush out profile data @@ -64,13 +65,13 @@ lbann_callback_sync_selected::lbann_callback_sync_selected( m_bwd_ptrs.reserve(cnt_bwd); } -lbann_callback_sync_selected::~lbann_callback_sync_selected() { +sync_selected::~sync_selected() { #ifdef LBANN_NVPROF cudaProfilerStop(); // make sure to flush out profile data #endif } -std::string lbann_callback_sync_selected::get_description() const { +std::string sync_selected::get_description() const { std::string selection; for (const auto& l: m_layers) { std::map::const_iterator it = m_prop_str.find(l.second); @@ -79,11 +80,11 @@ std::string lbann_callback_sync_selected::get_description() const { return "sync_selected : { " + selection + '}'; } -void lbann_callback_sync_selected::turn_off_init_cuda_profiler() { +void sync_selected::turn_off_init_cuda_profiler() { m_cuda_profiler_initialized = true; } -bool lbann_callback_sync_selected::check_if_cuda_profiler_initialized() { +bool sync_selected::check_if_cuda_profiler_initialized() { return m_cuda_profiler_initialized; } @@ -98,7 +99,7 @@ bool lbann_callback_sync_selected::check_if_cuda_profiler_initialized() { * @param comm global world communicator. * The profile output will be wrttien to out_dir/layer_name.prop.rank.prof */ -void lbann_callback_sync_selected::init_cuda_profiler( +void sync_selected::init_cuda_profiler( const std::string cfg_file, const std::string out_dir, int out_mode, lbann_comm* comm) const { #ifdef LBANN_NVPROF if (check_if_cuda_profiler_initialized()) { @@ -151,7 +152,7 @@ void lbann_callback_sync_selected::init_cuda_profiler( #endif } -void lbann_callback_sync_selected::setup(model *m) { +void sync_selected::setup(model *m) { const std::vector& layers = m->get_layers(); for (auto l: layers) { populate_layer_ptrs(l, Forward); @@ -163,7 +164,7 @@ void lbann_callback_sync_selected::setup(model *m) { } -void lbann_callback_sync_selected::on_forward_prop_begin(model *m, Layer *l) { +void sync_selected::on_forward_prop_begin(model *m, Layer *l) { const layer_ptrs_t::const_iterator it = m_fwd_ptrs.find(l); if (it == m_fwd_ptrs.cend()) { @@ -175,7 +176,7 @@ void lbann_callback_sync_selected::on_forward_prop_begin(model *m, Layer *l) { do_pre_sync(l); } -void lbann_callback_sync_selected::on_forward_prop_end(model *m, Layer *l) { +void sync_selected::on_forward_prop_end(model *m, Layer *l) { const layer_ptrs_t::const_iterator it = m_fwd_ptrs.find(l); if (it == m_fwd_ptrs.cend()) { return; @@ -185,7 +186,7 @@ void lbann_callback_sync_selected::on_forward_prop_end(model *m, Layer *l) { l->m_fp_time += get_time() - start; } -void lbann_callback_sync_selected::on_backward_prop_begin(model *m, Layer *l) { +void sync_selected::on_backward_prop_begin(model *m, Layer *l) { const layer_ptrs_t::const_iterator it = m_bwd_ptrs.find(l); if (it == m_bwd_ptrs.cend()) { @@ -194,7 +195,7 @@ void lbann_callback_sync_selected::on_backward_prop_begin(model *m, Layer *l) { do_pre_sync(l); } -void lbann_callback_sync_selected::on_backward_prop_end(model *m, Layer *l) { +void sync_selected::on_backward_prop_end(model *m, Layer *l) { const layer_ptrs_t::const_iterator it = m_bwd_ptrs.find(l); if (it == m_bwd_ptrs.cend()) { return; @@ -204,7 +205,7 @@ void lbann_callback_sync_selected::on_backward_prop_end(model *m, Layer *l) { l->m_bp_time += get_time() - start; } -bool lbann_callback_sync_selected::check_if_all_accounted_for() const { +bool sync_selected::check_if_all_accounted_for() const { return (m_fwd_ptrs.size() + m_bwd_ptrs.size() == m_layers.size() + m_both_ptrs.size()); } @@ -214,9 +215,9 @@ bool lbann_callback_sync_selected::check_if_all_accounted_for() const { * to match. When the first time the match is found, save the pointer of the * selected layer and use it for the subsequent matching instead of name. */ -lbann_callback_sync_selected::layer_ptrs_t::iterator -lbann_callback_sync_selected::populate_layer_ptrs( - Layer* l, const lbann_callback_sync_selected::prop_t current_prop) { +sync_selected::layer_ptrs_t::iterator +sync_selected::populate_layer_ptrs( + Layer* l, const sync_selected::prop_t current_prop) { std::pair ret = std::make_pair(((current_prop == Forward)? m_fwd_ptrs.end() : m_bwd_ptrs.end()), false); @@ -253,14 +254,14 @@ lbann_callback_sync_selected::populate_layer_ptrs( } -void lbann_callback_sync_selected::do_pre_sync(Layer *l) { - lbann_callback_sync_layers::do_sync(l); +void sync_selected::do_pre_sync(Layer *l) { + sync_layers::do_sync(l); #ifdef LBANN_NVPROF cudaProfilerStart(); #endif } -void lbann_callback_sync_selected::do_sync(Layer *l) { +void sync_selected::do_sync(Layer *l) { #ifdef LBANN_NVPROF //(also deinfed LBANN_HAS_GPU) if (m_sync_gpus) { El::GPUManager::SynchronizeDevice(); @@ -273,12 +274,12 @@ void lbann_callback_sync_selected::do_sync(Layer *l) { cudaProfilerStop(); } #else - lbann_callback_sync_layers::do_sync(l); + sync_layers::do_sync(l); #endif } -std::unique_ptr -build_callback_sync_selected_from_pbuf( +std::unique_ptr +build_sync_selected_callback_from_pbuf( const google::protobuf::Message& proto_msg, lbann_summary*) { const auto& params = dynamic_cast(proto_msg); @@ -288,8 +289,8 @@ build_callback_sync_selected_from_pbuf( "to synchronize."); } - using layers_t = lbann_callback_sync_selected::layers_t; - using prop_t = lbann_callback_sync_selected::prop_t; + using layers_t = sync_selected::layers_t; + using prop_t = sync_selected::prop_t; layers_t selected_layers; selected_layers.reserve(num_layers); @@ -301,14 +302,14 @@ build_callback_sync_selected_from_pbuf( } auto cb_ptr - = make_unique(selected_layers, + = make_unique(selected_layers, params.async_gpus(), params.async_mpi()); #ifdef LBANN_NVPROF const auto& cp_setup = params.cuda_profiler_setup(); if (cp_setup.no_init()) { - lbann_callback_sync_selected::turn_off_init_cuda_profiler(); + sync_selected::turn_off_init_cuda_profiler(); } else { cb_ptr->init_cuda_profiler(cp_setup.config_file(), cp_setup.output_dir(), @@ -319,4 +320,5 @@ build_callback_sync_selected_from_pbuf( return cb_ptr; } -} // namespace lbann +} // namespace callback +} // namespace lbann diff --git a/src/callbacks/callback_timeline.cpp b/src/callbacks/callback_timeline.cpp index b1eb8919a59..12550b84923 100644 --- a/src/callbacks/callback_timeline.cpp +++ b/src/callbacks/callback_timeline.cpp @@ -31,8 +31,9 @@ #include "lbann/utils/timer.hpp" namespace lbann { +namespace callback { -void lbann_callback_timeline::on_train_begin(model *m) { +void timeline::on_train_begin(model *m) { // Set up layers and weights. for (const auto& l : m->get_layers()) { m_fp_times.emplace(l->get_name(), std::vector>()); @@ -46,7 +47,7 @@ void lbann_callback_timeline::on_train_begin(model *m) { m_start_time = get_time(); } -void lbann_callback_timeline::on_train_end(model *m) { +void timeline::on_train_end(model *m) { const std::string path = m_outdir + "/timeline.m" + std::to_string(m->get_comm()->get_trainer_rank()) + "." + std::to_string(m->get_comm()->get_rank_in_trainer()) + ".txt"; @@ -71,39 +72,40 @@ void lbann_callback_timeline::on_train_end(model *m) { } } -void lbann_callback_timeline::on_forward_prop_begin(model *m, Layer *l) { +void timeline::on_forward_prop_begin(model *m, Layer *l) { m_fp_start_time = get_rel_time(); } -void lbann_callback_timeline::on_forward_prop_end(model *m, Layer *l) { +void timeline::on_forward_prop_end(model *m, Layer *l) { EvalType end = get_rel_time(); m_fp_times[l->get_name()].emplace_back(m_fp_start_time, end); } -void lbann_callback_timeline::on_backward_prop_begin(model *m, Layer *l) { +void timeline::on_backward_prop_begin(model *m, Layer *l) { m_bp_start_time = get_rel_time(); } -void lbann_callback_timeline::on_backward_prop_end(model *m, Layer *l) { +void timeline::on_backward_prop_end(model *m, Layer *l) { EvalType end = get_rel_time(); m_bp_times[l->get_name()].emplace_back(m_bp_start_time, end); } -void lbann_callback_timeline::on_optimize_begin(model *m, weights *w) { +void timeline::on_optimize_begin(model *m, weights *w) { m_opt_start_time = get_rel_time(); } -void lbann_callback_timeline::on_optimize_end(model *m, weights *w) { +void timeline::on_optimize_end(model *m, weights *w) { EvalType end = get_rel_time(); m_opt_times[w->get_name()].emplace_back(m_opt_start_time, end); } -std::unique_ptr -build_callback_timeline_from_pbuf( +std::unique_ptr +build_timeline_callback_from_pbuf( const google::protobuf::Message& proto_msg, lbann_summary*) { const auto& params = dynamic_cast(proto_msg); - return make_unique(params.directory()); + return make_unique(params.directory()); } -} // namespace lbann +} // namespace callback +} // namespace lbann diff --git a/src/callbacks/callback_timer.cpp b/src/callbacks/callback_timer.cpp index c12ed0b39cd..500e171e34d 100644 --- a/src/callbacks/callback_timer.cpp +++ b/src/callbacks/callback_timer.cpp @@ -29,13 +29,14 @@ #include namespace lbann { +namespace callback { -void lbann_callback_timer::batch_timing_begin(const model& m) { +void timer::batch_timing_begin(const model& m) { const auto& mode = m.get_execution_mode(); m_batch_start_times[mode] = get_time(); } -void lbann_callback_timer::batch_timing_end(const model& m) { +void timer::batch_timing_end(const model& m) { const auto& mode = m.get_execution_mode(); const auto& batch_time = get_time() - m_batch_start_times[mode]; m_batch_times[mode].push_back(batch_time); @@ -45,13 +46,13 @@ void lbann_callback_timer::batch_timing_end(const model& m) { } } -void lbann_callback_timer::timing_begin(const model& m) { +void timer::timing_begin(const model& m) { const auto& mode = m.get_execution_mode(); m_start_times[mode] = get_time(); m_batch_times[mode].clear(); } -void lbann_callback_timer::timing_end(model& m) { +void timer::timing_end(model& m) { constexpr EvalType zero = 0; // Get run time @@ -167,10 +168,11 @@ void lbann_callback_timer::timing_end(model& m) { } -std::unique_ptr -build_callback_timer_from_pbuf( +std::unique_ptr +build_timer_callback_from_pbuf( const google::protobuf::Message&, lbann_summary* summarizer) { - return make_unique(summarizer); + return make_unique(summarizer); } -} // namespace lbann +} // namespace callback +} // namespace lbann diff --git a/src/callbacks/callback_variable_minibatch.cpp b/src/callbacks/callback_variable_minibatch.cpp index 875aa4a071f..81e3f816bcc 100644 --- a/src/callbacks/callback_variable_minibatch.cpp +++ b/src/callbacks/callback_variable_minibatch.cpp @@ -32,12 +32,13 @@ #include "lbann/layers/io/input/input_layer.hpp" namespace lbann { +namespace callback { -lbann_callback_variable_minibatch::lbann_callback_variable_minibatch( +variable_minibatch::variable_minibatch( int starting_mbsize) : m_starting_mbsize(starting_mbsize), m_current_mini_batch_size(starting_mbsize) {} -void lbann_callback_variable_minibatch::on_train_begin(model *m) { +void variable_minibatch::on_train_begin(model *m) { // Avoid issues with the train method being called multiple times. if (m->get_epoch() != 0) { return; } @@ -63,7 +64,7 @@ void lbann_callback_variable_minibatch::on_train_begin(model *m) { m_starting_mbsize); } -void lbann_callback_variable_minibatch::on_epoch_end(model *m) { +void variable_minibatch::on_epoch_end(model *m) { // Get first input layer in model generic_input_layer* input = nullptr; @@ -124,7 +125,7 @@ void lbann_callback_variable_minibatch::on_epoch_end(model *m) { } } -void lbann_callback_variable_minibatch::change_learning_rate( +void variable_minibatch::change_learning_rate( model *m, float new_lr) const { for (weights *w : m->get_weights()) { optimizer *opt = w->get_optimizer(); @@ -134,7 +135,7 @@ void lbann_callback_variable_minibatch::change_learning_rate( } } -float lbann_callback_variable_minibatch::get_current_learning_rate( +float variable_minibatch::get_current_learning_rate( model *m) const { for (weights *w : m->get_weights()) { optimizer *opt = w->get_optimizer(); @@ -145,12 +146,12 @@ float lbann_callback_variable_minibatch::get_current_learning_rate( return 0.0f; } -lbann_callback_step_minibatch::lbann_callback_step_minibatch( +step_minibatch::step_minibatch( int starting_mbsize, int step, int ramp_time) : - lbann_callback_variable_minibatch(starting_mbsize), m_step(step), + variable_minibatch(starting_mbsize), m_step(step), m_ramp_time(ramp_time) {} -bool lbann_callback_step_minibatch::schedule( +bool step_minibatch::schedule( model *m, int& new_mbsize, float& new_lr, int& ramp_time) { if (m->get_epoch() % m_step == 0) { new_mbsize = m_current_mini_batch_size * 2; @@ -162,16 +163,16 @@ bool lbann_callback_step_minibatch::schedule( } } -lbann_callback_minibatch_schedule::lbann_callback_minibatch_schedule( +minibatch_schedule::minibatch_schedule( int starting_mbsize, std::vector steps) : - lbann_callback_variable_minibatch(starting_mbsize), m_steps(std::move(steps)) { + variable_minibatch(starting_mbsize), m_steps(std::move(steps)) { std::sort(m_steps.rbegin(), m_steps.rend(), [] (const minibatch_step& a, const minibatch_step& b) { return a.epoch < b.epoch; }); } -bool lbann_callback_minibatch_schedule::schedule( +bool minibatch_schedule::schedule( model *m, int& new_mbsize, float& new_lr, int& ramp_time) { if (!m_steps.empty() && m->get_epoch() == m_steps.back().epoch) { new_mbsize = m_steps.back().mbsize; @@ -183,22 +184,22 @@ bool lbann_callback_minibatch_schedule::schedule( return false; } -std::unique_ptr -build_callback_step_minibatch_from_pbuf( +std::unique_ptr +build_step_minibatch_callback_from_pbuf( const google::protobuf::Message& proto_msg, lbann_summary*) { const auto& params = dynamic_cast(proto_msg); - return make_unique(params.starting_mbsize(), + return make_unique(params.starting_mbsize(), params.step(), params.ramp_time()); } -std::unique_ptr -build_callback_minibatch_schedule_from_pbuf( +std::unique_ptr +build_minibatch_schedule_callback_from_pbuf( const google::protobuf::Message& proto_msg, lbann_summary*) { const auto& params = dynamic_cast(proto_msg); - std::vector steps; + std::vector steps; for (int i = 0; i < params.step_size(); ++i) { const auto& proto_step = params.step(i); steps.emplace_back(proto_step.epoch(), @@ -206,8 +207,9 @@ build_callback_minibatch_schedule_from_pbuf( proto_step.lr(), proto_step.ramp_time()); } - return make_unique(params.starting_mbsize(), + return make_unique(params.starting_mbsize(), steps); } -} // namespace lbann +} // namespace callback +} // namespace lbann diff --git a/src/callbacks/profiler.cpp b/src/callbacks/profiler.cpp index 76e2b93d1c9..6b5a20d25b6 100644 --- a/src/callbacks/profiler.cpp +++ b/src/callbacks/profiler.cpp @@ -23,7 +23,7 @@ // implied. See the License for the specific language governing // permissions and limitations under the license. // -// lbann_callback_timer .hpp .cpp - Callback hooks to time training +// timer .hpp .cpp - Callback hooks to time training /////////////////////////////////////////////////////////////////////////////// #include @@ -37,9 +37,10 @@ #endif namespace lbann { +namespace callback { -lbann_callback_profiler::lbann_callback_profiler(bool sync, bool skip_init) : - lbann_callback(), m_sync(sync), m_skip_init(skip_init) { +profiler::profiler(bool sync, bool skip_init) : + callback_base(), m_sync(sync), m_skip_init(skip_init) { #ifdef LBANN_NVPROF nvtxNameCudaStreamA(El::GPUManager::Stream(), "Hydrogen"); #endif @@ -48,7 +49,7 @@ lbann_callback_profiler::lbann_callback_profiler(bool sync, bool skip_init) : } } -void lbann_callback_profiler::on_epoch_begin(model *m) { +void profiler::on_epoch_begin(model *m) { // Skip the first epoch if (m_skip_init && m->get_epoch() == 1) { prof_start(); @@ -57,84 +58,84 @@ void lbann_callback_profiler::on_epoch_begin(model *m) { prof_colors[0], m_sync); } -void lbann_callback_profiler::on_epoch_end(model *m) { +void profiler::on_epoch_end(model *m) { prof_region_end(("epoch " + std::to_string(m->get_epoch())).c_str(), m_sync); } -void lbann_callback_profiler::on_validation_begin(model *m) { +void profiler::on_validation_begin(model *m) { prof_region_begin(("val " + std::to_string(m->get_epoch())).c_str(), prof_colors[0], m_sync); } -void lbann_callback_profiler::on_validation_end(model *m) { +void profiler::on_validation_end(model *m) { prof_region_end(("val " + std::to_string(m->get_epoch())).c_str(), m_sync); } -void lbann_callback_profiler::on_test_begin(model *m) { +void profiler::on_test_begin(model *m) { prof_region_begin(("test " + std::to_string(m->get_epoch())).c_str(), prof_colors[0], m_sync); } -void lbann_callback_profiler::on_test_end(model *m) { +void profiler::on_test_end(model *m) { prof_region_end(("test " + std::to_string(m->get_epoch())).c_str(), m_sync); } -void lbann_callback_profiler::on_batch_begin(model *m) { +void profiler::on_batch_begin(model *m) { prof_region_begin(("batch " + std::to_string(m->get_step(execution_mode::training))).c_str(), prof_colors[1], m_sync); } -void lbann_callback_profiler::on_batch_end(model *m) { +void profiler::on_batch_end(model *m) { prof_region_end(("batch " + std::to_string(m->get_step(execution_mode::training))).c_str(), m_sync); } -void lbann_callback_profiler::on_batch_evaluate_begin(model *m) { +void profiler::on_batch_evaluate_begin(model *m) { prof_region_begin(("batch eval " + std::to_string(m->get_step(execution_mode::training))).c_str(), prof_colors[1], m_sync); } -void lbann_callback_profiler::on_batch_evaluate_end(model *m) { +void profiler::on_batch_evaluate_end(model *m) { prof_region_end(("batch eval " + std::to_string(m->get_step(execution_mode::training))).c_str(), m_sync); } -void lbann_callback_profiler::on_forward_prop_begin(model *m) { +void profiler::on_forward_prop_begin(model *m) { prof_region_begin("forward", prof_colors[2], m_sync); } -void lbann_callback_profiler::on_forward_prop_end(model *m) { +void profiler::on_forward_prop_end(model *m) { prof_region_end("forward", m_sync); } -void lbann_callback_profiler::on_evaluate_forward_prop_begin(model *m) { +void profiler::on_evaluate_forward_prop_begin(model *m) { prof_region_begin("forward", prof_colors[2], m_sync); } -void lbann_callback_profiler::on_evaluate_forward_prop_end(model *m) { +void profiler::on_evaluate_forward_prop_end(model *m) { prof_region_end("forward", m_sync); } -void lbann_callback_profiler::on_backward_prop_begin(model *m) { +void profiler::on_backward_prop_begin(model *m) { prof_region_begin("backward", prof_colors[3], m_sync); } -void lbann_callback_profiler::on_backward_prop_end(model *m) { +void profiler::on_backward_prop_end(model *m) { prof_region_end("backward", m_sync); } -void lbann_callback_profiler::on_optimize_begin(model *m) { +void profiler::on_optimize_begin(model *m) { prof_region_begin("optimize", prof_colors[4], m_sync); } -void lbann_callback_profiler::on_optimize_end(model *m) { +void profiler::on_optimize_end(model *m) { prof_region_end("optimize", m_sync); } -int lbann_callback_profiler::get_color(Layer *l) { +int profiler::get_color(Layer *l) { const std::string &lname = l->get_type(); int idx = 5; if (lname == "fully connected") { @@ -161,45 +162,46 @@ int lbann_callback_profiler::get_color(Layer *l) { return prof_colors[idx % num_prof_colors]; } -void lbann_callback_profiler::on_forward_prop_begin(model *m, Layer *l) { +void profiler::on_forward_prop_begin(model *m, Layer *l) { prof_region_begin(("fw " + l->get_name()).c_str(), get_color(l), m_sync); } -void lbann_callback_profiler::on_forward_prop_end(model *m, Layer *l) { +void profiler::on_forward_prop_end(model *m, Layer *l) { prof_region_end(("fw " + l->get_name()).c_str(), m_sync); } -void lbann_callback_profiler::on_evaluate_forward_prop_begin(model *m, Layer *l) { +void profiler::on_evaluate_forward_prop_begin(model *m, Layer *l) { prof_region_begin(("fw " + l->get_name()).c_str(), get_color(l), m_sync); } -void lbann_callback_profiler::on_evaluate_forward_prop_end(model *m, Layer *l) { +void profiler::on_evaluate_forward_prop_end(model *m, Layer *l) { prof_region_end(("fw " + l->get_name()).c_str(), m_sync); } -void lbann_callback_profiler::on_backward_prop_begin(model *m, Layer *l) { +void profiler::on_backward_prop_begin(model *m, Layer *l) { prof_region_begin(("bw " + l->get_name()).c_str(), get_color(l), m_sync); } -void lbann_callback_profiler::on_backward_prop_end(model *m, Layer *l) { +void profiler::on_backward_prop_end(model *m, Layer *l) { prof_region_end(("bw " + l->get_name()).c_str(), m_sync); } -void lbann_callback_profiler::on_optimize_begin(model *m, weights *w) { +void profiler::on_optimize_begin(model *m, weights *w) { prof_region_begin(("opt " + w->get_name()).c_str(), prof_colors[5], m_sync); } -void lbann_callback_profiler::on_optimize_end(model *m, weights *w) { +void profiler::on_optimize_end(model *m, weights *w) { prof_region_end(("opt " + w->get_name()).c_str(), m_sync); } -std::unique_ptr -build_callback_profiler_from_pbuf( +std::unique_ptr +build_profiler_callback_from_pbuf( const google::protobuf::Message& proto_msg, lbann_summary*) { const auto& params = dynamic_cast(proto_msg); - return make_unique(params.sync(), + return make_unique(params.sync(), params.skip_init()); } -} // namespace lbann +} // namespace callback +} // namespace lbann diff --git a/src/models/model.cpp b/src/models/model.cpp index 7a377b6da24..75610c7057f 100644 --- a/src/models/model.cpp +++ b/src/models/model.cpp @@ -422,7 +422,7 @@ void model::add_weights(weights* w) { } -void model::add_callback(lbann_callback *cb) { +void model::add_callback(callback_base *cb) { if (cb == nullptr) { throw lbann_exception("model: Attempted to add null pointer as a callback."); } @@ -1728,12 +1728,11 @@ bool model::reload_weights(const std::string latest, const std::vector(c); - if(cb != nullptr) { - return cb->save_model(this); + if (auto *cb = dynamic_cast(c)) { + return cb->do_save_model(this); } } - if(m_comm->am_trainer_master()) { + if (m_comm->am_trainer_master()) { LBANN_WARNING("save_model was called, but the callback_save_model was not loaded"); } return false; diff --git a/src/proto/factories/callback_factory.cpp b/src/proto/factories/callback_factory.cpp index 9d6b24d5f72..493f2304ea2 100644 --- a/src/proto/factories/callback_factory.cpp +++ b/src/proto/factories/callback_factory.cpp @@ -81,103 +81,104 @@ namespace { // Define the factory type. using factory_type = lbann::generic_factory< - lbann_callback, + lbann::callback_base, std::string, - generate_builder_type, default_key_error_policy>; void register_default_builders(factory_type& factory) { + using namespace callback; factory.register_builder("CallbackAdaptiveLearningRate", - build_callback_adaptive_learning_rate_from_pbuf); + build_adaptive_learning_rate_callback_from_pbuf); factory.register_builder("CallbackCheckDataset", - build_callback_check_dataset_from_pbuf); + build_check_dataset_callback_from_pbuf); factory.register_builder("CallbackCheckGradients", - build_callback_check_gradients_from_pbuf); + build_check_gradients_callback_from_pbuf); factory.register_builder("CallbackCheckInit", - build_callback_check_init_from_pbuf); + build_check_init_callback_from_pbuf); factory.register_builder("CallbackCheckMetric", - build_callback_check_metric_from_pbuf); + build_check_metric_callback_from_pbuf); factory.register_builder("CallbackCheckNaN", - build_callback_check_nan_from_pbuf); + build_check_nan_callback_from_pbuf); factory.register_builder("CallbackCheckpoint", - build_callback_checkpoint_from_pbuf); + build_checkpoint_callback_from_pbuf); factory.register_builder("CallbackCheckSmall", - build_callback_check_small_from_pbuf); + build_check_small_callback_from_pbuf); factory.register_builder("CallbackConfusionMatrix", - build_callback_confusion_matrix_from_pbuf); + build_confusion_matrix_callback_from_pbuf); factory.register_builder("CallbackDebug", - build_callback_debug_from_pbuf); + build_debug_callback_from_pbuf); factory.register_builder("CallbackDebugIO", - build_callback_debug_io_from_pbuf); + build_debug_io_callback_from_pbuf); factory.register_builder("CallbackDispIOStats", - build_callback_disp_io_stats_from_pbuf); + build_disp_io_stats_callback_from_pbuf); factory.register_builder("CallbackDropFixedLearningRate", - build_callback_drop_fixed_learning_rate_from_pbuf); + build_drop_fixed_learning_rate_callback_from_pbuf); factory.register_builder("CallbackDumpErrorSignals", - build_callback_dump_error_signals_from_pbuf); + build_dump_error_signals_callback_from_pbuf); factory.register_builder("CallbackDumpGradients", - build_callback_dump_gradients_from_pbuf); + build_dump_gradients_callback_from_pbuf); factory.register_builder("CallbackDumpMBIndices", - build_callback_dump_mb_indices_from_pbuf); + build_dump_mb_indices_callback_from_pbuf); factory.register_builder("CallbackDumpOutputs", - build_callback_dump_outputs_from_pbuf); + build_dump_outputs_callback_from_pbuf); factory.register_builder("CallbackDumpWeights", - build_callback_dump_weights_from_pbuf); + build_dump_weights_callback_from_pbuf); factory.register_builder("CallbackEarlyStopping", - build_callback_early_stopping_from_pbuf); + build_early_stopping_callback_from_pbuf); factory.register_builder("CallbackGPUMemoryUsage", - build_callback_gpu_memory_usage_from_pbuf); + build_gpu_memory_usage_callback_from_pbuf); factory.register_builder("CallbackHang", - build_callback_hang_from_pbuf); + build_hang_callback_from_pbuf); factory.register_builder("CallbackImComm", - build_callback_imcomm_from_pbuf); + build_imcomm_callback_from_pbuf); factory.register_builder( "CallbackLinearGrowthLearningRate", - build_callback_linear_growth_learning_rate_from_pbuf); + build_linear_growth_learning_rate_callback_from_pbuf); factory.register_builder("CallbackLTFB", - build_callback_ltfb_from_pbuf); + build_ltfb_callback_from_pbuf); factory.register_builder("CallbackMinibatchSchedule", - build_callback_minibatch_schedule_from_pbuf); + build_minibatch_schedule_callback_from_pbuf); factory.register_builder("CallbackMixup", - build_callback_mixup_from_pbuf); + build_mixup_callback_from_pbuf); factory.register_builder( "CallbackOptimizerwiseAdaptiveLearningRate", - build_callback_optimizerwise_adaptive_learning_rate_from_pbuf); + build_optimizerwise_adaptive_learning_rate_callback_from_pbuf); factory.register_builder("CallbackPerturbAdam", - build_callback_perturb_adam_from_pbuf); + build_perturb_adam_callback_from_pbuf); factory.register_builder("CallbackPerturbDropout", - build_callback_perturb_dropout_from_pbuf); + build_perturb_dropout_callback_from_pbuf); factory.register_builder("CallbackPolyLearningRate", - build_callback_poly_learning_rate_from_pbuf); + build_poly_learning_rate_callback_from_pbuf); factory.register_builder("CallbackPrint", - build_callback_print_from_pbuf); + build_print_callback_from_pbuf); factory.register_builder("CallbackProfiler", - build_callback_profiler_from_pbuf); + build_profiler_callback_from_pbuf); factory.register_builder("CallbackReplaceWeights", - build_callback_replace_weights_from_pbuf); + build_replace_weights_callback_from_pbuf); factory.register_builder("CallbackSaveImages", - build_callback_save_images_from_pbuf); + build_save_images_callback_from_pbuf); factory.register_builder("CallbackSaveModel", - build_callback_save_model_from_pbuf); + build_save_model_callback_from_pbuf); factory.register_builder("CallbackSaveTopKModels", - build_callback_save_topk_models_from_pbuf); + build_save_topk_models_callback_from_pbuf); factory.register_builder("CallbackStepLearningRate", - build_callback_step_learning_rate_from_pbuf); + build_step_learning_rate_callback_from_pbuf); factory.register_builder("CallbackStepMinibatch", - build_callback_step_minibatch_from_pbuf); + build_step_minibatch_callback_from_pbuf); factory.register_builder("CallbackSummary", - build_callback_summary_from_pbuf); + build_summary_callback_from_pbuf); factory.register_builder("CallbackSyncLayers", - build_callback_sync_layers_from_pbuf); + build_sync_layers_callback_from_pbuf); factory.register_builder("CallbackSyncSelected", - build_callback_sync_selected_from_pbuf); + build_sync_selected_callback_from_pbuf); factory.register_builder("CallbackTimeline", - build_callback_timeline_from_pbuf); + build_timeline_callback_from_pbuf); factory.register_builder("CallbackTimer", - build_callback_timer_from_pbuf); + build_timer_callback_from_pbuf); } // Manage a global factory @@ -198,7 +199,7 @@ factory_type const& get_callback_factory() noexcept } // namespace -std::unique_ptr +std::unique_ptr construct_callback( const google::protobuf::Message& proto_msg, lbann_summary* summarizer) { diff --git a/src/proto/factories/model_factory.cpp b/src/proto/factories/model_factory.cpp index ca3d55da361..e4dbb5f6c89 100644 --- a/src/proto/factories/model_factory.cpp +++ b/src/proto/factories/model_factory.cpp @@ -269,7 +269,7 @@ model* construct_model(lbann_comm* comm, assign_layers_to_metrics(layer_pointers, metric_list, proto_model); // Construct callbacks - std::vector> callback_list; + std::vector> callback_list; auto&& summarizer = construct_summarizer(comm, proto_model); for (int i=0; i build_model_from_prototext( std::cout << "\n" << ret_model->get_description() << "Callbacks:" << std::endl; - for (lbann_callback *cb : ret_model->get_callbacks()) { + for (callback_base *cb : ret_model->get_callbacks()) { std::cout << cb->name() << std::endl; } } From 3b1d60950009cf77fba30870776a03054a018c7d Mon Sep 17 00:00:00 2001 From: "Thomas R. Benson" Date: Mon, 22 Jul 2019 11:46:33 -0700 Subject: [PATCH 146/634] remove fixme comments --- src/callbacks/callback_learning_rate.cpp | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/callbacks/callback_learning_rate.cpp b/src/callbacks/callback_learning_rate.cpp index ab2294fa1f3..b5da28a4168 100644 --- a/src/callbacks/callback_learning_rate.cpp +++ b/src/callbacks/callback_learning_rate.cpp @@ -304,7 +304,6 @@ float lbann_callback_optimizerwise_adaptive_learning_rate::optimizer_schedule( } } -// FIXME TRB std::unique_ptr build_callback_step_learning_rate_from_pbuf( const google::protobuf::Message& proto_msg, lbann_summary*) { @@ -316,7 +315,6 @@ build_callback_step_learning_rate_from_pbuf( parse_list(params.weights())); } -// FIXME TRB std::unique_ptr build_callback_adaptive_learning_rate_from_pbuf( const google::protobuf::Message& proto_msg, lbann_summary*) { @@ -328,7 +326,6 @@ build_callback_adaptive_learning_rate_from_pbuf( parse_list(params.weights())); } -// FIXME TRB std::unique_ptr build_callback_drop_fixed_learning_rate_from_pbuf( const google::protobuf::Message& proto_msg, lbann_summary*) { @@ -344,7 +341,6 @@ build_callback_drop_fixed_learning_rate_from_pbuf( parse_list(params.weights())); } -// FIXME TRB std::unique_ptr build_callback_linear_growth_learning_rate_from_pbuf( const google::protobuf::Message& proto_msg,lbann_summary*) { @@ -358,7 +354,6 @@ build_callback_linear_growth_learning_rate_from_pbuf( parse_list(params.weights())); } -// FIXME TRB std::unique_ptr build_callback_optimizerwise_adaptive_learning_rate_from_pbuf( const google::protobuf::Message& proto_msg,lbann_summary*) { @@ -369,7 +364,6 @@ build_callback_optimizerwise_adaptive_learning_rate_from_pbuf( parse_list(params.weights())); } -// FIXME TRB std::unique_ptr build_callback_poly_learning_rate_from_pbuf( const google::protobuf::Message& proto_msg, lbann_summary*) { From 6498e3bafc42be3c919b3495fc0594a6191e174d Mon Sep 17 00:00:00 2001 From: "Thomas R. Benson" Date: Mon, 22 Jul 2019 13:33:25 -0700 Subject: [PATCH 147/634] a few more class renaming changes --- include/lbann/layers/learning/convolution.hpp | 7 ++++++- include/lbann/layers/learning/deconvolution.hpp | 6 ++++-- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/include/lbann/layers/learning/convolution.hpp b/include/lbann/layers/learning/convolution.hpp index b1afa3e956d..1b7f0c23e12 100644 --- a/include/lbann/layers/learning/convolution.hpp +++ b/include/lbann/layers/learning/convolution.hpp @@ -32,6 +32,11 @@ namespace lbann { +// Forward declaration. +namespace callback { +class imcomm; +} + /** @brief Standard deep learning convolution. * * Applies convolution (more precisely, cross-correlation) to input @@ -42,7 +47,7 @@ template { private: - friend class lbann_callback_imcomm; + friend class callback::imcomm; public: diff --git a/include/lbann/layers/learning/deconvolution.hpp b/include/lbann/layers/learning/deconvolution.hpp index 56e962fb8d6..19e98369e50 100644 --- a/include/lbann/layers/learning/deconvolution.hpp +++ b/include/lbann/layers/learning/deconvolution.hpp @@ -33,14 +33,16 @@ namespace lbann { // Forward declaration. -class lbann_callback_imcomm; +namespace callback { +class imcomm; +} /** @brief Transpose of the convolution layer. */ template class deconvolution_layer : public base_convolution_layer { private: - friend class lbann_callback_imcomm; + friend class callback::imcomm; public: From b706f1c09bde69c49fa34be09e4c52d86bb4214e Mon Sep 17 00:00:00 2001 From: "Thomas R. Benson" Date: Mon, 22 Jul 2019 13:33:55 -0700 Subject: [PATCH 148/634] rename all the callback files to not have the redundant callback_ prefix. --- include/lbann/callbacks/CMakeLists.txt | 65 ++++++++-------- ...ck_check_dataset.hpp => check_dataset.hpp} | 0 ...heck_gradients.hpp => check_gradients.hpp} | 0 ...callback_check_init.hpp => check_init.hpp} | 0 ...back_check_metric.hpp => check_metric.hpp} | 0 .../{callback_checknan.hpp => check_nan.hpp} | 0 ...allback_checksmall.hpp => check_small.hpp} | 0 ...callback_checkpoint.hpp => checkpoint.hpp} | 0 ...fusion_matrix.hpp => confusion_matrix.hpp} | 0 .../{callback_debug.hpp => debug.hpp} | 0 .../{callback_debug_io.hpp => debug_io.hpp} | 0 ...ror_signals.hpp => dump_error_signals.hpp} | 0 ..._dump_gradients.hpp => dump_gradients.hpp} | 0 ....hpp => dump_minibatch_sample_indices.hpp} | 0 ...back_dump_outputs.hpp => dump_outputs.hpp} | 0 ...back_dump_weights.hpp => dump_weights.hpp} | 0 ..._early_stopping.hpp => early_stopping.hpp} | 0 ..._memory_usage.hpp => gpu_memory_usage.hpp} | 0 .../callbacks/{callback_hang.hpp => hang.hpp} | 0 .../{callback_imcomm.hpp => imcomm.hpp} | 0 .../callbacks/{callback_io.hpp => io.hpp} | 0 ...ck_learning_rate.hpp => learning_rate.hpp} | 0 .../callbacks/{callback_ltfb.hpp => ltfb.hpp} | 0 .../{callback_mixup.hpp => mixup.hpp} | 0 ...back_perturb_adam.hpp => perturb_adam.hpp} | 0 ...erturb_dropout.hpp => perturb_dropout.hpp} | 0 .../{callback_print.hpp => print.hpp} | 0 ...eplace_weights.hpp => replace_weights.hpp} | 0 ...llback_save_images.hpp => save_images.hpp} | 0 ...callback_save_model.hpp => save_model.hpp} | 0 ...e_topk_models.hpp => save_topk_models.hpp} | 2 +- .../{callback_summary.hpp => summary.hpp} | 0 ...llback_sync_layers.hpp => sync_layers.hpp} | 0 ...ck_sync_selected.hpp => sync_selected.hpp} | 2 +- .../{callback_timeline.hpp => timeline.hpp} | 0 .../{callback_timer.hpp => timer.hpp} | 0 ...e_minibatch.hpp => variable_minibatch.hpp} | 0 .../layers/io/input/generic_input_layer.hpp | 2 +- include/lbann/lbann.hpp | 74 +++++++++---------- src/callbacks/CMakeLists.txt | 72 +++++++++--------- ...ck_check_dataset.cpp => check_dataset.cpp} | 2 +- ...heck_gradients.cpp => check_gradients.cpp} | 2 +- ...callback_check_init.cpp => check_init.cpp} | 2 +- ...back_check_metric.cpp => check_metric.cpp} | 2 +- .../{callback_checknan.cpp => check_nan.cpp} | 2 +- ...allback_checksmall.cpp => check_small.cpp} | 2 +- ...callback_checkpoint.cpp => checkpoint.cpp} | 2 +- ...fusion_matrix.cpp => confusion_matrix.cpp} | 2 +- .../{callback_debug.cpp => debug.cpp} | 2 +- .../{callback_debug_io.cpp => debug_io.cpp} | 2 +- ...ror_signals.cpp => dump_error_signals.cpp} | 2 +- ..._dump_gradients.cpp => dump_gradients.cpp} | 2 +- ....cpp => dump_minibatch_sample_indices.cpp} | 2 +- ...back_dump_outputs.cpp => dump_outputs.cpp} | 2 +- ...back_dump_weights.cpp => dump_weights.cpp} | 2 +- ..._early_stopping.cpp => early_stopping.cpp} | 2 +- ..._memory_usage.cpp => gpu_memory_usage.cpp} | 2 +- src/callbacks/{callback_hang.cpp => hang.cpp} | 2 +- src/callbacks/helpers.hpp | 53 +++++++++++++ .../{callback_imcomm.cpp => imcomm.cpp} | 2 +- src/callbacks/{callback_io.cpp => io.cpp} | 2 +- ...ck_learning_rate.cpp => learning_rate.cpp} | 2 +- src/callbacks/{callback_ltfb.cpp => ltfb.cpp} | 4 +- .../{callback_mixup.cpp => mixup.cpp} | 2 +- ...back_perturb_adam.cpp => perturb_adam.cpp} | 2 +- ...erturb_dropout.cpp => perturb_dropout.cpp} | 2 +- .../{callback_print.cpp => print.cpp} | 2 +- ...eplace_weights.cpp => replace_weights.cpp} | 2 +- ...llback_save_images.cpp => save_images.cpp} | 2 +- ...callback_save_model.cpp => save_model.cpp} | 4 +- ...e_topk_models.cpp => save_topk_models.cpp} | 2 +- .../{callback_summary.cpp => summary.cpp} | 2 +- ...llback_sync_layers.cpp => sync_layers.cpp} | 2 +- ...ck_sync_selected.cpp => sync_selected.cpp} | 2 +- .../{callback_timeline.cpp => timeline.cpp} | 2 +- .../{callback_timer.cpp => timer.cpp} | 2 +- ...e_minibatch.cpp => variable_minibatch.cpp} | 2 +- src/models/model.cpp | 2 +- src/proto/callbacks.proto | 6 +- src/proto/factories/callback_factory.cpp | 72 +++++++++--------- src/utils/lbann_library.cpp | 2 +- 81 files changed, 244 insertions(+), 184 deletions(-) rename include/lbann/callbacks/{callback_check_dataset.hpp => check_dataset.hpp} (100%) rename include/lbann/callbacks/{callback_check_gradients.hpp => check_gradients.hpp} (100%) rename include/lbann/callbacks/{callback_check_init.hpp => check_init.hpp} (100%) rename include/lbann/callbacks/{callback_check_metric.hpp => check_metric.hpp} (100%) rename include/lbann/callbacks/{callback_checknan.hpp => check_nan.hpp} (100%) rename include/lbann/callbacks/{callback_checksmall.hpp => check_small.hpp} (100%) rename include/lbann/callbacks/{callback_checkpoint.hpp => checkpoint.hpp} (100%) rename include/lbann/callbacks/{callback_confusion_matrix.hpp => confusion_matrix.hpp} (100%) rename include/lbann/callbacks/{callback_debug.hpp => debug.hpp} (100%) rename include/lbann/callbacks/{callback_debug_io.hpp => debug_io.hpp} (100%) rename include/lbann/callbacks/{callback_dump_error_signals.hpp => dump_error_signals.hpp} (100%) rename include/lbann/callbacks/{callback_dump_gradients.hpp => dump_gradients.hpp} (100%) rename include/lbann/callbacks/{callback_dump_minibatch_sample_indices.hpp => dump_minibatch_sample_indices.hpp} (100%) rename include/lbann/callbacks/{callback_dump_outputs.hpp => dump_outputs.hpp} (100%) rename include/lbann/callbacks/{callback_dump_weights.hpp => dump_weights.hpp} (100%) rename include/lbann/callbacks/{callback_early_stopping.hpp => early_stopping.hpp} (100%) rename include/lbann/callbacks/{callback_gpu_memory_usage.hpp => gpu_memory_usage.hpp} (100%) rename include/lbann/callbacks/{callback_hang.hpp => hang.hpp} (100%) rename include/lbann/callbacks/{callback_imcomm.hpp => imcomm.hpp} (100%) rename include/lbann/callbacks/{callback_io.hpp => io.hpp} (100%) rename include/lbann/callbacks/{callback_learning_rate.hpp => learning_rate.hpp} (100%) rename include/lbann/callbacks/{callback_ltfb.hpp => ltfb.hpp} (100%) rename include/lbann/callbacks/{callback_mixup.hpp => mixup.hpp} (100%) rename include/lbann/callbacks/{callback_perturb_adam.hpp => perturb_adam.hpp} (100%) rename include/lbann/callbacks/{callback_perturb_dropout.hpp => perturb_dropout.hpp} (100%) rename include/lbann/callbacks/{callback_print.hpp => print.hpp} (100%) rename include/lbann/callbacks/{callback_replace_weights.hpp => replace_weights.hpp} (100%) rename include/lbann/callbacks/{callback_save_images.hpp => save_images.hpp} (100%) rename include/lbann/callbacks/{callback_save_model.hpp => save_model.hpp} (100%) rename include/lbann/callbacks/{callback_save_topk_models.hpp => save_topk_models.hpp} (98%) rename include/lbann/callbacks/{callback_summary.hpp => summary.hpp} (100%) rename include/lbann/callbacks/{callback_sync_layers.hpp => sync_layers.hpp} (100%) rename include/lbann/callbacks/{callback_sync_selected.hpp => sync_selected.hpp} (99%) rename include/lbann/callbacks/{callback_timeline.hpp => timeline.hpp} (100%) rename include/lbann/callbacks/{callback_timer.hpp => timer.hpp} (100%) rename include/lbann/callbacks/{callback_variable_minibatch.hpp => variable_minibatch.hpp} (100%) rename src/callbacks/{callback_check_dataset.cpp => check_dataset.cpp} (99%) rename src/callbacks/{callback_check_gradients.cpp => check_gradients.cpp} (99%) rename src/callbacks/{callback_check_init.cpp => check_init.cpp} (98%) rename src/callbacks/{callback_check_metric.cpp => check_metric.cpp} (98%) rename src/callbacks/{callback_checknan.cpp => check_nan.cpp} (99%) rename src/callbacks/{callback_checksmall.cpp => check_small.cpp} (98%) rename src/callbacks/{callback_checkpoint.cpp => checkpoint.cpp} (99%) rename src/callbacks/{callback_confusion_matrix.cpp => confusion_matrix.cpp} (99%) rename src/callbacks/{callback_debug.cpp => debug.cpp} (99%) rename src/callbacks/{callback_debug_io.cpp => debug_io.cpp} (99%) rename src/callbacks/{callback_dump_error_signals.cpp => dump_error_signals.cpp} (97%) rename src/callbacks/{callback_dump_gradients.cpp => dump_gradients.cpp} (97%) rename src/callbacks/{callback_dump_minibatch_sample_indices.cpp => dump_minibatch_sample_indices.cpp} (97%) rename src/callbacks/{callback_dump_outputs.cpp => dump_outputs.cpp} (99%) rename src/callbacks/{callback_dump_weights.cpp => dump_weights.cpp} (97%) rename src/callbacks/{callback_early_stopping.cpp => early_stopping.cpp} (97%) rename src/callbacks/{callback_gpu_memory_usage.cpp => gpu_memory_usage.cpp} (98%) rename src/callbacks/{callback_hang.cpp => hang.cpp} (97%) create mode 100644 src/callbacks/helpers.hpp rename src/callbacks/{callback_imcomm.cpp => imcomm.cpp} (99%) rename src/callbacks/{callback_io.cpp => io.cpp} (98%) rename src/callbacks/{callback_learning_rate.cpp => learning_rate.cpp} (99%) rename src/callbacks/{callback_ltfb.cpp => ltfb.cpp} (99%) rename src/callbacks/{callback_mixup.cpp => mixup.cpp} (98%) rename src/callbacks/{callback_perturb_adam.cpp => perturb_adam.cpp} (99%) rename src/callbacks/{callback_perturb_dropout.cpp => perturb_dropout.cpp} (98%) rename src/callbacks/{callback_print.cpp => print.cpp} (99%) rename src/callbacks/{callback_replace_weights.cpp => replace_weights.cpp} (97%) rename src/callbacks/{callback_save_images.cpp => save_images.cpp} (99%) rename src/callbacks/{callback_save_model.cpp => save_model.cpp} (97%) rename src/callbacks/{callback_save_topk_models.cpp => save_topk_models.cpp} (98%) rename src/callbacks/{callback_summary.cpp => summary.cpp} (99%) rename src/callbacks/{callback_sync_layers.cpp => sync_layers.cpp} (97%) rename src/callbacks/{callback_sync_selected.cpp => sync_selected.cpp} (99%) rename src/callbacks/{callback_timeline.cpp => timeline.cpp} (98%) rename src/callbacks/{callback_timer.cpp => timer.cpp} (99%) rename src/callbacks/{callback_variable_minibatch.cpp => variable_minibatch.cpp} (99%) diff --git a/include/lbann/callbacks/CMakeLists.txt b/include/lbann/callbacks/CMakeLists.txt index d9043c2e48a..44debb7811f 100644 --- a/include/lbann/callbacks/CMakeLists.txt +++ b/include/lbann/callbacks/CMakeLists.txt @@ -1,36 +1,43 @@ # Add the headers for this directory set_full_path(THIS_DIR_HEADERS callback.hpp - callback_check_dataset.hpp - callback_check_gradients.hpp - callback_check_init.hpp - callback_check_metric.hpp - callback_checknan.hpp - callback_checksmall.hpp - callback_confusion_matrix.hpp - callback_debug.hpp - callback_debug_io.hpp - callback_dump_outputs.hpp - callback_dump_error_signals.hpp - callback_dump_gradients.hpp - callback_dump_minibatch_sample_indices.hpp - callback_dump_weights.hpp - callback_early_stopping.hpp - callback_hang.hpp - callback_imcomm.hpp - callback_io.hpp - callback_learning_rate.hpp - callback_ltfb.hpp - callback_mixup.hpp - callback_perturb_adam.hpp - callback_print.hpp - callback_save_images.hpp - callback_save_model.hpp - callback_summary.hpp - callback_timer.hpp - callback_variable_minibatch.hpp + check_dataset.hpp + check_gradients.hpp + check_init.hpp + check_metric.hpp + check_nan.hpp + check_small.hpp + checkpoint.hpp + confusion_matrix.hpp + debug.hpp + debug_io.hpp + dump_error_signals.hpp + dump_gradients.hpp + dump_minibatch_sample_indices.hpp + dump_outputs.hpp + dump_weights.hpp + early_stopping.hpp + gpu_memory_usage.hpp + hang.hpp + imcomm.hpp + io.hpp + learning_rate.hpp + ltfb.hpp + mixup.hpp + perturb_adam.hpp + perturb_dropout.hpp + print.hpp profiler.hpp - callback_gpu_memory_usage.hpp + replace_weights.hpp + save_images.hpp + save_model.hpp + save_topk_models.hpp + summary.hpp + sync_layers.hpp + sync_selected.hpp + timeline.hpp + timer.hpp + variable_minibatch.hpp ) # Propagate the files up the tree diff --git a/include/lbann/callbacks/callback_check_dataset.hpp b/include/lbann/callbacks/check_dataset.hpp similarity index 100% rename from include/lbann/callbacks/callback_check_dataset.hpp rename to include/lbann/callbacks/check_dataset.hpp diff --git a/include/lbann/callbacks/callback_check_gradients.hpp b/include/lbann/callbacks/check_gradients.hpp similarity index 100% rename from include/lbann/callbacks/callback_check_gradients.hpp rename to include/lbann/callbacks/check_gradients.hpp diff --git a/include/lbann/callbacks/callback_check_init.hpp b/include/lbann/callbacks/check_init.hpp similarity index 100% rename from include/lbann/callbacks/callback_check_init.hpp rename to include/lbann/callbacks/check_init.hpp diff --git a/include/lbann/callbacks/callback_check_metric.hpp b/include/lbann/callbacks/check_metric.hpp similarity index 100% rename from include/lbann/callbacks/callback_check_metric.hpp rename to include/lbann/callbacks/check_metric.hpp diff --git a/include/lbann/callbacks/callback_checknan.hpp b/include/lbann/callbacks/check_nan.hpp similarity index 100% rename from include/lbann/callbacks/callback_checknan.hpp rename to include/lbann/callbacks/check_nan.hpp diff --git a/include/lbann/callbacks/callback_checksmall.hpp b/include/lbann/callbacks/check_small.hpp similarity index 100% rename from include/lbann/callbacks/callback_checksmall.hpp rename to include/lbann/callbacks/check_small.hpp diff --git a/include/lbann/callbacks/callback_checkpoint.hpp b/include/lbann/callbacks/checkpoint.hpp similarity index 100% rename from include/lbann/callbacks/callback_checkpoint.hpp rename to include/lbann/callbacks/checkpoint.hpp diff --git a/include/lbann/callbacks/callback_confusion_matrix.hpp b/include/lbann/callbacks/confusion_matrix.hpp similarity index 100% rename from include/lbann/callbacks/callback_confusion_matrix.hpp rename to include/lbann/callbacks/confusion_matrix.hpp diff --git a/include/lbann/callbacks/callback_debug.hpp b/include/lbann/callbacks/debug.hpp similarity index 100% rename from include/lbann/callbacks/callback_debug.hpp rename to include/lbann/callbacks/debug.hpp diff --git a/include/lbann/callbacks/callback_debug_io.hpp b/include/lbann/callbacks/debug_io.hpp similarity index 100% rename from include/lbann/callbacks/callback_debug_io.hpp rename to include/lbann/callbacks/debug_io.hpp diff --git a/include/lbann/callbacks/callback_dump_error_signals.hpp b/include/lbann/callbacks/dump_error_signals.hpp similarity index 100% rename from include/lbann/callbacks/callback_dump_error_signals.hpp rename to include/lbann/callbacks/dump_error_signals.hpp diff --git a/include/lbann/callbacks/callback_dump_gradients.hpp b/include/lbann/callbacks/dump_gradients.hpp similarity index 100% rename from include/lbann/callbacks/callback_dump_gradients.hpp rename to include/lbann/callbacks/dump_gradients.hpp diff --git a/include/lbann/callbacks/callback_dump_minibatch_sample_indices.hpp b/include/lbann/callbacks/dump_minibatch_sample_indices.hpp similarity index 100% rename from include/lbann/callbacks/callback_dump_minibatch_sample_indices.hpp rename to include/lbann/callbacks/dump_minibatch_sample_indices.hpp diff --git a/include/lbann/callbacks/callback_dump_outputs.hpp b/include/lbann/callbacks/dump_outputs.hpp similarity index 100% rename from include/lbann/callbacks/callback_dump_outputs.hpp rename to include/lbann/callbacks/dump_outputs.hpp diff --git a/include/lbann/callbacks/callback_dump_weights.hpp b/include/lbann/callbacks/dump_weights.hpp similarity index 100% rename from include/lbann/callbacks/callback_dump_weights.hpp rename to include/lbann/callbacks/dump_weights.hpp diff --git a/include/lbann/callbacks/callback_early_stopping.hpp b/include/lbann/callbacks/early_stopping.hpp similarity index 100% rename from include/lbann/callbacks/callback_early_stopping.hpp rename to include/lbann/callbacks/early_stopping.hpp diff --git a/include/lbann/callbacks/callback_gpu_memory_usage.hpp b/include/lbann/callbacks/gpu_memory_usage.hpp similarity index 100% rename from include/lbann/callbacks/callback_gpu_memory_usage.hpp rename to include/lbann/callbacks/gpu_memory_usage.hpp diff --git a/include/lbann/callbacks/callback_hang.hpp b/include/lbann/callbacks/hang.hpp similarity index 100% rename from include/lbann/callbacks/callback_hang.hpp rename to include/lbann/callbacks/hang.hpp diff --git a/include/lbann/callbacks/callback_imcomm.hpp b/include/lbann/callbacks/imcomm.hpp similarity index 100% rename from include/lbann/callbacks/callback_imcomm.hpp rename to include/lbann/callbacks/imcomm.hpp diff --git a/include/lbann/callbacks/callback_io.hpp b/include/lbann/callbacks/io.hpp similarity index 100% rename from include/lbann/callbacks/callback_io.hpp rename to include/lbann/callbacks/io.hpp diff --git a/include/lbann/callbacks/callback_learning_rate.hpp b/include/lbann/callbacks/learning_rate.hpp similarity index 100% rename from include/lbann/callbacks/callback_learning_rate.hpp rename to include/lbann/callbacks/learning_rate.hpp diff --git a/include/lbann/callbacks/callback_ltfb.hpp b/include/lbann/callbacks/ltfb.hpp similarity index 100% rename from include/lbann/callbacks/callback_ltfb.hpp rename to include/lbann/callbacks/ltfb.hpp diff --git a/include/lbann/callbacks/callback_mixup.hpp b/include/lbann/callbacks/mixup.hpp similarity index 100% rename from include/lbann/callbacks/callback_mixup.hpp rename to include/lbann/callbacks/mixup.hpp diff --git a/include/lbann/callbacks/callback_perturb_adam.hpp b/include/lbann/callbacks/perturb_adam.hpp similarity index 100% rename from include/lbann/callbacks/callback_perturb_adam.hpp rename to include/lbann/callbacks/perturb_adam.hpp diff --git a/include/lbann/callbacks/callback_perturb_dropout.hpp b/include/lbann/callbacks/perturb_dropout.hpp similarity index 100% rename from include/lbann/callbacks/callback_perturb_dropout.hpp rename to include/lbann/callbacks/perturb_dropout.hpp diff --git a/include/lbann/callbacks/callback_print.hpp b/include/lbann/callbacks/print.hpp similarity index 100% rename from include/lbann/callbacks/callback_print.hpp rename to include/lbann/callbacks/print.hpp diff --git a/include/lbann/callbacks/callback_replace_weights.hpp b/include/lbann/callbacks/replace_weights.hpp similarity index 100% rename from include/lbann/callbacks/callback_replace_weights.hpp rename to include/lbann/callbacks/replace_weights.hpp diff --git a/include/lbann/callbacks/callback_save_images.hpp b/include/lbann/callbacks/save_images.hpp similarity index 100% rename from include/lbann/callbacks/callback_save_images.hpp rename to include/lbann/callbacks/save_images.hpp diff --git a/include/lbann/callbacks/callback_save_model.hpp b/include/lbann/callbacks/save_model.hpp similarity index 100% rename from include/lbann/callbacks/callback_save_model.hpp rename to include/lbann/callbacks/save_model.hpp diff --git a/include/lbann/callbacks/callback_save_topk_models.hpp b/include/lbann/callbacks/save_topk_models.hpp similarity index 98% rename from include/lbann/callbacks/callback_save_topk_models.hpp rename to include/lbann/callbacks/save_topk_models.hpp index a21801ffcea..42ef8baa42a 100644 --- a/include/lbann/callbacks/callback_save_topk_models.hpp +++ b/include/lbann/callbacks/save_topk_models.hpp @@ -29,7 +29,7 @@ #ifndef LBANN_CALLBACKS_CALLBACK_SAVE_TOPK_MODELS_HPP_INCLUDED #define LBANN_CALLBACKS_CALLBACK_SAVE_TOPK_MODELS_HPP_INCLUDED -#include "lbann/callbacks/callback_save_model.hpp" +#include "lbann/callbacks/save_model.hpp" namespace lbann { namespace callback { diff --git a/include/lbann/callbacks/callback_summary.hpp b/include/lbann/callbacks/summary.hpp similarity index 100% rename from include/lbann/callbacks/callback_summary.hpp rename to include/lbann/callbacks/summary.hpp diff --git a/include/lbann/callbacks/callback_sync_layers.hpp b/include/lbann/callbacks/sync_layers.hpp similarity index 100% rename from include/lbann/callbacks/callback_sync_layers.hpp rename to include/lbann/callbacks/sync_layers.hpp diff --git a/include/lbann/callbacks/callback_sync_selected.hpp b/include/lbann/callbacks/sync_selected.hpp similarity index 99% rename from include/lbann/callbacks/callback_sync_selected.hpp rename to include/lbann/callbacks/sync_selected.hpp index efffc3d35ca..b9c101f294d 100644 --- a/include/lbann/callbacks/callback_sync_selected.hpp +++ b/include/lbann/callbacks/sync_selected.hpp @@ -29,7 +29,7 @@ #ifndef LBANN_CALLBACKS_CALLBACK_SYNC_SELECTED_HPP_INCLUDED #define LBANN_CALLBACKS_CALLBACK_SYNC_SELECTED_HPP_INCLUDED -#include "lbann/callbacks/callback_sync_layers.hpp" +#include "lbann/callbacks/sync_layers.hpp" #include #include diff --git a/include/lbann/callbacks/callback_timeline.hpp b/include/lbann/callbacks/timeline.hpp similarity index 100% rename from include/lbann/callbacks/callback_timeline.hpp rename to include/lbann/callbacks/timeline.hpp diff --git a/include/lbann/callbacks/callback_timer.hpp b/include/lbann/callbacks/timer.hpp similarity index 100% rename from include/lbann/callbacks/callback_timer.hpp rename to include/lbann/callbacks/timer.hpp diff --git a/include/lbann/callbacks/callback_variable_minibatch.hpp b/include/lbann/callbacks/variable_minibatch.hpp similarity index 100% rename from include/lbann/callbacks/callback_variable_minibatch.hpp rename to include/lbann/callbacks/variable_minibatch.hpp diff --git a/include/lbann/layers/io/input/generic_input_layer.hpp b/include/lbann/layers/io/input/generic_input_layer.hpp index da8c53c0243..7c001602f27 100644 --- a/include/lbann/layers/io/input/generic_input_layer.hpp +++ b/include/lbann/layers/io/input/generic_input_layer.hpp @@ -32,7 +32,7 @@ #include "lbann/io/data_buffers/generic_io_buffer.hpp" #include "lbann/io/data_buffers/partitioned_io_buffer.hpp" #include "lbann/models/model.hpp" -#include "lbann/callbacks/callback_imcomm.hpp" +#include "lbann/callbacks/imcomm.hpp" #include "lbann/utils/omp_diagnostics.hpp" #include diff --git a/include/lbann/lbann.hpp b/include/lbann/lbann.hpp index 870580b7690..6b7bf345e4b 100644 --- a/include/lbann/lbann.hpp +++ b/include/lbann/lbann.hpp @@ -130,44 +130,44 @@ #include "lbann/data_store/data_store_conduit.hpp" /// Callbacks -#include "lbann/callbacks/callback_check_init.hpp" -#include "lbann/callbacks/callback_checknan.hpp" -#include "lbann/callbacks/callback_checksmall.hpp" -#include "lbann/callbacks/callback_check_dataset.hpp" -#include "lbann/callbacks/callback_print.hpp" -#include "lbann/callbacks/callback_timer.hpp" -#include "lbann/callbacks/callback_io.hpp" -#include "lbann/callbacks/callback_summary.hpp" -#include "lbann/callbacks/callback_learning_rate.hpp" -#include "lbann/callbacks/callback_debug.hpp" -#include "lbann/callbacks/callback_debug_io.hpp" -#include "lbann/callbacks/callback_imcomm.hpp" -#include "lbann/callbacks/callback_dump_weights.hpp" -#include "lbann/callbacks/callback_dump_outputs.hpp" -#include "lbann/callbacks/callback_dump_error_signals.hpp" -#include "lbann/callbacks/callback_dump_gradients.hpp" -#include "lbann/callbacks/callback_dump_minibatch_sample_indices.hpp" -#include "lbann/callbacks/callback_early_stopping.hpp" -#include "lbann/callbacks/callback_ltfb.hpp" -#include "lbann/callbacks/callback_mixup.hpp" -#include "lbann/callbacks/callback_save_images.hpp" -#include "lbann/callbacks/callback_save_model.hpp" -#include "lbann/callbacks/callback_save_topk_models.hpp" +#include "lbann/callbacks/check_init.hpp" +#include "lbann/callbacks/check_nan.hpp" +#include "lbann/callbacks/check_small.hpp" +#include "lbann/callbacks/check_dataset.hpp" +#include "lbann/callbacks/print.hpp" +#include "lbann/callbacks/timer.hpp" +#include "lbann/callbacks/io.hpp" +#include "lbann/callbacks/summary.hpp" +#include "lbann/callbacks/learning_rate.hpp" +#include "lbann/callbacks/debug.hpp" +#include "lbann/callbacks/debug_io.hpp" +#include "lbann/callbacks/imcomm.hpp" +#include "lbann/callbacks/dump_weights.hpp" +#include "lbann/callbacks/dump_outputs.hpp" +#include "lbann/callbacks/dump_error_signals.hpp" +#include "lbann/callbacks/dump_gradients.hpp" +#include "lbann/callbacks/dump_minibatch_sample_indices.hpp" +#include "lbann/callbacks/early_stopping.hpp" +#include "lbann/callbacks/ltfb.hpp" +#include "lbann/callbacks/mixup.hpp" +#include "lbann/callbacks/save_images.hpp" +#include "lbann/callbacks/save_model.hpp" +#include "lbann/callbacks/save_topk_models.hpp" #include "lbann/callbacks/profiler.hpp" -#include "lbann/callbacks/callback_hang.hpp" -#include "lbann/callbacks/callback_variable_minibatch.hpp" -#include "lbann/callbacks/callback_timeline.hpp" -#include "lbann/callbacks/callback_checkpoint.hpp" -#include "lbann/callbacks/callback_save_model.hpp" -#include "lbann/callbacks/callback_replace_weights.hpp" -#include "lbann/callbacks/callback_gpu_memory_usage.hpp" -#include "lbann/callbacks/callback_sync_layers.hpp" -#include "lbann/callbacks/callback_sync_selected.hpp" -#include "lbann/callbacks/callback_confusion_matrix.hpp" -#include "lbann/callbacks/callback_check_gradients.hpp" -#include "lbann/callbacks/callback_check_metric.hpp" -#include "lbann/callbacks/callback_perturb_adam.hpp" -#include "lbann/callbacks/callback_perturb_dropout.hpp" +#include "lbann/callbacks/hang.hpp" +#include "lbann/callbacks/variable_minibatch.hpp" +#include "lbann/callbacks/timeline.hpp" +#include "lbann/callbacks/checkpoint.hpp" +#include "lbann/callbacks/save_model.hpp" +#include "lbann/callbacks/replace_weights.hpp" +#include "lbann/callbacks/gpu_memory_usage.hpp" +#include "lbann/callbacks/sync_layers.hpp" +#include "lbann/callbacks/sync_selected.hpp" +#include "lbann/callbacks/confusion_matrix.hpp" +#include "lbann/callbacks/check_gradients.hpp" +#include "lbann/callbacks/check_metric.hpp" +#include "lbann/callbacks/perturb_adam.hpp" +#include "lbann/callbacks/perturb_dropout.hpp" /// Weights and weight initializers #include "lbann/weights/weights.hpp" diff --git a/src/callbacks/CMakeLists.txt b/src/callbacks/CMakeLists.txt index a10b751ed8c..2d71c561bfb 100644 --- a/src/callbacks/CMakeLists.txt +++ b/src/callbacks/CMakeLists.txt @@ -1,42 +1,42 @@ # Add the source files for this directory set_full_path(THIS_DIR_SOURCES - callback_check_dataset.cpp - callback_check_gradients.cpp - callback_check_init.cpp - callback_check_metric.cpp - callback_checknan.cpp - callback_checkpoint.cpp - callback_checksmall.cpp - callback_confusion_matrix.cpp - callback_debug.cpp - callback_debug_io.cpp - callback_dump_outputs.cpp - callback_dump_error_signals.cpp - callback_dump_gradients.cpp - callback_dump_minibatch_sample_indices.cpp - callback_dump_weights.cpp - callback_early_stopping.cpp - callback_hang.cpp - callback_imcomm.cpp - callback_io.cpp - callback_learning_rate.cpp - callback_ltfb.cpp - callback_mixup.cpp - callback_perturb_adam.cpp - callback_print.cpp - callback_save_images.cpp - callback_save_model.cpp - callback_summary.cpp - callback_sync_layers.cpp - callback_sync_selected.cpp - callback_timeline.cpp - callback_timer.cpp - callback_variable_minibatch.cpp + check_dataset.cpp + check_gradients.cpp + check_init.cpp + check_metric.cpp + check_nan.cpp + check_small.cpp + checkpoint.cpp + confusion_matrix.cpp + debug.cpp + debug_io.cpp + dump_error_signals.cpp + dump_gradients.cpp + dump_minibatch_sample_indices.cpp + dump_outputs.cpp + dump_weights.cpp + early_stopping.cpp + gpu_memory_usage.cpp + hang.cpp + imcomm.cpp + io.cpp + learning_rate.cpp + ltfb.cpp + mixup.cpp + perturb_adam.cpp + perturb_dropout.cpp + print.cpp profiler.cpp - callback_replace_weights.cpp - callback_gpu_memory_usage.cpp - callback_perturb_dropout.cpp - callback_save_topk_models.cpp + replace_weights.cpp + save_images.cpp + save_model.cpp + save_topk_models.cpp + summary.cpp + sync_layers.cpp + sync_selected.cpp + timeline.cpp + timer.cpp + variable_minibatch.cpp ) # Propagate the files up the tree diff --git a/src/callbacks/callback_check_dataset.cpp b/src/callbacks/check_dataset.cpp similarity index 99% rename from src/callbacks/callback_check_dataset.cpp rename to src/callbacks/check_dataset.cpp index 51c0254a30b..69a41b5df50 100644 --- a/src/callbacks/callback_check_dataset.cpp +++ b/src/callbacks/check_dataset.cpp @@ -25,7 +25,7 @@ //////////////////////////////////////////////////////////////////////////////// #include -#include "lbann/callbacks/callback_check_dataset.hpp" +#include "lbann/callbacks/check_dataset.hpp" #include "lbann/layers/io/io_layer.hpp" #include "lbann/layers/io/input/input_layer.hpp" #include diff --git a/src/callbacks/callback_check_gradients.cpp b/src/callbacks/check_gradients.cpp similarity index 99% rename from src/callbacks/callback_check_gradients.cpp rename to src/callbacks/check_gradients.cpp index 6ae9000804a..be640229327 100644 --- a/src/callbacks/callback_check_gradients.cpp +++ b/src/callbacks/check_gradients.cpp @@ -24,7 +24,7 @@ // permissions and limitations under the license. //////////////////////////////////////////////////////////////////////////////// -#include "lbann/callbacks/callback_check_gradients.hpp" +#include "lbann/callbacks/check_gradients.hpp" #include "lbann/layers/io/input/generic_input_layer.hpp" #include "lbann/data_readers/data_reader.hpp" diff --git a/src/callbacks/callback_check_init.cpp b/src/callbacks/check_init.cpp similarity index 98% rename from src/callbacks/callback_check_init.cpp rename to src/callbacks/check_init.cpp index 73c4784c88c..c38bc95c6f2 100644 --- a/src/callbacks/callback_check_init.cpp +++ b/src/callbacks/check_init.cpp @@ -26,7 +26,7 @@ // check_init .hpp .cpp - Check multi-model init //////////////////////////////////////////////////////////////////////////////// -#include "lbann/callbacks/callback_check_init.hpp" +#include "lbann/callbacks/check_init.hpp" #include "lbann/utils/exception.hpp" namespace lbann { diff --git a/src/callbacks/callback_check_metric.cpp b/src/callbacks/check_metric.cpp similarity index 98% rename from src/callbacks/callback_check_metric.cpp rename to src/callbacks/check_metric.cpp index c9724899378..abb9455327c 100644 --- a/src/callbacks/callback_check_metric.cpp +++ b/src/callbacks/check_metric.cpp @@ -24,7 +24,7 @@ // permissions and limitations under the license. //////////////////////////////////////////////////////////////////////////////// -#include "lbann/callbacks/callback_check_metric.hpp" +#include "lbann/callbacks/check_metric.hpp" #include "lbann/proto/factories.hpp" diff --git a/src/callbacks/callback_checknan.cpp b/src/callbacks/check_nan.cpp similarity index 99% rename from src/callbacks/callback_checknan.cpp rename to src/callbacks/check_nan.cpp index b52f8db0a96..e970c23e790 100644 --- a/src/callbacks/callback_checknan.cpp +++ b/src/callbacks/check_nan.cpp @@ -24,7 +24,7 @@ // permissions and limitations under the license. //////////////////////////////////////////////////////////////////////////////// -#include "lbann/callbacks/callback_checknan.hpp" +#include "lbann/callbacks/check_nan.hpp" #include "lbann/utils/exception.hpp" namespace lbann { diff --git a/src/callbacks/callback_checksmall.cpp b/src/callbacks/check_small.cpp similarity index 98% rename from src/callbacks/callback_checksmall.cpp rename to src/callbacks/check_small.cpp index 2daf2d00aa8..a91227bf143 100644 --- a/src/callbacks/callback_checksmall.cpp +++ b/src/callbacks/check_small.cpp @@ -24,7 +24,7 @@ // permissions and limitations under the license. //////////////////////////////////////////////////////////////////////////////// -#include "lbann/callbacks/callback_checksmall.hpp" +#include "lbann/callbacks/check_small.hpp" #include "lbann/utils/exception.hpp" namespace lbann { diff --git a/src/callbacks/callback_checkpoint.cpp b/src/callbacks/checkpoint.cpp similarity index 99% rename from src/callbacks/callback_checkpoint.cpp rename to src/callbacks/checkpoint.cpp index 385f7337e95..fe3235a3f9d 100644 --- a/src/callbacks/callback_checkpoint.cpp +++ b/src/callbacks/checkpoint.cpp @@ -27,7 +27,7 @@ //////////////////////////////////////////////////////////////////////////////// -#include "lbann/callbacks/callback_checkpoint.hpp" +#include "lbann/callbacks/checkpoint.hpp" namespace lbann { namespace callback { diff --git a/src/callbacks/callback_confusion_matrix.cpp b/src/callbacks/confusion_matrix.cpp similarity index 99% rename from src/callbacks/callback_confusion_matrix.cpp rename to src/callbacks/confusion_matrix.cpp index 9b4eddd0067..7f8d24dc86e 100644 --- a/src/callbacks/callback_confusion_matrix.cpp +++ b/src/callbacks/confusion_matrix.cpp @@ -24,7 +24,7 @@ // permissions and limitations under the license. /////////////////////////////////////////////////////////////////////////////// -#include "lbann/callbacks/callback_confusion_matrix.hpp" +#include "lbann/callbacks/confusion_matrix.hpp" namespace lbann { namespace callback { diff --git a/src/callbacks/callback_debug.cpp b/src/callbacks/debug.cpp similarity index 99% rename from src/callbacks/callback_debug.cpp rename to src/callbacks/debug.cpp index bee2fa454a8..af06f33b069 100644 --- a/src/callbacks/callback_debug.cpp +++ b/src/callbacks/debug.cpp @@ -24,7 +24,7 @@ // permissions and limitations under the license. /////////////////////////////////////////////////////////////////////////////// -#include "lbann/callbacks/callback_debug.hpp" +#include "lbann/callbacks/debug.hpp" #include "lbann/comm.hpp" #include "lbann/proto/factories.hpp" #include "lbann/utils/memory.hpp" diff --git a/src/callbacks/callback_debug_io.cpp b/src/callbacks/debug_io.cpp similarity index 99% rename from src/callbacks/callback_debug_io.cpp rename to src/callbacks/debug_io.cpp index 400460bb5a0..2171586c86a 100644 --- a/src/callbacks/callback_debug_io.cpp +++ b/src/callbacks/debug_io.cpp @@ -26,7 +26,7 @@ // debug .hpp .cpp - Callback hooks to debug LBANN /////////////////////////////////////////////////////////////////////////////// -#include "lbann/callbacks/callback_debug_io.hpp" +#include "lbann/callbacks/debug_io.hpp" namespace lbann { namespace callback { diff --git a/src/callbacks/callback_dump_error_signals.cpp b/src/callbacks/dump_error_signals.cpp similarity index 97% rename from src/callbacks/callback_dump_error_signals.cpp rename to src/callbacks/dump_error_signals.cpp index f65836bd707..ab2e782df41 100644 --- a/src/callbacks/callback_dump_error_signals.cpp +++ b/src/callbacks/dump_error_signals.cpp @@ -24,7 +24,7 @@ // permissions and limitations under the license. //////////////////////////////////////////////////////////////////////////////// -#include "lbann/callbacks/callback_dump_error_signals.hpp" +#include "lbann/callbacks/dump_error_signals.hpp" #include diff --git a/src/callbacks/callback_dump_gradients.cpp b/src/callbacks/dump_gradients.cpp similarity index 97% rename from src/callbacks/callback_dump_gradients.cpp rename to src/callbacks/dump_gradients.cpp index e78a3358be2..27d459aa800 100644 --- a/src/callbacks/callback_dump_gradients.cpp +++ b/src/callbacks/dump_gradients.cpp @@ -26,7 +26,7 @@ // dump_gradients .hpp .cpp - Callbacks to dump gradients //////////////////////////////////////////////////////////////////////////////// -#include "lbann/callbacks/callback_dump_gradients.hpp" +#include "lbann/callbacks/dump_gradients.hpp" #include diff --git a/src/callbacks/callback_dump_minibatch_sample_indices.cpp b/src/callbacks/dump_minibatch_sample_indices.cpp similarity index 97% rename from src/callbacks/callback_dump_minibatch_sample_indices.cpp rename to src/callbacks/dump_minibatch_sample_indices.cpp index 03c95b43162..f2b3e037512 100644 --- a/src/callbacks/callback_dump_minibatch_sample_indices.cpp +++ b/src/callbacks/dump_minibatch_sample_indices.cpp @@ -28,7 +28,7 @@ //////////////////////////////////////////////////////////////////////////////// #include -#include "lbann/callbacks/callback_dump_minibatch_sample_indices.hpp" +#include "lbann/callbacks/dump_minibatch_sample_indices.hpp" #include "lbann/layers/io/input/input_layer.hpp" #include diff --git a/src/callbacks/callback_dump_outputs.cpp b/src/callbacks/dump_outputs.cpp similarity index 99% rename from src/callbacks/callback_dump_outputs.cpp rename to src/callbacks/dump_outputs.cpp index 1cba1b67a3e..268d9681e6c 100644 --- a/src/callbacks/callback_dump_outputs.cpp +++ b/src/callbacks/dump_outputs.cpp @@ -24,7 +24,7 @@ // permissions and limitations under the license. //////////////////////////////////////////////////////////////////////////////// -#include "lbann/callbacks/callback_dump_outputs.hpp" +#include "lbann/callbacks/dump_outputs.hpp" #include "lbann/proto/proto_common.hpp" #include "lbann/utils/file_utils.hpp" diff --git a/src/callbacks/callback_dump_weights.cpp b/src/callbacks/dump_weights.cpp similarity index 97% rename from src/callbacks/callback_dump_weights.cpp rename to src/callbacks/dump_weights.cpp index 8f8ff886cd2..e4fba8fff00 100644 --- a/src/callbacks/callback_dump_weights.cpp +++ b/src/callbacks/dump_weights.cpp @@ -27,7 +27,7 @@ //////////////////////////////////////////////////////////////////////////////// #include -#include "lbann/callbacks/callback_dump_weights.hpp" +#include "lbann/callbacks/dump_weights.hpp" namespace lbann { namespace callback { diff --git a/src/callbacks/callback_early_stopping.cpp b/src/callbacks/early_stopping.cpp similarity index 97% rename from src/callbacks/callback_early_stopping.cpp rename to src/callbacks/early_stopping.cpp index 0424f279623..051419ebdf7 100644 --- a/src/callbacks/callback_early_stopping.cpp +++ b/src/callbacks/early_stopping.cpp @@ -26,7 +26,7 @@ // lbann_early_stopping .hpp .cpp - Callback hooks for early stopping //////////////////////////////////////////////////////////////////////////////// -#include "lbann/callbacks/callback_early_stopping.hpp" +#include "lbann/callbacks/early_stopping.hpp" namespace lbann { namespace callback { diff --git a/src/callbacks/callback_gpu_memory_usage.cpp b/src/callbacks/gpu_memory_usage.cpp similarity index 98% rename from src/callbacks/callback_gpu_memory_usage.cpp rename to src/callbacks/gpu_memory_usage.cpp index 3a394e563a0..4100f2f677d 100644 --- a/src/callbacks/callback_gpu_memory_usage.cpp +++ b/src/callbacks/gpu_memory_usage.cpp @@ -25,7 +25,7 @@ // //////////////////////////////////////////////////////////////////////////////// -#include "lbann/callbacks/callback_gpu_memory_usage.hpp" +#include "lbann/callbacks/gpu_memory_usage.hpp" #include #include diff --git a/src/callbacks/callback_hang.cpp b/src/callbacks/hang.cpp similarity index 97% rename from src/callbacks/callback_hang.cpp rename to src/callbacks/hang.cpp index 254441d6c96..0445e815ad0 100644 --- a/src/callbacks/callback_hang.cpp +++ b/src/callbacks/hang.cpp @@ -24,7 +24,7 @@ // permissions and limitations under the license. //////////////////////////////////////////////////////////////////////////////// -#include "lbann/callbacks/callback_hang.hpp" +#include "lbann/callbacks/hang.hpp" #include diff --git a/src/callbacks/helpers.hpp b/src/callbacks/helpers.hpp new file mode 100644 index 00000000000..e612ef21850 --- /dev/null +++ b/src/callbacks/helpers.hpp @@ -0,0 +1,53 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#include "lbann/utils/exception.hpp" + +#include +#include + +namespace lbann { +namespace { +template +std::vector select_things_by_name( + std::vector const& things, + std::vector const& thing_names) { + + std::vector out_things; + for (auto const& name : thing_names) { + auto it = std::find_if( + things.begin(), things.end(), + [&name](const T* t) { return t->get_name() == name; }); + if (it != things.end()) + out_things.push_back(*it); + else + LBANN_ERROR(std::string("Requested thing \"") + name + + "\" does not exist in the list of things."); + } + return out_things; +} +} // namespace +} // namespace lbann diff --git a/src/callbacks/callback_imcomm.cpp b/src/callbacks/imcomm.cpp similarity index 99% rename from src/callbacks/callback_imcomm.cpp rename to src/callbacks/imcomm.cpp index 70c519bacc3..282c9284017 100644 --- a/src/callbacks/callback_imcomm.cpp +++ b/src/callbacks/imcomm.cpp @@ -28,7 +28,7 @@ #include #include -#include "lbann/callbacks/callback_imcomm.hpp" +#include "lbann/callbacks/imcomm.hpp" #include "lbann/utils/timer.hpp" #include "lbann/utils/exception.hpp" diff --git a/src/callbacks/callback_io.cpp b/src/callbacks/io.cpp similarity index 98% rename from src/callbacks/callback_io.cpp rename to src/callbacks/io.cpp index 57b6898e745..2b7b79a0587 100644 --- a/src/callbacks/callback_io.cpp +++ b/src/callbacks/io.cpp @@ -28,7 +28,7 @@ #include -#include "lbann/callbacks/callback_io.hpp" +#include "lbann/callbacks/io.hpp" #include "lbann/layers/io/input/generic_input_layer.hpp" #include "lbann/proto/proto_common.hpp" diff --git a/src/callbacks/callback_learning_rate.cpp b/src/callbacks/learning_rate.cpp similarity index 99% rename from src/callbacks/callback_learning_rate.cpp rename to src/callbacks/learning_rate.cpp index fca64ca1401..0c49ca8d6d8 100644 --- a/src/callbacks/callback_learning_rate.cpp +++ b/src/callbacks/learning_rate.cpp @@ -26,7 +26,7 @@ // lbann_learning_rate .hpp .cpp - Callback hooks for learning rate schedules //////////////////////////////////////////////////////////////////////////////// -#include "lbann/callbacks/callback_learning_rate.hpp" +#include "lbann/callbacks/learning_rate.hpp" #include "lbann/proto/proto_common.hpp" #include "callback_helpers.hpp" diff --git a/src/callbacks/callback_ltfb.cpp b/src/callbacks/ltfb.cpp similarity index 99% rename from src/callbacks/callback_ltfb.cpp rename to src/callbacks/ltfb.cpp index 568a5880cd7..843f8c46f88 100644 --- a/src/callbacks/callback_ltfb.cpp +++ b/src/callbacks/ltfb.cpp @@ -25,8 +25,8 @@ //////////////////////////////////////////////////////////////////////////////// #include -#include "lbann/callbacks/callback_ltfb.hpp" -#include "lbann/callbacks/callback_imcomm.hpp" +#include "lbann/callbacks/ltfb.hpp" +#include "lbann/callbacks/imcomm.hpp" #include "lbann/utils/random.hpp" #include "lbann/optimizers/sgd.hpp" #include "lbann/optimizers/adam.hpp" diff --git a/src/callbacks/callback_mixup.cpp b/src/callbacks/mixup.cpp similarity index 98% rename from src/callbacks/callback_mixup.cpp rename to src/callbacks/mixup.cpp index 2a65e721e7b..691be1c4355 100644 --- a/src/callbacks/callback_mixup.cpp +++ b/src/callbacks/mixup.cpp @@ -25,7 +25,7 @@ //////////////////////////////////////////////////////////////////////////////// #include -#include "lbann/callbacks/callback_mixup.hpp" +#include "lbann/callbacks/mixup.hpp" #include "lbann/proto/factories.hpp" #include "lbann/utils/beta.hpp" #include "lbann/utils/exception.hpp" diff --git a/src/callbacks/callback_perturb_adam.cpp b/src/callbacks/perturb_adam.cpp similarity index 99% rename from src/callbacks/callback_perturb_adam.cpp rename to src/callbacks/perturb_adam.cpp index 5af3e9b69bc..36ea3fc06e8 100644 --- a/src/callbacks/callback_perturb_adam.cpp +++ b/src/callbacks/perturb_adam.cpp @@ -24,7 +24,7 @@ // permissions and limitations under the license. //////////////////////////////////////////////////////////////////////////////// -#include "lbann/callbacks/callback_perturb_adam.hpp" +#include "lbann/callbacks/perturb_adam.hpp" #include "lbann/proto/factories.hpp" #include "lbann/utils/random.hpp" diff --git a/src/callbacks/callback_perturb_dropout.cpp b/src/callbacks/perturb_dropout.cpp similarity index 98% rename from src/callbacks/callback_perturb_dropout.cpp rename to src/callbacks/perturb_dropout.cpp index 5f3489391d1..13267bc0d17 100644 --- a/src/callbacks/callback_perturb_dropout.cpp +++ b/src/callbacks/perturb_dropout.cpp @@ -24,7 +24,7 @@ // permissions and limitations under the license. //////////////////////////////////////////////////////////////////////////////// -#include "lbann/callbacks/callback_perturb_dropout.hpp" +#include "lbann/callbacks/perturb_dropout.hpp" #include "lbann/proto/factories.hpp" #include "lbann/utils/random.hpp" diff --git a/src/callbacks/callback_print.cpp b/src/callbacks/print.cpp similarity index 99% rename from src/callbacks/callback_print.cpp rename to src/callbacks/print.cpp index 0f089db6b59..0e2fee28a96 100644 --- a/src/callbacks/callback_print.cpp +++ b/src/callbacks/print.cpp @@ -27,7 +27,7 @@ //////////////////////////////////////////////////////////////////////////////// #include -#include "lbann/callbacks/callback_print.hpp" +#include "lbann/callbacks/print.hpp" #include "lbann/layers/io/input/input_layer.hpp" #include diff --git a/src/callbacks/callback_replace_weights.cpp b/src/callbacks/replace_weights.cpp similarity index 97% rename from src/callbacks/callback_replace_weights.cpp rename to src/callbacks/replace_weights.cpp index cfe79ed5862..4ce852d7ee4 100644 --- a/src/callbacks/callback_replace_weights.cpp +++ b/src/callbacks/replace_weights.cpp @@ -24,7 +24,7 @@ // permissions and limitations under the license. //////////////////////////////////////////////////////////////////////////////// -#include "lbann/callbacks/callback_replace_weights.hpp" +#include "lbann/callbacks/replace_weights.hpp" #include "lbann/proto/proto_common.hpp" #include "callback_helpers.hpp" diff --git a/src/callbacks/callback_save_images.cpp b/src/callbacks/save_images.cpp similarity index 99% rename from src/callbacks/callback_save_images.cpp rename to src/callbacks/save_images.cpp index 0aad9ec7889..c601f099e03 100644 --- a/src/callbacks/callback_save_images.cpp +++ b/src/callbacks/save_images.cpp @@ -24,7 +24,7 @@ // permissions and limitations under the license. //////////////////////////////////////////////////////////////////////////////// -#include "lbann/callbacks/callback_save_images.hpp" +#include "lbann/callbacks/save_images.hpp" #include "lbann/proto/factories.hpp" #include diff --git a/src/callbacks/callback_save_model.cpp b/src/callbacks/save_model.cpp similarity index 97% rename from src/callbacks/callback_save_model.cpp rename to src/callbacks/save_model.cpp index 045ae33da78..94a939f6df4 100644 --- a/src/callbacks/callback_save_model.cpp +++ b/src/callbacks/save_model.cpp @@ -27,8 +27,8 @@ //////////////////////////////////////////////////////////////////////////////// #include -#include "lbann/callbacks/callback_save_model.hpp" -#include "lbann/callbacks/callback_checkpoint.hpp" // Reuse the checkpoint naming scheme +#include "lbann/callbacks/save_model.hpp" +#include "lbann/callbacks/checkpoint.hpp" // Reuse the checkpoint naming scheme #include #include #include diff --git a/src/callbacks/callback_save_topk_models.cpp b/src/callbacks/save_topk_models.cpp similarity index 98% rename from src/callbacks/callback_save_topk_models.cpp rename to src/callbacks/save_topk_models.cpp index ba76cad1c27..f43b161e27e 100644 --- a/src/callbacks/callback_save_topk_models.cpp +++ b/src/callbacks/save_topk_models.cpp @@ -27,7 +27,7 @@ //////////////////////////////////////////////////////////////////////////////// #include -#include "lbann/callbacks/callback_save_topk_models.hpp" +#include "lbann/callbacks/save_topk_models.hpp" namespace lbann { namespace callback { diff --git a/src/callbacks/callback_summary.cpp b/src/callbacks/summary.cpp similarity index 99% rename from src/callbacks/callback_summary.cpp rename to src/callbacks/summary.cpp index 4552b0180c4..1dfcef5097f 100644 --- a/src/callbacks/callback_summary.cpp +++ b/src/callbacks/summary.cpp @@ -26,7 +26,7 @@ // summary .hpp .cpp - Callback hooks to summarize to Tensorboard //////////////////////////////////////////////////////////////////////////////// -#include "lbann/callbacks/callback_summary.hpp" +#include "lbann/callbacks/summary.hpp" #include "lbann/utils/profiling.hpp" namespace lbann { diff --git a/src/callbacks/callback_sync_layers.cpp b/src/callbacks/sync_layers.cpp similarity index 97% rename from src/callbacks/callback_sync_layers.cpp rename to src/callbacks/sync_layers.cpp index 230afd6d790..72e30a93dd7 100644 --- a/src/callbacks/callback_sync_layers.cpp +++ b/src/callbacks/sync_layers.cpp @@ -26,7 +26,7 @@ // callback_sync_layers.cpp - Callback to synchronize layers /////////////////////////////////////////////////////////////////////////////// -#include "lbann/callbacks/callback_sync_layers.hpp" +#include "lbann/callbacks/sync_layers.hpp" #include "lbann/layers/io/input/generic_input_layer.hpp" #include "lbann/utils/timer.hpp" diff --git a/src/callbacks/callback_sync_selected.cpp b/src/callbacks/sync_selected.cpp similarity index 99% rename from src/callbacks/callback_sync_selected.cpp rename to src/callbacks/sync_selected.cpp index 771a0cb165d..c123039e313 100644 --- a/src/callbacks/callback_sync_selected.cpp +++ b/src/callbacks/sync_selected.cpp @@ -26,7 +26,7 @@ // callback_sync_selected.cpp - Callback to synchronize selected layers /////////////////////////////////////////////////////////////////////////////// -#include "lbann/callbacks/callback_sync_selected.hpp" +#include "lbann/callbacks/sync_selected.hpp" #include "lbann/utils/timer.hpp" #ifdef LBANN_NVPROF #include diff --git a/src/callbacks/callback_timeline.cpp b/src/callbacks/timeline.cpp similarity index 98% rename from src/callbacks/callback_timeline.cpp rename to src/callbacks/timeline.cpp index 12550b84923..63c5082ed64 100644 --- a/src/callbacks/callback_timeline.cpp +++ b/src/callbacks/timeline.cpp @@ -27,7 +27,7 @@ //////////////////////////////////////////////////////////////////////////////// #include -#include "lbann/callbacks/callback_timeline.hpp" +#include "lbann/callbacks/timeline.hpp" #include "lbann/utils/timer.hpp" namespace lbann { diff --git a/src/callbacks/callback_timer.cpp b/src/callbacks/timer.cpp similarity index 99% rename from src/callbacks/callback_timer.cpp rename to src/callbacks/timer.cpp index 500e171e34d..9449782c718 100644 --- a/src/callbacks/callback_timer.cpp +++ b/src/callbacks/timer.cpp @@ -24,7 +24,7 @@ // permissions and limitations under the license. /////////////////////////////////////////////////////////////////////////////// -#include "lbann/callbacks/callback_timer.hpp" +#include "lbann/callbacks/timer.hpp" #include "lbann/utils/timer.hpp" #include diff --git a/src/callbacks/callback_variable_minibatch.cpp b/src/callbacks/variable_minibatch.cpp similarity index 99% rename from src/callbacks/callback_variable_minibatch.cpp rename to src/callbacks/variable_minibatch.cpp index 81e3f816bcc..6798f8dd2f6 100644 --- a/src/callbacks/callback_variable_minibatch.cpp +++ b/src/callbacks/variable_minibatch.cpp @@ -28,7 +28,7 @@ #include -#include "lbann/callbacks/callback_variable_minibatch.hpp" +#include "lbann/callbacks/variable_minibatch.hpp" #include "lbann/layers/io/input/input_layer.hpp" namespace lbann { diff --git a/src/models/model.cpp b/src/models/model.cpp index 75610c7057f..e4dccc7bd68 100644 --- a/src/models/model.cpp +++ b/src/models/model.cpp @@ -26,7 +26,7 @@ #include "lbann/models/model.hpp" #include "lbann/callbacks/callback.hpp" -#include "lbann/callbacks/callback_save_model.hpp" +#include "lbann/callbacks/save_model.hpp" #include "lbann/io/persist.hpp" #include "lbann/layers/io/input/generic_input_layer.hpp" #include "lbann/layers/transform/dummy.hpp" diff --git a/src/proto/callbacks.proto b/src/proto/callbacks.proto index e42f76de2b5..e243b1231f6 100644 --- a/src/proto/callbacks.proto +++ b/src/proto/callbacks.proto @@ -110,7 +110,7 @@ message CallbackSaveImages { } message CallbackPrint { - int64 interval = 1; //default in lbann_callback_print.hpp is 1 + int64 interval = 1; //default in lbann/callbacks/print.hpp is 1 bool print_global_stat_only = 2; //useful in large scale multi-trainer, default is false } @@ -124,8 +124,8 @@ message CallbackTimer { message CallbackSummary { string dir = 1; //directory for the lbann_summary - int64 batch_interval = 2; //default in lbann_callback_summary.hpp is 1 - int64 mat_interval = 3; //default in lbann_callback_summary.hpp is 25 + int64 batch_interval = 2; //default in lbann/callbacks/summary.hpp is 1 + int64 mat_interval = 3; //default in lbann/callbacks/summary.hpp is 25 } message CallbackDumpWeights { diff --git a/src/proto/factories/callback_factory.cpp b/src/proto/factories/callback_factory.cpp index 493f2304ea2..6cc41e87aa6 100644 --- a/src/proto/factories/callback_factory.cpp +++ b/src/proto/factories/callback_factory.cpp @@ -26,42 +26,42 @@ // Get the declarations of all the builders for registration #include "lbann/callbacks/callback.hpp" -#include "lbann/callbacks/callback_check_dataset.hpp" -#include "lbann/callbacks/callback_check_gradients.hpp" -#include "lbann/callbacks/callback_check_init.hpp" -#include "lbann/callbacks/callback_check_metric.hpp" -#include "lbann/callbacks/callback_checknan.hpp" -#include "lbann/callbacks/callback_checkpoint.hpp" -#include "lbann/callbacks/callback_checksmall.hpp" -#include "lbann/callbacks/callback_confusion_matrix.hpp" -#include "lbann/callbacks/callback_debug.hpp" -#include "lbann/callbacks/callback_debug_io.hpp" -#include "lbann/callbacks/callback_dump_error_signals.hpp" -#include "lbann/callbacks/callback_dump_gradients.hpp" -#include "lbann/callbacks/callback_dump_minibatch_sample_indices.hpp" -#include "lbann/callbacks/callback_dump_outputs.hpp" -#include "lbann/callbacks/callback_dump_weights.hpp" -#include "lbann/callbacks/callback_early_stopping.hpp" -#include "lbann/callbacks/callback_gpu_memory_usage.hpp" -#include "lbann/callbacks/callback_hang.hpp" -#include "lbann/callbacks/callback_imcomm.hpp" -#include "lbann/callbacks/callback_io.hpp" -#include "lbann/callbacks/callback_learning_rate.hpp" -#include "lbann/callbacks/callback_ltfb.hpp" -#include "lbann/callbacks/callback_mixup.hpp" -#include "lbann/callbacks/callback_perturb_adam.hpp" -#include "lbann/callbacks/callback_perturb_dropout.hpp" -#include "lbann/callbacks/callback_print.hpp" -#include "lbann/callbacks/callback_replace_weights.hpp" -#include "lbann/callbacks/callback_save_images.hpp" -#include "lbann/callbacks/callback_save_model.hpp" -#include "lbann/callbacks/callback_save_topk_models.hpp" -#include "lbann/callbacks/callback_summary.hpp" -#include "lbann/callbacks/callback_sync_layers.hpp" -#include "lbann/callbacks/callback_sync_selected.hpp" -#include "lbann/callbacks/callback_timeline.hpp" -#include "lbann/callbacks/callback_timer.hpp" -#include "lbann/callbacks/callback_variable_minibatch.hpp" +#include "lbann/callbacks/check_dataset.hpp" +#include "lbann/callbacks/check_gradients.hpp" +#include "lbann/callbacks/check_init.hpp" +#include "lbann/callbacks/check_metric.hpp" +#include "lbann/callbacks/check_nan.hpp" +#include "lbann/callbacks/check_small.hpp" +#include "lbann/callbacks/checkpoint.hpp" +#include "lbann/callbacks/confusion_matrix.hpp" +#include "lbann/callbacks/debug.hpp" +#include "lbann/callbacks/debug_io.hpp" +#include "lbann/callbacks/dump_error_signals.hpp" +#include "lbann/callbacks/dump_gradients.hpp" +#include "lbann/callbacks/dump_minibatch_sample_indices.hpp" +#include "lbann/callbacks/dump_outputs.hpp" +#include "lbann/callbacks/dump_weights.hpp" +#include "lbann/callbacks/early_stopping.hpp" +#include "lbann/callbacks/gpu_memory_usage.hpp" +#include "lbann/callbacks/hang.hpp" +#include "lbann/callbacks/imcomm.hpp" +#include "lbann/callbacks/io.hpp" +#include "lbann/callbacks/learning_rate.hpp" +#include "lbann/callbacks/ltfb.hpp" +#include "lbann/callbacks/mixup.hpp" +#include "lbann/callbacks/perturb_adam.hpp" +#include "lbann/callbacks/perturb_dropout.hpp" +#include "lbann/callbacks/print.hpp" +#include "lbann/callbacks/replace_weights.hpp" +#include "lbann/callbacks/save_images.hpp" +#include "lbann/callbacks/save_model.hpp" +#include "lbann/callbacks/save_topk_models.hpp" +#include "lbann/callbacks/summary.hpp" +#include "lbann/callbacks/sync_layers.hpp" +#include "lbann/callbacks/sync_selected.hpp" +#include "lbann/callbacks/timeline.hpp" +#include "lbann/callbacks/timer.hpp" +#include "lbann/callbacks/variable_minibatch.hpp" #include "lbann/proto/factories.hpp" #include "lbann/proto/proto_helpers.hpp" diff --git a/src/utils/lbann_library.cpp b/src/utils/lbann_library.cpp index 3fde7557063..c8e7ab33f25 100644 --- a/src/utils/lbann_library.cpp +++ b/src/utils/lbann_library.cpp @@ -25,7 +25,7 @@ //////////////////////////////////////////////////////////////////////////////// #include "lbann/utils/lbann_library.hpp" -#include "lbann/callbacks/callback_checkpoint.hpp" +#include "lbann/callbacks/checkpoint.hpp" namespace lbann { From 248effbdc0a050bf35a25b08bcabd9019af63854 Mon Sep 17 00:00:00 2001 From: "Thomas R. Benson" Date: Mon, 22 Jul 2019 13:35:15 -0700 Subject: [PATCH 149/634] remove accidentally-added file --- src/proto/callback.proto | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 src/proto/callback.proto diff --git a/src/proto/callback.proto b/src/proto/callback.proto deleted file mode 100644 index e69de29bb2d..00000000000 From c55ce4961ffcbcb19262c5629306240fa9fc02ab Mon Sep 17 00:00:00 2001 From: "Thomas R. Benson" Date: Mon, 22 Jul 2019 14:13:17 -0700 Subject: [PATCH 150/634] move function implementations to base.cpp --- include/lbann/base.hpp | 55 ++++++------------------------------------ src/base.cpp | 50 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 58 insertions(+), 47 deletions(-) diff --git a/include/lbann/base.hpp b/include/lbann/base.hpp index bfe52cec140..38b156fa3ac 100644 --- a/include/lbann/base.hpp +++ b/include/lbann/base.hpp @@ -119,40 +119,12 @@ enum class matrix_format {MC_MR, CIRC_CIRC, STAR_STAR, STAR_VC, MC_STAR, invalid /// Data layout that is optimized for different modes of parallelism enum class data_layout {MODEL_PARALLEL, DATA_PARALLEL, invalid}; -inline matrix_format data_layout_to_matrix_format(data_layout layout) { - matrix_format format; - switch(layout) { - case data_layout::MODEL_PARALLEL: - format = matrix_format::MC_MR; - break; - case data_layout::DATA_PARALLEL: - /// Weights are stored in STAR_STAR and data in STAR_VC - format = matrix_format::STAR_STAR; - break; - default: - throw std::runtime_error("Invalid data layout selected"); - } - return format; -} +matrix_format data_layout_to_matrix_format(data_layout layout); /// Neural network execution mode enum class execution_mode {training, validation, testing, prediction, invalid}; -inline std::string to_string(execution_mode m) { - switch(m) { - case execution_mode::training: - return "training"; - case execution_mode::validation: - return "validation"; - case execution_mode::testing: - return "testing"; - case execution_mode::prediction: - return "prediction"; - case execution_mode::invalid: - return "invalid"; - default: - throw std::runtime_error("Invalid execution mode specified"); - } -} +std::string to_string(execution_mode m); + /** @brief Convert a string to an execution_mode. */ execution_mode exe_mode_from_string(std::string const& str); /** @brief Extract an execution_mode from a stream. */ @@ -173,26 +145,15 @@ enum class data_reader_target_mode {CLASSIFICATION, REGRESSION, RECONSTRUCTION, * It checks if the string 'mainStr' ends with given string * 'toMatch' */ -inline bool endsWith(const std::string mainStr, const std::string &toMatch) -{ - if(mainStr.size() >= toMatch.size() && - mainStr.compare(mainStr.size() - toMatch.size(), toMatch.size(), toMatch) == 0) - return true; - else - return false; -} +bool endsWith(const std::string mainStr, const std::string &toMatch); /// Print the dimensions and name of a Elemental matrix -inline void print_matrix_dims(AbsDistMat *m, const char *name) { - std::cout << "DISPLAY MATRIX: " << name << " = " << m->Height() << " x " << m->Width() << std::endl; -} -#define PRINT_MATRIX_DIMS(x) print_matrix_dims(x, #x); +void print_matrix_dims(AbsDistMat *m, const char *name); +#define LBANN_PRINT_MATRIX_DIMS(x) print_matrix_dims(x, #x); /// Print the dimensions and name of a Elemental matrix -inline void print_local_matrix_dims(AbsMat *m, const char *name) { - std::cout << "DISPLAY MATRIX: " << name << " = " << m->Height() << " x " << m->Width() << std::endl; -} -#define PRINT_LOCAL_MATRIX_DIMS(x) print_local_matrix_dims(x, #x); +void print_local_matrix_dims(AbsMat *m, const char *name); +#define LBANN_PRINT_LOCAL_MATRIX_DIMS(x) print_local_matrix_dims(x, #x); #define LBANN_MAKE_STR_(x) #x #define LBANN_MAKE_STR(x) LBANN_MAKE_STR_(x) diff --git a/src/base.cpp b/src/base.cpp index f0d4a4bdfc9..ab2fd2e106f 100644 --- a/src/base.cpp +++ b/src/base.cpp @@ -108,6 +108,39 @@ std::string get_pool_mode_name(pool_mode m) { return pool_mode_names[(int)m]; } +matrix_format data_layout_to_matrix_format(data_layout layout) { + matrix_format format; + switch(layout) { + case data_layout::MODEL_PARALLEL: + format = matrix_format::MC_MR; + break; + case data_layout::DATA_PARALLEL: + /// Weights are stored in STAR_STAR and data in STAR_VC + format = matrix_format::STAR_STAR; + break; + default: + throw std::runtime_error("Invalid data layout selected"); + } + return format; +} + +std::string to_string(execution_mode m) { + switch(m) { + case execution_mode::training: + return "training"; + case execution_mode::validation: + return "validation"; + case execution_mode::testing: + return "testing"; + case execution_mode::prediction: + return "prediction"; + case execution_mode::invalid: + return "invalid"; + default: + throw std::runtime_error("Invalid execution mode specified"); + } +} + execution_mode exe_mode_from_string(std::string const& str) { if (str == "training" || str == "train") return execution_mode::training; @@ -130,4 +163,21 @@ std::istream& operator>>(std::istream& is, execution_mode& m) { return is; } +bool endsWith(const std::string mainStr, const std::string &toMatch) +{ + if(mainStr.size() >= toMatch.size() && + mainStr.compare(mainStr.size() - toMatch.size(), toMatch.size(), toMatch) == 0) + return true; + else + return false; +} + +void print_matrix_dims(AbsDistMat *m, const char *name) { + std::cout << "DISPLAY MATRIX: " << name << " = " << m->Height() << " x " << m->Width() << std::endl; +} + +void print_local_matrix_dims(AbsMat *m, const char *name) { + std::cout << "DISPLAY MATRIX: " << name << " = " << m->Height() << " x " << m->Width() << std::endl; +} + } // namespace lbann From eb217f7212d753a4d7d10e07dcbb1aadec2db58e Mon Sep 17 00:00:00 2001 From: "Thomas R. Benson" Date: Mon, 22 Jul 2019 14:16:46 -0700 Subject: [PATCH 151/634] clean up error reporting/handling in base.cpp --- include/lbann/base.hpp | 4 +++- src/base.cpp | 18 ++++++++++++------ 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/include/lbann/base.hpp b/include/lbann/base.hpp index 38b156fa3ac..97f33e93583 100644 --- a/include/lbann/base.hpp +++ b/include/lbann/base.hpp @@ -39,7 +39,9 @@ #include "lbann/utils/omp_pragma.hpp" #include -#include +#include +#include +#include namespace lbann { diff --git a/src/base.cpp b/src/base.cpp index ab2fd2e106f..643a42fe1be 100644 --- a/src/base.cpp +++ b/src/base.cpp @@ -37,6 +37,7 @@ #endif #include "lbann/comm.hpp" +#include "lbann/utils/exception.hpp" #include "lbann/utils/random.hpp" #include "lbann/utils/omp_diagnostics.hpp" #include "lbann/utils/stack_trace.hpp" @@ -45,6 +46,10 @@ #include "lbann/utils/cudnn.hpp" #endif +#include +#include +#include + namespace lbann { world_comm_ptr initialize(int& argc, char**& argv, int seed) { @@ -102,8 +107,7 @@ static std::vector pool_mode_names = { "invalid", "max", "average", /** returns a string representation of the pool_mode */ std::string get_pool_mode_name(pool_mode m) { if ((int)m < 1 or (int)m >= (int)pool_mode_names.size()) { - throw(std::string{} + __FILE__ + " " + std::to_string(__LINE__) + " :: " - + " Invalid pool_mode"); + LBANN_ERROR("Invalid pool_mode"); } return pool_mode_names[(int)m]; } @@ -119,7 +123,7 @@ matrix_format data_layout_to_matrix_format(data_layout layout) { format = matrix_format::STAR_STAR; break; default: - throw std::runtime_error("Invalid data layout selected"); + LBANN_ERROR("Invalid data layout selected"); } return format; } @@ -137,7 +141,7 @@ std::string to_string(execution_mode m) { case execution_mode::invalid: return "invalid"; default: - throw std::runtime_error("Invalid execution mode specified"); + LBANN_ERROR("Invalid execution mode specified"); } } @@ -173,11 +177,13 @@ bool endsWith(const std::string mainStr, const std::string &toMatch) } void print_matrix_dims(AbsDistMat *m, const char *name) { - std::cout << "DISPLAY MATRIX: " << name << " = " << m->Height() << " x " << m->Width() << std::endl; + std::cout << "DISPLAY MATRIX: " << name << " = " + << m->Height() << " x " << m->Width() << std::endl; } void print_local_matrix_dims(AbsMat *m, const char *name) { - std::cout << "DISPLAY MATRIX: " << name << " = " << m->Height() << " x " << m->Width() << std::endl; + std::cout << "DISPLAY MATRIX: " << name << " = " + << m->Height() << " x " << m->Width() << std::endl; } } // namespace lbann From fcf19a5b160254e48753346e6001503c0bb7323a Mon Sep 17 00:00:00 2001 From: "Thomas R. Benson" Date: Mon, 22 Jul 2019 14:17:52 -0700 Subject: [PATCH 152/634] improve macro naming and implementation to assuage Tim's fears of collisions and/or misuse --- include/lbann/callbacks/callback.hpp | 8 ++++---- include/lbann/callbacks/callback_check_dataset.hpp | 2 +- include/lbann/callbacks/callback_check_init.hpp | 2 +- include/lbann/callbacks/callback_checknan.hpp | 2 +- include/lbann/callbacks/callback_checksmall.hpp | 2 +- include/lbann/callbacks/callback_gpu_memory_usage.hpp | 2 +- 6 files changed, 9 insertions(+), 9 deletions(-) diff --git a/include/lbann/callbacks/callback.hpp b/include/lbann/callbacks/callback.hpp index 3392d352915..b920998ca85 100644 --- a/include/lbann/callbacks/callback.hpp +++ b/include/lbann/callbacks/callback.hpp @@ -36,10 +36,10 @@ // A utility macro for easily adding default-constructed sub-class // builders. -#define ADD_DEFAULT_CALLBACK_BUILDER(Class, FunctionName) \ - inline std::unique_ptr FunctionName( \ - const google::protobuf::Message&, lbann_summary*) { \ - return make_unique(); \ +#define LBANN_ADD_DEFAULT_CALLBACK_BUILDER(Class, FunctionName) \ + inline std::unique_ptr FunctionName( \ + const google::protobuf::Message&, lbann_summary*) { \ + return lbann::make_unique(); \ } namespace lbann { diff --git a/include/lbann/callbacks/callback_check_dataset.hpp b/include/lbann/callbacks/callback_check_dataset.hpp index d8b513eb9bf..4635acacd4a 100644 --- a/include/lbann/callbacks/callback_check_dataset.hpp +++ b/include/lbann/callbacks/callback_check_dataset.hpp @@ -69,7 +69,7 @@ class lbann_callback_check_dataset : public lbann_callback { }; // Builder function -ADD_DEFAULT_CALLBACK_BUILDER( +LBANN_ADD_DEFAULT_CALLBACK_BUILDER( lbann_callback_check_dataset, build_callback_check_dataset_from_pbuf); } // namespace lbann diff --git a/include/lbann/callbacks/callback_check_init.hpp b/include/lbann/callbacks/callback_check_init.hpp index 6acafc01cc2..12a853845f4 100644 --- a/include/lbann/callbacks/callback_check_init.hpp +++ b/include/lbann/callbacks/callback_check_init.hpp @@ -54,7 +54,7 @@ class lbann_callback_check_init : public lbann_callback { }; // Builder function -ADD_DEFAULT_CALLBACK_BUILDER( +LBANN_ADD_DEFAULT_CALLBACK_BUILDER( lbann_callback_check_init, build_callback_check_init_from_pbuf) } // namespace lbann diff --git a/include/lbann/callbacks/callback_checknan.hpp b/include/lbann/callbacks/callback_checknan.hpp index b76c4e9d382..8be09d90403 100644 --- a/include/lbann/callbacks/callback_checknan.hpp +++ b/include/lbann/callbacks/callback_checknan.hpp @@ -62,7 +62,7 @@ class lbann_callback_checknan : public lbann_callback { }; // Builder function -ADD_DEFAULT_CALLBACK_BUILDER( +LBANN_ADD_DEFAULT_CALLBACK_BUILDER( lbann_callback_checknan, build_callback_check_nan_from_pbuf) } // namespace lbann diff --git a/include/lbann/callbacks/callback_checksmall.hpp b/include/lbann/callbacks/callback_checksmall.hpp index c0ea4e3203c..c32bfa3ea90 100644 --- a/include/lbann/callbacks/callback_checksmall.hpp +++ b/include/lbann/callbacks/callback_checksmall.hpp @@ -68,7 +68,7 @@ class lbann_callback_checksmall : public lbann_callback { }; // Builder function -ADD_DEFAULT_CALLBACK_BUILDER( +LBANN_ADD_DEFAULT_CALLBACK_BUILDER( lbann_callback_checksmall, build_callback_check_small_from_pbuf) } // namespace lbann diff --git a/include/lbann/callbacks/callback_gpu_memory_usage.hpp b/include/lbann/callbacks/callback_gpu_memory_usage.hpp index 610d7d0bc66..93d459abe7b 100644 --- a/include/lbann/callbacks/callback_gpu_memory_usage.hpp +++ b/include/lbann/callbacks/callback_gpu_memory_usage.hpp @@ -47,7 +47,7 @@ class lbann_callback_gpu_memory_usage : public lbann_callback { }; // Builder function -ADD_DEFAULT_CALLBACK_BUILDER( +LBANN_ADD_DEFAULT_CALLBACK_BUILDER( lbann_callback_gpu_memory_usage, build_callback_gpu_memory_usage_from_pbuf); } // namespace lbann From 48fe69e93550556b2affba8b519a89ae5ededfe4 Mon Sep 17 00:00:00 2001 From: "Thomas R. Benson" Date: Mon, 22 Jul 2019 14:24:37 -0700 Subject: [PATCH 153/634] rename proto_helpers to just helpers --- include/lbann/proto/CMakeLists.txt | 1 + include/lbann/proto/{proto_helpers.hpp => helpers.hpp} | 10 +++++----- src/proto/CMakeLists.txt | 2 +- src/proto/factories/callback_factory.cpp | 4 ++-- src/proto/{proto_helpers.cpp => helpers.cpp} | 6 +++--- 5 files changed, 12 insertions(+), 11 deletions(-) rename include/lbann/proto/{proto_helpers.hpp => helpers.hpp} (90%) rename src/proto/{proto_helpers.cpp => helpers.cpp} (96%) diff --git a/include/lbann/proto/CMakeLists.txt b/include/lbann/proto/CMakeLists.txt index 59dbee3097d..c69357ce404 100644 --- a/include/lbann/proto/CMakeLists.txt +++ b/include/lbann/proto/CMakeLists.txt @@ -2,6 +2,7 @@ set_full_path(THIS_DIR_HEADERS init_image_data_readers.hpp proto_common.hpp + helpers.hpp ) # Propagate the files up the tree diff --git a/include/lbann/proto/proto_helpers.hpp b/include/lbann/proto/helpers.hpp similarity index 90% rename from include/lbann/proto/proto_helpers.hpp rename to include/lbann/proto/helpers.hpp index 0c57cb5f278..c28861f7bfd 100644 --- a/include/lbann/proto/proto_helpers.hpp +++ b/include/lbann/proto/helpers.hpp @@ -24,8 +24,8 @@ // permissions and limitations under the license. //////////////////////////////////////////////////////////////////////////////// -#ifndef LBANN_PROTO_PROTO_HELPERS_HPP_INCLUDED -#define LBANN_PROTO_PROTO_HELPERS_HPP_INCLUDED +#ifndef LBANN_PROTO_HELPERS_HPP_INCLUDED +#define LBANN_PROTO_HELPERS_HPP_INCLUDED #include @@ -48,7 +48,7 @@ template using generate_builder_type = typename GenerateBuilderType_struct::type; -namespace proto_helpers +namespace helpers { /** @brief Get a "derived type" message from the given message. */ @@ -56,7 +56,7 @@ google::protobuf::Message const& get_oneof_message( google::protobuf::Message const& msg_in, std::string const& oneof_name); -}// namespace proto_helpers +}// namespace helpers }// namespace proto }// namespace lbann -#endif /* LBANN_PROTO_PROTO_HELPERS_HPP_INCLUDED */ +#endif /* LBANN_PROTO_HELPERS_HPP_INCLUDED */ diff --git a/src/proto/CMakeLists.txt b/src/proto/CMakeLists.txt index 0f8cc593494..fa9473100e5 100644 --- a/src/proto/CMakeLists.txt +++ b/src/proto/CMakeLists.txt @@ -60,7 +60,7 @@ endif (LBANN_HAS_PROTOBUF) set_full_path(THIS_DIR_SOURCES init_image_data_readers.cpp proto_common.cpp - proto_helpers.cpp + helpers.cpp ) # Add the subdirectories diff --git a/src/proto/factories/callback_factory.cpp b/src/proto/factories/callback_factory.cpp index 9d6b24d5f72..9a36bc62852 100644 --- a/src/proto/factories/callback_factory.cpp +++ b/src/proto/factories/callback_factory.cpp @@ -64,7 +64,7 @@ #include "lbann/callbacks/callback_variable_minibatch.hpp" #include "lbann/proto/factories.hpp" -#include "lbann/proto/proto_helpers.hpp" +#include "lbann/proto/helpers.hpp" #include "lbann/utils/factory.hpp" #include "lbann/utils/memory.hpp" @@ -204,7 +204,7 @@ construct_callback( auto const& factory = get_callback_factory(); auto const& msg = - proto_helpers::get_oneof_message(proto_msg, "callback_type"); + helpers::get_oneof_message(proto_msg, "callback_type"); return factory.create_object(msg.GetDescriptor()->name(), msg, summarizer); } diff --git a/src/proto/proto_helpers.cpp b/src/proto/helpers.cpp similarity index 96% rename from src/proto/proto_helpers.cpp rename to src/proto/helpers.cpp index a9ea573c136..a9494a1f412 100644 --- a/src/proto/proto_helpers.cpp +++ b/src/proto/helpers.cpp @@ -24,7 +24,7 @@ // permissions and limitations under the license. //////////////////////////////////////////////////////////////////////////////// -#include "lbann/proto/proto_helpers.hpp" +#include "lbann/proto/helpers.hpp" #include "lbann/utils/exception.hpp" #include @@ -34,7 +34,7 @@ namespace lbann { namespace proto { -namespace proto_helpers { +namespace helpers { google::protobuf::Message const& get_oneof_message( @@ -64,6 +64,6 @@ get_oneof_message( return reflex->GetMessage(msg_in, oneof_field); } -}// namespace proto_helpers +}// namespace helpers }// namespace proto }// namespace lbann From 86c04af486e2b2808bd3848f4f359aa156a4c141 Mon Sep 17 00:00:00 2001 From: "Thomas R. Benson" Date: Mon, 22 Jul 2019 15:54:33 -0700 Subject: [PATCH 154/634] move the concrete callback messages inside the main callback message --- src/callbacks/callback_check_gradients.cpp | 2 +- src/callbacks/callback_check_metric.cpp | 2 +- src/callbacks/callback_checkpoint.cpp | 2 +- src/callbacks/callback_confusion_matrix.cpp | 2 +- src/callbacks/callback_debug.cpp | 2 +- src/callbacks/callback_debug_io.cpp | 2 +- src/callbacks/callback_dump_error_signals.cpp | 2 +- src/callbacks/callback_dump_gradients.cpp | 2 +- ...callback_dump_minibatch_sample_indices.cpp | 2 +- src/callbacks/callback_dump_outputs.cpp | 2 +- src/callbacks/callback_dump_weights.cpp | 2 +- src/callbacks/callback_early_stopping.cpp | 2 +- src/callbacks/callback_hang.cpp | 2 +- src/callbacks/callback_imcomm.cpp | 2 +- src/callbacks/callback_io.cpp | 2 +- src/callbacks/callback_learning_rate.cpp | 12 +- src/callbacks/callback_ltfb.cpp | 2 +- src/callbacks/callback_mixup.cpp | 2 +- src/callbacks/callback_perturb_adam.cpp | 2 +- src/callbacks/callback_perturb_dropout.cpp | 2 +- src/callbacks/callback_print.cpp | 2 +- src/callbacks/callback_replace_weights.cpp | 2 +- src/callbacks/callback_save_images.cpp | 2 +- src/callbacks/callback_save_model.cpp | 2 +- src/callbacks/callback_save_topk_models.cpp | 2 +- src/callbacks/callback_summary.cpp | 2 +- src/callbacks/callback_sync_layers.cpp | 2 +- src/callbacks/callback_sync_selected.cpp | 2 +- src/callbacks/callback_timeline.cpp | 2 +- src/callbacks/callback_variable_minibatch.cpp | 4 +- src/callbacks/profiler.cpp | 2 +- src/proto/callbacks.proto | 532 +++++++++--------- src/proto/factories/callback_factory.cpp | 2 +- 33 files changed, 304 insertions(+), 304 deletions(-) diff --git a/src/callbacks/callback_check_gradients.cpp b/src/callbacks/callback_check_gradients.cpp index 1e9cd248696..cd46d14c6c9 100644 --- a/src/callbacks/callback_check_gradients.cpp +++ b/src/callbacks/callback_check_gradients.cpp @@ -233,7 +233,7 @@ std::unique_ptr build_callback_check_gradients_from_pbuf( const google::protobuf::Message& proto_msg, lbann_summary*) { const auto& params = - dynamic_cast(proto_msg); + dynamic_cast(proto_msg); return make_unique(params.step_size(), params.verbose(), params.error_on_failure()); diff --git a/src/callbacks/callback_check_metric.cpp b/src/callbacks/callback_check_metric.cpp index a0919854e68..caed2fca818 100644 --- a/src/callbacks/callback_check_metric.cpp +++ b/src/callbacks/callback_check_metric.cpp @@ -92,7 +92,7 @@ std::unique_ptr build_callback_check_metric_from_pbuf( const google::protobuf::Message& proto_msg, lbann_summary*) { const auto& params = - dynamic_cast(proto_msg); + dynamic_cast(proto_msg); const auto& modes = parse_set(params.execution_modes()); return make_unique(params.metric(), diff --git a/src/callbacks/callback_checkpoint.cpp b/src/callbacks/callback_checkpoint.cpp index 0aac71d4633..7bf82b50642 100644 --- a/src/callbacks/callback_checkpoint.cpp +++ b/src/callbacks/callback_checkpoint.cpp @@ -326,7 +326,7 @@ std::unique_ptr build_callback_checkpoint_from_pbuf( const google::protobuf::Message& proto_msg, lbann_summary*) { const auto& params = - dynamic_cast(proto_msg); + dynamic_cast(proto_msg); return make_unique(params.checkpoint_dir(), params.checkpoint_epochs(), params.checkpoint_steps(), diff --git a/src/callbacks/callback_confusion_matrix.cpp b/src/callbacks/callback_confusion_matrix.cpp index 42d37825b36..6bdf3ff9787 100644 --- a/src/callbacks/callback_confusion_matrix.cpp +++ b/src/callbacks/callback_confusion_matrix.cpp @@ -236,7 +236,7 @@ std::unique_ptr build_callback_confusion_matrix_from_pbuf( const google::protobuf::Message& proto_msg, lbann_summary*) { const auto& params = - dynamic_cast(proto_msg); + dynamic_cast(proto_msg); return make_unique(params.prediction(), params.label(), params.prefix()); diff --git a/src/callbacks/callback_debug.cpp b/src/callbacks/callback_debug.cpp index 25382a361d2..cc15aaa6fa0 100644 --- a/src/callbacks/callback_debug.cpp +++ b/src/callbacks/callback_debug.cpp @@ -161,7 +161,7 @@ std::unique_ptr build_callback_debug_from_pbuf(const google::protobuf::Message& proto_msg, lbann_summary* summarizer) { const auto& params = - dynamic_cast(proto_msg); + dynamic_cast(proto_msg); const auto& modes = parse_set(params.phase()); return make_unique(modes, summarizer); diff --git a/src/callbacks/callback_debug_io.cpp b/src/callbacks/callback_debug_io.cpp index 9f8dd5e2530..b8e38f06488 100644 --- a/src/callbacks/callback_debug_io.cpp +++ b/src/callbacks/callback_debug_io.cpp @@ -156,7 +156,7 @@ std::unique_ptr build_callback_debug_io_from_pbuf( const google::protobuf::Message& proto_msg, lbann_summary*) { const auto& params = - dynamic_cast(proto_msg); + dynamic_cast(proto_msg); const auto& phase = exe_mode_from_string(params.phase()); const auto& lvl = params.lvl(); switch (phase) { diff --git a/src/callbacks/callback_dump_error_signals.cpp b/src/callbacks/callback_dump_error_signals.cpp index c4e1b5a4b2c..b9a4eb26871 100644 --- a/src/callbacks/callback_dump_error_signals.cpp +++ b/src/callbacks/callback_dump_error_signals.cpp @@ -56,7 +56,7 @@ std::unique_ptr build_callback_dump_error_signals_from_pbuf( const google::protobuf::Message& proto_msg, lbann_summary*) { const auto& params = - dynamic_cast(proto_msg); + dynamic_cast(proto_msg); return make_unique(params.basename()); } diff --git a/src/callbacks/callback_dump_gradients.cpp b/src/callbacks/callback_dump_gradients.cpp index c8c0e0d34d1..dfceae93c6a 100644 --- a/src/callbacks/callback_dump_gradients.cpp +++ b/src/callbacks/callback_dump_gradients.cpp @@ -54,7 +54,7 @@ std::unique_ptr build_callback_dump_gradients_from_pbuf( const google::protobuf::Message& proto_msg, lbann_summary*) { const auto& params = - dynamic_cast(proto_msg); + dynamic_cast(proto_msg); return make_unique(params.basename(), params.interval()); } diff --git a/src/callbacks/callback_dump_minibatch_sample_indices.cpp b/src/callbacks/callback_dump_minibatch_sample_indices.cpp index 13b390e8d90..4a5ae90b78e 100644 --- a/src/callbacks/callback_dump_minibatch_sample_indices.cpp +++ b/src/callbacks/callback_dump_minibatch_sample_indices.cpp @@ -81,7 +81,7 @@ std::unique_ptr build_callback_dump_mb_indices_from_pbuf( const google::protobuf::Message& proto_msg, lbann_summary*) { const auto& params = - dynamic_cast(proto_msg); + dynamic_cast(proto_msg); return make_unique( params.basename(), params.interval()); diff --git a/src/callbacks/callback_dump_outputs.cpp b/src/callbacks/callback_dump_outputs.cpp index f11ac2fbb44..5419738cb53 100644 --- a/src/callbacks/callback_dump_outputs.cpp +++ b/src/callbacks/callback_dump_outputs.cpp @@ -182,7 +182,7 @@ std::unique_ptr build_callback_dump_outputs_from_pbuf( const google::protobuf::Message& proto_msg, lbann_summary*) { const auto& params = - dynamic_cast(proto_msg); + dynamic_cast(proto_msg); const auto& layer_names = parse_set(params.layers()); const auto& modes = parse_set(params.execution_modes()); diff --git a/src/callbacks/callback_dump_weights.cpp b/src/callbacks/callback_dump_weights.cpp index db0dae0db95..42a120b4b55 100644 --- a/src/callbacks/callback_dump_weights.cpp +++ b/src/callbacks/callback_dump_weights.cpp @@ -59,7 +59,7 @@ std::unique_ptr build_callback_dump_weights_from_pbuf( const google::protobuf::Message& proto_msg, lbann_summary*) { const auto& params = - dynamic_cast(proto_msg); + dynamic_cast(proto_msg); return make_unique(params.basename()); } diff --git a/src/callbacks/callback_early_stopping.cpp b/src/callbacks/callback_early_stopping.cpp index 9b78d7904c6..4ac30a5ea45 100644 --- a/src/callbacks/callback_early_stopping.cpp +++ b/src/callbacks/callback_early_stopping.cpp @@ -64,7 +64,7 @@ std::unique_ptr build_callback_early_stopping_from_pbuf( const google::protobuf::Message& proto_msg, lbann_summary*) { const auto& params = - dynamic_cast(proto_msg); + dynamic_cast(proto_msg); return make_unique(params.patience()); } diff --git a/src/callbacks/callback_hang.cpp b/src/callbacks/callback_hang.cpp index 891310de751..26572b9af87 100644 --- a/src/callbacks/callback_hang.cpp +++ b/src/callbacks/callback_hang.cpp @@ -47,7 +47,7 @@ std::unique_ptr build_callback_hang_from_pbuf( const google::protobuf::Message& proto_msg, lbann_summary*) { const auto& params = - dynamic_cast(proto_msg); + dynamic_cast(proto_msg); return make_unique(params.rank()); } diff --git a/src/callbacks/callback_imcomm.cpp b/src/callbacks/callback_imcomm.cpp index d07385ab0f7..d6a6e269563 100644 --- a/src/callbacks/callback_imcomm.cpp +++ b/src/callbacks/callback_imcomm.cpp @@ -162,7 +162,7 @@ std::unique_ptr build_callback_imcomm_from_pbuf( const google::protobuf::Message& proto_msg, lbann_summary* summarizer) { - const auto& params = dynamic_cast(proto_msg); + const auto& params = dynamic_cast(proto_msg); const auto& type_str = params.intertrainer_comm_method(); lbann_callback_imcomm::comm_type type = lbann_callback_imcomm::comm_type::NONE; if (type_str == "none") { diff --git a/src/callbacks/callback_io.cpp b/src/callbacks/callback_io.cpp index 57143c64bb7..11d459e61bd 100644 --- a/src/callbacks/callback_io.cpp +++ b/src/callbacks/callback_io.cpp @@ -70,7 +70,7 @@ std::unique_ptr build_callback_disp_io_stats_from_pbuf( const google::protobuf::Message& proto_msg, lbann_summary*) { const auto& params = - dynamic_cast(proto_msg); + dynamic_cast(proto_msg); return make_unique( parse_list(params.layers())); } diff --git a/src/callbacks/callback_learning_rate.cpp b/src/callbacks/callback_learning_rate.cpp index b5da28a4168..bb68513b961 100644 --- a/src/callbacks/callback_learning_rate.cpp +++ b/src/callbacks/callback_learning_rate.cpp @@ -308,7 +308,7 @@ std::unique_ptr build_callback_step_learning_rate_from_pbuf( const google::protobuf::Message& proto_msg, lbann_summary*) { const auto& params = - dynamic_cast(proto_msg); + dynamic_cast(proto_msg); return make_unique( params.step(), params.amt(), @@ -319,7 +319,7 @@ std::unique_ptr build_callback_adaptive_learning_rate_from_pbuf( const google::protobuf::Message& proto_msg, lbann_summary*) { const auto& params = - dynamic_cast(proto_msg); + dynamic_cast(proto_msg); return make_unique( params.patience(), params.amt(), @@ -330,7 +330,7 @@ std::unique_ptr build_callback_drop_fixed_learning_rate_from_pbuf( const google::protobuf::Message& proto_msg, lbann_summary*) { const auto& params = - dynamic_cast(proto_msg); + dynamic_cast(proto_msg); std::vector drop_epochs; for (int i = 0; i < params.drop_epoch_size(); ++i) { drop_epochs.push_back(params.drop_epoch(i)); @@ -344,7 +344,7 @@ build_callback_drop_fixed_learning_rate_from_pbuf( std::unique_ptr build_callback_linear_growth_learning_rate_from_pbuf( const google::protobuf::Message& proto_msg,lbann_summary*) { - using MsgType = lbann_data::CallbackLinearGrowthLearningRate; + using MsgType = lbann_data::Callback::CallbackLinearGrowthLearningRate; using CallbackType = lbann_callback_linear_growth_learning_rate; const auto& params = dynamic_cast(proto_msg); @@ -357,7 +357,7 @@ build_callback_linear_growth_learning_rate_from_pbuf( std::unique_ptr build_callback_optimizerwise_adaptive_learning_rate_from_pbuf( const google::protobuf::Message& proto_msg,lbann_summary*) { - using MsgType = lbann_data::CallbackOptimizerwiseAdaptiveLearningRate; + using MsgType = lbann_data::Callback::CallbackOptimizerwiseAdaptiveLearningRate; using CallbackType = lbann_callback_optimizerwise_adaptive_learning_rate; const auto& params = dynamic_cast(proto_msg); return make_unique(params.scale(), @@ -368,7 +368,7 @@ std::unique_ptr build_callback_poly_learning_rate_from_pbuf( const google::protobuf::Message& proto_msg, lbann_summary*) { const auto& params = - dynamic_cast(proto_msg); + dynamic_cast(proto_msg); return make_unique( params.power(), params.num_epochs(), diff --git a/src/callbacks/callback_ltfb.cpp b/src/callbacks/callback_ltfb.cpp index 2fe90186d1d..8f0d947d357 100644 --- a/src/callbacks/callback_ltfb.cpp +++ b/src/callbacks/callback_ltfb.cpp @@ -529,7 +529,7 @@ build_callback_ltfb_from_pbuf( const google::protobuf::Message& proto_msg, lbann_summary* summarizer) { const auto& params = - dynamic_cast(proto_msg); + dynamic_cast(proto_msg); return make_unique( params.batch_interval(), params.metric(), diff --git a/src/callbacks/callback_mixup.cpp b/src/callbacks/callback_mixup.cpp index f38b07a74c4..a5e5bf98f1b 100644 --- a/src/callbacks/callback_mixup.cpp +++ b/src/callbacks/callback_mixup.cpp @@ -101,7 +101,7 @@ std::unique_ptr build_callback_mixup_from_pbuf( const google::protobuf::Message& proto_msg, lbann_summary*) { const auto& params = - dynamic_cast(proto_msg); + dynamic_cast(proto_msg); const auto& layers_list = parse_list(params.layers()); std::unordered_set layers(layers_list.begin(), layers_list.end()); diff --git a/src/callbacks/callback_perturb_adam.cpp b/src/callbacks/callback_perturb_adam.cpp index 65a32f8f0bf..3ccd61a656c 100644 --- a/src/callbacks/callback_perturb_adam.cpp +++ b/src/callbacks/callback_perturb_adam.cpp @@ -165,7 +165,7 @@ std::unique_ptr build_callback_perturb_adam_from_pbuf( const google::protobuf::Message& proto_msg, lbann_summary*) { const auto& params = - dynamic_cast(proto_msg); + dynamic_cast(proto_msg); return make_unique( params.learning_rate_factor(), params.beta1_factor(), diff --git a/src/callbacks/callback_perturb_dropout.cpp b/src/callbacks/callback_perturb_dropout.cpp index 9df485628bf..de306b0bdf4 100644 --- a/src/callbacks/callback_perturb_dropout.cpp +++ b/src/callbacks/callback_perturb_dropout.cpp @@ -121,7 +121,7 @@ std::unique_ptr build_callback_perturb_dropout_from_pbuf( const google::protobuf::Message& proto_msg, lbann_summary*) { const auto& params = - dynamic_cast(proto_msg); + dynamic_cast(proto_msg); return make_unique( params.keep_dropout_factor(), parse_set(params.layers())); diff --git a/src/callbacks/callback_print.cpp b/src/callbacks/callback_print.cpp index 672cb6ec223..f936223ddfd 100644 --- a/src/callbacks/callback_print.cpp +++ b/src/callbacks/callback_print.cpp @@ -250,7 +250,7 @@ std::unique_ptr build_callback_print_from_pbuf( const google::protobuf::Message& proto_msg, lbann_summary*) { const auto& params = - dynamic_cast(proto_msg); + dynamic_cast(proto_msg); return make_unique(params.interval(), params.print_global_stat_only()); } diff --git a/src/callbacks/callback_replace_weights.cpp b/src/callbacks/callback_replace_weights.cpp index 5aa820aae1b..faf8ef34bc4 100644 --- a/src/callbacks/callback_replace_weights.cpp +++ b/src/callbacks/callback_replace_weights.cpp @@ -54,7 +54,7 @@ std::unique_ptr build_callback_replace_weights_from_pbuf( const google::protobuf::Message& proto_msg, lbann_summary*) { const auto& params = - dynamic_cast(proto_msg); + dynamic_cast(proto_msg); return make_unique( parse_list(params.source_layers()), parse_list(params.destination_layers()), diff --git a/src/callbacks/callback_save_images.cpp b/src/callbacks/callback_save_images.cpp index 4d320f5c5b7..0234151022c 100644 --- a/src/callbacks/callback_save_images.cpp +++ b/src/callbacks/callback_save_images.cpp @@ -160,7 +160,7 @@ std::unique_ptr build_callback_save_images_from_pbuf( const google::protobuf::Message& proto_msg, lbann_summary*) { const auto& params = - dynamic_cast(proto_msg); + dynamic_cast(proto_msg); return make_unique( parse_list<>(params.layers()), params.image_format(), diff --git a/src/callbacks/callback_save_model.cpp b/src/callbacks/callback_save_model.cpp index 5f2d917e883..5162fe8ca65 100644 --- a/src/callbacks/callback_save_model.cpp +++ b/src/callbacks/callback_save_model.cpp @@ -178,7 +178,7 @@ std::unique_ptr build_callback_save_model_from_pbuf( const google::protobuf::Message& proto_msg, lbann_summary*) { const auto& params = - dynamic_cast(proto_msg); + dynamic_cast(proto_msg); if(params.extension().size() != 0) { return make_unique( params.dir(), diff --git a/src/callbacks/callback_save_topk_models.cpp b/src/callbacks/callback_save_topk_models.cpp index 9f3f0ed21a0..f986590ae9a 100644 --- a/src/callbacks/callback_save_topk_models.cpp +++ b/src/callbacks/callback_save_topk_models.cpp @@ -92,7 +92,7 @@ std::unique_ptr build_callback_save_topk_models_from_pbuf( const google::protobuf::Message& proto_msg, lbann_summary*) { const auto& params = - dynamic_cast(proto_msg); + dynamic_cast(proto_msg); return make_unique( params.dir(), params.k(), diff --git a/src/callbacks/callback_summary.cpp b/src/callbacks/callback_summary.cpp index 6e75c93b471..959214f2101 100644 --- a/src/callbacks/callback_summary.cpp +++ b/src/callbacks/callback_summary.cpp @@ -137,7 +137,7 @@ build_callback_summary_from_pbuf( const google::protobuf::Message& proto_msg, lbann_summary* summarizer) { const auto& params = - dynamic_cast(proto_msg); + dynamic_cast(proto_msg); return make_unique(summarizer, params.batch_interval(), params.mat_interval()); diff --git a/src/callbacks/callback_sync_layers.cpp b/src/callbacks/callback_sync_layers.cpp index 6a7c674602d..61075337b57 100644 --- a/src/callbacks/callback_sync_layers.cpp +++ b/src/callbacks/callback_sync_layers.cpp @@ -65,7 +65,7 @@ std::unique_ptr build_callback_sync_layers_from_pbuf( const google::protobuf::Message& proto_msg, lbann_summary*) { const auto& params = - dynamic_cast(proto_msg); + dynamic_cast(proto_msg); return make_unique(params.sync_gpus(), params.sync_mpi(), params.only_input()); diff --git a/src/callbacks/callback_sync_selected.cpp b/src/callbacks/callback_sync_selected.cpp index d16530d62ef..cec3f6b4a11 100644 --- a/src/callbacks/callback_sync_selected.cpp +++ b/src/callbacks/callback_sync_selected.cpp @@ -281,7 +281,7 @@ std::unique_ptr build_callback_sync_selected_from_pbuf( const google::protobuf::Message& proto_msg, lbann_summary*) { const auto& params = - dynamic_cast(proto_msg); + dynamic_cast(proto_msg); const int num_layers = params.layer_to_sync_size(); if (num_layers == 0) { throw lbann_exception("sync_selected requires at least a layer " diff --git a/src/callbacks/callback_timeline.cpp b/src/callbacks/callback_timeline.cpp index b1eb8919a59..4d6f9077753 100644 --- a/src/callbacks/callback_timeline.cpp +++ b/src/callbacks/callback_timeline.cpp @@ -102,7 +102,7 @@ std::unique_ptr build_callback_timeline_from_pbuf( const google::protobuf::Message& proto_msg, lbann_summary*) { const auto& params = - dynamic_cast(proto_msg); + dynamic_cast(proto_msg); return make_unique(params.directory()); } diff --git a/src/callbacks/callback_variable_minibatch.cpp b/src/callbacks/callback_variable_minibatch.cpp index 875aa4a071f..10a0d19d836 100644 --- a/src/callbacks/callback_variable_minibatch.cpp +++ b/src/callbacks/callback_variable_minibatch.cpp @@ -187,7 +187,7 @@ std::unique_ptr build_callback_step_minibatch_from_pbuf( const google::protobuf::Message& proto_msg, lbann_summary*) { const auto& params = - dynamic_cast(proto_msg); + dynamic_cast(proto_msg); return make_unique(params.starting_mbsize(), params.step(), params.ramp_time()); @@ -197,7 +197,7 @@ std::unique_ptr build_callback_minibatch_schedule_from_pbuf( const google::protobuf::Message& proto_msg, lbann_summary*) { const auto& params = - dynamic_cast(proto_msg); + dynamic_cast(proto_msg); std::vector steps; for (int i = 0; i < params.step_size(); ++i) { const auto& proto_step = params.step(i); diff --git a/src/callbacks/profiler.cpp b/src/callbacks/profiler.cpp index 76e2b93d1c9..fdf8771f4f2 100644 --- a/src/callbacks/profiler.cpp +++ b/src/callbacks/profiler.cpp @@ -197,7 +197,7 @@ std::unique_ptr build_callback_profiler_from_pbuf( const google::protobuf::Message& proto_msg, lbann_summary*) { const auto& params = - dynamic_cast(proto_msg); + dynamic_cast(proto_msg); return make_unique(params.sync(), params.skip_init()); } diff --git a/src/proto/callbacks.proto b/src/proto/callbacks.proto index e42f76de2b5..e72773e864f 100644 --- a/src/proto/callbacks.proto +++ b/src/proto/callbacks.proto @@ -76,272 +76,272 @@ message Callback { CallbackEarlyStopping early_stopping = 43; CallbackTimeline timeline = 44; } -} - -message CallbackLTFB { - int64 batch_interval = 1; - string metric = 2; - string weights = 3; // default: all weights - bool low_score_wins = 4; - string communication_algorithm = 5; // default: "sendrecv_weights" - bool exchange_hyperparameters = 6; -} - -message CallbackStepLearningRate { - string weights = 1; //default: all weights - int64 step = 2; - double amt = 3; -} - -message CallbackCustomLearningRate { - //don't know how to support this, since it takes an std::function as an argument -} - -message CallbackAdaptiveLearningRate { - string weights = 1; //default: all weights - int64 patience = 2; - double amt = 3; -} - -message CallbackSaveImages { - string layers = 1; // Layer outputs to save as images - string image_format = 2; // Image format (e.g. jpg, png, pgm) - string image_prefix = 3; // Prefix for saved image files -} - -message CallbackPrint { - int64 interval = 1; //default in lbann_callback_print.hpp is 1 - bool print_global_stat_only = 2; //useful in large scale multi-trainer, default is false -} - -message CallbackProfiler { - bool sync = 1; - bool skip_init = 2; -} - -message CallbackTimer { -} - -message CallbackSummary { - string dir = 1; //directory for the lbann_summary - int64 batch_interval = 2; //default in lbann_callback_summary.hpp is 1 - int64 mat_interval = 3; //default in lbann_callback_summary.hpp is 25 -} - -message CallbackDumpWeights { - string basename = 1; -} - -message CallbackDumpOutputs { - string layers = 1; // Default: all layers - string execution_modes = 2; // Default: all modes - int64 batch_interval = 3; // Frequency for output dumping (default: all steps) - string directory = 4; // Directory for output files - string format = 5; // Options: csv, tsv, npy, npz (default: csv) -} - -message CallbackDumpErrorSignals { - string basename = 1; -} - -message CallbackDumpGradients { - string basename = 1; - int64 interval = 2; -} - -message CallbackDumpMBIndices { - string basename = 1; - int64 interval = 2; -} - -message CallbackDispIOStats { - string layers = 1; //e.g: "2 4 5"; use "10000" to apply to all layers -} - -message CallbackImComm { - string intertrainer_comm_method = 1; - bool all_optimizers = 2; -} - -message CallbackDebug { - string phase = 1; //should be called "modes" -} - -message CallbackDebugIO { - string phase = 1; - int32 lvl = 2; -} - -message CallbackCheckSmall { -} - -message CallbackCheckNaN { -} - -message CallbackCheckDataset { -} - -message CallbackHang { - int64 rank = 1; -} - -message CallbackDropFixedLearningRate { - string weights = 1; - repeated int64 drop_epoch = 2; - double amt = 3; -} - -message CallbackLinearGrowthLearningRate { - string weights = 1; - double target = 2; - int64 num_epochs = 3; - int64 delay = 4; -} - -message CallbackPolyLearningRate { - string weights = 1; - double power = 2; - uint64 num_epochs = 3; - uint64 max_iter = 4; - double end_lr = 5; -} - -message CallbackStepMinibatch { - int64 starting_mbsize = 1; - int64 step = 2; - int64 ramp_time = 3; -} - -message MinibatchScheduleStep { - int64 epoch = 1; - int64 mbsize = 2; - double lr = 3; - int64 ramp_time = 4; -} - -message CallbackOptimizerwiseAdaptiveLearningRate { - string weights = 1; - double scale = 2; -} - -message CallbackMinibatchSchedule { - int64 starting_mbsize = 1; - repeated MinibatchScheduleStep step = 2; -} - -message CallbackCheckGradients { - double step_size = 1; - bool verbose = 2; - bool error_on_failure = 3; // Throw error if gradient check fails -} - -message CallbackCheckMetric { - string metric = 1; - double lower_bound = 2; - double upper_bound = 3; - bool error_on_failure = 4; // Throw error if metric check fails - string execution_modes = 5; // Default: all modes -} - -message CallbackCheckpoint { - string checkpoint_dir = 1; - int64 checkpoint_epochs = 2; - int64 checkpoint_steps = 3; - double checkpoint_secs = 4; - string per_rank_dir = 5; - int64 ckpt_dist_epochs = 6; - int64 ckpt_dist_steps = 7; -} - - -message CallbackSaveModel { - string dir = 1; - string extension = 2; - bool disable_save_after_training = 3; -} - -message CallbackReplaceWeights { - string source_layers = 1; //set of layers to copy weights from - string destination_layers = 2; //set of layers to copy weights to - int64 batch_interval = 3; -} -message CallbackGPUMemoryUsage { -} - -message CallbackSyncLayers { - bool sync_gpus = 1; - bool sync_mpi = 2; - bool only_input = 3; -} - -message CallbackSyncSelected { - message LayerToSync { - enum PropDirection { - Both = 0; - Forward = 1; - Backward = 2; - } - string name = 1; // name of the layer to synchronize - PropDirection prop = 2; // propagation setep to synchronize + + message CallbackLTFB { + int64 batch_interval = 1; + string metric = 2; + string weights = 3; // default: all weights + bool low_score_wins = 4; + string communication_algorithm = 5; // default: "sendrecv_weights" + bool exchange_hyperparameters = 6; + } + + message CallbackStepLearningRate { + string weights = 1; //default: all weights + int64 step = 2; + double amt = 3; + } + + message CallbackCustomLearningRate { + //don't know how to support this, since it takes an std::function as an argument + } + + message CallbackAdaptiveLearningRate { + string weights = 1; //default: all weights + int64 patience = 2; + double amt = 3; + } + + message CallbackSaveImages { + string layers = 1; // Layer outputs to save as images + string image_format = 2; // Image format (e.g. jpg, png, pgm) + string image_prefix = 3; // Prefix for saved image files + } + + message CallbackPrint { + int64 interval = 1; //default in lbann_callback_print.hpp is 1 + bool print_global_stat_only = 2; //useful in large scale multi-trainer, default is false + } + + message CallbackProfiler { + bool sync = 1; + bool skip_init = 2; + } + + message CallbackTimer { + } + + message CallbackSummary { + string dir = 1; //directory for the lbann_summary + int64 batch_interval = 2; //default in lbann_callback_summary.hpp is 1 + int64 mat_interval = 3; //default in lbann_callback_summary.hpp is 25 + } + + message CallbackDumpWeights { + string basename = 1; + } + + message CallbackDumpOutputs { + string layers = 1; // Default: all layers + string execution_modes = 2; // Default: all modes + int64 batch_interval = 3; // Frequency for output dumping (default: all steps) + string directory = 4; // Directory for output files + string format = 5; // Options: csv, tsv, npy, npz (default: csv) + } + + message CallbackDumpErrorSignals { + string basename = 1; + } + + message CallbackDumpGradients { + string basename = 1; + int64 interval = 2; + } + + message CallbackDumpMBIndices { + string basename = 1; + int64 interval = 2; + } + + message CallbackDispIOStats { + string layers = 1; //e.g: "2 4 5"; use "10000" to apply to all layers + } + + message CallbackImComm { + string intertrainer_comm_method = 1; + bool all_optimizers = 2; + } + + message CallbackDebug { + string phase = 1; //should be called "modes" + } + + message CallbackDebugIO { + string phase = 1; + int32 lvl = 2; + } + + message CallbackCheckSmall { + } + + message CallbackCheckNaN { + } + + message CallbackCheckDataset { + } + + message CallbackHang { + int64 rank = 1; + } + + message CallbackDropFixedLearningRate { + string weights = 1; + repeated int64 drop_epoch = 2; + double amt = 3; } - message CudaProfilerSetup { - enum OutputMode { - KeyValuePair = 0; - CSV = 1; + message CallbackLinearGrowthLearningRate { + string weights = 1; + double target = 2; + int64 num_epochs = 3; + int64 delay = 4; + } + + message CallbackPolyLearningRate { + string weights = 1; + double power = 2; + uint64 num_epochs = 3; + uint64 max_iter = 4; + double end_lr = 5; + } + + message CallbackStepMinibatch { + int64 starting_mbsize = 1; + int64 step = 2; + int64 ramp_time = 3; + } + + message MinibatchScheduleStep { + int64 epoch = 1; + int64 mbsize = 2; + double lr = 3; + int64 ramp_time = 4; + } + + message CallbackOptimizerwiseAdaptiveLearningRate { + string weights = 1; + double scale = 2; + } + + message CallbackMinibatchSchedule { + int64 starting_mbsize = 1; + repeated MinibatchScheduleStep step = 2; + } + + message CallbackCheckGradients { + double step_size = 1; + bool verbose = 2; + bool error_on_failure = 3; // Throw error if gradient check fails + } + + message CallbackCheckMetric { + string metric = 1; + double lower_bound = 2; + double upper_bound = 3; + bool error_on_failure = 4; // Throw error if metric check fails + string execution_modes = 5; // Default: all modes + } + + message CallbackCheckpoint { + string checkpoint_dir = 1; + int64 checkpoint_epochs = 2; + int64 checkpoint_steps = 3; + double checkpoint_secs = 4; + string per_rank_dir = 5; + int64 ckpt_dist_epochs = 6; + int64 ckpt_dist_steps = 7; + } + + + message CallbackSaveModel { + string dir = 1; + string extension = 2; + bool disable_save_after_training = 3; + } + + message CallbackReplaceWeights { + string source_layers = 1; //set of layers to copy weights from + string destination_layers = 2; //set of layers to copy weights to + int64 batch_interval = 3; + } + message CallbackGPUMemoryUsage { + } + + message CallbackSyncLayers { + bool sync_gpus = 1; + bool sync_mpi = 2; + bool only_input = 3; + } + + message CallbackSyncSelected { + message LayerToSync { + enum PropDirection { + Both = 0; + Forward = 1; + Backward = 2; + } + string name = 1; // name of the layer to synchronize + PropDirection prop = 2; // propagation setep to synchronize } - bool no_init = 1; - string config_file = 2; - string output_dir = 3; - OutputMode output_mode = 4; - } - - bool async_gpus = 1; - bool async_mpi = 2; - repeated LayerToSync layer_to_sync = 3; - CudaProfilerSetup cuda_profiler_setup = 4; -} - -message CallbackConfusionMatrix { - string prediction = 1; // Prediction layer - string label = 2; // Label layer - string prefix = 3; // Prefix for output files -} - -message CallbackPerturbAdam { - float learning_rate_factor = 1; // Learning rate perturbation (in log space) - float beta1_factor = 2; // beta1 perturbation (in log space) - float beta2_factor = 3; // beta2 perturbation (in log space) - float eps_factor = 4; // eps perturbation (in log space) - bool perturb_during_training = 5; // Whether to periodically perturb during training - int64 batch_interval = 6; // Frequency of perturbation if perturb_during_training is true - string weights = 7; // Weights with Adam optimizer -} - -message CallbackPerturbDropout { - float keep_dropout_factor = 1; //Keep dropout prob perturbation (in log space) - string layers = 2; // dropout layers to perturb keep prob, all dropout layers by default -} - -message CallbackSaveTopKModels { - string dir = 1; //directory to save model - int32 k = 2; //number of (top) models to save - string metric = 3; //metrics to use in evaluating models - bool ascending_ordering = 4; //whether to sort metrics per model in ascending order, descending order is default -} - -message CallbackMixup { - string layers = 1; - float alpha = 2; -} - -message CallbackCheckInit { -} - -message CallbackEarlyStopping { - int64 patience = 1; -} - -message CallbackTimeline { - string directory = 1; -} + + message CudaProfilerSetup { + enum OutputMode { + KeyValuePair = 0; + CSV = 1; + } + bool no_init = 1; + string config_file = 2; + string output_dir = 3; + OutputMode output_mode = 4; + } + + bool async_gpus = 1; + bool async_mpi = 2; + repeated LayerToSync layer_to_sync = 3; + CudaProfilerSetup cuda_profiler_setup = 4; + } + + message CallbackConfusionMatrix { + string prediction = 1; // Prediction layer + string label = 2; // Label layer + string prefix = 3; // Prefix for output files + } + + message CallbackPerturbAdam { + float learning_rate_factor = 1; // Learning rate perturbation (in log space) + float beta1_factor = 2; // beta1 perturbation (in log space) + float beta2_factor = 3; // beta2 perturbation (in log space) + float eps_factor = 4; // eps perturbation (in log space) + bool perturb_during_training = 5; // Whether to periodically perturb during training + int64 batch_interval = 6; // Frequency of perturbation if perturb_during_training is true + string weights = 7; // Weights with Adam optimizer + } + + message CallbackPerturbDropout { + float keep_dropout_factor = 1; //Keep dropout prob perturbation (in log space) + string layers = 2; // dropout layers to perturb keep prob, all dropout layers by default + } + + message CallbackSaveTopKModels { + string dir = 1; //directory to save model + int32 k = 2; //number of (top) models to save + string metric = 3; //metrics to use in evaluating models + bool ascending_ordering = 4; //whether to sort metrics per model in ascending order, descending order is default + } + + message CallbackMixup { + string layers = 1; + float alpha = 2; + } + + message CallbackCheckInit { + } + + message CallbackEarlyStopping { + int64 patience = 1; + } + + message CallbackTimeline { + string directory = 1; + } +} \ No newline at end of file diff --git a/src/proto/factories/callback_factory.cpp b/src/proto/factories/callback_factory.cpp index 9a36bc62852..8f7d4e287d2 100644 --- a/src/proto/factories/callback_factory.cpp +++ b/src/proto/factories/callback_factory.cpp @@ -216,7 +216,7 @@ lbann_summary* construct_summarizer(lbann_comm* comm, for (int j=0; j Date: Mon, 22 Jul 2019 16:41:29 -0700 Subject: [PATCH 155/634] Minor changes --- bamboo/allocate_and_run.sh | 14 +++++++------- bamboo/common_python/tools.py | 2 +- bamboo/full_alexnet_clang6/README.md | 1 + bamboo/full_alexnet_gcc7/README.md | 1 + bamboo/full_alexnet_intel19/README.md | 1 + bamboo/integration_tests/common_code.py | 10 ++++++---- bamboo/unit_tests/test_unit_lbann2_reload.py | 2 +- docs/continuous_integration.rst | 2 +- 8 files changed, 19 insertions(+), 14 deletions(-) create mode 100644 bamboo/full_alexnet_clang6/README.md create mode 100644 bamboo/full_alexnet_gcc7/README.md create mode 100644 bamboo/full_alexnet_intel19/README.md diff --git a/bamboo/allocate_and_run.sh b/bamboo/allocate_and_run.sh index b7f14295461..bc364071f67 100755 --- a/bamboo/allocate_and_run.sh +++ b/bamboo/allocate_and_run.sh @@ -35,19 +35,19 @@ fi if [ "${CLUSTER}" = 'lassen' ]; then ALLOCATION_TIME_LIMIT=600 if [ ${WEEKLY} -ne 0 ]; then - timeout 24h bsub -G guests -Is -q pbatch -nnodes 16 -W $ALLOCATION_TIME_LIMIT ./run.sh --weekly + timeout -k 5 24h bsub -G guests -Is -q pbatch -nnodes 16 -W $ALLOCATION_TIME_LIMIT ./run.sh --weekly else - timeout 24h bsub -G guests -Is -q pbatch -nnodes 16 -W $ALLOCATION_TIME_LIMIT ./run.sh + timeout -k 5 24h bsub -G guests -Is -q pbatch -nnodes 16 -W $ALLOCATION_TIME_LIMIT ./run.sh fi elif [ "${CLUSTER}" = 'catalyst' ] || [ "${CLUSTER}" = 'corona' ] || [ "${CLUSTER}" = 'pascal' ]; then if [ ${WEEKLY} -ne 0 ]; then ALLOCATION_TIME_LIMIT=720 - timeout 24h salloc -N16 --partition=pbatch -t $ALLOCATION_TIME_LIMIT ./run.sh --weekly + timeout -k 5 24h salloc -N16 --partition=pbatch -t $ALLOCATION_TIME_LIMIT ./run.sh --weekly if [ "${CLUSTER}" = 'catalyst' ]; then cd integration_tests - python -m pytest -s test_integration_performance_full_alexnet_clang6 --weekly --run --junitxml=alexnet_clang6_results.xml - python -m pytest -s test_integration_performance_full_alexnet_gcc7 --weekly --run --junitxml=alexnet_gcc7_results.xml - # python -m pytest -s test_integration_performance_full_alexnet_intel19 --weekly --run --junitxml=alexnet_intel19_results.xml + python -m pytest -s test_integration_performance_full_alexnet_clang6 --weekly --run --junitxml=../full_alexnet_clang6/results.xml + python -m pytest -s test_integration_performance_full_alexnet_gcc7 --weekly --run --junitxml=../full_alexnet_gcc7/results.xml + # python -m pytest -s test_integration_performance_full_alexnet_intel19 --weekly --run --junitxml=../full_alexnet_intel19/results.xml cd .. fi else @@ -56,6 +56,6 @@ elif [ "${CLUSTER}" = 'catalyst' ] || [ "${CLUSTER}" = 'corona' ] || [ "${CLUSTE elif [ "${CLUSTER}" = 'corona' ] || [ "${CLUSTER}" = 'pascal' ]; then ALLOCATION_TIME_LIMIT=660 fi - timeout 24h salloc -N16 --partition=pbatch -t $ALLOCATION_TIME_LIMIT ./run.sh + timeout -k 5 24h salloc -N16 --partition=pbatch -t $ALLOCATION_TIME_LIMIT ./run.sh fi fi diff --git a/bamboo/common_python/tools.py b/bamboo/common_python/tools.py index 5fe16d4be4e..7f48f57461a 100644 --- a/bamboo/common_python/tools.py +++ b/bamboo/common_python/tools.py @@ -72,7 +72,7 @@ def get_command(cluster, else: raise Exception('Unsupported Cluster: %s' % cluster) - MAX_TIME = 60 + MAX_TIME = 600 # Description of command line options are from the appropriate command's # man pages if scheduler == 'slurm': diff --git a/bamboo/full_alexnet_clang6/README.md b/bamboo/full_alexnet_clang6/README.md new file mode 100644 index 00000000000..6672d2ab7b0 --- /dev/null +++ b/bamboo/full_alexnet_clang6/README.md @@ -0,0 +1 @@ +Directory for results.xml for full_alexnet_clang6. \ No newline at end of file diff --git a/bamboo/full_alexnet_gcc7/README.md b/bamboo/full_alexnet_gcc7/README.md new file mode 100644 index 00000000000..a518e84799e --- /dev/null +++ b/bamboo/full_alexnet_gcc7/README.md @@ -0,0 +1 @@ +Directory for results.xml for full_alexnet_gcc7. \ No newline at end of file diff --git a/bamboo/full_alexnet_intel19/README.md b/bamboo/full_alexnet_intel19/README.md new file mode 100644 index 00000000000..0fe9ebc203b --- /dev/null +++ b/bamboo/full_alexnet_intel19/README.md @@ -0,0 +1 @@ +Directory for results.xml for full_alexnet_intel19. \ No newline at end of file diff --git a/bamboo/integration_tests/common_code.py b/bamboo/integration_tests/common_code.py index a939f0effc4..37da5fa3850 100644 --- a/bamboo/integration_tests/common_code.py +++ b/bamboo/integration_tests/common_code.py @@ -8,14 +8,16 @@ def get_command(cluster, dir_name, model_folder, model_name, executable, output_file_name, error_file_name, compiler_name, weekly=False): if model_name in ['alexnet', 'conv_autoencoder_imagenet']: - data_reader_percent = 0.01 - # If doing weekly testing, increase data_reader_percent if weekly: data_reader_percent = 0.10 + time_limit = 600 + else: + data_reader_percent = 0.01 + time_limit = 60 if cluster == 'lassen': command = tools.get_command( cluster=cluster, executable=executable, num_nodes=16, - partition='pbatch', time_limit=600, num_processes=32, + partition='pbatch', time_limit=time_limit, num_processes=32, dir_name=dir_name, data_filedir_train_default='/p/lscratchh/brainusr/datasets/ILSVRC2012/original/train/', data_filename_train_default='/p/lscratchh/brainusr/datasets/ILSVRC2012/labels/train.txt', @@ -28,7 +30,7 @@ def get_command(cluster, dir_name, model_folder, model_name, executable, else: command = tools.get_command( cluster=cluster, executable=executable, num_nodes=16, - partition='pbatch', time_limit=600, num_processes=32, + partition='pbatch', time_limit=time_limit, num_processes=32, dir_name=dir_name, data_filedir_train_default='/p/lscratchh/brainusr/datasets/ILSVRC2012/original/train/', data_filename_train_default='/p/lscratchh/brainusr/datasets/ILSVRC2012/labels/train.txt', diff --git a/bamboo/unit_tests/test_unit_lbann2_reload.py b/bamboo/unit_tests/test_unit_lbann2_reload.py index 1ff1ef76635..7a2307fa92b 100644 --- a/bamboo/unit_tests/test_unit_lbann2_reload.py +++ b/bamboo/unit_tests/test_unit_lbann2_reload.py @@ -125,7 +125,7 @@ def test_unit_lbann2_reload_clang6(cluster, exes, dirname): def test_unit_lbann2_reload_gcc7(cluster, exes, dirname): - if cluster in ['catalyst', 'lassen', 'pascal']: # STILL ERRORS + if cluster in ['catalyst', 'corona', 'lassen', 'pascal']: # STILL ERRORS pytest.skip('FIXME') skeleton_lbann2_reload(cluster, exes, dirname, 'gcc7') diff --git a/docs/continuous_integration.rst b/docs/continuous_integration.rst index 396798bc532..8735fde0b3f 100644 --- a/docs/continuous_integration.rst +++ b/docs/continuous_integration.rst @@ -165,7 +165,7 @@ Bamboo agent properties are used to specify requirements for each job. +--------------------------------+-------------+--------------+----------+------------------+------------------------+ | Pascal Agents (x86_gpu_pascal) | lbannusr | x86_64 | pascal | pascal | chaos_6_x86_64_ib | +--------------------------------+-------------+--------------+----------+------------------+------------------------+ -| Ray Agents (ppc64le_gpu) | lbannusr | ppc64_le | ray | pascal | blueos_3_ppc64le_ib | +| Ray Agents (ppc64le_gpu) | lbannusr | ppc64le | ray | pascal | blueos_3_ppc64le_ib | +--------------------------------+-------------+--------------+----------+------------------+------------------------+ Currently, "agent_owner", "architecture", and "gpu_architecture" are used to From 58ba6480b9a3dda1e7167f30e0fe604eb2d199c2 Mon Sep 17 00:00:00 2001 From: Nikoli Dryden Date: Tue, 23 Jul 2019 21:00:15 -0700 Subject: [PATCH 156/634] Address review comments. --- include/lbann/layers/regularizers/batch_normalization.hpp | 3 ++- python/lbann/models/resnet.py | 4 ++-- src/proto/factories/layer_factory.cpp | 4 ++-- src/proto/lbann.proto | 4 ++-- 4 files changed, 8 insertions(+), 7 deletions(-) diff --git a/include/lbann/layers/regularizers/batch_normalization.hpp b/include/lbann/layers/regularizers/batch_normalization.hpp index 06e150a2cfc..efb2cfa686f 100644 --- a/include/lbann/layers/regularizers/batch_normalization.hpp +++ b/include/lbann/layers/regularizers/batch_normalization.hpp @@ -224,7 +224,8 @@ class batch_normalization_layer : public regularizer_layer { << "may be too small to get good statistics"; std::cerr << err.str() << std::endl; } - } else if (m_statistics_group_size*local_mini_batch_size <= 4) { + } else if (m_statistics_group_size != 0 && + m_statistics_group_size*local_mini_batch_size <= 4) { // This possibly underestimates the aggregation size for processors with // smaller local mini-batch sizes. if (output.DistRank() == 0) { diff --git a/python/lbann/models/resnet.py b/python/lbann/models/resnet.py index 997a5a91dd7..0caeb150473 100644 --- a/python/lbann/models/resnet.py +++ b/python/lbann/models/resnet.py @@ -61,8 +61,8 @@ def forward(self, x): conv = self.conv(x) bn = lbann.BatchNormalization( conv, weights=self.bn_weights, - statistics_group_size=self.bn_statistics_group_size, - global_statistics=True if self.bn_statistics_group_size == 0 else None, + statistics_group_size=(-1 if self.bn_statistics_group_size == 0 + else self.bn_statistics_group_size), name='{0}_bn_instance{1}'.format(self.name,self.instance)) if self.relu: return lbann.Relu( diff --git a/src/proto/factories/layer_factory.cpp b/src/proto/factories/layer_factory.cpp index f02fef5b1a8..2bb63f7ab10 100644 --- a/src/proto/factories/layer_factory.cpp +++ b/src/proto/factories/layer_factory.cpp @@ -422,8 +422,8 @@ std::unique_ptr construct_layer( const auto& params = proto_layer.batch_normalization(); if (Layout == data_layout::DATA_PARALLEL) { int statistics_group_size = params.statistics_group_size(); - if (params.global_statistics()) { - statistics_group_size = 0; + if (statistics_group_size < 0) { + statistics_group_size = 0; // Global statistics. } else if (statistics_group_size == 0) { statistics_group_size = 1; // Default to local. } diff --git a/src/proto/lbann.proto b/src/proto/lbann.proto index 78559cd7d25..ed698ec2016 100644 --- a/src/proto/lbann.proto +++ b/src/proto/lbann.proto @@ -1061,8 +1061,8 @@ message BatchNormalization { double bias_init = 3; //default: 0.0 double epsilon = 4; //default: 1e-5 string stats_aggregation = 5; // default: local; deprecated - int64 statistics_group_size = 6; // default: 1 (local) - bool global_statistics = 7; + // default: 1 (local aggregation); set to a negative value for global stats. + int64 statistics_group_size = 6; } message SeluDropout { From ccf30bff926953fd1df68d679a58ba2a5c7b309c Mon Sep 17 00:00:00 2001 From: "Thomas R. Benson" Date: Wed, 24 Jul 2019 08:55:55 -0700 Subject: [PATCH 157/634] fix a few names per GH feedback --- include/lbann/callbacks/CMakeLists.txt | 4 +- include/lbann/callbacks/mixup.hpp | 6 +-- .../callbacks/{io.hpp => monitor_io.hpp} | 20 ++++----- .../{print.hpp => print_statistics.hpp} | 25 ++++++----- include/lbann/lbann.hpp | 44 +++++++++---------- src/callbacks/CMakeLists.txt | 4 +- src/callbacks/mixup.cpp | 4 +- src/callbacks/{io.cpp => monitor_io.cpp} | 25 ++++++----- .../{print.cpp => print_statistics.cpp} | 20 ++++----- src/proto/factories/callback_factory.cpp | 8 ++-- 10 files changed, 82 insertions(+), 78 deletions(-) rename include/lbann/callbacks/{io.hpp => monitor_io.hpp} (81%) rename include/lbann/callbacks/{print.hpp => print_statistics.hpp} (70%) rename src/callbacks/{io.cpp => monitor_io.cpp} (78%) rename src/callbacks/{print.cpp => print_statistics.cpp} (95%) diff --git a/include/lbann/callbacks/CMakeLists.txt b/include/lbann/callbacks/CMakeLists.txt index 44debb7811f..bbd67d8dda5 100644 --- a/include/lbann/callbacks/CMakeLists.txt +++ b/include/lbann/callbacks/CMakeLists.txt @@ -20,13 +20,13 @@ set_full_path(THIS_DIR_HEADERS gpu_memory_usage.hpp hang.hpp imcomm.hpp - io.hpp learning_rate.hpp ltfb.hpp mixup.hpp + monitor_io.hpp perturb_adam.hpp perturb_dropout.hpp - print.hpp + print_statistics.hpp profiler.hpp replace_weights.hpp save_images.hpp diff --git a/include/lbann/callbacks/mixup.hpp b/include/lbann/callbacks/mixup.hpp index 9b67c08ef5e..f31d129bb44 100644 --- a/include/lbann/callbacks/mixup.hpp +++ b/include/lbann/callbacks/mixup.hpp @@ -55,17 +55,17 @@ namespace callback { * * The recommended default alpha (from the paper) is 0.4. */ -class callback_mixup : public callback_base { +class mixup : public callback_base { public: /** Apply mixup to layers named in layers with mixup parameter alpha. */ - callback_mixup(std::unordered_set layers, float alpha) : + mixup(std::unordered_set layers, float alpha) : callback_base(), m_layers(layers), m_alpha(alpha) { if (alpha < 0.0f) { LBANN_ERROR("Mixup alpha must be non-negative."); } } - callback_mixup* copy() const override { return new callback_mixup(*this); } + mixup* copy() const override { return new mixup(*this); } std::string name() const override { return "mixup"; } void on_forward_prop_end(model *m, Layer *l) override; diff --git a/include/lbann/callbacks/io.hpp b/include/lbann/callbacks/monitor_io.hpp similarity index 81% rename from include/lbann/callbacks/io.hpp rename to include/lbann/callbacks/monitor_io.hpp index 58c9a2e668d..fa5f2832b2d 100644 --- a/include/lbann/callbacks/io.hpp +++ b/include/lbann/callbacks/monitor_io.hpp @@ -23,7 +23,7 @@ // implied. See the License for the specific language governing // permissions and limitations under the license. // -// io .hpp .cpp - Callback hooks for I/O monitoring +// monitor_io .hpp .cpp - Callback hooks for I/O monitoring //////////////////////////////////////////////////////////////////////////////// #ifndef LBANN_CALLBACKS_IO_HPP_INCLUDED @@ -42,22 +42,22 @@ namespace callback { /** * Print information on the amount of IO that layers do. */ -class io : public callback_base { +class monitor_io : public callback_base { public: - io() = default; + monitor_io() = default; /** Only apply to specific layers. */ - io(std::vector const& layers) + monitor_io(std::vector const& layers) : m_layers(layers.begin(), layers.end()) {} - io(const io&) = default; - io& operator=(const io&) = default; - io* copy() const override { - return new io(*this); + monitor_io(const monitor_io&) = default; + monitor_io& operator=(const monitor_io&) = default; + monitor_io* copy() const override { + return new monitor_io(*this); } /** Report how much I/O has occured per data reader */ void on_epoch_end(model *m) override; void on_test_end(model *m) override; - std::string name() const override { return "io"; } + std::string name() const override { return "monitor_io"; } private: /** Indicies of layers to monitor. */ std::unordered_set m_layers; @@ -65,7 +65,7 @@ class io : public callback_base { // Builder function std::unique_ptr -build_disp_io_stats_callback_from_pbuf( +build_monitor_io_callback_from_pbuf( const google::protobuf::Message&, lbann_summary*); } // namespace callback diff --git a/include/lbann/callbacks/print.hpp b/include/lbann/callbacks/print_statistics.hpp similarity index 70% rename from include/lbann/callbacks/print.hpp rename to include/lbann/callbacks/print_statistics.hpp index 15f96c66a23..75943c661fa 100644 --- a/include/lbann/callbacks/print.hpp +++ b/include/lbann/callbacks/print_statistics.hpp @@ -23,11 +23,11 @@ // implied. See the License for the specific language governing // permissions and limitations under the license. // -// print .hpp .cpp - Callback hooks to print information +// print_statistics .hpp .cpp - Callback hooks to print information //////////////////////////////////////////////////////////////////////////////// -#ifndef LBANN_CALLBACKS_CALLBACK_PRINT_HPP_INCLUDED -#define LBANN_CALLBACKS_CALLBACK_PRINT_HPP_INCLUDED +#ifndef LBANN_CALLBACKS_CALLBACK_PRINT_STATISTICS_HPP_INCLUDED +#define LBANN_CALLBACKS_CALLBACK_PRINT_STATISTICS_HPP_INCLUDED #include "lbann/callbacks/callback.hpp" @@ -38,19 +38,20 @@ namespace callback { * Prints average objective function value and metric scores after * each training epoch and evaluation. */ -class print : public callback_base { +class print_statistics : public callback_base { public: - print(int batch_interval = 1, bool print_global_stat_only=false) : - callback_base(batch_interval), m_print_global_stat_only(print_global_stat_only) {} - print(const print&) = default; - print& operator=(const print&) = default; - print* copy() const override { return new print(*this); } + print_statistics(int batch_interval = 1, bool print_global_stat_only=false) : + callback_base(batch_interval), + m_print_global_stat_only(print_global_stat_only) {} + print_statistics(const print_statistics&) = default; + print_statistics& operator=(const print_statistics&) = default; + print_statistics* copy() const override { return new print_statistics(*this); } void setup(model *m) override; void on_epoch_begin(model *m) override; void on_epoch_end(model *m) override; void on_validation_end(model *m) override; void on_test_end(model *m) override; - std::string name() const override { return "print"; } + std::string name() const override { return "print_statistics"; } private: /** Print objective function and metrics to standard output. */ @@ -61,10 +62,10 @@ class print : public callback_base { // Builder function std::unique_ptr -build_print_callback_from_pbuf( +build_print_statistics_callback_from_pbuf( const google::protobuf::Message&, lbann_summary*); } // namespace callback } // namespace lbann -#endif // LBANN_CALLBACKS_CALLBACK_PRINT_HPP_INCLUDED +#endif // LBANN_CALLBACKS_CALLBACK_PRINT_STATISTICS_HPP_INCLUDED diff --git a/include/lbann/lbann.hpp b/include/lbann/lbann.hpp index 6b7bf345e4b..70e0bc2da99 100644 --- a/include/lbann/lbann.hpp +++ b/include/lbann/lbann.hpp @@ -130,44 +130,44 @@ #include "lbann/data_store/data_store_conduit.hpp" /// Callbacks +#include "lbann/callbacks/check_dataset.hpp" +#include "lbann/callbacks/check_gradients.hpp" #include "lbann/callbacks/check_init.hpp" +#include "lbann/callbacks/check_metric.hpp" #include "lbann/callbacks/check_nan.hpp" #include "lbann/callbacks/check_small.hpp" -#include "lbann/callbacks/check_dataset.hpp" -#include "lbann/callbacks/print.hpp" -#include "lbann/callbacks/timer.hpp" -#include "lbann/callbacks/io.hpp" -#include "lbann/callbacks/summary.hpp" -#include "lbann/callbacks/learning_rate.hpp" +#include "lbann/callbacks/checkpoint.hpp" +#include "lbann/callbacks/confusion_matrix.hpp" #include "lbann/callbacks/debug.hpp" #include "lbann/callbacks/debug_io.hpp" -#include "lbann/callbacks/imcomm.hpp" -#include "lbann/callbacks/dump_weights.hpp" -#include "lbann/callbacks/dump_outputs.hpp" #include "lbann/callbacks/dump_error_signals.hpp" #include "lbann/callbacks/dump_gradients.hpp" #include "lbann/callbacks/dump_minibatch_sample_indices.hpp" +#include "lbann/callbacks/dump_outputs.hpp" +#include "lbann/callbacks/dump_weights.hpp" #include "lbann/callbacks/early_stopping.hpp" +#include "lbann/callbacks/gpu_memory_usage.hpp" +#include "lbann/callbacks/hang.hpp" +#include "lbann/callbacks/imcomm.hpp" +#include "lbann/callbacks/learning_rate.hpp" #include "lbann/callbacks/ltfb.hpp" #include "lbann/callbacks/mixup.hpp" +#include "lbann/callbacks/monitor_io.hpp" +#include "lbann/callbacks/perturb_adam.hpp" +#include "lbann/callbacks/perturb_dropout.hpp" +#include "lbann/callbacks/print_statistics.hpp" +#include "lbann/callbacks/profiler.hpp" +#include "lbann/callbacks/replace_weights.hpp" #include "lbann/callbacks/save_images.hpp" #include "lbann/callbacks/save_model.hpp" -#include "lbann/callbacks/save_topk_models.hpp" -#include "lbann/callbacks/profiler.hpp" -#include "lbann/callbacks/hang.hpp" -#include "lbann/callbacks/variable_minibatch.hpp" -#include "lbann/callbacks/timeline.hpp" -#include "lbann/callbacks/checkpoint.hpp" #include "lbann/callbacks/save_model.hpp" -#include "lbann/callbacks/replace_weights.hpp" -#include "lbann/callbacks/gpu_memory_usage.hpp" +#include "lbann/callbacks/save_topk_models.hpp" +#include "lbann/callbacks/summary.hpp" #include "lbann/callbacks/sync_layers.hpp" #include "lbann/callbacks/sync_selected.hpp" -#include "lbann/callbacks/confusion_matrix.hpp" -#include "lbann/callbacks/check_gradients.hpp" -#include "lbann/callbacks/check_metric.hpp" -#include "lbann/callbacks/perturb_adam.hpp" -#include "lbann/callbacks/perturb_dropout.hpp" +#include "lbann/callbacks/timeline.hpp" +#include "lbann/callbacks/timer.hpp" +#include "lbann/callbacks/variable_minibatch.hpp" /// Weights and weight initializers #include "lbann/weights/weights.hpp" diff --git a/src/callbacks/CMakeLists.txt b/src/callbacks/CMakeLists.txt index 2d71c561bfb..1068a4955ff 100644 --- a/src/callbacks/CMakeLists.txt +++ b/src/callbacks/CMakeLists.txt @@ -19,13 +19,13 @@ set_full_path(THIS_DIR_SOURCES gpu_memory_usage.cpp hang.cpp imcomm.cpp - io.cpp learning_rate.cpp ltfb.cpp mixup.cpp + monitor_io.cpp perturb_adam.cpp perturb_dropout.cpp - print.cpp + print_statistics.cpp profiler.cpp replace_weights.cpp save_images.cpp diff --git a/src/callbacks/mixup.cpp b/src/callbacks/mixup.cpp index 786605d5b29..5e49fae4085 100644 --- a/src/callbacks/mixup.cpp +++ b/src/callbacks/mixup.cpp @@ -38,7 +38,7 @@ namespace lbann { namespace callback { -void callback_mixup::on_forward_prop_end(model *m, Layer *l) { +void mixup::on_forward_prop_end(model *m, Layer *l) { if (!m_layers.count(l->get_name())) { return; } @@ -106,7 +106,7 @@ build_mixup_callback_from_pbuf( const auto& layers_list = parse_list(params.layers()); std::unordered_set layers(layers_list.begin(), layers_list.end()); - return make_unique(layers, params.alpha()); + return make_unique(layers, params.alpha()); } } // namespace callback diff --git a/src/callbacks/io.cpp b/src/callbacks/monitor_io.cpp similarity index 78% rename from src/callbacks/io.cpp rename to src/callbacks/monitor_io.cpp index 4b23222126d..19be961ade1 100644 --- a/src/callbacks/io.cpp +++ b/src/callbacks/monitor_io.cpp @@ -23,26 +23,27 @@ // implied. See the License for the specific language governing // permissions and limitations under the license. // -// io .hpp .cpp - Callback hooks for I/O monitoring +// monitor_io .hpp .cpp - Callback hooks for I/O monitoring //////////////////////////////////////////////////////////////////////////////// #include -#include "lbann/callbacks/io.hpp" +#include "lbann/callbacks/monitor_io.hpp" #include "lbann/layers/io/input/generic_input_layer.hpp" #include "lbann/proto/proto_common.hpp" namespace lbann { namespace callback { -void io::on_epoch_end(model *m) { +void monitor_io::on_epoch_end(model *m) { lbann_comm *comm = m->get_comm(); for (Layer *layer : m->get_layers()) { if(m_layers.size() == 0 || m_layers.find(layer->get_name()) != m_layers.end()) { - auto *input = (generic_input_layer *) dynamic_cast (layer); + auto *input = dynamic_cast (layer); if(input != nullptr) { - std::cout << "Rank " << comm->get_trainer_rank() << "." << comm->get_rank_in_trainer() << " processed " + std::cout << "Rank " << comm->get_trainer_rank() << "." + << comm->get_rank_in_trainer() << " processed " << input->get_num_samples_trained() << " training samples of " << input->get_total_num_training_samples() << " (" << input->get_num_samples_trained() / m->get_epoch() << " per epoch)" << std::endl; @@ -51,28 +52,30 @@ void io::on_epoch_end(model *m) { } } -void io::on_test_end(model *m) { +void monitor_io::on_test_end(model *m) { lbann_comm *comm = m->get_comm(); for (Layer *layer : m->get_layers()) { if(m_layers.size() == 0 || m_layers.find(layer->get_name()) != m_layers.end()) { - auto *input = (generic_input_layer *) dynamic_cast (layer); + auto *input = dynamic_cast (layer); if(input != nullptr) { - std::cout << "Rank " << comm->get_trainer_rank() << "." << comm->get_rank_in_trainer() << " processed " + std::cout << "Rank " << comm->get_trainer_rank() << "." + << comm->get_rank_in_trainer() << " processed " << input->get_num_samples_tested() << " test samples of " << input->get_total_num_testing_samples() << " (" - << input->get_num_samples_tested() / m->get_epoch() << " per epoch)" << std::endl; + << input->get_num_samples_tested() / m->get_epoch() + << " per epoch)" << std::endl; } } } } std::unique_ptr -build_disp_io_stats_callback_from_pbuf( +build_monitor_io_callback_from_pbuf( const google::protobuf::Message& proto_msg, lbann_summary*) { const auto& params = dynamic_cast(proto_msg); - return make_unique( + return make_unique( parse_list(params.layers())); } diff --git a/src/callbacks/print.cpp b/src/callbacks/print_statistics.cpp similarity index 95% rename from src/callbacks/print.cpp rename to src/callbacks/print_statistics.cpp index fa2813ebd9d..058241fcf6a 100644 --- a/src/callbacks/print.cpp +++ b/src/callbacks/print_statistics.cpp @@ -23,18 +23,18 @@ // implied. See the License for the specific language governing // permissions and limitations under the license. // -// print .hpp .cpp - Callback hooks to print information +// print_statistics .hpp .cpp - Callback hooks to print information //////////////////////////////////////////////////////////////////////////////// #include -#include "lbann/callbacks/print.hpp" +#include "lbann/callbacks/print_statistics.hpp" #include "lbann/layers/io/input/input_layer.hpp" #include namespace lbann { namespace callback { -void print::setup(model *m) { +void print_statistics::setup(model *m) { #ifdef LBANN_VERSION lbann_comm *comm = m->get_comm(); if (comm->am_world_master()) { @@ -44,7 +44,7 @@ void print::setup(model *m) { #endif } -void print::on_epoch_begin(model *m) { +void print_statistics::on_epoch_begin(model *m) { lbann_comm *comm = m->get_comm(); if (comm->am_world_master()) { @@ -116,19 +116,19 @@ void print::on_epoch_begin(model *m) { } } -void print::on_epoch_end(model *m) { +void print_statistics::on_epoch_end(model *m) { report_results(m); } -void print::on_validation_end(model *m) { +void print_statistics::on_validation_end(model *m) { report_results(m); } -void print::on_test_end(model *m) { +void print_statistics::on_test_end(model *m) { report_results(m); } -void print::report_results(model *m) { +void print_statistics::report_results(model *m) { lbann_comm *comm = m->get_comm(); // Get string for execution mode @@ -248,11 +248,11 @@ void print::report_results(model *m) { } std::unique_ptr -build_print_callback_from_pbuf( +build_print_statistics_callback_from_pbuf( const google::protobuf::Message& proto_msg, lbann_summary*) { const auto& params = dynamic_cast(proto_msg); - return make_unique(params.interval(), + return make_unique(params.interval(), params.print_global_stat_only()); } diff --git a/src/proto/factories/callback_factory.cpp b/src/proto/factories/callback_factory.cpp index 67951e9cec5..d8b24f6e183 100644 --- a/src/proto/factories/callback_factory.cpp +++ b/src/proto/factories/callback_factory.cpp @@ -45,13 +45,13 @@ #include "lbann/callbacks/gpu_memory_usage.hpp" #include "lbann/callbacks/hang.hpp" #include "lbann/callbacks/imcomm.hpp" -#include "lbann/callbacks/io.hpp" #include "lbann/callbacks/learning_rate.hpp" #include "lbann/callbacks/ltfb.hpp" #include "lbann/callbacks/mixup.hpp" +#include "lbann/callbacks/monitor_io.hpp" #include "lbann/callbacks/perturb_adam.hpp" #include "lbann/callbacks/perturb_dropout.hpp" -#include "lbann/callbacks/print.hpp" +#include "lbann/callbacks/print_statistics.hpp" #include "lbann/callbacks/replace_weights.hpp" #include "lbann/callbacks/save_images.hpp" #include "lbann/callbacks/save_model.hpp" @@ -114,7 +114,7 @@ void register_default_builders(factory_type& factory) factory.register_builder("CallbackDebugIO", build_debug_io_callback_from_pbuf); factory.register_builder("CallbackDispIOStats", - build_disp_io_stats_callback_from_pbuf); + build_monitor_io_callback_from_pbuf); factory.register_builder("CallbackDropFixedLearningRate", build_drop_fixed_learning_rate_callback_from_pbuf); factory.register_builder("CallbackDumpErrorSignals", @@ -154,7 +154,7 @@ void register_default_builders(factory_type& factory) factory.register_builder("CallbackPolyLearningRate", build_poly_learning_rate_callback_from_pbuf); factory.register_builder("CallbackPrint", - build_print_callback_from_pbuf); + build_print_statistics_callback_from_pbuf); factory.register_builder("CallbackProfiler", build_profiler_callback_from_pbuf); factory.register_builder("CallbackReplaceWeights", From 524b15f0ad06adec766d40eeef05a77676bba24b Mon Sep 17 00:00:00 2001 From: "David A. Hysom" Date: Wed, 24 Jul 2019 09:14:21 -0700 Subject: [PATCH 158/634] changed 'int' to 'size_t' in several places, since the sum of the image file sizes was over-running INT_MAX. --- .../lbann/data_store/data_store_conduit.hpp | 4 ++-- src/data_store/data_store_conduit.cpp | 21 ++++++++++++------- 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/include/lbann/data_store/data_store_conduit.hpp b/include/lbann/data_store/data_store_conduit.hpp index 6f651dfc793..d35b65087e5 100644 --- a/include/lbann/data_store/data_store_conduit.hpp +++ b/include/lbann/data_store/data_store_conduit.hpp @@ -288,8 +288,8 @@ protected : void get_image_sizes(std::unordered_map &sizes, std::vector> &indices); /// offset at which the raw image will be stored in a shared memory segment; - /// for use in local cache mode - std::unordered_map m_image_offsets; + /// for use in local cache mode; maps data_id to offset + std::unordered_map m_image_offsets; /// fills in m_image_offsets for use in local cache mode void compute_image_offsets(std::unordered_map &sizes, std::vector> &indices); diff --git a/src/data_store/data_store_conduit.cpp b/src/data_store/data_store_conduit.cpp index 37bca740d4a..5ed0c1da4f8 100644 --- a/src/data_store/data_store_conduit.cpp +++ b/src/data_store/data_store_conduit.cpp @@ -1117,7 +1117,7 @@ void data_store_conduit::get_image_sizes(std::unordered_map &file_sizes } void data_store_conduit::compute_image_offsets(std::unordered_map &sizes, std::vector> &indices) { - int offset = 0; + size_t offset = 0; for (size_t p=0; p &si //in case a previous run was aborted, attempt to remove the file, which //may or may not exist + shm_unlink(m_seg_name.c_str()); int node_id = m_comm->get_rank_in_node(); if (node_id == 0) { - std::stringstream s; - s << "rm -rf /dev/shm/" << m_seg_name; - system(s.str().c_str()); + std::remove(m_seg_name.c_str()); } + m_comm->trainer_barrier(); #if 0 debug block; may go away @@ -1279,15 +1279,22 @@ void data_store_conduit::preload_local_cache() { } void data_store_conduit::read_files(std::vector &work, std::unordered_map &sizes, std::vector &indices) { - int n = 0; + if (m_world_master) { + std::cout << "data_store_conduit: reading files for local_cache\n"; + } + size_t n = 0; for (auto t : indices) { n += sizes[t]; } work.resize(n); + if (m_output) { + m_output << "data_store_conduit::read_files; requested work size: " << n << std::endl; + } + image_data_reader *image_reader = dynamic_cast(m_reader); const std::vector &image_list = image_reader->get_image_list(); - int offset = 0; + size_t offset = 0; for (auto h : indices) { int s = sizes[h]; const std::string fn = m_reader->get_file_dir() + '/' + image_list[h].first; @@ -1303,7 +1310,7 @@ void data_store_conduit::build_conduit_nodes(std::unordered_map &sizes) const std::vector &image_list = image_reader->get_image_list(); for (size_t idx=0; idx Date: Wed, 24 Jul 2019 14:47:20 -0700 Subject: [PATCH 159/634] Channel-wise scale/bias layer (#1122) * CPU implementation os channel-wise scale/bias layer. Metric checking and gradient checking both pass. * GPU implementation of channel-wise scale/bias layer. Metric checking and gradient checking both pass. --- .../learning/channelwise_scale_bias.hpp | 144 ++++++++++ include/lbann/lbann.hpp | 1 + src/layers/learning/CMakeLists.txt | 9 + .../learning/channelwise_scale_bias.cpp | 136 ++++++++++ src/layers/learning/channelwise_scale_bias.cu | 251 ++++++++++++++++++ src/proto/factories/layer_factory.cpp | 10 + src/proto/lbann.proto | 3 + 7 files changed, 554 insertions(+) create mode 100644 include/lbann/layers/learning/channelwise_scale_bias.hpp create mode 100644 src/layers/learning/channelwise_scale_bias.cpp create mode 100644 src/layers/learning/channelwise_scale_bias.cu diff --git a/include/lbann/layers/learning/channelwise_scale_bias.hpp b/include/lbann/layers/learning/channelwise_scale_bias.hpp new file mode 100644 index 00000000000..6af5ce54909 --- /dev/null +++ b/include/lbann/layers/learning/channelwise_scale_bias.hpp @@ -0,0 +1,144 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_LAYER_LEARNING_CHANNELWISE_SCALE_BIAS_HPP_INCLUDED +#define LBANN_LAYER_LEARNING_CHANNELWISE_SCALE_BIAS_HPP_INCLUDED + +#include "lbann/layers/layer.hpp" +#include "lbann/models/model.hpp" +#include "lbann/utils/exception.hpp" + +namespace lbann { + +/** @brief Apply scale and bias to tensor channels. + * + * The input tensor is sliced along the first tensor dimension (the + * "channel" dimension, assuming image data in CHW format) and scale + * and bias terms are applied independently to each slice. More + * precisely, given input and output tensors + * @f$ X,Y\in\mathbb{R}^{d_1\times\cdots\times d_n} @f$ + * and scale and bias vectors @f$ a,b\in\mathbb{R}^{d_1} @f$: + * @f[ + * Y_{i,j,\cdots} = a_i X_{i,j,\cdots} + b_i + * @f] + * + * The scale and bias vectors are fused into a single weights tensor + * to reduce the number of gradient allreduces during backprop. In + * particular, the weights tensor is a + * @f$ \text{num_channels} \times 2 @f$ matrix, where the first + * column correspond to scale terms and the second column to bias + * terms. + */ +template +class channelwise_scale_bias_layer : public Layer { +public: + + channelwise_scale_bias_layer(lbann_comm *comm) + : Layer(comm) { + static_assert(Layout == data_layout::DATA_PARALLEL, + "channelwise_mean_layer only supports " + "data-parallel data layout"); + } + + channelwise_scale_bias_layer(const channelwise_scale_bias_layer& other) + : Layer(other), + m_weights_gradient(other.m_weights_gradient ? + other.m_weights_gradient->Copy() : nullptr) {} + channelwise_scale_bias_layer& operator=(const channelwise_scale_bias_layer& other) { + Layer::operator=(other); + m_weights_gradient.reset(other.m_weights_gradient ? + other.m_weights_gradient->Copy() : + nullptr); + return *this; + } + + channelwise_scale_bias_layer* copy() const override { + return new channelwise_scale_bias_layer(*this); + } + std::string get_type() const override { return "channel-wise scale/bias"; } + data_layout get_data_layout() const override { return Layout; } + El::Device get_device_allocation() const override { return Device; } + + void setup_matrices(const El::Grid& grid) override { + Layer::setup_matrices(grid); + m_weights_gradient.reset(new StarMat(grid)); + } + + void setup_data() override { + Layer::setup_data(); + const El::Int num_channels = get_output_dims()[0]; + + // Construct default weights if needed + if (this->m_weights.size() < 1) { + this->m_weights.push_back(new weights(get_comm())); + std::vector vals(2*num_channels, DataType{0}); + std::fill(vals.begin(), vals.begin()+num_channels, DataType{1}); + std::unique_ptr init(new value_initializer(vals)); + std::unique_ptr opt(m_model->create_optimizer()); + this->m_weights[0]->set_name(get_name() + "_weights"); + this->m_weights[0]->set_initializer(init); + this->m_weights[0]->set_optimizer(opt); + this->m_model->add_weights(this->m_weights[0]); + } + if (this->m_weights.size() != 1) { + std::ostringstream err; + err << "attempted to setup " + << this->get_type() << " layer \"" << this->get_name() << "\" " + << "with an invalid number of weights " + << "(expected 1, " + << "found " << this->m_weights.size() << ")"; + LBANN_ERROR(err.str()); + } + + // Setup weights + auto matrix_dist = get_prev_activations().DistData(); + matrix_dist.colDist = El::STAR; + matrix_dist.rowDist = El::STAR; + m_weights[0]->set_dims({static_cast(num_channels)}, + {static_cast(2)}); + m_weights[0]->set_matrix_distribution(matrix_dist); + + // Setup gradient w.r.t. weights + m_weights_gradient->AlignWith(matrix_dist); + m_weights_gradient->Resize(num_channels, 2); + + } + +protected: + void fp_compute() override; + void bp_compute() override; + +private: + + /** Objective function gradient w.r.t. weights. */ + std::unique_ptr m_weights_gradient; + +}; + +} // namespace lbann + +#endif // LBANN_LAYER_LEARNING_CHANNELWISE_SCALE_BIAS_HPP_INCLUDED diff --git a/include/lbann/lbann.hpp b/include/lbann/lbann.hpp index 870580b7690..9336ebc65dc 100644 --- a/include/lbann/lbann.hpp +++ b/include/lbann/lbann.hpp @@ -46,6 +46,7 @@ #include "lbann/layers/learning/convolution.hpp" #include "lbann/layers/learning/deconvolution.hpp" #include "lbann/layers/learning/embedding.hpp" +#include "lbann/layers/learning/channelwise_scale_bias.hpp" /// Loss layers #include "lbann/layers/loss/categorical_accuracy.hpp" diff --git a/src/layers/learning/CMakeLists.txt b/src/layers/learning/CMakeLists.txt index 317f968d510..7fb7aa137c6 100644 --- a/src/layers/learning/CMakeLists.txt +++ b/src/layers/learning/CMakeLists.txt @@ -1,8 +1,17 @@ # Add the source files for this directory set_full_path(THIS_DIR_SOURCES + channelwise_scale_bias.cpp embedding.cpp fully_connected.cpp ) +if (LBANN_HAS_CUDA) + # Add the CUDA source files for this directory + set_full_path(THIS_DIR_CU_SOURCES + channelwise_scale_bias.cu + ) +endif () + # Propagate the files up the tree set(SOURCES "${SOURCES}" "${THIS_DIR_SOURCES}" PARENT_SCOPE) +set(CUDA_SOURCES "${CUDA_SOURCES}" "${THIS_DIR_CU_SOURCES}" PARENT_SCOPE) diff --git a/src/layers/learning/channelwise_scale_bias.cpp b/src/layers/learning/channelwise_scale_bias.cpp new file mode 100644 index 00000000000..baab65edff2 --- /dev/null +++ b/src/layers/learning/channelwise_scale_bias.cpp @@ -0,0 +1,136 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#include "lbann/layers/learning/channelwise_scale_bias.hpp" + +namespace lbann { + +template <> +void channelwise_scale_bias_layer + ::fp_compute() { + + // Local matrices + const auto& local_input = dynamic_cast(get_local_prev_activations()); + auto& local_output = dynamic_cast(get_local_activations()); + const auto& local_weights = dynamic_cast(m_weights[0]->get_values().LockedMatrix()); + const auto local_scale = El::LockedView(local_weights, + El::ALL, El::IR(0)); + const auto local_bias = El::LockedView(local_weights, + El::ALL, El::IR(1)); + + // Dimensions + // Note: channel_size is the number of input entries per channel and + // local_width is the number of local mini-batch samples. + const auto dims = get_output_dims(); + const El::Int num_channels = dims[0]; + const El::Int channel_size = std::accumulate(dims.begin() + 1, + dims.end(), + 1, std::multiplies()); + const El::Int local_width = local_input.Width(); + + // Apply channel-wise scale and bias + LBANN_OMP_PARALLEL_FOR + for (El::Int channel = 0; channel < num_channels; ++channel) { + const auto a = local_scale(channel, 0); + const auto b = local_bias(channel, 0); + const El::Int row_start = channel * channel_size; + const El::Int row_end = (channel + 1) * channel_size; + const El::Int col_start = 0; + const El::Int col_end = local_width; + for (El::Int col = col_start; col < col_end; ++col) { + for (El::Int row = row_start; row < row_end; ++row) { + const auto& x = local_input(row, col); + auto& y = local_output(row, col); + y = a * x + b; + } + } + } + +} + +template <> +void channelwise_scale_bias_layer + ::bp_compute() { + + // Local matrices + const auto& local_input = dynamic_cast(get_local_prev_activations()); + const auto& local_gradient_wrt_output = dynamic_cast(get_local_prev_error_signals()); + auto& local_gradient_wrt_input = dynamic_cast(get_local_error_signals()); + const auto& local_weights = dynamic_cast(m_weights[0]->get_values().LockedMatrix()); + auto& local_gradient_wrt_weights = dynamic_cast(m_weights_gradient->Matrix()); + const auto local_scale = El::LockedView(local_weights, + El::ALL, El::IR(0)); + auto local_gradient_wrt_scale = El::View(local_gradient_wrt_weights, + El::ALL, El::IR(0)); + auto local_gradient_wrt_bias = El::View(local_gradient_wrt_weights, + El::ALL, El::IR(1)); + + + // Dimensions + // Note: channel_size is the number of input entries per channel and + // local_width is the number of local mini-batch samples. + const auto dims = get_output_dims(); + const El::Int num_channels = dims[0]; + const El::Int channel_size = std::accumulate(dims.begin() + 1, + dims.end(), + 1, std::multiplies()); + const El::Int local_width = local_input.Width(); + + // Compute gradients + LBANN_OMP_PARALLEL_FOR + for (El::Int channel = 0; channel < num_channels; ++channel) { + const auto a = local_scale(channel, 0); + DataType da{0}, db{0}; + const El::Int row_start = channel * channel_size; + const El::Int row_end = (channel + 1) * channel_size; + const El::Int col_start = 0; + const El::Int col_end = local_width; + for (El::Int col = col_start; col < col_end; ++col) { + for (El::Int row = row_start; row < row_end; ++row) { + const auto& x = local_input(row, col); + const auto& dy = local_gradient_wrt_output(row, col); + auto& dx = local_gradient_wrt_input(row, col); + da += x * dy; + db += dy; + dx = a * dy; + } + } + local_gradient_wrt_scale(channel, 0) = da; + local_gradient_wrt_bias(channel, 0) = db; + } + + // Update optimizer with gradient + auto* opt = m_weights[0]->get_optimizer(); + if (opt != nullptr) { + const El::Int mini_batch_size = this->m_model->get_effective_mini_batch_size(); + opt->add_to_gradient(*m_weights_gradient, + DataType{1} / mini_batch_size, + true); + } + +} + +} // namespace lbann diff --git a/src/layers/learning/channelwise_scale_bias.cu b/src/layers/learning/channelwise_scale_bias.cu new file mode 100644 index 00000000000..99df5f3e245 --- /dev/null +++ b/src/layers/learning/channelwise_scale_bias.cu @@ -0,0 +1,251 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#include "lbann/layers/learning/channelwise_scale_bias.hpp" +#include "cub/block/block_reduce.cuh" + +namespace lbann { + +namespace { + +/** + * Block dimensions: bsizex x bsizey x 1 + * + * Grid dimensions: (channel_size / bsizex) x (width / bsizey) x num_channels + */ +__global__ void fp_kernel(size_t num_channels, + size_t channel_size, + size_t width, + const DataType* __restrict__ input, + size_t input_ldim, + DataType* __restrict__ output, + size_t output_ldim, + const DataType* __restrict__ scale, + const DataType* __restrict__ bias) { + + // Indices + const size_t bidz = blockIdx.z; + const size_t gidx = threadIdx.x + blockIdx.x * blockDim.x; + const size_t gidy = threadIdx.y + blockIdx.y * blockDim.y; + const size_t nthreadsx = blockDim.x * gridDim.x; + const size_t nthreadsy = blockDim.y * gridDim.y; + const size_t nblocksz = gridDim.z; + + // Apply channel-wise scale/bias + for (size_t channel = bidz; channel < num_channels; channel += nblocksz) { + const auto a = scale[channel]; + const auto b = bias[channel]; + const size_t row_start = channel * channel_size; + const size_t row_end = (channel + 1) * channel_size; + const size_t col_start = 0; + const size_t col_end = width; + for (size_t col = col_start+gidy; col < col_end; col += nthreadsy) { + for (size_t row = row_start+gidx; row < row_end; row += nthreadsx) { + const auto& x = input[row + col*input_ldim]; + auto& y = output[row + col*output_ldim]; + y = a * x + b; + } + } + } + +} + +/** + * Block dimensions: bsizex x bsizey x 1 + * + * Grid dimensions: (channel_size / bsizex) x (width / bsizey) x num_channels + */ +template +__global__ void bp_kernel(size_t num_channels, + size_t channel_size, + size_t width, + const DataType* __restrict__ input, + size_t input_ldim, + const DataType* __restrict__ gradient_wrt_output, + size_t gradient_wrt_output_ldim, + DataType* __restrict__ gradient_wrt_input, + size_t gradient_wrt_input_ldim, + const DataType* __restrict__ scale, + DataType* __restrict__ gradient_wrt_scale, + DataType* __restrict__ gradient_wrt_bias) { + + // Indices + const size_t tid = threadIdx.x + threadIdx.y * blockDim.x; + const size_t bidz = blockIdx.z; + const size_t gidx = threadIdx.x + blockIdx.x * blockDim.x; + const size_t gidy = threadIdx.y + blockIdx.y * blockDim.y; + const size_t nthreadsx = blockDim.x * gridDim.x; + const size_t nthreadsy = blockDim.y * gridDim.y; + const size_t nblocksz = gridDim.z; + + for (size_t channel = bidz; channel < num_channels; channel += nblocksz) { + + // Accumulate gradient contributions for thread in private memory + DataType private_da{0}, private_db{0}; + const auto a = scale[channel]; + const size_t row_start = channel * channel_size; + const size_t row_end = (channel + 1) * channel_size; + const size_t col_start = 0; + const size_t col_end = width; + for (size_t col = col_start+gidy; col < col_end; col += nthreadsy) { + for (size_t row = row_start+gidx; row < row_end; row += nthreadsx) { + const auto& x = input[row + col*input_ldim]; + const auto& dy = gradient_wrt_output[row + col*gradient_wrt_output_ldim]; + auto& dx = gradient_wrt_input[row + col*gradient_wrt_input_ldim]; + private_da += x * dy; + private_db += dy; + dx = a * dy; + } + } + + // Accumulate gradient contributions for block and add to result + // Note: Perform block reduction with CUB + constexpr auto reduce_algo = cub::BLOCK_REDUCE_WARP_REDUCTIONS; + using BlockReduce = cub::BlockReduce; + __shared__ typename BlockReduce::TempStorage workspace; + __syncthreads(); + const auto da = BlockReduce(workspace).Sum(private_da); + if (tid == 0) { + cuda::atomic_add(&gradient_wrt_scale[channel], da); + } + __syncthreads(); + const auto db = BlockReduce(workspace).Sum(private_db); + if (tid == 0) { + cuda::atomic_add(&gradient_wrt_bias[channel], db); + } + + } + +} + +} // namespace + +template <> +void channelwise_scale_bias_layer + ::fp_compute() { + + // Local matrices + const auto& local_input = dynamic_cast(get_local_prev_activations()); + auto& local_output = dynamic_cast(get_local_activations()); + const auto& local_weights = dynamic_cast(m_weights[0]->get_values().LockedMatrix()); + const auto local_scale = El::LockedView(local_weights, + El::ALL, El::IR(0)); + const auto local_bias = El::LockedView(local_weights, + El::ALL, El::IR(1)); + + // Dimensions + // Note: channel_size is the number of input entries per channel and + // local_width is the number of local mini-batch samples. + const auto dims = get_output_dims(); + const El::Int num_channels = dims[0]; + const El::Int channel_size = std::accumulate(dims.begin() + 1, + dims.end(), + 1, std::multiplies()); + const El::Int local_width = local_input.Width(); + + // Apply channel-wise scale and bias + if (!local_input.IsEmpty()) { + constexpr size_t block_size_x = 256; + constexpr size_t block_size_y = 1; + dim3 block_dims, grid_dims; + block_dims.x = block_size_x; + block_dims.y = block_size_y; + grid_dims.x = (channel_size + block_size_x - 1) / block_size_x; + grid_dims.y = (local_width + block_size_y - 1) / block_size_y; + grid_dims.z = num_channels; + fp_kernel + <<>>( + num_channels, channel_size, local_width, + local_input.LockedBuffer(), local_input.LDim(), + local_output.Buffer(), local_output.LDim(), + local_scale.LockedBuffer(), + local_bias.LockedBuffer()); + } + +} + +template <> +void channelwise_scale_bias_layer + ::bp_compute() { + + // Local matrices + const auto& local_input = dynamic_cast(get_local_prev_activations()); + const auto& local_gradient_wrt_output = dynamic_cast(get_local_prev_error_signals()); + auto& local_gradient_wrt_input = dynamic_cast(get_local_error_signals()); + const auto& local_weights = dynamic_cast(m_weights[0]->get_values().LockedMatrix()); + auto& local_gradient_wrt_weights = dynamic_cast(m_weights_gradient->Matrix()); + const auto local_scale = El::LockedView(local_weights, + El::ALL, El::IR(0)); + auto local_gradient_wrt_scale = El::View(local_gradient_wrt_weights, + El::ALL, El::IR(0)); + auto local_gradient_wrt_bias = El::View(local_gradient_wrt_weights, + El::ALL, El::IR(1)); + + // Dimensions + // Note: channel_size is the number of input entries per channel and + // local_width is the number of local mini-batch samples. + const auto dims = get_output_dims(); + const El::Int num_channels = dims[0]; + const El::Int channel_size = std::accumulate(dims.begin() + 1, + dims.end(), + 1, std::multiplies()); + const El::Int local_width = local_input.Width(); + + // Compute gradients + El::Zero(local_gradient_wrt_weights); + if (!local_input.IsEmpty()) { + constexpr size_t block_size_x = 256; + constexpr size_t block_size_y = 1; + dim3 block_dims, grid_dims; + block_dims.x = block_size_x; + block_dims.y = block_size_y; + grid_dims.x = (channel_size + block_size_x - 1) / block_size_x; + grid_dims.y = (local_width + block_size_y - 1) / block_size_y; + grid_dims.z = num_channels; + bp_kernel + <<>>( + num_channels, channel_size, local_width, + local_input.LockedBuffer(), local_input.LDim(), + local_gradient_wrt_output.LockedBuffer(), local_gradient_wrt_output.LDim(), + local_gradient_wrt_input.Buffer(), local_gradient_wrt_input.LDim(), + local_scale.LockedBuffer(), + local_gradient_wrt_scale.Buffer(), + local_gradient_wrt_bias.Buffer()); + } + + // Update optimizer with gradient + auto* opt = m_weights[0]->get_optimizer(); + if (opt != nullptr) { + const El::Int mini_batch_size = this->m_model->get_effective_mini_batch_size(); + opt->add_to_gradient(*m_weights_gradient, + DataType{1} / mini_batch_size, + true); + } + + +} + +} // namespace lbann diff --git a/src/proto/factories/layer_factory.cpp b/src/proto/factories/layer_factory.cpp index 2bb63f7ab10..10d4c517766 100644 --- a/src/proto/factories/layer_factory.cpp +++ b/src/proto/factories/layer_factory.cpp @@ -211,6 +211,16 @@ std::unique_ptr construct_layer( } } + // Channel-wise scale/bias layer + if (proto_layer.has_channelwise_scale_bias()) { + if (Layout == data_layout::DATA_PARALLEL) { + return lbann::make_unique>(comm); + } else { + LBANN_ERROR("channel-wise scale/bias layer is only supported " + "with data-parallel data layout"); + } + } + // Transform layers if (proto_layer.has_reshape()) { const auto& params = proto_layer.reshape(); diff --git a/src/proto/lbann.proto b/src/proto/lbann.proto index 7577f1298a9..17a3d92584c 100644 --- a/src/proto/lbann.proto +++ b/src/proto/lbann.proto @@ -579,6 +579,7 @@ message Layer { Convolution convolution = 13; Deconvolution deconvolution = 305; Embedding embedding = 328; + ChannelwiseScaleBias channelwise_scale_bias = 329; // Loss layers CrossEntropy cross_entropy = 60; @@ -1012,6 +1013,8 @@ message Embedding { int64 embedding_size = 2; } +message ChannelwiseScaleBias {} + ////////////////// // Image layers // ////////////////// From 31fd4bb7df96fb83d9fa32ad769662acc6e2ef09 Mon Sep 17 00:00:00 2001 From: Tim Moon Date: Wed, 24 Jul 2019 15:32:25 -0700 Subject: [PATCH 160/634] Fixing bug when parsing list of execution modes. --- src/base.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/base.cpp b/src/base.cpp index 643a42fe1be..c9c139aa06e 100644 --- a/src/base.cpp +++ b/src/base.cpp @@ -163,7 +163,7 @@ execution_mode exe_mode_from_string(std::string const& str) { std::istream& operator>>(std::istream& is, execution_mode& m) { std::string tmp; is >> tmp; - m = exe_mode_from_string(tmp); + m = is ? exe_mode_from_string(tmp) : execution_mode::invalid; return is; } From d8d2febf6372b35333068d113dad67f46d83f761 Mon Sep 17 00:00:00 2001 From: "Thomas R. Benson" Date: Wed, 24 Jul 2019 21:15:02 -0700 Subject: [PATCH 161/634] Revert "Fixing bug when parsing list of execution modes." This reverts commit 31fd4bb7df96fb83d9fa32ad769662acc6e2ef09. --- src/base.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/base.cpp b/src/base.cpp index c9c139aa06e..643a42fe1be 100644 --- a/src/base.cpp +++ b/src/base.cpp @@ -163,7 +163,7 @@ execution_mode exe_mode_from_string(std::string const& str) { std::istream& operator>>(std::istream& is, execution_mode& m) { std::string tmp; is >> tmp; - m = is ? exe_mode_from_string(tmp) : execution_mode::invalid; + m = exe_mode_from_string(tmp); return is; } From 9031001105a8458a568545cc00edfbee796a1cec Mon Sep 17 00:00:00 2001 From: "Thomas R. Benson" Date: Thu, 25 Jul 2019 08:36:22 -0700 Subject: [PATCH 162/634] clean up parse_list and parse_set; add tests for them --- CMakeLists.txt | 1 + include/lbann/proto/proto_common.hpp | 57 +++++++++++++++---- src/proto/proto_common.cpp | 15 +++++ src/proto/unit_test/CMakeLists.txt | 8 +++ src/proto/unit_test/parse_list_test.cpp | 52 +++++++++++++++++ src/proto/unit_test/parse_set_test.cpp | 47 ++++++++++++++++ src/proto/unit_test/trim_test.cpp | 74 +++++++++++++++++++++++++ 7 files changed, 243 insertions(+), 11 deletions(-) create mode 100644 src/proto/unit_test/CMakeLists.txt create mode 100644 src/proto/unit_test/parse_list_test.cpp create mode 100644 src/proto/unit_test/parse_set_test.cpp create mode 100644 src/proto/unit_test/trim_test.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 4d596590ba7..45fccfb2403 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -452,6 +452,7 @@ if (LBANN_WITH_UNIT_TESTING) # Now that Catch2 has been found, start adding the unit tests include(CTest) include(Catch) + add_subdirectory(src/proto/unit_test) add_subdirectory(src/utils/unit_test) add_subdirectory(src/transforms/unit_test) add_subdirectory(src/transforms/vision/unit_test) diff --git a/include/lbann/proto/proto_common.hpp b/include/lbann/proto/proto_common.hpp index 1f9acb6e252..6414f8bcb94 100644 --- a/include/lbann/proto/proto_common.hpp +++ b/include/lbann/proto/proto_common.hpp @@ -95,27 +95,62 @@ bool write_prototext_file( const std::string& fn, lbann_data::LbannPB& pb); -/** @brief Parse a space-separated list. */ -template -std::vector parse_list(std::string str) { +/** @brief Trim leading and trailing whitespace from a string. */ +std::string trim(std::string const& str); + +// These functions work on trimmed, nonempty strings +namespace details { + +template +std::vector parse_list_impl(std::string const& str) { + T entry; std::vector list; - std::istringstream ss(str); - for (T entry; ss >> entry;) { - list.push_back(entry); + std::istringstream iss(str); + while (iss.good()) { + iss >> entry; + list.emplace_back(std::move(entry)); } return list; } -/** @brief Parse a space-separated set. */ -template -std::set parse_set(std::string str) { +template +std::set parse_set_impl(std::string const& str) { + T entry; std::set set; std::istringstream iss(str); - for (T entry; iss >> entry;) { - set.insert(entry); + while(iss.good()) { + iss >> entry; + set.emplace(std::move(entry)); } return set; } + +// TODO (trb 07/25/19): we should think about what to do about bad +// input. That is, if a user calls parse_list("one two three"), +// the result is undefined (one test I did gave [0,0,0] and another +// gave [INT_MAX,INT_MAX,INT_MAX]). In most cases in LBANN, I would +// guess that this will result in a logic error further down the +// codepath, but we shouldn't count on it. + +}// namespace details + +/** @brief Parse a space-separated list. */ +template +std::vector parse_list(std::string const& str) { + auto trim_str = trim(str); + if (trim_str.size()) + return details::parse_list_impl(trim_str); + return {}; +} + +/** @brief Parse a space-separated set. */ +template +std::set parse_set(std::string const& str) { + auto trim_str = trim(str); + if (trim_str.size()) + return details::parse_set_impl(trim_str); + return {}; +} } // namespace lbann #endif // LBANN_PROTO_PROTO_COMMON_HPP_INCLUDED diff --git a/src/proto/proto_common.cpp b/src/proto/proto_common.cpp index be0179ded90..9e978a8d08e 100644 --- a/src/proto/proto_common.cpp +++ b/src/proto/proto_common.cpp @@ -1025,4 +1025,19 @@ void save_session(const lbann_comm& comm, const int argc, char * const* argv, lb out.close(); } +std::string trim(std::string const& str) +{ + // Short-circuit on the empty string + if (str.size() == 0) return std::string(); + + const std::string whitespace = "\f\n\r\t\v "; + auto first = str.find_first_not_of(whitespace); + + // All characters are whitespace; short-circuit. + if (first == std::string::npos) return std::string(); + + auto last = str.find_last_not_of(whitespace); + return str.substr(first, (last-first)+1); +} + } // namespace lbann diff --git a/src/proto/unit_test/CMakeLists.txt b/src/proto/unit_test/CMakeLists.txt new file mode 100644 index 00000000000..90324d96bf0 --- /dev/null +++ b/src/proto/unit_test/CMakeLists.txt @@ -0,0 +1,8 @@ +set_full_path(_DIR_LBANN_CATCH2_TEST_FILES + parse_list_test.cpp + parse_set_test.cpp + trim_test.cpp + ) + +set(LBANN_CATCH2_TEST_FILES + "${LBANN_CATCH2_TEST_FILES}" "${_DIR_LBANN_CATCH2_TEST_FILES}" PARENT_SCOPE) diff --git a/src/proto/unit_test/parse_list_test.cpp b/src/proto/unit_test/parse_list_test.cpp new file mode 100644 index 00000000000..e79d42557c2 --- /dev/null +++ b/src/proto/unit_test/parse_list_test.cpp @@ -0,0 +1,52 @@ +#include + +#include +#include + +#include +#include + +TEST_CASE("Testing parse_list", "[proto][utilities]") +{ + SECTION("execution_mode") + { + const std::vector expected = + { lbann::execution_mode::training, + lbann::execution_mode::validation, + lbann::execution_mode::testing }; + + auto const answer = + lbann::parse_list("train validate test"); + CHECK(answer == expected); + CHECK( + lbann::parse_list("") + == std::vector{}); + CHECK( + lbann::parse_list(" ") + == std::vector{}); + + CHECK_THROWS( + lbann::parse_list("banana tuna salad")); + } + + SECTION("std::string") + { + const std::vector expected = { "this", "is", "a", "test" }; + auto const answer = + lbann::parse_list("this is a test"); + CHECK(answer == expected); + CHECK( + lbann::parse_list("") == std::vector{}); + + } + + SECTION("int") + { + const std::vector expected = { 1, 2, 3, 4, 5}; + auto const answer = + lbann::parse_list("1 2 3 4 5"); + CHECK(answer == expected); + CHECK(lbann::parse_list("") == std::vector{}); + CHECK(lbann::parse_list(" ") == std::vector{}); + } +} diff --git a/src/proto/unit_test/parse_set_test.cpp b/src/proto/unit_test/parse_set_test.cpp new file mode 100644 index 00000000000..293731c0eb9 --- /dev/null +++ b/src/proto/unit_test/parse_set_test.cpp @@ -0,0 +1,47 @@ +#include + +#include +#include + +#include +#include + +TEST_CASE("Testing parse_set", "[proto][utilities]") +{ + SECTION("execution_mode") + { + const std::set expected = + { lbann::execution_mode::training, + lbann::execution_mode::validation, + lbann::execution_mode::testing }; + + auto const answer = + lbann::parse_set("train validate train test test"); + CHECK(answer == expected); + CHECK( + lbann::parse_set("") + == std::set{}); + CHECK( + lbann::parse_set(" ") + == std::set{}); + } + + SECTION("std::string") + { + const std::set expected = { "this", "is", "a", "test" }; + auto const answer = + lbann::parse_set("this is a test"); + CHECK(answer == expected); + CHECK(lbann::parse_set("") == std::set{}); + } + + SECTION("int") + { + const std::set expected = { 1, 2, 3, 4, 5}; + auto const answer = + lbann::parse_set("1 1 2 1 3 4 3 3 5 2"); + CHECK(answer == expected); + CHECK(lbann::parse_set("") == std::set{}); + CHECK(lbann::parse_set(" ") == std::set{}); + } +} diff --git a/src/proto/unit_test/trim_test.cpp b/src/proto/unit_test/trim_test.cpp new file mode 100644 index 00000000000..1214dd82a59 --- /dev/null +++ b/src/proto/unit_test/trim_test.cpp @@ -0,0 +1,74 @@ +#include + +#include + +#include + +TEST_CASE("Testing string trimming", "[proto][utilities]") +{ + SECTION("Leading spaces") + { + CHECK(lbann::trim(" my string") == "my string"); + CHECK(lbann::trim("\nmy string") == "my string"); + CHECK(lbann::trim("\tmy string") == "my string"); + CHECK(lbann::trim(" \n\tmy string") == "my string"); + CHECK(lbann::trim(" my string") == "my string"); + } + SECTION("Trailing spaces") + { + CHECK(lbann::trim("my string ") == "my string"); + CHECK(lbann::trim("my string\n") == "my string"); + CHECK(lbann::trim("my string\t") == "my string"); + CHECK(lbann::trim("my string \n\t") == "my string"); + CHECK(lbann::trim("my string ") == "my string"); + } + SECTION("Leading and trailing spaces") + { + CHECK(lbann::trim(" my string ") == "my string"); + CHECK(lbann::trim(" my string\n") == "my string"); + CHECK(lbann::trim(" my string\t") == "my string"); + CHECK(lbann::trim(" my string \n\t") == "my string"); + CHECK(lbann::trim(" my string ") == "my string"); + + CHECK(lbann::trim("\nmy string ") == "my string"); + CHECK(lbann::trim("\nmy string\n") == "my string"); + CHECK(lbann::trim("\nmy string\t") == "my string"); + CHECK(lbann::trim("\nmy string \n\t") == "my string"); + CHECK(lbann::trim("\nmy string ") == "my string"); + + CHECK(lbann::trim("\tmy string ") == "my string"); + CHECK(lbann::trim("\tmy string\n") == "my string"); + CHECK(lbann::trim("\tmy string\t") == "my string"); + CHECK(lbann::trim("\tmy string \n\t") == "my string"); + CHECK(lbann::trim("\tmy string ") == "my string"); + + CHECK(lbann::trim(" \n\tmy string ") == "my string"); + CHECK(lbann::trim(" \n\tmy string\n") == "my string"); + CHECK(lbann::trim(" \n\tmy string\t") == "my string"); + CHECK(lbann::trim(" \n\tmy string \n\t") == "my string"); + CHECK(lbann::trim(" \n\tmy string ") == "my string"); + + CHECK(lbann::trim(" my string ") == "my string"); + CHECK(lbann::trim(" my string\n") == "my string"); + CHECK(lbann::trim(" my string\t") == "my string"); + CHECK(lbann::trim(" my string \n\t") == "my string"); + CHECK(lbann::trim(" my string ") == "my string"); + } + SECTION("Neither leading nor trailing spaces") + { + CHECK(lbann::trim("my string") == "my string"); + CHECK(lbann::trim("lbann") == "lbann"); + } + SECTION("Only spaces") + { + CHECK(lbann::trim(" ") == ""); + CHECK(lbann::trim("\n") == ""); + CHECK(lbann::trim("\t") == ""); + CHECK(lbann::trim(" \n\t") == ""); + CHECK(lbann::trim(" \t\n\t") == ""); + } + SECTION("Empty string") + { + CHECK(lbann::trim("") == ""); + } +} From 58a2b33ea9d2d70c5f5ce2e11f670e0d9fd1bef2 Mon Sep 17 00:00:00 2001 From: Tom Benson <30674819+benson31@users.noreply.github.com> Date: Thu, 25 Jul 2019 12:25:43 -0700 Subject: [PATCH 163/634] Update src/proto/unit_test/parse_list_test.cpp Co-Authored-By: Tim Moon --- src/proto/unit_test/parse_list_test.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/proto/unit_test/parse_list_test.cpp b/src/proto/unit_test/parse_list_test.cpp index e79d42557c2..40fdc8aed33 100644 --- a/src/proto/unit_test/parse_list_test.cpp +++ b/src/proto/unit_test/parse_list_test.cpp @@ -42,7 +42,7 @@ TEST_CASE("Testing parse_list", "[proto][utilities]") SECTION("int") { - const std::vector expected = { 1, 2, 3, 4, 5}; + const std::vector expected = { 1, 2, 3, 4, 5 }; auto const answer = lbann::parse_list("1 2 3 4 5"); CHECK(answer == expected); From 8506372730efe9445a433a6f546ed5fc1b579ad3 Mon Sep 17 00:00:00 2001 From: Tom Benson <30674819+benson31@users.noreply.github.com> Date: Thu, 25 Jul 2019 12:25:50 -0700 Subject: [PATCH 164/634] Update src/proto/unit_test/parse_set_test.cpp Co-Authored-By: Tim Moon --- src/proto/unit_test/parse_set_test.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/proto/unit_test/parse_set_test.cpp b/src/proto/unit_test/parse_set_test.cpp index 293731c0eb9..a79cfc43a9d 100644 --- a/src/proto/unit_test/parse_set_test.cpp +++ b/src/proto/unit_test/parse_set_test.cpp @@ -37,7 +37,7 @@ TEST_CASE("Testing parse_set", "[proto][utilities]") SECTION("int") { - const std::set expected = { 1, 2, 3, 4, 5}; + const std::set expected = { 1, 2, 3, 4, 5 }; auto const answer = lbann::parse_set("1 1 2 1 3 4 3 3 5 2"); CHECK(answer == expected); From 42216f31cb89d716688cfd5724c40879283b27bc Mon Sep 17 00:00:00 2001 From: Tim Moon Date: Thu, 25 Jul 2019 16:09:19 -0700 Subject: [PATCH 165/634] Entry-wise scale/bias layer (#1127) * CPU implementation os channel-wise scale/bias layer. Metric checking and gradient checking both pass. * CPU implementation of element-wise scale/bias layer. Metric checking and gradient checking both pass. * Reverting accidentally changed experiment script. * GPU implementation of element-wise scale/bias layer. Metric checking and gradient checking both pass. * Renaming "element-wise scale/bias" to "entry-wise scale/bias". --- include/lbann/layers/learning/CMakeLists.txt | 2 + .../learning/channelwise_scale_bias.hpp | 10 +- .../layers/learning/entrywise_scale_bias.hpp | 176 ++++++++++++++ include/lbann/lbann.hpp | 1 + src/layers/learning/CMakeLists.txt | 2 + .../learning/channelwise_scale_bias.cpp | 8 +- src/layers/learning/entrywise_scale_bias.cpp | 160 +++++++++++++ src/layers/learning/entrywise_scale_bias.cu | 216 ++++++++++++++++++ src/proto/factories/layer_factory.cpp | 7 +- src/proto/lbann.proto | 2 + 10 files changed, 572 insertions(+), 12 deletions(-) create mode 100644 include/lbann/layers/learning/entrywise_scale_bias.hpp create mode 100644 src/layers/learning/entrywise_scale_bias.cpp create mode 100644 src/layers/learning/entrywise_scale_bias.cu diff --git a/include/lbann/layers/learning/CMakeLists.txt b/include/lbann/layers/learning/CMakeLists.txt index e5f7e6337f4..089524eab2e 100644 --- a/include/lbann/layers/learning/CMakeLists.txt +++ b/include/lbann/layers/learning/CMakeLists.txt @@ -7,6 +7,8 @@ set_full_path(THIS_DIR_HEADERS fully_connected.hpp fully_connected_cuda.hpp learning.hpp + channelwise_scale_bias.hpp + entrywise_scale_bias.hpp ) # Propagate the files up the tree diff --git a/include/lbann/layers/learning/channelwise_scale_bias.hpp b/include/lbann/layers/learning/channelwise_scale_bias.hpp index 6af5ce54909..570aefa54f4 100644 --- a/include/lbann/layers/learning/channelwise_scale_bias.hpp +++ b/include/lbann/layers/learning/channelwise_scale_bias.hpp @@ -115,15 +115,15 @@ class channelwise_scale_bias_layer : public Layer { } // Setup weights - auto matrix_dist = get_prev_activations().DistData(); - matrix_dist.colDist = El::STAR; - matrix_dist.rowDist = El::STAR; + auto dist = get_prev_activations().DistData(); + dist.colDist = El::STAR; + dist.rowDist = El::STAR; m_weights[0]->set_dims({static_cast(num_channels)}, {static_cast(2)}); - m_weights[0]->set_matrix_distribution(matrix_dist); + m_weights[0]->set_matrix_distribution(dist); // Setup gradient w.r.t. weights - m_weights_gradient->AlignWith(matrix_dist); + m_weights_gradient->AlignWith(dist); m_weights_gradient->Resize(num_channels, 2); } diff --git a/include/lbann/layers/learning/entrywise_scale_bias.hpp b/include/lbann/layers/learning/entrywise_scale_bias.hpp new file mode 100644 index 00000000000..7fa6af7deb0 --- /dev/null +++ b/include/lbann/layers/learning/entrywise_scale_bias.hpp @@ -0,0 +1,176 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_LAYER_LEARNING_ENTRYWISE_SCALE_BIAS_HPP_INCLUDED +#define LBANN_LAYER_LEARNING_ENTRYWISE_SCALE_BIAS_HPP_INCLUDED + +#include "lbann/layers/layer.hpp" +#include "lbann/models/model.hpp" +#include "lbann/utils/exception.hpp" + +namespace lbann { + +/** @brief Apply scale and bias to tensor entries. + * + * Scale and bias terms are applied independently to each tensor + * entry. More precisely, given input, output, scale, and bias + * tensors @f$ X,Y,A,B\in\mathbb{R}^{d_1\times\cdots\times d_n} @f$: + * @f[ + * Y = A \circ X + B + * @f] + * + * The scale and bias terms are fused into a single weights tensor to + * reduce the number of gradient allreduces during backprop. In + * particular, the weights tensor is a + * @f$ \text{size} \times 2 @f$ matrix, where the first + * column correspond to scale terms and the second column to bias + * terms. + */ +template +class entrywise_scale_bias_layer : public Layer { +public: + + entrywise_scale_bias_layer(lbann_comm *comm) + : Layer(comm) {} + + entrywise_scale_bias_layer(const entrywise_scale_bias_layer& other) + : Layer(other), + m_weights_gradient(other.m_weights_gradient ? + other.m_weights_gradient->Copy() : nullptr) {} + entrywise_scale_bias_layer& operator=(const entrywise_scale_bias_layer& other) { + Layer::operator=(other); + m_weights_gradient.reset(other.m_weights_gradient ? + other.m_weights_gradient->Copy() : + nullptr); + return *this; + } + + entrywise_scale_bias_layer* copy() const override { + return new entrywise_scale_bias_layer(*this); + } + std::string get_type() const override { return "entry-wise scale/bias"; } + data_layout get_data_layout() const override { return Layout; } + El::Device get_device_allocation() const override { return Device; } + + void setup_matrices(const El::Grid& grid) override { + Layer::setup_matrices(grid); + auto dist = get_prev_activations().DistData(); + dist.rowDist = El::STAR; + m_weights_gradient.reset(AbsDistMat::Instantiate(dist)); + } + + void setup_data() override { + Layer::setup_data(); + const auto dims = get_output_dims(); + const El::Int size = get_output_size(); + + // Construct default weights if needed + if (this->m_weights.size() < 1) { + this->m_weights.push_back(new weights(get_comm())); + std::vector vals(2*size, DataType{0}); + std::fill(vals.begin(), vals.begin()+size, DataType{1}); + std::unique_ptr init(new value_initializer(vals)); + std::unique_ptr opt(m_model->create_optimizer()); + this->m_weights[0]->set_name(get_name() + "_weights"); + this->m_weights[0]->set_initializer(init); + this->m_weights[0]->set_optimizer(opt); + this->m_model->add_weights(this->m_weights[0]); + } + if (this->m_weights.size() != 1) { + std::ostringstream err; + err << "attempted to setup " + << this->get_type() << " layer \"" << this->get_name() << "\" " + << "with an invalid number of weights " + << "(expected 1, " + << "found " << this->m_weights.size() << ")"; + LBANN_ERROR(err.str()); + } + + // Setup weights + auto dist = get_prev_activations().DistData(); + dist.rowDist = El::STAR; + m_weights[0]->set_dims(dims, + {static_cast(2)}); + m_weights[0]->set_matrix_distribution(dist); + + // Setup gradient w.r.t. weights + m_weights_gradient->AlignWith(dist); + m_weights_gradient->Resize(size, 2); + + } + + void fp_setup_outputs(El::Int mini_batch_size) override { + Layer::fp_setup_outputs(mini_batch_size); + +#if 0 /// @todo See https://github.com/LLNL/lbann/issues/1123 + + // Check that input and weights tensors are aligned + /// @todo Realign weights tensor if misaligned + bool aligned = true; + try { + const auto& x = get_prev_activations(); + const auto& w = m_weights[0]->get_values(); + aligned = (x.ColAlign() == w.ColAlign() + && x.RowAlign() == w.RowAlign()); + } + catch (const exception& e) { + // An exception is thrown if you try accessing weights values + // before they are initialized. We don't care if this case is + // aligned, so it's safe to ignore. + } + if (!aligned) { + std::ostringstream err; + err << this->get_type() << " layer \"" << this->get_name() << "\" " + << "has misaligned input and weights matrices"; + LBANN_ERROR(err.str()); + } + +#endif // 0 + + } + + void bp_setup_gradient_wrt_inputs(El::Int mini_batch_size) { + Layer::bp_setup_gradient_wrt_inputs(mini_batch_size); + m_weights_gradient->Empty(false); + m_weights_gradient->AlignWith(get_prev_activations()); + m_weights_gradient->Resize(get_input_size(), mini_batch_size); + } + +protected: + void fp_compute() override; + void bp_compute() override; + +private: + + /** Objective function gradient w.r.t. weights. */ + std::unique_ptr m_weights_gradient; + +}; + +} // namespace lbann + +#endif // LBANN_LAYER_LEARNING_ENTRYWISE_SCALE_BIAS_HPP_INCLUDED diff --git a/include/lbann/lbann.hpp b/include/lbann/lbann.hpp index 9336ebc65dc..b67a9da0046 100644 --- a/include/lbann/lbann.hpp +++ b/include/lbann/lbann.hpp @@ -47,6 +47,7 @@ #include "lbann/layers/learning/deconvolution.hpp" #include "lbann/layers/learning/embedding.hpp" #include "lbann/layers/learning/channelwise_scale_bias.hpp" +#include "lbann/layers/learning/entrywise_scale_bias.hpp" /// Loss layers #include "lbann/layers/loss/categorical_accuracy.hpp" diff --git a/src/layers/learning/CMakeLists.txt b/src/layers/learning/CMakeLists.txt index 7fb7aa137c6..4ff1798faaa 100644 --- a/src/layers/learning/CMakeLists.txt +++ b/src/layers/learning/CMakeLists.txt @@ -1,6 +1,7 @@ # Add the source files for this directory set_full_path(THIS_DIR_SOURCES channelwise_scale_bias.cpp + entrywise_scale_bias.cpp embedding.cpp fully_connected.cpp ) @@ -9,6 +10,7 @@ if (LBANN_HAS_CUDA) # Add the CUDA source files for this directory set_full_path(THIS_DIR_CU_SOURCES channelwise_scale_bias.cu + entrywise_scale_bias.cu ) endif () diff --git a/src/layers/learning/channelwise_scale_bias.cpp b/src/layers/learning/channelwise_scale_bias.cpp index baab65edff2..d837982387a 100644 --- a/src/layers/learning/channelwise_scale_bias.cpp +++ b/src/layers/learning/channelwise_scale_bias.cpp @@ -88,7 +88,6 @@ void channelwise_scale_bias_layer auto local_gradient_wrt_bias = El::View(local_gradient_wrt_weights, El::ALL, El::IR(1)); - // Dimensions // Note: channel_size is the number of input entries per channel and // local_width is the number of local mini-batch samples. @@ -103,7 +102,10 @@ void channelwise_scale_bias_layer LBANN_OMP_PARALLEL_FOR for (El::Int channel = 0; channel < num_channels; ++channel) { const auto a = local_scale(channel, 0); - DataType da{0}, db{0}; + auto& da = local_gradient_wrt_scale(channel, 0); + auto& db = local_gradient_wrt_bias(channel, 0); + da = 0; + db = 0; const El::Int row_start = channel * channel_size; const El::Int row_end = (channel + 1) * channel_size; const El::Int col_start = 0; @@ -118,8 +120,6 @@ void channelwise_scale_bias_layer dx = a * dy; } } - local_gradient_wrt_scale(channel, 0) = da; - local_gradient_wrt_bias(channel, 0) = db; } // Update optimizer with gradient diff --git a/src/layers/learning/entrywise_scale_bias.cpp b/src/layers/learning/entrywise_scale_bias.cpp new file mode 100644 index 00000000000..48323475945 --- /dev/null +++ b/src/layers/learning/entrywise_scale_bias.cpp @@ -0,0 +1,160 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#include "lbann/layers/learning/entrywise_scale_bias.hpp" + +namespace lbann { + +namespace { + +void fp_impl(const CPUMat& local_input, + CPUMat& local_output, + const weights& scale_bias) { + + // Local matrices + const auto& local_scale_bias + = dynamic_cast(scale_bias.get_values().LockedMatrix()); + const auto local_scale = El::LockedView(local_scale_bias, + El::ALL, El::IR(0)); + const auto local_bias = El::LockedView(local_scale_bias, + El::ALL, El::IR(1)); + + // Apply entry-wise scale and bias + const El::Int local_height = local_input.Height(); + const El::Int local_width = local_input.Width(); + LBANN_OMP_PARALLEL_FOR_COLLAPSE2 + for (El::Int col = 0; col < local_width; ++col) { + for (El::Int row = 0; row < local_height; ++row) { + const auto& a = local_scale(row, 0); + const auto& b = local_bias(row, 0); + const auto& x = local_input(row, col); + auto& y = local_output(row, col); + y = a * x + b; + } + } + +} + +void bp_impl(const CPUMat& local_input, + const CPUMat& local_gradient_wrt_output, + CPUMat& local_gradient_wrt_input, + weights& scale_bias, + AbsDistMat& gradient_wrt_scale_bias, + El::Int mini_batch_size) { + + // Local matrices + const auto& local_scale_bias + = dynamic_cast(scale_bias.get_values().LockedMatrix()); + auto& local_gradient_wrt_scale_bias + = dynamic_cast(gradient_wrt_scale_bias.Matrix()); + const auto local_scale = El::LockedView(local_scale_bias, + El::ALL, El::IR(0)); + auto local_gradient_wrt_scale = El::View(local_gradient_wrt_scale_bias, + El::ALL, El::IR(0)); + auto local_gradient_wrt_bias = El::View(local_gradient_wrt_scale_bias, + El::ALL, El::IR(1)); + + // Dimensions + const El::Int local_height = local_input.Height(); + const El::Int local_width = local_input.Width(); + + // Iterate through row blocks + // Note: Block size is chosen to match cache line size. + El::Zero(local_gradient_wrt_scale_bias); + constexpr El::Int _bsize = 64 / sizeof(DataType); + constexpr El::Int bsize = _bsize > 1 ? _bsize : 1; + LBANN_OMP_PARALLEL_FOR + for (El::Int row_start = 0; row_start < local_height; row_start += bsize) { + const El::Int row_end = std::min(row_start + bsize, local_height); + const El::Int col_start = 0; + const El::Int col_end = local_width; + + // Compute gradient contributions for row block + for (El::Int col = col_start; col < col_end; ++col) { + for (El::Int row = row_start; row < row_end; ++row) { + const auto& a = local_scale(row, 0); + const auto& x = local_input(row, col); + const auto& dy = local_gradient_wrt_output(row, col); + auto& dx = local_gradient_wrt_input(row, col); + auto& da = local_gradient_wrt_scale(row, 0); + auto& db = local_gradient_wrt_bias(row, 0); + dx = a * dy; + da += x * dy; + db += dy; + } + } + + } + + // Update optimizer with gradient + auto* opt = scale_bias.get_optimizer(); + if (opt != nullptr) { + opt->add_to_gradient(gradient_wrt_scale_bias, + DataType{1} / mini_batch_size, + true); + } + +} + +} // namespace + +// Template instantiation +template <> +void entrywise_scale_bias_layer + ::fp_compute() { + fp_impl(get_local_prev_activations(), + get_local_activations(), + *m_weights[0]); +} +template <> +void entrywise_scale_bias_layer + ::fp_compute() { + fp_impl(get_local_prev_activations(), + get_local_activations(), + *m_weights[0]); +} +template <> +void entrywise_scale_bias_layer + ::bp_compute() { + bp_impl(get_local_prev_activations(), + get_local_prev_error_signals(), + get_local_error_signals(), + *this->m_weights[0], + *m_weights_gradient, + this->m_model->get_effective_mini_batch_size()); +} +template <> +void entrywise_scale_bias_layer + ::bp_compute() { + bp_impl(get_local_prev_activations(), + get_local_prev_error_signals(), + get_local_error_signals(), + *this->m_weights[0], + *m_weights_gradient, + this->m_model->get_effective_mini_batch_size()); +} + +} // namespace lbann diff --git a/src/layers/learning/entrywise_scale_bias.cu b/src/layers/learning/entrywise_scale_bias.cu new file mode 100644 index 00000000000..71b739f0429 --- /dev/null +++ b/src/layers/learning/entrywise_scale_bias.cu @@ -0,0 +1,216 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#include "lbann/layers/learning/entrywise_scale_bias.hpp" + +namespace lbann { + +namespace { + +/** + * Block dimensions: bsizex x bsizey x 1 + * + * Grid dimensions: (height / bsizex) x (width / bsizey) x num_channels + */ +__global__ void fp_kernel(size_t height, + size_t width, + const DataType* __restrict__ input, + size_t input_ldim, + DataType* __restrict__ output, + size_t output_ldim, + const DataType* __restrict__ scale, + const DataType* __restrict__ bias) { + const size_t gidx = threadIdx.x + blockIdx.x * blockDim.x; + const size_t gidy = threadIdx.y + blockIdx.y * blockDim.y; + const size_t nthreadsx = blockDim.x * gridDim.x; + const size_t nthreadsy = blockDim.y * gridDim.y; + for (size_t row = gidx; row < height; row += nthreadsx) { + const auto a = scale[row]; + const auto b = bias[row]; + for (size_t col = gidy; col < width; col += nthreadsy) { + const auto& x = input[row + col*input_ldim]; + auto& y = output[row + col*output_ldim]; + y = a * x + b; + } + } +} + +/** + * Block dimensions: bsize x 1 x 1 + * + * Grid dimensions: (height / bsize) x 1 x 1 + */ +__global__ void bp_kernel(size_t height, + size_t width, + const DataType* __restrict__ input, + size_t input_ldim, + const DataType* __restrict__ gradient_wrt_output, + size_t gradient_wrt_output_ldim, + DataType* __restrict__ gradient_wrt_input, + size_t gradient_wrt_input_ldim, + const DataType* __restrict__ scale, + DataType* __restrict__ gradient_wrt_scale, + DataType* __restrict__ gradient_wrt_bias) { + const size_t gid = threadIdx.x + blockIdx.x * blockDim.x; + const size_t nthreads = blockDim.x * gridDim.x; + for (size_t row = gid; row < height; row += nthreads) { + const auto a = scale[row]; + DataType da{0}, db{0}; + for (size_t col = 0; col < width; ++col) { + const auto& x = input[row + col * input_ldim]; + const auto& dy = gradient_wrt_output[row + col * gradient_wrt_output_ldim]; + auto& dx = gradient_wrt_input[row + col * gradient_wrt_input_ldim]; + dx = a * dy; + da += x * dy; + db += dy; + } + gradient_wrt_scale[row] = da; + gradient_wrt_bias[row] = db; + } +} + +void fp_impl(const GPUMat& local_input, + GPUMat& local_output, + const weights& scale_bias) { + + // Local matrices + const auto& local_scale_bias + = dynamic_cast(scale_bias.get_values().LockedMatrix()); + const auto local_scale = El::LockedView(local_scale_bias, + El::ALL, El::IR(0)); + const auto local_bias = El::LockedView(local_scale_bias, + El::ALL, El::IR(1)); + + // Apply entry-wise scale and bias + const El::Int local_height = local_input.Height(); + const El::Int local_width = local_input.Width(); + if (!local_input.IsEmpty()) { + constexpr size_t block_size_x = 256; + constexpr size_t block_size_y = 1; + dim3 block_dims, grid_dims; + block_dims.x = block_size_x; + block_dims.y = block_size_y; + grid_dims.x = (local_height + block_size_x - 1) / block_size_x; + grid_dims.y = (local_width + block_size_y - 1) / block_size_y; + fp_kernel + <<>>( + local_height, local_width, + local_input.LockedBuffer(), local_input.LDim(), + local_output.Buffer(), local_output.LDim(), + local_scale.LockedBuffer(), + local_bias.LockedBuffer()); + } + +} + +void bp_impl(const GPUMat& local_input, + const GPUMat& local_gradient_wrt_output, + GPUMat& local_gradient_wrt_input, + weights& scale_bias, + AbsDistMat& gradient_wrt_scale_bias, + El::Int mini_batch_size) { + + // Local matrices + const auto& local_scale_bias + = dynamic_cast(scale_bias.get_values().LockedMatrix()); + auto& local_gradient_wrt_scale_bias + = dynamic_cast(gradient_wrt_scale_bias.Matrix()); + const auto local_scale = El::LockedView(local_scale_bias, + El::ALL, El::IR(0)); + auto local_gradient_wrt_scale = El::View(local_gradient_wrt_scale_bias, + El::ALL, El::IR(0)); + auto local_gradient_wrt_bias = El::View(local_gradient_wrt_scale_bias, + El::ALL, El::IR(1)); + + // Compute gradients + const El::Int local_height = local_input.Height(); + const El::Int local_width = local_input.Width(); + El::Zero(local_gradient_wrt_scale_bias); + if (!local_input.IsEmpty()) { + constexpr size_t block_size = 256; + dim3 block_dims, grid_dims; + block_dims.x = block_size; + grid_dims.x = (local_height + block_size - 1) / block_size; + bp_kernel + <<>>( + local_height, local_width, + local_input.LockedBuffer(), local_input.LDim(), + local_gradient_wrt_output.LockedBuffer(), local_gradient_wrt_output.LDim(), + local_gradient_wrt_input.Buffer(), local_gradient_wrt_input.LDim(), + local_scale.LockedBuffer(), + local_gradient_wrt_scale.Buffer(), + local_gradient_wrt_bias.Buffer()); + } + + // Update optimizer with gradient + auto* opt = scale_bias.get_optimizer(); + if (opt != nullptr) { + opt->add_to_gradient(gradient_wrt_scale_bias, + DataType{1} / mini_batch_size, + true); + } + +} + +} // namespace + +// Template instantiation +template <> +void entrywise_scale_bias_layer + ::fp_compute() { + fp_impl(get_local_prev_activations(), + get_local_activations(), + *m_weights[0]); +} +template <> +void entrywise_scale_bias_layer + ::fp_compute() { + fp_impl(get_local_prev_activations(), + get_local_activations(), + *m_weights[0]); +} +template <> +void entrywise_scale_bias_layer + ::bp_compute() { + bp_impl(get_local_prev_activations(), + get_local_prev_error_signals(), + get_local_error_signals(), + *this->m_weights[0], + *m_weights_gradient, + this->m_model->get_effective_mini_batch_size()); +} +template <> +void entrywise_scale_bias_layer + ::bp_compute() { + bp_impl(get_local_prev_activations(), + get_local_prev_error_signals(), + get_local_error_signals(), + *this->m_weights[0], + *m_weights_gradient, + this->m_model->get_effective_mini_batch_size()); +} + +} // namespace lbann diff --git a/src/proto/factories/layer_factory.cpp b/src/proto/factories/layer_factory.cpp index 10d4c517766..50c8ffd82ec 100644 --- a/src/proto/factories/layer_factory.cpp +++ b/src/proto/factories/layer_factory.cpp @@ -198,7 +198,7 @@ std::unique_ptr construct_layer( } } - // Embedding layer + // Learning layers if (proto_layer.has_embedding()) { const auto& params = proto_layer.embedding(); if (Layout == data_layout::DATA_PARALLEL @@ -210,8 +210,6 @@ std::unique_ptr construct_layer( "data-parallel data layout and on CPU"); } } - - // Channel-wise scale/bias layer if (proto_layer.has_channelwise_scale_bias()) { if (Layout == data_layout::DATA_PARALLEL) { return lbann::make_unique>(comm); @@ -220,6 +218,9 @@ std::unique_ptr construct_layer( "with data-parallel data layout"); } } + if (proto_layer.has_entrywise_scale_bias()) { + return lbann::make_unique>(comm); + } // Transform layers if (proto_layer.has_reshape()) { diff --git a/src/proto/lbann.proto b/src/proto/lbann.proto index 17a3d92584c..6ef43542276 100644 --- a/src/proto/lbann.proto +++ b/src/proto/lbann.proto @@ -580,6 +580,7 @@ message Layer { Deconvolution deconvolution = 305; Embedding embedding = 328; ChannelwiseScaleBias channelwise_scale_bias = 329; + EntrywiseScaleBias entrywise_scale_bias = 330; // Loss layers CrossEntropy cross_entropy = 60; @@ -1014,6 +1015,7 @@ message Embedding { } message ChannelwiseScaleBias {} +message EntrywiseScaleBias {} ////////////////// // Image layers // From 4708fc457a3a4691644be36646459cdb196574f3 Mon Sep 17 00:00:00 2001 From: "David A. Hysom" Date: Thu, 25 Jul 2019 18:59:10 -0700 Subject: [PATCH 166/634] added check to ensure there is sufficient memory for the shared memory segment; added and modified print statements for user feedback. --- src/data_store/data_store_conduit.cpp | 30 ++++++++++++++++++++++++--- 1 file changed, 27 insertions(+), 3 deletions(-) diff --git a/src/data_store/data_store_conduit.cpp b/src/data_store/data_store_conduit.cpp index 5ed0c1da4f8..5d1972acb67 100644 --- a/src/data_store/data_store_conduit.cpp +++ b/src/data_store/data_store_conduit.cpp @@ -38,6 +38,8 @@ #include #include #include +#include +#include namespace lbann { @@ -239,7 +241,9 @@ void data_store_conduit::setup(int mini_batch_size) { if (m_world_master) { std::cerr << "starting data_store_conduit::setup() for role: " << m_reader->get_role() << "\n"; - if (m_super_node) { + if (m_is_local_cache) { + std::cerr << "data store mode: local cache\n"; + } else if (m_super_node) { std::cerr << "data store mode: exchange_data via super nodes\n"; } else { std::cerr << "data store mode: exchange_data via individual samples\n"; @@ -1077,7 +1081,7 @@ void data_store_conduit::get_image_sizes(std::unordered_map &file_sizes const std::string fn = m_reader->get_file_dir() + '/' + image_list[h].first; std::ifstream in(fn.c_str()); if (!in) { - LBANN_ERROR("failed to open " + fn + " for reading"); + LBANN_ERROR("failed to open " + fn + " for reading; file_dir: " + m_reader->get_file_dir() + " fn: " + image_list[h].first + "; role: " + m_reader->get_role()); } in.seekg(0, std::ios::end); my_image_sizes.push_back(h); @@ -1138,8 +1142,24 @@ void data_store_conduit::allocate_shared_segment(std::unordered_map &si size += t.second; } m_mem_seg_length = size; + + struct statvfs stat; + int x = statvfs("/dev/shm", &stat); + if (x != 0) { + LBANN_ERROR("statvfs failed\n"); + } + size_t avail_mem = stat.f_bsize*stat.f_bavail; + double percent = 100.0 * m_mem_seg_length / avail_mem; + std::stringstream msg; + msg << " size of required shared memory segment: " << m_mem_seg_length << "\n" + << " available mem: " << avail_mem << "\n" + << " required size is " << percent << " percent of available\n"; if (m_world_master) { - std::cout << "size of shared memory segment: " << m_mem_seg_length << std::endl; + std::cout << "\nShared memory segment statistics:\n" + << msg.str() << "\n"; + } + if (m_mem_seg_length >= avail_mem) { + LBANN_ERROR("insufficient available memory:\n" + msg.str()); } //need to ensure name is unique across all data readers @@ -1291,10 +1311,14 @@ void data_store_conduit::read_files(std::vector &work, std::unordered_map< if (m_output) { m_output << "data_store_conduit::read_files; requested work size: " << n << std::endl; } + if (m_world_master) { + std::cout << "data_store_conduit::read_files; requested work size: " << n << std::endl; + } image_data_reader *image_reader = dynamic_cast(m_reader); const std::vector &image_list = image_reader->get_image_list(); size_t offset = 0; + for (auto h : indices) { int s = sizes[h]; const std::string fn = m_reader->get_file_dir() + '/' + image_list[h].first; From 65ec9244c3fceef13216a08d8104b8a683d57107 Mon Sep 17 00:00:00 2001 From: "David A. Hysom" Date: Fri, 26 Jul 2019 14:14:21 -0700 Subject: [PATCH 167/634] changed 'int' to 'size_t' in a couple of places to avoid over-run; added/modified print statement to cout. --- .../lbann/data_store/data_store_conduit.hpp | 2 +- src/data_store/data_store_conduit.cpp | 128 +++++++----------- 2 files changed, 49 insertions(+), 81 deletions(-) diff --git a/include/lbann/data_store/data_store_conduit.hpp b/include/lbann/data_store/data_store_conduit.hpp index d35b65087e5..a3ba48cb151 100644 --- a/include/lbann/data_store/data_store_conduit.hpp +++ b/include/lbann/data_store/data_store_conduit.hpp @@ -306,7 +306,7 @@ protected : void exchange_images(std::vector &work, std::unordered_map &image_sizes, std::vector> &indices); /// for use in local cache mode - void fillin_shared_images(const std::vector &images, int offset); + void fillin_shared_images(const std::vector &images, size_t offset); /// for use in local cache mode char *m_mem_seg = 0; diff --git a/src/data_store/data_store_conduit.cpp b/src/data_store/data_store_conduit.cpp index 5d1972acb67..585015b2b7c 100644 --- a/src/data_store/data_store_conduit.cpp +++ b/src/data_store/data_store_conduit.cpp @@ -237,8 +237,6 @@ void data_store_conduit::copy_members(const data_store_conduit& rhs, const std:: } void data_store_conduit::setup(int mini_batch_size) { - double tm1 = get_time(); - if (m_world_master) { std::cerr << "starting data_store_conduit::setup() for role: " << m_reader->get_role() << "\n"; if (m_is_local_cache) { @@ -250,8 +248,11 @@ void data_store_conduit::setup(int mini_batch_size) { } } + double tm1 = get_time(); if (!m_preload) { + if (m_world_master) std::cout << "calling build_owner_map\n"; build_owner_map(mini_batch_size); + if (m_world_master) std::cout << " build_owner_map time: " << (get_time()-tm1) << "\n"; } else { m_owner_map_mb_size = mini_batch_size; } @@ -1174,24 +1175,6 @@ void data_store_conduit::allocate_shared_segment(std::unordered_map &si } m_comm->trainer_barrier(); - #if 0 - debug block; may go away - for (int i=0; iget_role() << "; m_rank_in_trainer: " << m_rank_in_trainer << std::endl; - system("ls -l /dev/shm"); - s << "rm -rf /dev/shm/" << m_seg_name; - system(s.str().c_str()); - std::cerr << "\nls -l /dev/shm; AFTER rm -rf; role: " << m_reader->get_role() << "; m_rank_in_trainer: " << m_rank_in_trainer << std::endl; - system("ls -l /dev/shm"); - } - } - m_comm->trainer_barrier(); - } - #endif - int shm_fd; if (node_id == 0) { @@ -1243,90 +1226,73 @@ void data_store_conduit::allocate_shared_segment(std::unordered_map &si void data_store_conduit::preload_local_cache() { std::unordered_map file_sizes; std::vector> indices; - get_image_sizes(file_sizes, indices); - if (m_world_master) { - //verify that file_sizes map is correct - //verify that indices is correct - } + double tm1 = get_time(); + if (m_world_master) std::cout << "calling get_image_sizes" << std::endl; + get_image_sizes(file_sizes, indices); + if (m_world_master) std::cout << " get_image_sizes time: " << (get_time()-tm1) << std::endl; + tm1 = get_time(); + //indices[j] contains the indices (wrt m_reader->get_image_list()) + //that P_j will read from disk, and subsequently bcast to all others + // + //file_sizes maps an index to its file size + if (m_world_master) std::cout << "calling allocate_shared_segment" << std::endl; + allocate_shared_segment(file_sizes, indices); + if (m_world_master) std::cout << " allocate_shared_segment time: " << (get_time()-tm1) << std::endl; + tm1 = get_time(); + + if (m_world_master) std::cout << "calling read_files" << std::endl; std::vector work; read_files(work, file_sizes, indices[m_rank_in_trainer]); - allocate_shared_segment(file_sizes, indices); + if (m_world_master) std::cout << " read_files time: " << (get_time()- tm1) << std::endl; + tm1 = get_time(); + + if (m_world_master) std::cout << "calling compute_image_offsets" << std::endl; compute_image_offsets(file_sizes, indices); - exchange_images(work, file_sizes, indices); + if (m_world_master) std::cout << " compute_image_offsets time: " << (get_time()-tm1) << std::endl; + tm1 = get_time(); -#if 0 - if (m_world_master) { - //verify that images in shared segment are correct - image_data_reader *image_reader = dynamic_cast(m_reader); - const std::vector &image_list = image_reader->get_image_list(); - for (size_t h=0; hget_file_dir() + '/' + image_list[h].first; - std::cerr << "\nXX checking data_id " << h << " file: " << fn << "\n"; - std::ifstream in(fn, std::ios::in | std::ios::binary); - in.seekg(0, std::ios::end); - int n = in.tellg(); - in.seekg(0, std::ios::beg); - std::cerr << " XX file size: " << n << " from sizes map: " << file_sizes[h] << "\n"; - if (n != file_sizes[h]) { - LBANN_ERROR("n != sizes[h]"); - } - char *c = m_mem_seg + m_image_offsets[h]; - std::vector w(n); - in.read(w.data(), n); - in.close(); - for (int i=0; i &work, std::unordered_map &sizes, std::vector &indices) { - if (m_world_master) { - std::cout << "data_store_conduit: reading files for local_cache\n"; - } + + //reserve space for reading this proc's files into a contiguous memory space size_t n = 0; - for (auto t : indices) { - n += sizes[t]; + for (size_t j=0; j(m_reader); const std::vector &image_list = image_reader->get_image_list(); - size_t offset = 0; - for (auto h : indices) { - int s = sizes[h]; - const std::string fn = m_reader->get_file_dir() + '/' + image_list[h].first; + //read the images + size_t offset = 0; + if (m_world_master) std::cerr << " my num files: " << indices.size() << std::endl; + for (size_t j=0; jget_file_dir() + '/' + image_list[idx].first; std::ifstream in(fn, std::ios::in | std::ios::binary); in.read(work.data()+offset, s); in.close(); offset += s; } + if (m_world_master) std::cout << " finished reading files\n"; } void data_store_conduit::build_conduit_nodes(std::unordered_map &sizes) { @@ -1344,14 +1310,14 @@ void data_store_conduit::build_conduit_nodes(std::unordered_map &sizes) } } -void data_store_conduit::fillin_shared_images(const std::vector &images, int offset) { +void data_store_conduit::fillin_shared_images(const std::vector &images, size_t offset) { memcpy(m_mem_seg+offset, reinterpret_cast(images.data()), images.size()); } void data_store_conduit::exchange_images(std::vector &work, std::unordered_map &image_sizes, std::vector> &indices) { std::vector work2; int node_rank = m_comm->get_rank_in_node(); - int offset = 0; + size_t offset = 0; for (int p=0; ptrainer_broadcast(p, work.data(), work.size()); @@ -1359,7 +1325,7 @@ void data_store_conduit::exchange_images(std::vector &work, std::unordered fillin_shared_images(work, offset); } } else { - int sz = 0; + size_t sz = 0; for (auto idx : indices[p]) { sz += image_sizes[idx]; } @@ -1369,9 +1335,11 @@ void data_store_conduit::exchange_images(std::vector &work, std::unordered fillin_shared_images(work2, offset); } } + for (size_t r=0; rbarrier(m_comm->get_node_comm()); From 3bbc9ff6f0c8217a5570c57ad3bb0533dda8f7bb Mon Sep 17 00:00:00 2001 From: "David A. Hysom" Date: Sun, 28 Jul 2019 07:22:42 -0700 Subject: [PATCH 168/634] previously images were loaded per the data reader's image list; this resulted in unnecessary work and memory, esp. when using less than 100% of the list. This is now fixed so that images are loaded per the shuffled indices list. --- src/data_store/data_store_conduit.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/data_store/data_store_conduit.cpp b/src/data_store/data_store_conduit.cpp index 585015b2b7c..827bee27ea3 100644 --- a/src/data_store/data_store_conduit.cpp +++ b/src/data_store/data_store_conduit.cpp @@ -1078,14 +1078,14 @@ void data_store_conduit::get_image_sizes(std::unordered_map &file_sizes // get sizes of files for which I'm responsible std::vector my_image_sizes; - for (size_t h=m_rank_in_trainer; hget_file_dir() + '/' + image_list[h].first; + for (size_t h=m_rank_in_trainer; hsize(); h += m_np_in_trainer) { + const std::string fn = m_reader->get_file_dir() + '/' + image_list[(*m_shuffled_indices)[h]].first; std::ifstream in(fn.c_str()); if (!in) { LBANN_ERROR("failed to open " + fn + " for reading; file_dir: " + m_reader->get_file_dir() + " fn: " + image_list[h].first + "; role: " + m_reader->get_role()); } in.seekg(0, std::ios::end); - my_image_sizes.push_back(h); + my_image_sizes.push_back((*m_shuffled_indices)[h]); my_image_sizes.push_back(in.tellg()); in.close(); } From f309bfb91b4dc3096cf1bf5b4a0921ab6bc782a8 Mon Sep 17 00:00:00 2001 From: Tim Moon Date: Mon, 29 Jul 2019 10:18:56 -0700 Subject: [PATCH 169/634] Disable signal handler in Python worker processes (#1129) * Disabling signal handler in worker processes for Python data reader. * Typo. --- src/data_readers/data_reader_python.cpp | 36 +++++++++++++++++++++++-- 1 file changed, 34 insertions(+), 2 deletions(-) diff --git a/src/data_readers/data_reader_python.cpp b/src/data_readers/data_reader_python.cpp index 65a9c109a4c..09cad0e1c0b 100644 --- a/src/data_readers/data_reader_python.cpp +++ b/src/data_readers/data_reader_python.cpp @@ -278,9 +278,41 @@ def @wrapper_func@(sample_index, array_offset): = PyObject_GetAttrString(main_module, wrapper_func_name.c_str()); + // Create initializer function for worker processes + const std::string init_func_name + = "_DATA_READER_PYTHON_CPP_init_function"; + std::string init_func_def = R"( +def @init_func@(): + """Initialize worker process. + + Disables the LBANN signal handler since it reports a spurious error + when the worker process recieves SIGTERM from the master process. + + """ + + # Disable LBANN signal handler + import signal + for sig in range(signal.NSIG): + try: + signal.signal(sig, signal.SIG_DFL) + pass + except: pass +)"; + init_func_def = std::regex_replace(init_func_def, + std::regex("\\@init_func\\@"), + init_func_name); + PyRun_SimpleString(init_func_def.c_str()); + python::session::check_error(); + python::object init_func + = PyObject_GetAttrString(main_module, + init_func_name.c_str()); + // Start Python process pool - m_process_pool = PyObject_CallMethod(multiprocessing_module, "Pool", - "(L)", num_io_threads); + m_process_pool = PyObject_CallMethod(multiprocessing_module, + "Pool", + "(L,O)", + num_io_threads, + init_func.get()); } From 424bd7a0fbed38f04458f4007679d610ded2a62f Mon Sep 17 00:00:00 2001 From: Ryan Forsyth Date: Fri, 26 Jul 2019 10:00:22 -0700 Subject: [PATCH 170/634] Switch to python3 --- bamboo/compiler_tests/conftest.py | 4 ++-- bamboo/compiler_tests/test_compiler.py | 4 ++-- bamboo/integration_tests/common_code.py | 2 +- bamboo/integration_tests/conftest.py | 4 ++-- bamboo/run.sh | 8 +------- bamboo/unit_tests/conftest.py | 4 ++-- 6 files changed, 10 insertions(+), 16 deletions(-) diff --git a/bamboo/compiler_tests/conftest.py b/bamboo/compiler_tests/conftest.py index 238b812e638..9f137a01527 100644 --- a/bamboo/compiler_tests/conftest.py +++ b/bamboo/compiler_tests/conftest.py @@ -4,9 +4,9 @@ def pytest_addoption(parser): cluster = re.sub('[0-9]+', '', subprocess.check_output( - 'hostname'.split()).strip()) + 'hostname'.split()).decode('utf-8').strip()) default_dirname = subprocess.check_output( - 'git rev-parse --show-toplevel'.split()).strip() + 'git rev-parse --show-toplevel'.split()).decode('utf-8').strip() parser.addoption('--cluster', action='store', default=cluster, help='--cluster= to specify the cluster being run on, for the purpose of determing which commands to use. Default the current cluster') parser.addoption('--dirname', action='store', default=default_dirname, diff --git a/bamboo/compiler_tests/test_compiler.py b/bamboo/compiler_tests/test_compiler.py index 5c8be7bee7a..bf3c58e3109 100644 --- a/bamboo/compiler_tests/test_compiler.py +++ b/bamboo/compiler_tests/test_compiler.py @@ -150,13 +150,13 @@ def build_skeleton(dir_name, compiler, debug, should_log): error_file_name = '%s/bamboo/compiler_tests/error/%s_%s_build_error.txt' % (dir_name, compiler_underscored, build_type) compiler = compiler.replace('@', '-') #mpi_lib = mpi_lib.replace('@', '-') - cluster = re.sub('[0-9]+', '', subprocess.check_output('hostname'.split()).strip()) + cluster = re.sub('[0-9]+', '', subprocess.check_output('hostname'.split()).decode('utf-8').strip()) # For reference: # Commenting out for now. These additions to path name will likely return # one day, so I am not removing them entirely. # x86_64 <=> catalyst, pascal # ppc64le <=> ray - #architecture = subprocess.check_output('uname -m'.split()).strip() + #architecture = subprocess.check_output('uname -m'.split()).decode('utf-8').strip() #if cluster == 'ray': # architecture += '_gpu_cuda-9.2.64_cudnn-7.0' #elif cluster == 'pascal': diff --git a/bamboo/integration_tests/common_code.py b/bamboo/integration_tests/common_code.py index 37da5fa3850..da6c0beb39a 100644 --- a/bamboo/integration_tests/common_code.py +++ b/bamboo/integration_tests/common_code.py @@ -226,7 +226,7 @@ def skeleton(cluster, dir_name, executable, model_folder, model_name, def csv_to_dict(csv_path): with open(csv_path, 'r') as csv_file: reader = csv.reader(csv_file, skipinitialspace=True) - column_headers = reader.next() + column_headers = next(reader) values = {} for row in reader: row_header = row[0] diff --git a/bamboo/integration_tests/conftest.py b/bamboo/integration_tests/conftest.py index d71b3987918..09c52c5119e 100644 --- a/bamboo/integration_tests/conftest.py +++ b/bamboo/integration_tests/conftest.py @@ -6,9 +6,9 @@ def pytest_addoption(parser): cluster = re.sub('[0-9]+', '', subprocess.check_output( - 'hostname'.split()).strip()) + 'hostname'.split()).decode('utf-8').strip()) default_dirname = subprocess.check_output( - 'git rev-parse --show-toplevel'.split()).strip() + 'git rev-parse --show-toplevel'.split()).decode('utf-8').strip() default_exes = tools.get_default_exes(default_dirname, cluster) parser.addoption('--cluster', action='store', default=cluster, diff --git a/bamboo/run.sh b/bamboo/run.sh index a671532e4dd..7ce3597a0bd 100755 --- a/bamboo/run.sh +++ b/bamboo/run.sh @@ -5,13 +5,7 @@ CLUSTER=$(hostname | sed 's/\([a-zA-Z][a-zA-Z]*\)[0-9]*/\1/g') echo "run.sh CLUSTER=" echo $CLUSTER -if [ "${CLUSTER}" = 'catalyst' ] || [ "${CLUSTER}" = 'corona' ] || [ "${CLUSTER}" = 'lassen' ]; then - PYTHON=python -fi - -if [ "${CLUSTER}" = 'pascal' ]; then - PYTHON=$bamboo_PYTHON_x86_gpu/python -fi +PYTHON=python3 WEEKLY=0 while :; do diff --git a/bamboo/unit_tests/conftest.py b/bamboo/unit_tests/conftest.py index ef6b449b246..12c5bf457ec 100644 --- a/bamboo/unit_tests/conftest.py +++ b/bamboo/unit_tests/conftest.py @@ -5,9 +5,9 @@ def pytest_addoption(parser): cluster = re.sub('[0-9]+', '', subprocess.check_output( - 'hostname'.split()).strip()) + 'hostname'.split()).decode('utf-8').strip()) default_dirname = subprocess.check_output( - 'git rev-parse --show-toplevel'.split()).strip() + 'git rev-parse --show-toplevel'.split()).decode('utf-8').strip() default_exes = tools.get_default_exes(default_dirname, cluster) parser.addoption('--cluster', action='store', default=cluster, From b5c460fbdc7f42661c4f8d88f506775a8916f90f Mon Sep 17 00:00:00 2001 From: "Thomas R. Benson" Date: Tue, 30 Jul 2019 15:42:28 -0700 Subject: [PATCH 171/634] update the proto inputs --- src/proto/CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/proto/CMakeLists.txt b/src/proto/CMakeLists.txt index fa9473100e5..6223ba4725f 100644 --- a/src/proto/CMakeLists.txt +++ b/src/proto/CMakeLists.txt @@ -21,8 +21,8 @@ if (LBANN_HAS_PROTOBUF) "-I" "${CMAKE_CURRENT_SOURCE_DIR}" "${PROTO_INPUTS}" OUTPUT ${PROTO_SRCS} ${PROTO_HDRS} ${PROTO_PY} - DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/lbann.proto" protobuf::protoc - COMMENT "Running protoc on lbann.proto" + DEPENDS ${PROTO_INPUTS} protobuf::protoc + COMMENT "Running protoc on LBANN protobuf sources." COMMAND_EXPAND_LISTS VERBATIM) From f7c7dda47dc9759230216f0c2dbe29702748949f Mon Sep 17 00:00:00 2001 From: "Thomas R. Benson" Date: Tue, 30 Jul 2019 17:07:28 -0700 Subject: [PATCH 172/634] remove a warning about a missing override --- include/lbann/layers/learning/entrywise_scale_bias.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/lbann/layers/learning/entrywise_scale_bias.hpp b/include/lbann/layers/learning/entrywise_scale_bias.hpp index 7fa6af7deb0..528ef75bab8 100644 --- a/include/lbann/layers/learning/entrywise_scale_bias.hpp +++ b/include/lbann/layers/learning/entrywise_scale_bias.hpp @@ -153,7 +153,7 @@ class entrywise_scale_bias_layer : public Layer { } - void bp_setup_gradient_wrt_inputs(El::Int mini_batch_size) { + void bp_setup_gradient_wrt_inputs(El::Int mini_batch_size) override { Layer::bp_setup_gradient_wrt_inputs(mini_batch_size); m_weights_gradient->Empty(false); m_weights_gradient->AlignWith(get_prev_activations()); From 754649c8c69141bca4f499452905039ba765e569 Mon Sep 17 00:00:00 2001 From: "Thomas R. Benson" Date: Tue, 30 Jul 2019 17:43:30 -0700 Subject: [PATCH 173/634] split layers.proto from lbann.proto; remove motifs Protobuf and C++ changes only; python changes forthcoming --- include/lbann/proto/proto_common.hpp | 5 - src/proto/CMakeLists.txt | 2 +- src/proto/layers.proto | 572 +++++++++++++++++++++++++++ src/proto/lbann.proto | 569 +------------------------- src/proto/proto_common.cpp | 27 -- 5 files changed, 574 insertions(+), 601 deletions(-) create mode 100644 src/proto/layers.proto diff --git a/include/lbann/proto/proto_common.hpp b/include/lbann/proto/proto_common.hpp index 6414f8bcb94..261f18344d4 100644 --- a/include/lbann/proto/proto_common.hpp +++ b/include/lbann/proto/proto_common.hpp @@ -33,11 +33,6 @@ namespace lbann { -/** @brief Returns true if the Model contains at least one MotifLayer */ -bool has_motifs(const lbann_comm& comm, const lbann_data::LbannPB& p); - -void expand_motifs(const lbann_comm& comm, lbann_data::LbannPB& pb); - /** @brief Customize the name of the index list * * The following options are available diff --git a/src/proto/CMakeLists.txt b/src/proto/CMakeLists.txt index 6223ba4725f..e4075fba2ef 100644 --- a/src/proto/CMakeLists.txt +++ b/src/proto/CMakeLists.txt @@ -6,7 +6,7 @@ if (LBANN_HAS_PROTOBUF) # implementation of "protobuf_generate_cpp" but it gives us a custom # command on which we can depend. Using this, when lbann.proto is # touched, CMake will rebuild the LbannProto library. - set_full_path(PROTO_INPUTS lbann.proto callbacks.proto) + set_full_path(PROTO_INPUTS lbann.proto callbacks.proto layers.proto) foreach (proto IN LISTS PROTO_INPUTS) get_filename_component(name "${proto}" NAME_WE) diff --git a/src/proto/layers.proto b/src/proto/layers.proto new file mode 100644 index 00000000000..f365c2f47de --- /dev/null +++ b/src/proto/layers.proto @@ -0,0 +1,572 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +syntax = "proto3"; + +package lbann_data; + +message Layer { + string name = 50; + string parents = 151; + string children = 152; + string data_layout = 52; + string device_allocation = 55; + string weights = 54; + bool num_neurons_from_data_reader = 53; + bool freeze = 5; + string hint_layer = 56; + + repeated WeightsData weights_data = 153; + string top = 154; + string bottom = 155; + string type = 156; + + // a Layer should contain exactly one of the following + // (this may or may not be properly checked for in proto_common.cpp) + // + // @todo: this should be done better using oneof: + // oneof a_layer { + // Reshape reshape = 306 + // Pooling pooling = 12; + // ... + // } + // + // + + oneof layer_type { + // Input layers + Input input = 2; + + // Transform layers + Reshape reshape = 306; + Pooling pooling = 12; + Concatenation concatenation = 300; + Slice slice = 301; + Split split = 302; + Sum sum = 303; + WeightedSum weighted_sum = 323; + Unpooling unpooling = 304; + Hadamard hadamard = 308; + Constant constant = 309; + Reduction reduction = 310; + Evaluation evaluation = 311; + Gaussian gaussian = 312; + Bernoulli bernoulli = 313; + Uniform uniform = 314; + Crop crop = 316; + CategoricalRandom categorical_random = 317; + DiscreteRandom discrete_random = 318; + Dummy dummy = 319; + StopGradient stop_gradient = 320; + InTopK in_top_k = 324; + Sort sort = 325; + WeightsLayer weights_layer = 326; + Tessellate tessellate = 327; + + // Learning layers + FullyConnected fully_connected = 11; + Convolution convolution = 13; + Deconvolution deconvolution = 305; + Embedding embedding = 328; + ChannelwiseScaleBias channelwise_scale_bias = 329; + EntrywiseScaleBias entrywise_scale_bias = 330; + + // Loss layers + CrossEntropy cross_entropy = 60; + MeanSquaredError mean_squared_error = 61; + MeanAbsoluteError mean_absolute_error = 62; + CategoricalAccuracy categorical_accuracy = 63; + TopKCategoricalAccuracy top_k_categorical_accuracy = 64; + L2Norm2 l2_norm2 = 65; + L1Norm l1_norm = 66; + BinaryCrossEntropy binary_cross_entropy = 67; + SigmoidBinaryCrossEntropy sigmoid_binary_cross_entropy = 68; + BooleanAccuracy boolean_accuracy = 69; + BooleanFalseNegative boolean_false_negative = 70; + BooleanFalsePositive boolean_false_positive = 71; + + // Math layers + LogicalNot logical_not = 401; + Abs abs = 402; + Negative negative = 403; + Sign sign = 404; + Round round = 405; + Ceil ceil = 406; + Floor floor = 407; + Reciprocal reciprocal = 408; + Square square = 409; + Sqrt sqrt = 410; + Rsqrt rsqrt = 411; + SafeReciprocal safe_reciprocal = 412; + Exp exp = 413; + Expm1 expm1 = 414; + Log log = 415; + Log1p log1p = 416; + Cos cos = 417; + Sin sin = 418; + Tan tan = 419; + Acos acos = 420; + Asin asin = 421; + Atan atan = 422; + Cosh cosh = 423; + Sinh sinh = 424; + Tanh tanh = 425; + Acosh acosh = 426; + Asinh asinh = 427; + Atanh atanh = 428; + Add add = 450; + Subtract subtract = 451; + Multiply multiply = 452; + Divide divide = 453; + Mod mod = 454; + Pow pow = 455; + SafeDivide safe_divide = 456; + SquaredDifference squared_difference = 457; + Max max = 458; + Min min = 459; + Equal equal = 460; + NotEqual not_equal = 461; + Less less = 462; + LessEqual less_equal = 463; + Greater greater = 464; + GreaterEqual greater_equal = 465; + LogicalAnd logical_and = 466; + LogicalOr logical_or = 467; + LogicalXor logical_xor = 468; + Clamp clamp = 469; + + // Regularization layers + BatchNormalization batch_normalization = 19; + LocalResponseNormalization local_response_normalization = 20; + Dropout dropout = 21; + SeluDropout selu_dropout = 229; + + // Activation layers + Elu elu = 200; + Identity identity = 201; + LeakyRelu leaky_relu = 202; + LogSigmoid log_sigmoid = 203; + LogSoftmax log_softmax = 204; + Relu relu = 205; + Selu selu = 206; + Sigmoid sigmoid = 207; + Softmax softmax = 208; + Softplus softplus = 209; + Softsign softsign = 210; + + // Image layers + BilinearResize bilinear_resize = 500; + + // Miscellaneous layers + Covariance covariance = 600; + Variance variance = 601; + ChannelwiseMean channelwise_mean = 602; + MiniBatchIndex mini_batch_index = 603; + MiniBatchSize mini_batch_size = 604; + } + + /////////////////////// + // Math layers // + /////////////////////// + message LogicalNot {} + message Abs {} + message Negative {} + message Sign {} + message Round {} + message Ceil {} + message Floor {} + message Reciprocal {} + message Square {} + message Sqrt {} + message Rsqrt {} + message SafeReciprocal {} + message Exp {} + message Expm1 {} + message Log {} + message Log1p {} + message Cos {} + message Sin {} + message Tan {} + message Acos {} + message Asin {} + message Atan {} + message Cosh {} + message Sinh {} + message Tanh {} + message Acosh {} + message Asinh {} + message Atanh {} + message Add {} + message Subtract {} + message Multiply {} + message Divide {} + message Mod {} + message Pow {} + message SafeDivide {} + message SquaredDifference {} + message Max {} + message Min {} + message Equal {} + message NotEqual {} + message Less {} + message LessEqual {} + message Greater {} + message GreaterEqual {} + message LogicalAnd {} + message LogicalOr {} + message LogicalXor {} + message Clamp { + double min = 1; + double max = 2; + } + + /////////////////////// + // Activation layers // + /////////////////////// + message Elu { + double alpha = 1; //default: 1.0; should be >= 0 + } + message Identity {} + message LeakyRelu { + double negative_slope = 1; //default: 0.01 + } + message LogSigmoid {} + message LogSoftmax {} + message Relu {} + message Selu {} + message Sigmoid {} + message Softmax {} + message Softplus {} + message Softsign {} + + /////////////////////// + // Loss layers // + /////////////////////// + message CrossEntropy {} + message MeanSquaredError {} + message MeanAbsoluteError {} + message CategoricalAccuracy {} + message TopKCategoricalAccuracy { + int64 k = 1; + } + message L2Norm2 {} + message L1Norm {} + message BinaryCrossEntropy {} + message SigmoidBinaryCrossEntropy {} + message BooleanAccuracy {} + message BooleanFalseNegative {} + message BooleanFalsePositive {} + + /////////////////////////// + // Regularization layers // + /////////////////////////// + message BatchNormalization { + double decay = 1; //default: 0.9 + double scale_init = 2; //default: 1.0 + double bias_init = 3; //default: 0.0 + double epsilon = 4; //default: 1e-5 + string stats_aggregation = 5; // default: local; deprecated + // default: 1 (local aggregation); set to a negative value for global stats. + int64 statistics_group_size = 6; + } + + message SeluDropout { + double keep_prob = 2; //default: 0.95 + double alpha = 3; //default: 1.6732632423543772848170429916717 + double scale = 4; //default: 1.0507009873554804934193349852946 + } + + message LocalResponseNormalization { + int64 window_width = 4; + double lrn_alpha = 5; + double lrn_beta = 6; + double lrn_k = 7; + } + + message Dropout { + double keep_prob = 2; //default: 0.5 + } + + ////////////////// + // Input layers // + ////////////////// + message Input { + bool data_set_per_model = 1; // Default: false + string io_buffer = 2; // Options: "partitioned" (default) + string target_mode = 3; // Options: "classification" (default), "regression", "reconstruction", "N/A" + } + + ////////////////////// + // Transform layers // + ////////////////////// + message Reshape { + int64 num_dims = 1; //DEPRECATED + string dims = 2; //should be space-separated list of ints, e.g, "2 6 7" + } + + message Pooling { + int64 num_dims = 1; + + bool has_vectors = 2; + + //these are used if has_vectors = true + string pool_dims = 4; //should be space-separated list, e.g, "2 2 3" + string pool_pads = 5; //should be space-separated list, e.g, "2 2 3" + string pool_strides = 6; //should be space-separated list, e.g, "2 2 3" + + //these are used if has_vectors = false + int64 pool_dims_i = 10; + int64 pool_pads_i = 11; + int64 pool_strides_i = 12; + + //pool_mode should be one of: max, average, average_no_pad + //see: lbann/include/lbann/lbann_base.hpp + string pool_mode = 7; + } + + message Unpooling { + int64 num_dims = 1; + string pooling_layer = 13; //should be name of the pooling layer + } + + + message Concatenation { + int64 axis = 1; + } + + message Slice { + int64 axis = 1; + string slice_points = 2; //should be space-separated list of ints, e.g, "2 6 7" + //the following is for jag_conduit_hdf5; + string get_slice_points_from_reader = 4; + bool get_slice_points_from_reader_bool = 5; + } + + message Split { + } + + message Sum { + } + + message WeightedSum { + string scaling_factors = 1; + //should be a space-separated list of doubles, e.g. "1.0 2.0 -1.0" + } + + message Hadamard { + } + + message Constant { + double value=1; + string num_neurons=2; + } + + message Reduction { + string mode=1; //"sum" or "average" + } + + message Evaluation { + } + + message Gaussian { + double mean = 1; + double stdev = 2; + string neuron_dims = 3; + } + + message Bernoulli { + double prob = 1; + string neuron_dims = 2; + } + + message Uniform { + double min = 1; + double max = 2; + string neuron_dims = 3; + } + + + message Crop { + string dims = 3; + } + + message CategoricalRandom { + } + + message DiscreteRandom { + string values = 1; + string dims = 2; + } + + message Dummy { + } + + message StopGradient { + } + + message InTopK { + int64 k = 1; + } + + message Sort { + bool descending = 1; + } + + message WeightsLayer { + string dims = 1; + } + + message Tessellate { + string dims = 1; + } + + ///////////////////// + // Learning layers // + ///////////////////// + message FullyConnected { + int64 num_neurons = 1; + string weight_initialization = 2; //DEPRECATED + bool has_bias = 3; //default: true + double bias_initial_value = 4; //default: 0 + double l2_regularization_factor = 5; //default: 0 + double group_lasso_regularization_factor = 6; //default: 0 + bool transpose = 7; + bool num_neurons_is_num_labels = 8; + + bool get_input_dimension_from_reader = 9; + bool get_image_and_scalar_dimension_from_reader = 10; + bool get_image_dimension_from_reader = 11; + bool get_scalar_dimension_from_reader = 12; + repeated uint32 get_num_neurons_of_slice_from_reader = 13; + string get_slice_points_from_reader = 14; + } + + message Convolution { + int64 num_dims = 1; + int64 num_output_channels = 4; + int64 num_groups = 3; + + bool has_vectors = 2; + + // these are used if has_vector = true + string conv_dims = 5; //should be space-separated list, e.g, "2 2 3" + string conv_pads = 6; //should be space-separated list, e.g, "2 2 3" + string conv_strides = 7; //should be space-separated list, e.g, "2 2 3" + string conv_dilations = 8; //should be space-separated list, e.g. "2 3 3" + + // these are used if has_vector = false + int64 conv_dims_i = 50; + int64 conv_pads_i = 60; + int64 conv_strides_i = 70; + int64 conv_dilations_i = 80; + + string weight_initialization = 9; //DEPRECATED + bool has_bias = 10; //default: true + double bias_initial_value = 11; //default: 0 + double l2_regularization_factor = 12; //default: 0 + } + + message Deconvolution { + int64 num_dims = 1; + int64 num_output_channels = 4; + int64 num_groups = 3; + + bool has_vectors = 2; + + // these are used if has_vector = true + string conv_dims = 5; //should be space-separated list, e.g, "2 2 3" + string conv_pads = 6; //should be space-separated list, e.g, "2 2 3" + string conv_strides = 7; //should be space-separated list, e.g, "2 2 3" + string conv_dilations = 8; //should be space-separated list, e.g. "2 3 3" + + // these are used if has_vector = false + int64 conv_dims_i = 50; + int64 conv_pads_i = 60; + int64 conv_strides_i = 70; + int64 conv_dilations_i = 80; + + string weight_initialization = 9; //DEPRECATED + bool has_bias = 10; //default: true + double bias_initial_value = 11; //default: 0 + double l2_regularization_factor = 12; //default: 0 + } + + message Embedding { + int64 dictionary_size = 1; + int64 embedding_size = 2; + } + + message ChannelwiseScaleBias {} + message EntrywiseScaleBias {} + + ////////////////// + // Image layers // + ////////////////// + message BilinearResize { + int64 height = 1; + int64 width = 2; + } + + ////////////////////////// + // Miscellaneous layers // + ////////////////////////// + message Covariance { + bool biased = 1; //Whether to use a biased covariance estimate + } + message Variance { + bool biased = 1; //Whether to use a biased variance estimate + } + message ChannelwiseMean {} + message MiniBatchIndex {} + message MiniBatchSize {} +}// message Layer + +//note: I'd like to put this enum inside of Layer, but if I do the enum values +// become, e.g, Layer_Imcomm_EXCLUDE, which is just ugly +enum Imcomm { + DEFAULT = 0; //add Layer to Imcomm callback if all_learning_layers = true in + //the CallbackImComm + EXCLUDE = 1; //*do not* add Layer to Imcomm callback if all_learning_layers = true in + //the CallbackImComm + INCLUDE = 2; //add Layer to Imcomm callback regardless of whether all_learning_layers + //in the CallbackImComm is set to true or false +} + +// Weight data for exporting +message WeightsShape { + repeated int64 dim = 1 [packed = true]; +} + +message WeightsData { + WeightsShape shape = 5; + string name = 1; + int64 height = 2; + int64 width = 3; + //@todo assume float above, add other datatype + repeated float data = 4 [packed=true]; + + Imcomm imcomm = 55; +} diff --git a/src/proto/lbann.proto b/src/proto/lbann.proto index 6ef43542276..611f2a5b349 100644 --- a/src/proto/lbann.proto +++ b/src/proto/lbann.proto @@ -29,12 +29,12 @@ syntax = "proto3"; package lbann_data; import "callbacks.proto"; +import "layers.proto"; message LbannPB { DataReader data_reader = 1; Model model = 2; Optimizer optimizer = 3; - MotifDefinitions motif_definitions = 4; DataSetMetaData data_set_metadata = 5; } @@ -470,570 +470,3 @@ message HeNormalInitializer {} message HeUniformInitializer {} message LeCunNormalInitializer {} message LeCunUniformInitializer {} - -//note: I'd like to put this enum inside of Layer, but if I do the enum values -// become, e.g, Layer_Imcomm_EXCLUDE, which is just ugly -enum Imcomm { - DEFAULT = 0; //add Layer to Imcomm callback if all_learning_layers = true in - //the CallbackImComm - EXCLUDE = 1; //*do not* add Layer to Imcomm callback if all_learning_layers = true in - //the CallbackImComm - INCLUDE = 2; //add Layer to Imcomm callback regardless of whether all_learning_layers - //in the CallbackImComm is set to true or false -} - -// Weight data for exporting -message WeightsShape { - repeated int64 dim = 1 [packed = true]; -} -message WeightsData { - WeightsShape shape = 5; - string name = 1; - int64 height = 2; - int64 width = 3; - //@todo assume float above, add other datatype - repeated float data = 4 [packed=true]; - - Imcomm imcomm = 55; -} - -//======================================================================== -// MotifDefinitions -//======================================================================== - -message MotifDefinitions { - repeated Motif motif = 1; -} - -message Motif { - string name = 1; - repeated Layer layer = 2; -} - -//======================================================================== -// Layers -//======================================================================== - -message Layer { - string name = 50; - string parents = 151; - string children = 152; - string data_layout = 52; - string device_allocation = 55; - string weights = 54; - bool num_neurons_from_data_reader = 53; - bool freeze = 5; - string hint_layer = 56; - - repeated WeightsData weights_data = 153; - string top = 154; - string bottom = 155; - string type = 156; - - // a Layer should contain exactly one of the following - // (this may or may not be properly checked for in proto_common.cpp) - // - // @todo: this should be done better using oneof: - // oneof a_layer { - // Reshape reshape = 306 - // Pooling pooling = 12; - // ... - // } - // - // - - // motif layer - MotifLayer motif_layer = 4; - - // Input layers - Input input = 2; - - // Transform layers - Reshape reshape = 306; - Pooling pooling = 12; - Concatenation concatenation = 300; - Slice slice = 301; - Split split = 302; - Sum sum = 303; - WeightedSum weighted_sum = 323; - Unpooling unpooling = 304; - Hadamard hadamard = 308; - Constant constant = 309; - Reduction reduction = 310; - Evaluation evaluation = 311; - Gaussian gaussian = 312; - Bernoulli bernoulli = 313; - Uniform uniform = 314; - Crop crop = 316; - CategoricalRandom categorical_random = 317; - DiscreteRandom discrete_random = 318; - Dummy dummy = 319; - StopGradient stop_gradient = 320; - InTopK in_top_k = 324; - Sort sort = 325; - WeightsLayer weights_layer = 326; - Tessellate tessellate = 327; - - // Learning layers - FullyConnected fully_connected = 11; - Convolution convolution = 13; - Deconvolution deconvolution = 305; - Embedding embedding = 328; - ChannelwiseScaleBias channelwise_scale_bias = 329; - EntrywiseScaleBias entrywise_scale_bias = 330; - - // Loss layers - CrossEntropy cross_entropy = 60; - MeanSquaredError mean_squared_error = 61; - MeanAbsoluteError mean_absolute_error = 62; - CategoricalAccuracy categorical_accuracy = 63; - TopKCategoricalAccuracy top_k_categorical_accuracy = 64; - L2Norm2 l2_norm2 = 65; - L1Norm l1_norm = 66; - BinaryCrossEntropy binary_cross_entropy = 67; - SigmoidBinaryCrossEntropy sigmoid_binary_cross_entropy = 68; - BooleanAccuracy boolean_accuracy = 69; - BooleanFalseNegative boolean_false_negative = 70; - BooleanFalsePositive boolean_false_positive = 71; - - // Math layers - LogicalNot logical_not = 401; - Abs abs = 402; - Negative negative = 403; - Sign sign = 404; - Round round = 405; - Ceil ceil = 406; - Floor floor = 407; - Reciprocal reciprocal = 408; - Square square = 409; - Sqrt sqrt = 410; - Rsqrt rsqrt = 411; - SafeReciprocal safe_reciprocal = 412; - Exp exp = 413; - Expm1 expm1 = 414; - Log log = 415; - Log1p log1p = 416; - Cos cos = 417; - Sin sin = 418; - Tan tan = 419; - Acos acos = 420; - Asin asin = 421; - Atan atan = 422; - Cosh cosh = 423; - Sinh sinh = 424; - Tanh tanh = 425; - Acosh acosh = 426; - Asinh asinh = 427; - Atanh atanh = 428; - Add add = 450; - Subtract subtract = 451; - Multiply multiply = 452; - Divide divide = 453; - Mod mod = 454; - Pow pow = 455; - SafeDivide safe_divide = 456; - SquaredDifference squared_difference = 457; - Max max = 458; - Min min = 459; - Equal equal = 460; - NotEqual not_equal = 461; - Less less = 462; - LessEqual less_equal = 463; - Greater greater = 464; - GreaterEqual greater_equal = 465; - LogicalAnd logical_and = 466; - LogicalOr logical_or = 467; - LogicalXor logical_xor = 468; - Clamp clamp = 469; - - // Regularization layers - BatchNormalization batch_normalization = 19; - LocalResponseNormalization local_response_normalization = 20; - Dropout dropout = 21; - SeluDropout selu_dropout = 229; - - // Activation layers - Elu elu = 200; - Identity identity = 201; - LeakyRelu leaky_relu = 202; - LogSigmoid log_sigmoid = 203; - LogSoftmax log_softmax = 204; - Relu relu = 205; - Selu selu = 206; - Sigmoid sigmoid = 207; - Softmax softmax = 208; - Softplus softplus = 209; - Softsign softsign = 210; - - // Image layers - BilinearResize bilinear_resize = 500; - - // Miscellaneous layers - Covariance covariance = 600; - Variance variance = 601; - ChannelwiseMean channelwise_mean = 602; - MiniBatchIndex mini_batch_index = 603; - MiniBatchSize mini_batch_size = 604; - -} -/////////////////////// -// MotifLayer // -/////////////////////// -message MotifLayer { - string motif_id = 1; - repeated string variable = 2; -} - -/////////////////////// -// Math layers // -/////////////////////// -message LogicalNot {} -message Abs {} -message Negative {} -message Sign {} -message Round {} -message Ceil {} -message Floor {} -message Reciprocal {} -message Square {} -message Sqrt {} -message Rsqrt {} -message SafeReciprocal {} -message Exp {} -message Expm1 {} -message Log {} -message Log1p {} -message Cos {} -message Sin {} -message Tan {} -message Acos {} -message Asin {} -message Atan {} -message Cosh {} -message Sinh {} -message Tanh {} -message Acosh {} -message Asinh {} -message Atanh {} -message Add {} -message Subtract {} -message Multiply {} -message Divide {} -message Mod {} -message Pow {} -message SafeDivide {} -message SquaredDifference {} -message Max {} -message Min {} -message Equal {} -message NotEqual {} -message Less {} -message LessEqual {} -message Greater {} -message GreaterEqual {} -message LogicalAnd {} -message LogicalOr {} -message LogicalXor {} -message Clamp { - double min = 1; - double max = 2; -} - -/////////////////////// -// Activation layers // -/////////////////////// -message Elu { - double alpha = 1; //default: 1.0; should be >= 0 -} -message Identity {} -message LeakyRelu { - double negative_slope = 1; //default: 0.01 -} -message LogSigmoid {} -message LogSoftmax {} -message Relu {} -message Selu {} -message Sigmoid {} -message Softmax {} -message Softplus {} -message Softsign {} - -/////////////////////// -// Loss layers // -/////////////////////// -message CrossEntropy {} -message MeanSquaredError {} -message MeanAbsoluteError {} -message CategoricalAccuracy {} -message TopKCategoricalAccuracy { - int64 k = 1; -} -message L2Norm2 {} -message L1Norm {} -message BinaryCrossEntropy {} -message SigmoidBinaryCrossEntropy {} -message BooleanAccuracy {} -message BooleanFalseNegative {} -message BooleanFalsePositive {} - -/////////////////////////// -// Regularization layers // -/////////////////////////// -message BatchNormalization { - double decay = 1; //default: 0.9 - double scale_init = 2; //default: 1.0 - double bias_init = 3; //default: 0.0 - double epsilon = 4; //default: 1e-5 - string stats_aggregation = 5; // default: local; deprecated - // default: 1 (local aggregation); set to a negative value for global stats. - int64 statistics_group_size = 6; -} - -message SeluDropout { - double keep_prob = 2; //default: 0.95 - double alpha = 3; //default: 1.6732632423543772848170429916717 - double scale = 4; //default: 1.0507009873554804934193349852946 -} - -message LocalResponseNormalization { - int64 window_width = 4; - double lrn_alpha = 5; - double lrn_beta = 6; - double lrn_k = 7; -} - -message Dropout { - double keep_prob = 2; //default: 0.5 -} - -////////////////// -// Input layers // -////////////////// -message Input { - bool data_set_per_model = 1; // Default: false - string io_buffer = 2; // Options: "partitioned" (default) - string target_mode = 3; // Options: "classification" (default), "regression", "reconstruction", "N/A" -} - -////////////////////// -// Transform layers // -////////////////////// -message Reshape { - int64 num_dims = 1; //DEPRECATED - string dims = 2; //should be space-separated list of ints, e.g, "2 6 7" -} - -message Pooling { - int64 num_dims = 1; - - bool has_vectors = 2; - - //these are used if has_vectors = true - string pool_dims = 4; //should be space-separated list, e.g, "2 2 3" - string pool_pads = 5; //should be space-separated list, e.g, "2 2 3" - string pool_strides = 6; //should be space-separated list, e.g, "2 2 3" - - //these are used if has_vectors = false - int64 pool_dims_i = 10; - int64 pool_pads_i = 11; - int64 pool_strides_i = 12; - - //pool_mode should be one of: max, average, average_no_pad - //see: lbann/include/lbann/lbann_base.hpp - string pool_mode = 7; -} - -message Unpooling { - int64 num_dims = 1; - string pooling_layer = 13; //should be name of the pooling layer -} - - -message Concatenation { - int64 axis = 1; -} - -message Slice { - int64 axis = 1; - string slice_points = 2; //should be space-separated list of ints, e.g, "2 6 7" - //the following is for jag_conduit_hdf5; - string get_slice_points_from_reader = 4; - bool get_slice_points_from_reader_bool = 5; -} - -message Split { -} - -message Sum { -} - -message WeightedSum { - string scaling_factors = 1; //should be a space-separated list of doubles, e.g. "1.0 2.0 -1.0" -} - -message Hadamard { -} - -message Constant { - double value=1; - string num_neurons=2; -} - -message Reduction { - string mode=1; //"sum" or "average" -} - -message Evaluation { -} - -message Gaussian { - double mean = 1; - double stdev = 2; - string neuron_dims = 3; -} - -message Bernoulli { - double prob = 1; - string neuron_dims = 2; -} - -message Uniform { - double min = 1; - double max = 2; - string neuron_dims = 3; -} - - -message Crop { - string dims = 3; -} - -message CategoricalRandom { -} - -message DiscreteRandom { - string values = 1; - string dims = 2; -} - -message Dummy { -} - -message StopGradient { -} - -message InTopK { - int64 k = 1; -} - -message Sort { - bool descending = 1; -} - -message WeightsLayer { - string dims = 1; -} - -message Tessellate { - string dims = 1; -} - -///////////////////// -// Learning layers // -///////////////////// -message FullyConnected { - int64 num_neurons = 1; - string weight_initialization = 2; //DEPRECATED - bool has_bias = 3; //default: true - double bias_initial_value = 4; //default: 0 - double l2_regularization_factor = 5; //default: 0 - double group_lasso_regularization_factor = 6; //default: 0 - bool transpose = 7; - bool num_neurons_is_num_labels = 8; - - bool get_input_dimension_from_reader = 9; - bool get_image_and_scalar_dimension_from_reader = 10; - bool get_image_dimension_from_reader = 11; - bool get_scalar_dimension_from_reader = 12; - repeated uint32 get_num_neurons_of_slice_from_reader = 13; - string get_slice_points_from_reader = 14; -} - -message Convolution { - int64 num_dims = 1; - int64 num_output_channels = 4; - int64 num_groups = 3; - - bool has_vectors = 2; - - // these are used if has_vector = true - string conv_dims = 5; //should be space-separated list, e.g, "2 2 3" - string conv_pads = 6; //should be space-separated list, e.g, "2 2 3" - string conv_strides = 7; //should be space-separated list, e.g, "2 2 3" - string conv_dilations = 8; //should be space-separated list, e.g. "2 3 3" - - // these are used if has_vector = false - int64 conv_dims_i = 50; - int64 conv_pads_i = 60; - int64 conv_strides_i = 70; - int64 conv_dilations_i = 80; - - string weight_initialization = 9; //DEPRECATED - bool has_bias = 10; //default: true - double bias_initial_value = 11; //default: 0 - double l2_regularization_factor = 12; //default: 0 -} - -message Deconvolution { - int64 num_dims = 1; - int64 num_output_channels = 4; - int64 num_groups = 3; - - bool has_vectors = 2; - - // these are used if has_vector = true - string conv_dims = 5; //should be space-separated list, e.g, "2 2 3" - string conv_pads = 6; //should be space-separated list, e.g, "2 2 3" - string conv_strides = 7; //should be space-separated list, e.g, "2 2 3" - string conv_dilations = 8; //should be space-separated list, e.g. "2 3 3" - - // these are used if has_vector = false - int64 conv_dims_i = 50; - int64 conv_pads_i = 60; - int64 conv_strides_i = 70; - int64 conv_dilations_i = 80; - - string weight_initialization = 9; //DEPRECATED - bool has_bias = 10; //default: true - double bias_initial_value = 11; //default: 0 - double l2_regularization_factor = 12; //default: 0 -} - -message Embedding { - int64 dictionary_size = 1; - int64 embedding_size = 2; -} - -message ChannelwiseScaleBias {} -message EntrywiseScaleBias {} - -////////////////// -// Image layers // -////////////////// -message BilinearResize { - int64 height = 1; - int64 width = 2; -} - -////////////////////////// -// Miscellaneous layers // -////////////////////////// -message Covariance { - bool biased = 1; //Whether to use a biased covariance estimate -} -message Variance { - bool biased = 1; //Whether to use a biased variance estimate -} -message ChannelwiseMean {} -message MiniBatchIndex {} -message MiniBatchSize {} diff --git a/src/proto/proto_common.cpp b/src/proto/proto_common.cpp index 9e978a8d08e..43b050b3a7e 100644 --- a/src/proto/proto_common.cpp +++ b/src/proto/proto_common.cpp @@ -45,33 +45,6 @@ namespace lbann { -bool has_motifs(const lbann_comm& comm, const lbann_data::LbannPB& p) { - const bool master = comm.am_world_master(); - if (master) { - std::cout << "starting has_motifs\n"; - } - const lbann_data::Model& m = p.model(); - const int num_layers = m.layer_size(); - for (int j=0; j Date: Tue, 30 Jul 2019 18:09:59 -0700 Subject: [PATCH 174/634] move optimizers and weights to their own protobuf files --- src/proto/CMakeLists.txt | 8 +++- src/proto/lbann.proto | 91 +------------------------------------- src/proto/optimizers.proto | 71 +++++++++++++++++++++++++++++ src/proto/weights.proto | 72 ++++++++++++++++++++++++++++++ 4 files changed, 152 insertions(+), 90 deletions(-) create mode 100644 src/proto/optimizers.proto create mode 100644 src/proto/weights.proto diff --git a/src/proto/CMakeLists.txt b/src/proto/CMakeLists.txt index e4075fba2ef..80b404876d1 100644 --- a/src/proto/CMakeLists.txt +++ b/src/proto/CMakeLists.txt @@ -6,7 +6,13 @@ if (LBANN_HAS_PROTOBUF) # implementation of "protobuf_generate_cpp" but it gives us a custom # command on which we can depend. Using this, when lbann.proto is # touched, CMake will rebuild the LbannProto library. - set_full_path(PROTO_INPUTS lbann.proto callbacks.proto layers.proto) + set_full_path(PROTO_INPUTS + lbann.proto + callbacks.proto + layers.proto + optimizers.proto + weights.proto + ) foreach (proto IN LISTS PROTO_INPUTS) get_filename_component(name "${proto}" NAME_WE) diff --git a/src/proto/lbann.proto b/src/proto/lbann.proto index 611f2a5b349..848cf680dc7 100644 --- a/src/proto/lbann.proto +++ b/src/proto/lbann.proto @@ -30,6 +30,8 @@ package lbann_data; import "callbacks.proto"; import "layers.proto"; +import "optimizers.proto"; +import "weights.proto"; message LbannPB { DataReader data_reader = 1; @@ -381,92 +383,3 @@ message LayerMetric { string name = 2; string unit = 3; } - -//======================================================================== -// Optimizers -//======================================================================== -message Optimizer { - oneof optimizer_type { - AdaGrad adagrad = 1; - Adam adam = 2; - HypergradientAdam hypergradient_adam = 3; - RMSprop rmsprop = 4; - SGD sgd = 5; - } -} - -message AdaGrad { - double learn_rate = 1; - double eps = 2; // Suggested: 1e-8 -} - -message Adam { - double learn_rate = 1; - double beta1 = 6; // Suggested: 0.9 - double beta2 = 7; // Suggested: 0.99 - double eps = 8; // Suggested: 1e-8 -} - -message HypergradientAdam { - double init_learning_rate = 1; - double hyper_learning_rate = 2; // Suggested: 1e-7 - double beta1 = 6; // Suggested: 0.9 - double beta2 = 7; // Suggested: 0.99 - double eps = 8; // Suggested: 1e-8 -} - -message RMSprop { - double learn_rate = 1; - double decay_rate = 2; - double eps = 3; // Suggested: 1e-8 -} - -message SGD { - double learn_rate = 1; - double momentum = 2; // Set to zero for vanilla SGD - bool nesterov = 4; -} - -//======================================================================== -// Weights -//======================================================================== - -message Weights { - - string name = 1; - Optimizer optimizer = 2; - - ConstantInitializer constant_initializer = 20; - ValueInitializer value_initializer = 21; - UniformInitializer uniform_initializer = 22; - NormalInitializer normal_initializer = 23; - GlorotNormalInitializer glorot_normal_initializer = 24; - GlorotUniformInitializer glorot_uniform_initializer = 25; - HeNormalInitializer he_normal_initializer = 26; - HeUniformInitializer he_uniform_initializer = 27; - LeCunNormalInitializer lecun_normal_initializer = 28; - LeCunUniformInitializer lecun_uniform_initializer = 29; - -} - -// Weight initializers -message ConstantInitializer { - double value = 1; -} -message ValueInitializer { - string values = 1; -} -message UniformInitializer { - double min = 1; - double max = 2; -} -message NormalInitializer { - double mean = 1; - double standard_deviation = 2; -} -message GlorotNormalInitializer {} -message GlorotUniformInitializer {} -message HeNormalInitializer {} -message HeUniformInitializer {} -message LeCunNormalInitializer {} -message LeCunUniformInitializer {} diff --git a/src/proto/optimizers.proto b/src/proto/optimizers.proto new file mode 100644 index 00000000000..c914efd26d5 --- /dev/null +++ b/src/proto/optimizers.proto @@ -0,0 +1,71 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +syntax = "proto3"; + +package lbann_data; + +message Optimizer { + oneof optimizer_type { + AdaGrad adagrad = 1; + Adam adam = 2; + HypergradientAdam hypergradient_adam = 3; + RMSprop rmsprop = 4; + SGD sgd = 5; + } + + message AdaGrad { + double learn_rate = 1; + double eps = 2; // Suggested: 1e-8 + } + + message Adam { + double learn_rate = 1; + double beta1 = 6; // Suggested: 0.9 + double beta2 = 7; // Suggested: 0.99 + double eps = 8; // Suggested: 1e-8 + } + + message HypergradientAdam { + double init_learning_rate = 1; + double hyper_learning_rate = 2; // Suggested: 1e-7 + double beta1 = 6; // Suggested: 0.9 + double beta2 = 7; // Suggested: 0.99 + double eps = 8; // Suggested: 1e-8 + } + + message RMSprop { + double learn_rate = 1; + double decay_rate = 2; + double eps = 3; // Suggested: 1e-8 + } + + message SGD { + double learn_rate = 1; + double momentum = 2; // Set to zero for vanilla SGD + bool nesterov = 4; + } +} \ No newline at end of file diff --git a/src/proto/weights.proto b/src/proto/weights.proto new file mode 100644 index 00000000000..2fe3e46ad2f --- /dev/null +++ b/src/proto/weights.proto @@ -0,0 +1,72 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +syntax = "proto3"; + +import "optimizers.proto"; + +package lbann_data; + +message Weights { + + string name = 1; + Optimizer optimizer = 2; + + oneof initializer_type { + ConstantInitializer constant_initializer = 20; + ValueInitializer value_initializer = 21; + UniformInitializer uniform_initializer = 22; + NormalInitializer normal_initializer = 23; + GlorotNormalInitializer glorot_normal_initializer = 24; + GlorotUniformInitializer glorot_uniform_initializer = 25; + HeNormalInitializer he_normal_initializer = 26; + HeUniformInitializer he_uniform_initializer = 27; + LeCunNormalInitializer lecun_normal_initializer = 28; + LeCunUniformInitializer lecun_uniform_initializer = 29; + } + + // Weight initializers + message ConstantInitializer { + double value = 1; + } + message ValueInitializer { + string values = 1; + } + message UniformInitializer { + double min = 1; + double max = 2; + } + message NormalInitializer { + double mean = 1; + double standard_deviation = 2; + } + message GlorotNormalInitializer {} + message GlorotUniformInitializer {} + message HeNormalInitializer {} + message HeUniformInitializer {} + message LeCunNormalInitializer {} + message LeCunUniformInitializer {} +} \ No newline at end of file From 1ca39d7ff8f66e446b418c46cd5874d1db422d3c Mon Sep 17 00:00:00 2001 From: "Thomas R. Benson" Date: Wed, 31 Jul 2019 08:50:49 -0700 Subject: [PATCH 175/634] split remaining protobuf messages into separate files --- src/proto/CMakeLists.txt | 5 + src/proto/lbann.proto | 349 +--------------------------- src/proto/metrics.proto | 40 ++++ src/proto/model.proto | 69 ++++++ src/proto/objective_functions.proto | 45 ++++ src/proto/reader.proto | 159 +++++++++++++ src/proto/transforms.proto | 167 +++++++++++++ 7 files changed, 487 insertions(+), 347 deletions(-) create mode 100644 src/proto/metrics.proto create mode 100644 src/proto/model.proto create mode 100644 src/proto/objective_functions.proto create mode 100644 src/proto/reader.proto create mode 100644 src/proto/transforms.proto diff --git a/src/proto/CMakeLists.txt b/src/proto/CMakeLists.txt index 80b404876d1..e719ff46f78 100644 --- a/src/proto/CMakeLists.txt +++ b/src/proto/CMakeLists.txt @@ -10,7 +10,12 @@ if (LBANN_HAS_PROTOBUF) lbann.proto callbacks.proto layers.proto + metrics.proto + model.proto + objective_functions.proto optimizers.proto + reader.proto + transforms.proto weights.proto ) diff --git a/src/proto/lbann.proto b/src/proto/lbann.proto index 848cf680dc7..5e26d3b0a11 100644 --- a/src/proto/lbann.proto +++ b/src/proto/lbann.proto @@ -28,10 +28,9 @@ syntax = "proto3"; package lbann_data; -import "callbacks.proto"; -import "layers.proto"; +import "reader.proto"; +import "model.proto"; import "optimizers.proto"; -import "weights.proto"; message LbannPB { DataReader data_reader = 1; @@ -39,347 +38,3 @@ message LbannPB { Optimizer optimizer = 3; DataSetMetaData data_set_metadata = 5; } - -//======================================================================== -// DataReaders -//======================================================================== -message DataReader { - int64 max_par_io_size = 1; - repeated Reader reader = 2; - bool requires_data_set_metadata = 3; -} - -message Reader { - string name = 1; //mnist, nci, nci_regression, numpy, imagenet, synthetic, merge_samples - string role = 3; //train, validation, test - bool shuffle = 4; - string data_filedir = 5; - string data_local_filedir = 50; //to support data_store - string data_filename = 6; - string label_filename = 7; - string index_list = 8; - double validation_percent = 9; - int64 absolute_sample_count = 11; - int64 first_n = 200; - double percent_of_data_to_use = 12; - - //for GAN model - bool gan_labelling = 201; - int32 gan_label_value = 202; - - int32 num_labels = 99; //for imagenet and synthetic - int64 num_samples = 100; //only for synthetic - string synth_dimensions = 101; //only for synthetic - string synth_response_dimensions = 115; //only for synthetic - //csv attributes - string separator = 102; - int32 skip_cols = 103; - int32 skip_rows = 104; - bool has_header = 105; - int32 label_col = 106; - int32 response_col = 107; - bool disable_labels = 108; - bool disable_responses = 109; - string format = 110; // numpy, csv - string data_file_pattern = 111; - int64 num_neighbors = 112; // pilot2_molecular_reader - int64 max_neighborhood = 113; // pilot2_molecular_reader - int32 num_image_srcs = 114; // data_reader_multi_images - float scaling_factor_int16 = 116; // for numpy_npz_reader with int16 data - - int32 max_files_to_load = 1000; - - //------------- start of only for partitioned data sets ------------------ - bool is_partitioned = 300; - double partition_overlap = 301; - int32 partition_mode = 302; - // 1 - share a portion of your data with two neighbors; - // 2 - there's a set of overlap indices that are common to all models - //------------- end of only for partitioned data sets ------------------ - - //------------- start of only for index lists ------------------ - bool index_list_per_trainer = 400; - bool index_list_per_model = 401; - //------------- end of only for index lists ------------------ - - PythonDataReader python = 501; - - repeated Transform transforms = 600; // Ordered list of transforms to apply. -} - -message PythonDataReader { - string module = 1; // Python module - string module_dir = 2; // Directory containing Python module - string sample_function = 3; // Function that gets data sample - string num_samples_function = 4; // Function that gets number of data samples - string sample_dims_function = 5; // Function that gets dimensions of data sample -} - -// Preprocessing transforms. -message Transform { - // Transforms that apply to LBANN data. - // Normalize channel-wise with mean and standard deviation. - message Normalize { - string means = 1; - string stddevs = 2; - } - // Normalize each sample to have mean 0, standard deviation 1. - message SampleNormalize {} - // Scale by a constant. - message Scale { - float scale = 1; - } - - // Transforms that apply to images. - // Adjust the brightness of an image. - message AdjustBrightness { - float factor = 1; - } - // Adjust the contrast of an image. - message AdjustContrast { - float factor = 1; - } - // Adjust the saturation of an image. - message AdjustSaturation { - float factor = 1; - } - // Crop of size height x width from the center. - message CenterCrop { - uint64 height = 1; - uint64 width = 2; - } - // Convert to color. - message Colorize {} - // Randomly jitter brightness/contrast/saturation. - message ColorJitter { - float min_brightness_factor = 1; - float max_brightness_factor = 2; - float min_contrast_factor = 3; - float max_contrast_factor = 4; - float min_saturation_factor = 5; - float max_saturation_factor = 6; - } - // Apply cutout augmentation. - message Cutout { - uint64 num_holes = 1; - uint64 length = 2; - } - // Convert to grayscale. - message Grayscale {} - // Horizontal flip with probability p. - message HorizontalFlip { - float p = 1; - } - // Fused Normalize + ToLBANNLayout. - message NormalizeToLBANNLayout { - string means = 1; - string stddevs = 2; - } - // Apply a random affine transform. - message RandomAffine { - float rotate_min = 1; - float rotate_max = 2; - float translate_h = 3; - float translate_w = 4; - float scale_min = 5; - float scale_max = 6; - float shear_min = 7; - float shear_max = 8; - } - // Crop of size height x width from a random location. - message RandomCrop { - uint64 height = 1; - uint64 width = 2; - } - // Random crop with scale and aspect ratio augmentation. - message RandomResizedCrop { - uint64 height = 1; - uint64 width = 2; - float scale_min = 3; - float scale_max = 4; - float ar_min = 5; - float ar_max = 6; - } - // Resize to height x width, then randomly crop to crop_height x crop_width. - message RandomResizedCropWithFixedAspectRatio { - uint64 height = 1; - uint64 width = 2; - uint64 crop_height = 3; - uint64 crop_width = 4; - } - // Resize to height x width. - message Resize { - uint64 height = 1; - uint64 width = 2; - } - // Resize to height x width then crop to crop_height x crop_width at the center. - message ResizedCenterCrop { - uint64 height = 1; - uint64 width = 2; - uint64 crop_height = 3; - uint64 crop_width = 4; - } - // Convert from an image to LBANN data. - message ToLBANNLayout { } - // Vertical flip with probability p. - message VerticalFlip { - float p = 1; - } - - oneof a_transform { - // On LBANN data: - Normalize normalize = 1; - SampleNormalize sample_normalize = 2; - Scale scale = 3; - - // On images: - CenterCrop center_crop = 100; - Colorize colorize = 101; - Grayscale grayscale = 102; - HorizontalFlip horizontal_flip = 103; - NormalizeToLBANNLayout normalize_to_lbann_layout = 104; - RandomAffine random_affine = 105; - RandomCrop random_crop = 106; - RandomResizedCrop random_resized_crop = 107; - RandomResizedCropWithFixedAspectRatio random_resized_crop_with_fixed_aspect_ratio = 108; - Resize resize = 109; - ResizedCenterCrop resized_center_crop = 110; - ToLBANNLayout to_lbann_layout = 111; - VerticalFlip vertical_flip = 112; - AdjustBrightness adjust_brightness = 113; - AdjustContrast adjust_contrast = 114; - AdjustSaturation adjust_saturation = 115; - ColorJitter color_jitter = 116; - Cutout cutout = 117; - } -} - -//======================================================================== -// Metadata for a Data set -//======================================================================== -message DataSetMetaData { - message Schema { - string scalar_prefix = 1; - string image_prefix = 2; - string input_prefix = 3; - - uint64 image_height = 11; - uint64 image_width = 12; - uint64 image_num_channels = 13; - - //------------------ start of only for jag_conduit ----------------------- - bool split_jag_image_channels = 89; - repeated string jag_image_keys = 90; - repeated string jag_scalar_keys = 91; - repeated string jag_input_keys = 92; - message JagKeyPrefixFilter { - string key_prefix = 1; - uint32 min_len = 2; - } - repeated string jag_scalar_filters = 93; - repeated JagKeyPrefixFilter jag_scalar_prefix_filters = 94; - repeated string jag_input_filters = 95; - repeated JagKeyPrefixFilter jag_input_prefix_filters = 96; - - enum JAG_Data { - Undefined = 0; - JAG_Image = 1; - JAG_Scalar = 2; - JAG_Input = 3; - } - message JAGDataSlice { - repeated JAG_Data pieces = 1; - } - repeated JAGDataSlice independent = 97; - repeated JAGDataSlice dependent = 98; - //------------------ end of only for jag_conduit ----------------------- - } - - message Normalization { - //------------------ start of only for jag_conduit ----------------------- - message JagLinearNormalizationParams { - double scale = 1; - double bias = 2; - } - - repeated JagLinearNormalizationParams jag_image_normalization_params = 86; - repeated JagLinearNormalizationParams jag_scalar_normalization_params = 87; - repeated JagLinearNormalizationParams jag_input_normalization_params = 88; - - //------------------ end of only for jag_conduit ----------------------- - } - Schema schema = 1; - Normalization normalization = 2; -} - -//======================================================================== -// Model -//======================================================================== - -message Model { - string type = 1; - string name = 3; - ObjectiveFunction objective_function = 2; - repeated Metric metric = 5; - string data_layout = 6; - bool shareable_training_data_reader = 42; // Can the data reader be shared across multiple models( e.g. GAN) - bool shareable_testing_data_reader = 43; // Can the data reader be shared across multiple models (e.g. GAN) - bool shareable_validation_data_reader = 44; // Can the data reader be shared across multiple models (e.g. GAN) - - int64 mini_batch_size = 12; - int64 num_epochs = 4; - int64 super_steps = 121; //multiple steps/epochs currently use in GAN - int64 num_batches = 122; //multiple batches/sub epoch - int64 block_size = 50; - int64 procs_per_trainer = 51; - int64 num_gpus = 53; //has no effect - int64 evaluation_frequency = 54; - int64 num_parallel_readers = 100; - bool serialize_io = 101; - - bool disable_cuda = 8; - - repeated Layer layer = 10; - - repeated Weights weights = 11; - - repeated Callback callback = 20; - - int64 random_seed = 30; - // If true, models will have their model rank mixed into their random seed. - bool random_init_models_differently = 31; - -} - -//======================================================================== -// Objective function -//======================================================================== - -message ObjectiveFunction { - repeated LayerTerm layer_term = 1; - repeated L2WeightRegularization l2_weight_regularization = 2; -} - -message LayerTerm { - double scale_factor = 1; - string layer = 2; -} - -message L2WeightRegularization { - double scale_factor = 1; - string weights = 2; // If empty, L2 regularization is applied to all weights -} - -//======================================================================== -// Metrics -//======================================================================== - -message Metric { - LayerMetric layer_metric = 11; -} - -message LayerMetric { - string layer = 1; - string name = 2; - string unit = 3; -} diff --git a/src/proto/metrics.proto b/src/proto/metrics.proto new file mode 100644 index 00000000000..2c59ebdc66e --- /dev/null +++ b/src/proto/metrics.proto @@ -0,0 +1,40 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +syntax = "proto3"; + +package lbann_data; + +message Metric { + + message LayerMetric { + string layer = 1; + string name = 2; + string unit = 3; + } + + LayerMetric layer_metric = 11; +} \ No newline at end of file diff --git a/src/proto/model.proto b/src/proto/model.proto new file mode 100644 index 00000000000..0af84070cdd --- /dev/null +++ b/src/proto/model.proto @@ -0,0 +1,69 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +syntax = "proto3"; + +package lbann_data; + +import "callbacks.proto"; +import "layers.proto"; +import "metrics.proto"; +import "objective_functions.proto"; +import "weights.proto"; + +message Model { + string type = 1; + string name = 3; + ObjectiveFunction objective_function = 2; + repeated Metric metric = 5; + string data_layout = 6; + bool shareable_training_data_reader = 42; // Can the data reader be shared across multiple models( e.g. GAN) + bool shareable_testing_data_reader = 43; // Can the data reader be shared across multiple models (e.g. GAN) + bool shareable_validation_data_reader = 44; // Can the data reader be shared across multiple models (e.g. GAN) + + int64 mini_batch_size = 12; + int64 num_epochs = 4; + int64 super_steps = 121; //multiple steps/epochs currently use in GAN + int64 num_batches = 122; //multiple batches/sub epoch + int64 block_size = 50; + int64 procs_per_trainer = 51; + int64 num_gpus = 53; //has no effect + int64 evaluation_frequency = 54; + int64 num_parallel_readers = 100; + bool serialize_io = 101; + + bool disable_cuda = 8; + + repeated Layer layer = 10; + + repeated Weights weights = 11; + + repeated Callback callback = 20; + + int64 random_seed = 30; + // If true, models will have their model rank mixed into their random seed. + bool random_init_models_differently = 31; +} diff --git a/src/proto/objective_functions.proto b/src/proto/objective_functions.proto new file mode 100644 index 00000000000..168482b93e3 --- /dev/null +++ b/src/proto/objective_functions.proto @@ -0,0 +1,45 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +syntax = "proto3"; + +package lbann_data; + +message ObjectiveFunction { + + message LayerTerm { + double scale_factor = 1; + string layer = 2; + } + + message L2WeightRegularization { + double scale_factor = 1; + string weights = 2; // If empty, L2 regularization is applied to all weights + } + + repeated LayerTerm layer_term = 1; + repeated L2WeightRegularization l2_weight_regularization = 2; +} diff --git a/src/proto/reader.proto b/src/proto/reader.proto new file mode 100644 index 00000000000..e06050aacff --- /dev/null +++ b/src/proto/reader.proto @@ -0,0 +1,159 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +syntax = "proto3"; + +package lbann_data; + +import "transforms.proto"; + +message DataReader { + int64 max_par_io_size = 1; + repeated Reader reader = 2; + bool requires_data_set_metadata = 3; +} + +message Reader { + string name = 1; //mnist, nci, nci_regression, numpy, imagenet, synthetic, merge_samples + string role = 3; //train, validation, test + bool shuffle = 4; + string data_filedir = 5; + string data_local_filedir = 50; //to support data_store + string data_filename = 6; + string label_filename = 7; + string index_list = 8; + double validation_percent = 9; + int64 absolute_sample_count = 11; + int64 first_n = 200; + double percent_of_data_to_use = 12; + + //for GAN model + bool gan_labelling = 201; + int32 gan_label_value = 202; + + int32 num_labels = 99; //for imagenet and synthetic + int64 num_samples = 100; //only for synthetic + string synth_dimensions = 101; //only for synthetic + string synth_response_dimensions = 115; //only for synthetic + //csv attributes + string separator = 102; + int32 skip_cols = 103; + int32 skip_rows = 104; + bool has_header = 105; + int32 label_col = 106; + int32 response_col = 107; + bool disable_labels = 108; + bool disable_responses = 109; + string format = 110; // numpy, csv + string data_file_pattern = 111; + int64 num_neighbors = 112; // pilot2_molecular_reader + int64 max_neighborhood = 113; // pilot2_molecular_reader + int32 num_image_srcs = 114; // data_reader_multi_images + float scaling_factor_int16 = 116; // for numpy_npz_reader with int16 data + + int32 max_files_to_load = 1000; + + //------------- start of only for partitioned data sets ------------------ + bool is_partitioned = 300; + double partition_overlap = 301; + int32 partition_mode = 302; + // 1 - share a portion of your data with two neighbors; + // 2 - there's a set of overlap indices that are common to all models + //------------- end of only for partitioned data sets ------------------ + + //------------- start of only for index lists ------------------ + bool index_list_per_trainer = 400; + bool index_list_per_model = 401; + //------------- end of only for index lists ------------------ + + PythonDataReader python = 501; + + repeated Transform transforms = 600; // Ordered list of transforms to apply. +} + +message PythonDataReader { + string module = 1; // Python module + string module_dir = 2; // Directory containing Python module + string sample_function = 3; // Function that gets data sample + string num_samples_function = 4; // Function that gets number of data samples + string sample_dims_function = 5; // Function that gets dimensions of data sample +} + +message DataSetMetaData { + message Schema { + string scalar_prefix = 1; + string image_prefix = 2; + string input_prefix = 3; + + uint64 image_height = 11; + uint64 image_width = 12; + uint64 image_num_channels = 13; + + //------------------ start of only for jag_conduit ----------------------- + bool split_jag_image_channels = 89; + repeated string jag_image_keys = 90; + repeated string jag_scalar_keys = 91; + repeated string jag_input_keys = 92; + message JagKeyPrefixFilter { + string key_prefix = 1; + uint32 min_len = 2; + } + repeated string jag_scalar_filters = 93; + repeated JagKeyPrefixFilter jag_scalar_prefix_filters = 94; + repeated string jag_input_filters = 95; + repeated JagKeyPrefixFilter jag_input_prefix_filters = 96; + + enum JAG_Data { + Undefined = 0; + JAG_Image = 1; + JAG_Scalar = 2; + JAG_Input = 3; + } + message JAGDataSlice { + repeated JAG_Data pieces = 1; + } + repeated JAGDataSlice independent = 97; + repeated JAGDataSlice dependent = 98; + //------------------ end of only for jag_conduit ----------------------- + } + + message Normalization { + //------------------ start of only for jag_conduit ----------------------- + message JagLinearNormalizationParams { + double scale = 1; + double bias = 2; + } + + repeated JagLinearNormalizationParams jag_image_normalization_params = 86; + repeated JagLinearNormalizationParams jag_scalar_normalization_params = 87; + repeated JagLinearNormalizationParams jag_input_normalization_params = 88; + + //------------------ end of only for jag_conduit ----------------------- + } + + Schema schema = 1; + Normalization normalization = 2; +} diff --git a/src/proto/transforms.proto b/src/proto/transforms.proto new file mode 100644 index 00000000000..c398595719e --- /dev/null +++ b/src/proto/transforms.proto @@ -0,0 +1,167 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +syntax = "proto3"; + +package lbann_data; + +message Transform { + // Transforms that apply to LBANN data. + // Normalize channel-wise with mean and standard deviation. + message Normalize { + string means = 1; + string stddevs = 2; + } + // Normalize each sample to have mean 0, standard deviation 1. + message SampleNormalize {} + // Scale by a constant. + message Scale { + float scale = 1; + } + + // Transforms that apply to images. + // Adjust the brightness of an image. + message AdjustBrightness { + float factor = 1; + } + // Adjust the contrast of an image. + message AdjustContrast { + float factor = 1; + } + // Adjust the saturation of an image. + message AdjustSaturation { + float factor = 1; + } + // Crop of size height x width from the center. + message CenterCrop { + uint64 height = 1; + uint64 width = 2; + } + // Convert to color. + message Colorize {} + // Randomly jitter brightness/contrast/saturation. + message ColorJitter { + float min_brightness_factor = 1; + float max_brightness_factor = 2; + float min_contrast_factor = 3; + float max_contrast_factor = 4; + float min_saturation_factor = 5; + float max_saturation_factor = 6; + } + // Apply cutout augmentation. + message Cutout { + uint64 num_holes = 1; + uint64 length = 2; + } + // Convert to grayscale. + message Grayscale {} + // Horizontal flip with probability p. + message HorizontalFlip { + float p = 1; + } + // Fused Normalize + ToLBANNLayout. + message NormalizeToLBANNLayout { + string means = 1; + string stddevs = 2; + } + // Apply a random affine transform. + message RandomAffine { + float rotate_min = 1; + float rotate_max = 2; + float translate_h = 3; + float translate_w = 4; + float scale_min = 5; + float scale_max = 6; + float shear_min = 7; + float shear_max = 8; + } + // Crop of size height x width from a random location. + message RandomCrop { + uint64 height = 1; + uint64 width = 2; + } + // Random crop with scale and aspect ratio augmentation. + message RandomResizedCrop { + uint64 height = 1; + uint64 width = 2; + float scale_min = 3; + float scale_max = 4; + float ar_min = 5; + float ar_max = 6; + } + // Resize to height x width, then randomly crop to crop_height x crop_width. + message RandomResizedCropWithFixedAspectRatio { + uint64 height = 1; + uint64 width = 2; + uint64 crop_height = 3; + uint64 crop_width = 4; + } + // Resize to height x width. + message Resize { + uint64 height = 1; + uint64 width = 2; + } + // Resize to height x width then crop to crop_height x crop_width at the center. + message ResizedCenterCrop { + uint64 height = 1; + uint64 width = 2; + uint64 crop_height = 3; + uint64 crop_width = 4; + } + // Convert from an image to LBANN data. + message ToLBANNLayout { } + // Vertical flip with probability p. + message VerticalFlip { + float p = 1; + } + + oneof transform_type { + // On LBANN data: + Normalize normalize = 1; + SampleNormalize sample_normalize = 2; + Scale scale = 3; + + // On images: + CenterCrop center_crop = 100; + Colorize colorize = 101; + Grayscale grayscale = 102; + HorizontalFlip horizontal_flip = 103; + NormalizeToLBANNLayout normalize_to_lbann_layout = 104; + RandomAffine random_affine = 105; + RandomCrop random_crop = 106; + RandomResizedCrop random_resized_crop = 107; + RandomResizedCropWithFixedAspectRatio random_resized_crop_with_fixed_aspect_ratio = 108; + Resize resize = 109; + ResizedCenterCrop resized_center_crop = 110; + ToLBANNLayout to_lbann_layout = 111; + VerticalFlip vertical_flip = 112; + AdjustBrightness adjust_brightness = 113; + AdjustContrast adjust_contrast = 114; + AdjustSaturation adjust_saturation = 115; + ColorJitter color_jitter = 116; + Cutout cutout = 117; + } +} From dded259c22ef23bb3fb3506b92c150f12aa4067d Mon Sep 17 00:00:00 2001 From: "Thomas R. Benson" Date: Wed, 31 Jul 2019 09:02:04 -0700 Subject: [PATCH 176/634] update python front-end for new proto layout --- python/lbann/__init__.py | 2 +- python/lbann/layer.py | 6 +++--- python/lbann/metric.py | 4 ++-- python/lbann/model.py | 4 ++-- python/lbann/objective_function.py | 8 ++++---- python/lbann/optimizer.py | 6 +++--- python/lbann/weights.py | 8 ++++---- 7 files changed, 19 insertions(+), 19 deletions(-) diff --git a/python/lbann/__init__.py b/python/lbann/__init__.py index 5036777b488..89a48cedc11 100644 --- a/python/lbann/__init__.py +++ b/python/lbann/__init__.py @@ -19,7 +19,7 @@ _lbann_exe = _config['Paths']['lbann_exe'] except: pass -import lbann_pb2, callbacks_pb2 +import lbann_pb2, callbacks_pb2, layers_pb2, metrics_pb2, model_pb2, objective_functions_pb2, optimizers_pb2, weights_pb2 def lbann_exe(): """LBANN executable.""" return _lbann_exe if _lbann_exe else 'lbann' diff --git a/python/lbann/layer.py b/python/lbann/layer.py index 14083b47be2..a289e7ca3e5 100644 --- a/python/lbann/layer.py +++ b/python/lbann/layer.py @@ -1,6 +1,6 @@ """Neural network tensor operations.""" import abc -from lbann import lbann_pb2 +from lbann import layers_pb2 from lbann.util import make_iterable import lbann.util.class_generator @@ -52,7 +52,7 @@ def __init__(self, def export_proto(self): """Construct and return a protobuf message.""" - proto = lbann_pb2.Layer() + proto = layers_pb2.Layer() proto.parents = ' '.join([l.name for l in self.parents]) proto.children = ' '.join([l.name for l in self.children]) proto.weights = ' '.join([w.name for w in self.weights]) @@ -93,7 +93,7 @@ def __call__(self, parent): # Note: The list of skip fields must be updated if any new fields are # added to the Layer message in lbann.proto classes = lbann.util.class_generator.generate_classes_from_protobuf_message( - lbann_pb2.Layer, + layers_pb2.Layer, skip_fields = set([ 'name', 'parents', 'children', 'data_layout', 'device_allocation', 'weights', 'num_neurons_from_data_reader', 'freeze', 'hint_layer', diff --git a/python/lbann/metric.py b/python/lbann/metric.py index ebdf4a83e28..e9528659def 100644 --- a/python/lbann/metric.py +++ b/python/lbann/metric.py @@ -1,6 +1,6 @@ """Neural network tensor operations.""" import abc -from lbann import lbann_pb2 +from lbann import metrics_pb2 class Metric: """Metric that takes value from a layer. @@ -18,7 +18,7 @@ def __init__(self, layer, name=None, unit=''): def export_proto(self): """Construct and return a protobuf message.""" - proto = lbann_pb2.Metric() + proto = metrics_pb2.Metric() proto.layer_metric.layer = self.layer.name proto.layer_metric.name = self.name proto.layer_metric.unit = self.unit diff --git a/python/lbann/model.py b/python/lbann/model.py index ac22b840928..546c9049cdf 100644 --- a/python/lbann/model.py +++ b/python/lbann/model.py @@ -1,6 +1,6 @@ """Neural network model.""" import abc -from lbann import lbann_pb2 +from lbann import model_pb2 from lbann.util import make_iterable import lbann.layer import lbann.objective_function @@ -44,7 +44,7 @@ def __init__(self, mini_batch_size, epochs, def export_proto(self): """Construct and return a protobuf message.""" # Initialize protobuf message - model = lbann_pb2.Model() + model = model_pb2.Model() model.mini_batch_size = self.mini_batch_size model.num_epochs = self.epochs model.block_size = self.block_size diff --git a/python/lbann/objective_function.py b/python/lbann/objective_function.py index 6e30532bf2a..473a25632a3 100644 --- a/python/lbann/objective_function.py +++ b/python/lbann/objective_function.py @@ -1,5 +1,5 @@ import abc -from lbann import lbann_pb2 +from lbann import objective_functions_pb2 from lbann.util import make_iterable import lbann.layer @@ -19,7 +19,7 @@ def __init__(self, layer, scale=1.0): def export_proto(self): """Construct and return a protobuf message.""" - proto = lbann_pb2.LayerTerm() + proto = objective_functions_pb2.ObjectiveFunction.LayerTerm() proto.layer = self.layer.name proto.scale_factor = self.scale return proto @@ -33,7 +33,7 @@ def __init__(self, weights=[], scale=1.0): def export_proto(self): """Construct and return a protobuf message.""" - proto = lbann_pb2.L2WeightRegularization() + proto = objective_functions_pb2.ObjectiveFunction.L2WeightRegularization() proto.scale_factor = self.scale proto.weights = ' '.join([w.name for w in self.weights]) return proto @@ -65,7 +65,7 @@ def add_term(self, term): def export_proto(self): """Construct and return a protobuf message.""" - proto = lbann_pb2.ObjectiveFunction() + proto = objective_functions_pb2.ObjectiveFunction() for term in self.terms: term_message = term.export_proto() if type(term) is LayerTerm: diff --git a/python/lbann/optimizer.py b/python/lbann/optimizer.py index 061d002baff..fd5a9ab402e 100644 --- a/python/lbann/optimizer.py +++ b/python/lbann/optimizer.py @@ -1,16 +1,16 @@ -from lbann import lbann_pb2 +from lbann import optimizers_pb2 import lbann.util.class_generator class Optimizer: def export_proto(self): """Construct and return a protobuf message.""" - return lbann_pb2.Optimizer() + return optimizers_pb2.Optimizer() # Generate Optimizer sub-classes from lbann.proto # Note: The list of skip fields must be updated if any new fields are # added to the Optimizer message in lbann.proto classes = lbann.util.class_generator.generate_classes_from_protobuf_message( - lbann_pb2.Optimizer, + optimizers_pb2.Optimizer, base_class = Optimizer, base_has_export_proto = True) for c in classes: diff --git a/python/lbann/weights.py b/python/lbann/weights.py index df902a6ccd4..301f7bd00a5 100644 --- a/python/lbann/weights.py +++ b/python/lbann/weights.py @@ -1,6 +1,6 @@ """Trainable model parameters.""" import abc -from lbann import lbann_pb2 +from lbann import weights_pb2 import lbann.util.class_generator class Initializer(abc.ABC): @@ -12,7 +12,7 @@ def export_proto(self): # Note: The list of skip fields must be updated if any new fields are # added to the Weights message in lbann.proto classes = lbann.util.class_generator.generate_classes_from_protobuf_message( - lbann_pb2.Weights, + weights_pb2.Weights, skip_fields = set(['name', 'optimizer']), base_class = Initializer) for c in classes: @@ -31,14 +31,14 @@ def __init__(self, initializer=None, optimizer=None, name=None): def export_proto(self): """Construct and return a protobuf message.""" - proto = lbann_pb2.Weights() + proto = weights_pb2.Weights() proto.name = self.name # Set initializer if needed if self.initializer: type_name = type(self.initializer).__name__ field_name = None - for field in lbann_pb2.Weights.DESCRIPTOR.fields: + for field in weights_pb2.Weights.DESCRIPTOR.fields: if field.message_type and field.message_type.name == type_name: field_name = field.name break From 581c47f785fce3d4836c5cd7cce2ac183b48338a Mon Sep 17 00:00:00 2001 From: "Thomas R. Benson" Date: Wed, 31 Jul 2019 11:34:04 -0700 Subject: [PATCH 177/634] change the includes to use the most specific pb header possible --- include/lbann/callbacks/callback.hpp | 11 ++- .../callbacks/callback_check_gradients.hpp | 2 + .../lbann/callbacks/callback_perturb_adam.hpp | 1 + .../lbann/callbacks/callback_save_model.hpp | 7 +- include/lbann/layers/layer.hpp | 6 +- include/lbann/models/model.hpp | 7 +- include/lbann/proto/factories.hpp | 10 +++ .../lbann/proto/init_image_data_readers.hpp | 6 ++ include/lbann/proto/proto_common.hpp | 25 ++++--- include/lbann/utils/protobuf_utils.hpp | 9 ++- include/lbann/weights/weights.hpp | 13 ++-- model_zoo/lbann.cpp | 5 +- model_zoo/lbann2.cpp | 5 ++ model_zoo/lbann_aecycgan.cpp | 4 ++ model_zoo/lbann_cycgan.cpp | 4 ++ model_zoo/lbann_gan.cpp | 4 ++ model_zoo/lbann_inf.cpp | 10 ++- src/callbacks/callback_check_gradients.cpp | 10 ++- src/callbacks/callback_check_metric.cpp | 11 ++- src/callbacks/callback_checkpoint.cpp | 7 ++ src/callbacks/callback_confusion_matrix.cpp | 8 +++ src/callbacks/callback_debug_io.cpp | 7 ++ src/callbacks/callback_dump_weights.cpp | 6 +- src/callbacks/callback_early_stopping.cpp | 4 ++ src/callbacks/callback_imcomm.cpp | 30 ++++---- src/callbacks/callback_io.cpp | 2 + src/callbacks/callback_learning_rate.cpp | 2 + src/callbacks/callback_ltfb.cpp | 7 +- src/callbacks/callback_perturb_adam.cpp | 12 +++- src/callbacks/callback_perturb_dropout.cpp | 4 +- src/callbacks/callback_print.cpp | 12 +++- src/callbacks/callback_replace_weights.cpp | 5 ++ src/callbacks/callback_save_model.cpp | 10 ++- src/callbacks/callback_save_topk_models.cpp | 9 ++- src/callbacks/callback_summary.cpp | 7 ++ src/callbacks/callback_sync_layers.cpp | 4 ++ src/callbacks/callback_sync_selected.cpp | 11 ++- src/callbacks/callback_timeline.cpp | 10 ++- src/callbacks/callback_variable_minibatch.cpp | 8 ++- src/callbacks/profiler.cpp | 7 +- src/layers/layer.cpp | 6 +- src/models/model.cpp | 8 ++- src/proto/factories/callback_factory.cpp | 4 +- src/proto/factories/layer_factory.cpp | 70 +++++++++++++++++++ src/proto/factories/layer_graph_factory.cpp | 10 +++ src/proto/factories/model_factory.cpp | 9 +++ .../factories/objective_function_factory.cpp | 5 ++ src/proto/factories/optimizer_factory.cpp | 9 +++ src/proto/factories/transform_factory.cpp | 7 +- src/proto/factories/weights_factory.cpp | 6 ++ src/proto/init_image_data_readers.cpp | 16 ++++- src/proto/proto_common.cpp | 3 + src/utils/lbann_library.cpp | 8 ++- src/utils/protobuf_utils.cpp | 2 + src/weights/weights.cpp | 10 ++- tests/test_shuffled_indices.cpp | 5 ++ 56 files changed, 430 insertions(+), 70 deletions(-) diff --git a/include/lbann/callbacks/callback.hpp b/include/lbann/callbacks/callback.hpp index b920998ca85..10741442289 100644 --- a/include/lbann/callbacks/callback.hpp +++ b/include/lbann/callbacks/callback.hpp @@ -29,10 +29,15 @@ #ifndef __LBANN_CALLBACKS_CALLBACK_HPP_INCLUDED #define __LBANN_CALLBACKS_CALLBACK_HPP_INCLUDED -#include "lbann/base.hpp" -#include "lbann/utils/summary.hpp" -#include "lbann/models/model.hpp" #include "lbann/layers/layer.hpp" +#include "lbann/models/model.hpp" +#include "lbann/utils/memory.hpp" +#include "lbann/utils/summary.hpp" + +#include + +#include +#include // A utility macro for easily adding default-constructed sub-class // builders. diff --git a/include/lbann/callbacks/callback_check_gradients.hpp b/include/lbann/callbacks/callback_check_gradients.hpp index 1a46d7ea986..a931009a4bc 100644 --- a/include/lbann/callbacks/callback_check_gradients.hpp +++ b/include/lbann/callbacks/callback_check_gradients.hpp @@ -29,6 +29,8 @@ #include "lbann/callbacks/callback.hpp" +#include + namespace lbann { /** @brief Gradient checking callback. diff --git a/include/lbann/callbacks/callback_perturb_adam.hpp b/include/lbann/callbacks/callback_perturb_adam.hpp index dbb49b1a645..9406c9da918 100644 --- a/include/lbann/callbacks/callback_perturb_adam.hpp +++ b/include/lbann/callbacks/callback_perturb_adam.hpp @@ -29,6 +29,7 @@ #include "lbann/callbacks/callback.hpp" #include "lbann/optimizers/adam.hpp" + #include namespace lbann { diff --git a/include/lbann/callbacks/callback_save_model.hpp b/include/lbann/callbacks/callback_save_model.hpp index 2993ccc09fd..06b457697e1 100644 --- a/include/lbann/callbacks/callback_save_model.hpp +++ b/include/lbann/callbacks/callback_save_model.hpp @@ -32,9 +32,14 @@ #include #include "lbann/callbacks/callback.hpp" -#include + #include +// Forward-declare protobuf classes +namespace lbann_data { +class Model; +} + namespace lbann { /** diff --git a/include/lbann/layers/layer.hpp b/include/lbann/layers/layer.hpp index 6ed9ecb096b..56c015eabfc 100644 --- a/include/lbann/layers/layer.hpp +++ b/include/lbann/layers/layer.hpp @@ -35,10 +35,14 @@ #include "lbann/utils/timer.hpp" #include "lbann/utils/description.hpp" #include "lbann/io/persist.hpp" -#include #include #include +// Forward-declare protobuf classes +namespace lbann_data { +class Layer; +} + namespace lbann { // Forward declarations diff --git a/include/lbann/models/model.hpp b/include/lbann/models/model.hpp index 7e8671a5289..04923589ff9 100644 --- a/include/lbann/models/model.hpp +++ b/include/lbann/models/model.hpp @@ -39,11 +39,16 @@ #include "lbann/weights/weights.hpp" #include "lbann/optimizers/optimizer.hpp" #include "lbann/utils/threads/thread_pool.hpp" -#include + #include #include #include +// Forward-declare protobuf class +namespace lbann_data { +class Model; +} + namespace lbann { // Forward declarations diff --git a/include/lbann/proto/factories.hpp b/include/lbann/proto/factories.hpp index 59b05bd24e2..78a9e4286ae 100644 --- a/include/lbann/proto/factories.hpp +++ b/include/lbann/proto/factories.hpp @@ -27,11 +27,21 @@ #ifndef LBANN_PROTO_FACTORIES_HPP_INCLUDED #define LBANN_PROTO_FACTORIES_HPP_INCLUDED +#include "lbann/callbacks/callback.hpp" #include "lbann/proto/proto_common.hpp" #include "lbann/data_readers/data_reader.hpp" #include "lbann/transforms/transform.hpp" #include "lbann/transforms/transform_pipeline.hpp" +namespace lbann_data { +class Model; +class ObjectiveFunction; +class Optimizer; +class Reader; +class Transform; +class Weights; +}// namespace lbann_data + namespace lbann { namespace proto { diff --git a/include/lbann/proto/init_image_data_readers.hpp b/include/lbann/proto/init_image_data_readers.hpp index 94c782531f3..4b585998599 100644 --- a/include/lbann/proto/init_image_data_readers.hpp +++ b/include/lbann/proto/init_image_data_readers.hpp @@ -26,9 +26,15 @@ #ifndef LBANN_PROTO_INIT_IMAGE_DATA_READERS_HPP_INCLUDED #define LBANN_PROTO_INIT_IMAGE_DATA_READERS_HPP_INCLUDED + #include "lbann/proto/proto_common.hpp" #include "lbann/comm.hpp" +namespace lbann_data { +class Reader; +class DataSetMetaData; +} + namespace lbann { extern void init_image_data_reader(const lbann_data::Reader& pb_readme, const lbann_data::DataSetMetaData& pb_metadata, const bool master, generic_data_reader* &reader); diff --git a/include/lbann/proto/proto_common.hpp b/include/lbann/proto/proto_common.hpp index 261f18344d4..72fc2397651 100644 --- a/include/lbann/proto/proto_common.hpp +++ b/include/lbann/proto/proto_common.hpp @@ -27,9 +27,12 @@ #ifndef LBANN_PROTO_PROTO_COMMON_HPP_INCLUDED #define LBANN_PROTO_PROTO_COMMON_HPP_INCLUDED -#include "lbann/lbann.hpp" -#include -#include "lbann/proto/factories.hpp" +#include "lbann/data_readers/data_reader.hpp" + +// Forward declaration of protobuf classes +namespace lbann_data { +class LbannPB; +} namespace lbann { @@ -46,27 +49,27 @@ namespace lbann { _t_. @endverbatim */ void customize_data_readers_index_list(const lbann_comm& comm, - lbann_data::LbannPB& p); + ::lbann_data::LbannPB& p); /** @brief instantiates one or more generic_data_readers and inserts * them in &data_readers */ void init_data_readers( lbann_comm *comm, - const lbann_data::LbannPB& p, + const ::lbann_data::LbannPB& p, std::map& data_readers, bool is_shareable_training_data_reader, bool is_shareable_testing_data_reader, bool is_shareable_validation_data_reader = false); /** @brief adjusts the number of parallel data readers */ -void set_num_parallel_readers(const lbann_comm& comm, lbann_data::LbannPB& p); +void set_num_parallel_readers(const lbann_comm& comm, ::lbann_data::LbannPB& p); /** @brief adjusts the values in p by querying the options db */ -void get_cmdline_overrides(const lbann_comm& comm, lbann_data::LbannPB& p); +void get_cmdline_overrides(const lbann_comm& comm, ::lbann_data::LbannPB& p); /** @brief print various params (learn_rate, etc) to cout */ -void print_parameters(const lbann_comm& comm, lbann_data::LbannPB& p); +void print_parameters(const lbann_comm& comm, ::lbann_data::LbannPB& p); /** @brief prints usage information */ void print_help(const lbann_comm& comm); @@ -77,18 +80,18 @@ void print_help(std::ostream& os); /** @brief prints prototext file, cmd line, etc to file */ void save_session(const lbann_comm& comm, const int argc, char * const* argv, - lbann_data::LbannPB& p); + ::lbann_data::LbannPB& p); /** @brief Read prototext from a file into a protobuf message. */ void read_prototext_file( const std::string& fn, - lbann_data::LbannPB& pb, + ::lbann_data::LbannPB& pb, const bool master); /** @brief Write a protobuf message into a prototext file. */ bool write_prototext_file( const std::string& fn, - lbann_data::LbannPB& pb); + ::lbann_data::LbannPB& pb); /** @brief Trim leading and trailing whitespace from a string. */ std::string trim(std::string const& str); diff --git a/include/lbann/utils/protobuf_utils.hpp b/include/lbann/utils/protobuf_utils.hpp index 48340c89db9..83ef42497d3 100644 --- a/include/lbann/utils/protobuf_utils.hpp +++ b/include/lbann/utils/protobuf_utils.hpp @@ -1,9 +1,14 @@ #ifndef LBANN_UTILS_PROTOBUF_UTILS_HPP_INCLUDED #define LBANN_UTILS_PROTOBUF_UTILS_HPP_INCLUDED +#include +#include #include -#include "lbann/lbann.hpp" -#include + +// Forward-declare protobuf class +namespace lbann_data { +class LbannPB; +} namespace lbann { diff --git a/include/lbann/weights/weights.hpp b/include/lbann/weights/weights.hpp index 784869331fa..1fae5f54e35 100644 --- a/include/lbann/weights/weights.hpp +++ b/include/lbann/weights/weights.hpp @@ -27,17 +27,20 @@ #ifndef LBANN_WEIGHTS_HPP #define LBANN_WEIGHTS_HPP -#include -#include -#include - #include "lbann/base.hpp" #include "lbann/comm.hpp" #include "lbann/weights/initializer.hpp" #include "lbann/optimizers/optimizer.hpp" #include "lbann/io/persist.hpp" #include "lbann/utils/description.hpp" -#include + +#include +#include +#include + +namespace lbann_data { +class WeightsData; +} namespace lbann { diff --git a/model_zoo/lbann.cpp b/model_zoo/lbann.cpp index 868b3e1d362..7b0580a73b2 100644 --- a/model_zoo/lbann.cpp +++ b/model_zoo/lbann.cpp @@ -30,8 +30,11 @@ #include "lbann/proto/proto_common.hpp" #include "lbann/utils/protobuf_utils.hpp" #include "lbann/data_store/data_store_conduit.hpp" -#include +#include +#include + +#include using namespace lbann; diff --git a/model_zoo/lbann2.cpp b/model_zoo/lbann2.cpp index b72ddd2a38f..b14c5c6eac1 100644 --- a/model_zoo/lbann2.cpp +++ b/model_zoo/lbann2.cpp @@ -29,7 +29,12 @@ #include "lbann/lbann.hpp" #include "lbann/proto/proto_common.hpp" #include "lbann/utils/protobuf_utils.hpp" + +#include +#include + #include + #include using namespace lbann; diff --git a/model_zoo/lbann_aecycgan.cpp b/model_zoo/lbann_aecycgan.cpp index 699b779117a..6b2a7eb6d2d 100644 --- a/model_zoo/lbann_aecycgan.cpp +++ b/model_zoo/lbann_aecycgan.cpp @@ -29,6 +29,10 @@ #include "lbann/lbann.hpp" #include "lbann/proto/proto_common.hpp" #include "lbann/utils/protobuf_utils.hpp" + +#include +#include + #include using namespace lbann; diff --git a/model_zoo/lbann_cycgan.cpp b/model_zoo/lbann_cycgan.cpp index 3093075aa7e..9b2c061f957 100644 --- a/model_zoo/lbann_cycgan.cpp +++ b/model_zoo/lbann_cycgan.cpp @@ -29,6 +29,10 @@ #include "lbann/lbann.hpp" #include "lbann/proto/proto_common.hpp" #include "lbann/utils/protobuf_utils.hpp" + +#include +#include + #include using namespace lbann; diff --git a/model_zoo/lbann_gan.cpp b/model_zoo/lbann_gan.cpp index 2db1b291bf6..8ee5dcbbfa3 100644 --- a/model_zoo/lbann_gan.cpp +++ b/model_zoo/lbann_gan.cpp @@ -29,6 +29,10 @@ #include "lbann/lbann.hpp" #include "lbann/proto/proto_common.hpp" #include "lbann/utils/protobuf_utils.hpp" + +#include +#include + #include using namespace lbann; diff --git a/model_zoo/lbann_inf.cpp b/model_zoo/lbann_inf.cpp index 62f5f764f73..f8bea143620 100644 --- a/model_zoo/lbann_inf.cpp +++ b/model_zoo/lbann_inf.cpp @@ -29,8 +29,14 @@ #include "lbann/lbann.hpp" #include "lbann/proto/proto_common.hpp" #include "lbann/utils/protobuf_utils.hpp" + +#include +#include + #include + #include + using namespace lbann; int main(int argc, char *argv[]) { @@ -64,10 +70,10 @@ int main(int argc, char *argv[]) { // Load layer weights from checkpoint if checkpoint directory given if(opts->has_string("ckpt_dir")){ for(auto&& m : models) { - bool loaded = lbann_callback_save_model::load_model_weights(opts->get_string("ckpt_dir"), + bool loaded = lbann_callback_save_model::load_model_weights(opts->get_string("ckpt_dir"), m.get(), opts->get_bool("ckptdir_is_fullpath")); - if(!loaded) LBANN_ERROR("Unable to reload model"); + if(!loaded) LBANN_ERROR("Unable to reload model"); } }else { LBANN_ERROR("Unable to reload model"); diff --git a/src/callbacks/callback_check_gradients.cpp b/src/callbacks/callback_check_gradients.cpp index cd46d14c6c9..2038a8b0142 100644 --- a/src/callbacks/callback_check_gradients.cpp +++ b/src/callbacks/callback_check_gradients.cpp @@ -25,10 +25,16 @@ //////////////////////////////////////////////////////////////////////////////// #include "lbann/callbacks/callback_check_gradients.hpp" -#include "lbann/layers/io/input/generic_input_layer.hpp" #include "lbann/data_readers/data_reader.hpp" +#include "lbann/layers/io/input/generic_input_layer.hpp" +#include "lbann/utils/memory.hpp" + +#include -#include "callbacks.pb.h" +#include +#include +#include +#include namespace lbann { diff --git a/src/callbacks/callback_check_metric.cpp b/src/callbacks/callback_check_metric.cpp index caed2fca818..aef955614ac 100644 --- a/src/callbacks/callback_check_metric.cpp +++ b/src/callbacks/callback_check_metric.cpp @@ -26,7 +26,16 @@ #include "lbann/callbacks/callback_check_metric.hpp" -#include "lbann/proto/factories.hpp" +#include "lbann/proto/proto_common.hpp" +#include "lbann/utils/exception.hpp" +#include "lbann/utils/memory.hpp" + +#include + +#include +#include +#include +#include namespace lbann { diff --git a/src/callbacks/callback_checkpoint.cpp b/src/callbacks/callback_checkpoint.cpp index 7bf82b50642..4d3fc51eb11 100644 --- a/src/callbacks/callback_checkpoint.cpp +++ b/src/callbacks/callback_checkpoint.cpp @@ -29,6 +29,13 @@ #include "lbann/callbacks/callback_checkpoint.hpp" +#include "lbann/models/model.hpp" + +#include + +#include +#include + namespace lbann { // Load from checkpoint occurs during setup callbacks void lbann_callback_checkpoint::setup(model *m) { diff --git a/src/callbacks/callback_confusion_matrix.cpp b/src/callbacks/callback_confusion_matrix.cpp index 6bdf3ff9787..97c0f3113d1 100644 --- a/src/callbacks/callback_confusion_matrix.cpp +++ b/src/callbacks/callback_confusion_matrix.cpp @@ -26,6 +26,14 @@ #include "lbann/callbacks/callback_confusion_matrix.hpp" +#include + +#include +#include +#include +#include +#include + namespace lbann { // --------------------------------------------------------- diff --git a/src/callbacks/callback_debug_io.cpp b/src/callbacks/callback_debug_io.cpp index b8e38f06488..567317c6b89 100644 --- a/src/callbacks/callback_debug_io.cpp +++ b/src/callbacks/callback_debug_io.cpp @@ -28,6 +28,13 @@ #include "lbann/callbacks/callback_debug_io.hpp" +#include "lbann/base.hpp" +#include "lbann/utils/memory.hpp" + +#include +#include +#include + namespace lbann { /// BVE FIXME @todo The use of execution_mode invalid needs to be reconsidered diff --git a/src/callbacks/callback_dump_weights.cpp b/src/callbacks/callback_dump_weights.cpp index 42a120b4b55..b9ba637ed61 100644 --- a/src/callbacks/callback_dump_weights.cpp +++ b/src/callbacks/callback_dump_weights.cpp @@ -26,8 +26,12 @@ // lbann_callback_dump_weights .hpp .cpp - Callbacks to dump weight matrices //////////////////////////////////////////////////////////////////////////////// -#include #include "lbann/callbacks/callback_dump_weights.hpp" +#include "lbann/utils/memory.hpp" + +#include + +#include namespace lbann { diff --git a/src/callbacks/callback_early_stopping.cpp b/src/callbacks/callback_early_stopping.cpp index 4ac30a5ea45..bca9766bc04 100644 --- a/src/callbacks/callback_early_stopping.cpp +++ b/src/callbacks/callback_early_stopping.cpp @@ -28,6 +28,10 @@ #include "lbann/callbacks/callback_early_stopping.hpp" +#include + +#include + namespace lbann { lbann_callback_early_stopping::lbann_callback_early_stopping(int64_t patience) : diff --git a/src/callbacks/callback_imcomm.cpp b/src/callbacks/callback_imcomm.cpp index d6a6e269563..966a4949e3f 100644 --- a/src/callbacks/callback_imcomm.cpp +++ b/src/callbacks/callback_imcomm.cpp @@ -26,11 +26,15 @@ // lbann_callback_imcomm .hpp .cpp - Send gradient updates between models //////////////////////////////////////////////////////////////////////////////// -#include -#include #include "lbann/callbacks/callback_imcomm.hpp" -#include "lbann/utils/timer.hpp" + #include "lbann/utils/exception.hpp" +#include "lbann/utils/timer.hpp" + +#include + +#include +#include namespace lbann { @@ -71,10 +75,9 @@ void lbann_callback_imcomm::setup(model *m) { optimizer *opt = w->get_optimizer(); if (opt == nullptr) { std::stringstream err; - err << __FILE__ << " " << __LINE__ << " :: " - << "imcomm: trying to do inter-model gradient communication on " + err << "imcomm: trying to do inter-model gradient communication on " << w->get_name() << ", which has no optimizer"; - throw(err.str()); + LBANN_ERROR(err.str()); } } @@ -87,10 +90,9 @@ void lbann_callback_imcomm::on_train_begin(model *m) { return; // No point with only one model. } for (weights *w : m->get_weights()) { - AbsDistMat *values = w->get_values().Copy(); + auto values = std::unique_ptr{w->get_values().Copy()}; comm->intertrainer_broadcast_matrix(*values, 0); w->set_values(*values); - delete values; } } @@ -107,19 +109,17 @@ void lbann_callback_imcomm::on_backward_prop_end(model *m) { continue; } optimizer *opt = w->get_optimizer(); - auto gradient = opt->get_gradient().Copy(); + auto gradient = std::unique_ptr{opt->get_gradient().Copy()}; Mat* local_gradients = &(static_cast(gradient->Matrix())); switch (params.ct) { case NORMAL: comm->intertrainer_sum_matrix(*local_gradients); break; default: - throw(std::string{} + __FILE__ + " " + std::to_string(__LINE__) + " :: " - + "imcomm: unknown comm type"); + LBANN_ERROR("imcomm: unknown comm type"); } opt->clear_gradient(); opt->add_to_gradient(*gradient); - delete gradient; EvalType im_time = get_time() - start_time; do_summary(m, w, im_time); } @@ -146,14 +146,12 @@ void lbann_callback_imcomm::do_summary(model *m, weights *w, bytes_received, m->get_step(execution_mode::training)); } -static std::vector comm_type_names = - { "none", "normal" }; +static std::vector comm_type_names = { "none", "normal" }; /** returns a string representation of the weight_initialization */ std::string get_comm_type_name(lbann_callback_imcomm::comm_type m) { if ((int)m < 0 or (int)m >= (int)comm_type_names.size()) { - throw(std::string{} + __FILE__ + " " + std::to_string(__LINE__) + " :: " - + " Invalid comm_type"); + LBANN_ERROR(" Invalid comm_type"); } return comm_type_names[(int)m]; } diff --git a/src/callbacks/callback_io.cpp b/src/callbacks/callback_io.cpp index 11d459e61bd..d4c24efa315 100644 --- a/src/callbacks/callback_io.cpp +++ b/src/callbacks/callback_io.cpp @@ -32,6 +32,8 @@ #include "lbann/layers/io/input/generic_input_layer.hpp" #include "lbann/proto/proto_common.hpp" +#include + namespace lbann { void lbann_callback_io::on_epoch_end(model *m) { diff --git a/src/callbacks/callback_learning_rate.cpp b/src/callbacks/callback_learning_rate.cpp index bb68513b961..ab07aae19ff 100644 --- a/src/callbacks/callback_learning_rate.cpp +++ b/src/callbacks/callback_learning_rate.cpp @@ -31,6 +31,8 @@ #include "callback_helpers.hpp" +#include + #include #include // std::pow #include diff --git a/src/callbacks/callback_ltfb.cpp b/src/callbacks/callback_ltfb.cpp index 8f0d947d357..bc6303dc4b8 100644 --- a/src/callbacks/callback_ltfb.cpp +++ b/src/callbacks/callback_ltfb.cpp @@ -24,7 +24,6 @@ // permissions and limitations under the license. //////////////////////////////////////////////////////////////////////////////// -#include #include "lbann/callbacks/callback_ltfb.hpp" #include "lbann/callbacks/callback_imcomm.hpp" #include "lbann/utils/random.hpp" @@ -32,6 +31,12 @@ #include "lbann/optimizers/adam.hpp" #include "lbann/proto/factories.hpp" +#include + +#include +#include +#include + namespace lbann { namespace { diff --git a/src/callbacks/callback_perturb_adam.cpp b/src/callbacks/callback_perturb_adam.cpp index 3ccd61a656c..c843a847c5e 100644 --- a/src/callbacks/callback_perturb_adam.cpp +++ b/src/callbacks/callback_perturb_adam.cpp @@ -25,9 +25,19 @@ //////////////////////////////////////////////////////////////////////////////// #include "lbann/callbacks/callback_perturb_adam.hpp" -#include "lbann/proto/factories.hpp" +#include "lbann/proto/proto_common.hpp" #include "lbann/utils/random.hpp" +#include + +#include +#include +#include +#include +#include +#include +#include + namespace lbann { lbann_callback_perturb_adam::lbann_callback_perturb_adam(DataType learning_rate_factor, diff --git a/src/callbacks/callback_perturb_dropout.cpp b/src/callbacks/callback_perturb_dropout.cpp index de306b0bdf4..bad3c463f32 100644 --- a/src/callbacks/callback_perturb_dropout.cpp +++ b/src/callbacks/callback_perturb_dropout.cpp @@ -25,9 +25,11 @@ //////////////////////////////////////////////////////////////////////////////// #include "lbann/callbacks/callback_perturb_dropout.hpp" -#include "lbann/proto/factories.hpp" +#include "lbann/proto/proto_common.hpp" #include "lbann/utils/random.hpp" +#include + namespace lbann { lbann_callback_perturb_dropout::lbann_callback_perturb_dropout(EvalType keep_prob_factor, diff --git a/src/callbacks/callback_print.cpp b/src/callbacks/callback_print.cpp index f936223ddfd..99dbdbb3e40 100644 --- a/src/callbacks/callback_print.cpp +++ b/src/callbacks/callback_print.cpp @@ -26,10 +26,18 @@ // lbann_callback_print .hpp .cpp - Callback hooks to print information //////////////////////////////////////////////////////////////////////////////// -#include #include "lbann/callbacks/callback_print.hpp" -#include "lbann/layers/io/input/input_layer.hpp" + +#include "lbann/layers/io/input/generic_input_layer.hpp" +#include "lbann/utils/memory.hpp" + +#include + +#include #include +#include +#include +#include namespace lbann { diff --git a/src/callbacks/callback_replace_weights.cpp b/src/callbacks/callback_replace_weights.cpp index faf8ef34bc4..a63da1be864 100644 --- a/src/callbacks/callback_replace_weights.cpp +++ b/src/callbacks/callback_replace_weights.cpp @@ -29,6 +29,11 @@ #include "callback_helpers.hpp" +#include + +#include +#include + namespace lbann { void lbann_callback_replace_weights::setup(model *m) { diff --git a/src/callbacks/callback_save_model.cpp b/src/callbacks/callback_save_model.cpp index 5162fe8ca65..f339cf1038d 100644 --- a/src/callbacks/callback_save_model.cpp +++ b/src/callbacks/callback_save_model.cpp @@ -26,16 +26,22 @@ // lbann_callback_save_model .hpp .cpp - Callbacks to save a models description and weights //////////////////////////////////////////////////////////////////////////////// -#include #include "lbann/callbacks/callback_save_model.hpp" #include "lbann/callbacks/callback_checkpoint.hpp" // Reuse the checkpoint naming scheme + +#include +#include + #include #include #include -#include + #include #include + #include +#include +#include namespace lbann { diff --git a/src/callbacks/callback_save_topk_models.cpp b/src/callbacks/callback_save_topk_models.cpp index f986590ae9a..6cd99514bc7 100644 --- a/src/callbacks/callback_save_topk_models.cpp +++ b/src/callbacks/callback_save_topk_models.cpp @@ -26,9 +26,16 @@ // lbann_callback_save_topk_models .hpp .cpp - Callback hooks to save_topk_models information //////////////////////////////////////////////////////////////////////////////// -#include #include "lbann/callbacks/callback_save_topk_models.hpp" +#include + +#include +#include +#include +#include +#include + namespace lbann { void lbann_callback_save_topk_models::on_test_end(model *m) { bool in_topk = false; diff --git a/src/callbacks/callback_summary.cpp b/src/callbacks/callback_summary.cpp index 959214f2101..968f79090b4 100644 --- a/src/callbacks/callback_summary.cpp +++ b/src/callbacks/callback_summary.cpp @@ -27,8 +27,15 @@ //////////////////////////////////////////////////////////////////////////////// #include "lbann/callbacks/callback_summary.hpp" + +#include "lbann/utils/memory.hpp" #include "lbann/utils/profiling.hpp" +#include + +#include +#include + namespace lbann { lbann_callback_summary::lbann_callback_summary(lbann_summary *summarizer, diff --git a/src/callbacks/callback_sync_layers.cpp b/src/callbacks/callback_sync_layers.cpp index 61075337b57..b378653fe49 100644 --- a/src/callbacks/callback_sync_layers.cpp +++ b/src/callbacks/callback_sync_layers.cpp @@ -27,9 +27,13 @@ /////////////////////////////////////////////////////////////////////////////// #include "lbann/callbacks/callback_sync_layers.hpp" + #include "lbann/layers/io/input/generic_input_layer.hpp" +#include "lbann/utils/memory.hpp" #include "lbann/utils/timer.hpp" +#include + namespace lbann { void lbann_callback_sync_layers::on_forward_prop_end(model *m, Layer *l) { diff --git a/src/callbacks/callback_sync_selected.cpp b/src/callbacks/callback_sync_selected.cpp index cec3f6b4a11..9d0b923d016 100644 --- a/src/callbacks/callback_sync_selected.cpp +++ b/src/callbacks/callback_sync_selected.cpp @@ -27,13 +27,22 @@ /////////////////////////////////////////////////////////////////////////////// #include "lbann/callbacks/callback_sync_selected.hpp" + +#include "lbann/utils/memory.hpp" #include "lbann/utils/timer.hpp" + +#include + #ifdef LBANN_NVPROF -#include #include "lbann/utils/file_utils.hpp" +#include #include #endif // LBANN_NVPROF +#include +#include +#include + namespace lbann { bool lbann_callback_sync_selected::m_cuda_profiler_initialized = false; diff --git a/src/callbacks/callback_timeline.cpp b/src/callbacks/callback_timeline.cpp index 4d6f9077753..62f9df96030 100644 --- a/src/callbacks/callback_timeline.cpp +++ b/src/callbacks/callback_timeline.cpp @@ -26,10 +26,18 @@ // callback_timeline .hpp .cpp - Callback hooks to record a timeline of runtime //////////////////////////////////////////////////////////////////////////////// -#include #include "lbann/callbacks/callback_timeline.hpp" + +#include "lbann/utils/memory.hpp" #include "lbann/utils/timer.hpp" +#include + +#include +#include +#include +#include + namespace lbann { void lbann_callback_timeline::on_train_begin(model *m) { diff --git a/src/callbacks/callback_variable_minibatch.cpp b/src/callbacks/callback_variable_minibatch.cpp index 10a0d19d836..6f0b42e621f 100644 --- a/src/callbacks/callback_variable_minibatch.cpp +++ b/src/callbacks/callback_variable_minibatch.cpp @@ -26,10 +26,14 @@ // lbann_variable_minibatch .hpp .cpp - Callback for variable-size mini-batches //////////////////////////////////////////////////////////////////////////////// -#include - #include "lbann/callbacks/callback_variable_minibatch.hpp" #include "lbann/layers/io/input/input_layer.hpp" +#include "lbann/utils/exception.hpp" + +#include + +#include +#include namespace lbann { diff --git a/src/callbacks/profiler.cpp b/src/callbacks/profiler.cpp index fdf8771f4f2..5b5a6b70fba 100644 --- a/src/callbacks/profiler.cpp +++ b/src/callbacks/profiler.cpp @@ -26,9 +26,11 @@ // lbann_callback_timer .hpp .cpp - Callback hooks to time training /////////////////////////////////////////////////////////////////////////////// -#include #include "lbann/callbacks/profiler.hpp" #include "lbann/utils/profiling.hpp" + +#include + #ifdef LBANN_NVPROF #include "nvToolsExt.h" #include "nvToolsExtCuda.h" @@ -36,6 +38,9 @@ #include "cuda_runtime.h" #endif +#include +#include + namespace lbann { lbann_callback_profiler::lbann_callback_profiler(bool sync, bool skip_init) : diff --git a/src/layers/layer.cpp b/src/layers/layer.cpp index 2fabfe77505..d404e59d170 100644 --- a/src/layers/layer.cpp +++ b/src/layers/layer.cpp @@ -29,11 +29,15 @@ #include "lbann/models/model.hpp" #include "lbann/io/file_io.hpp" #include "lbann/io/persist.hpp" -#include + +#include + #include #include #include +#include + // Asynchronous memory transfers for input data // Note: This introduces a race condition. It is possible for the // input data to be modified by another layer before it is used by diff --git a/src/models/model.cpp b/src/models/model.cpp index 7a377b6da24..caa3de1cfef 100644 --- a/src/models/model.cpp +++ b/src/models/model.cpp @@ -38,14 +38,16 @@ #include "lbann/utils/omp_diagnostics.hpp" #include "lbann/utils/description.hpp" #include "lbann/data_store/data_store_conduit.hpp" + +#include + +#include + #include #include #include #include #include -#include - -#include "mpi.h" namespace lbann { diff --git a/src/proto/factories/callback_factory.cpp b/src/proto/factories/callback_factory.cpp index 8f7d4e287d2..f1f9c63ab81 100644 --- a/src/proto/factories/callback_factory.cpp +++ b/src/proto/factories/callback_factory.cpp @@ -62,15 +62,17 @@ #include "lbann/callbacks/callback_timeline.hpp" #include "lbann/callbacks/callback_timer.hpp" #include "lbann/callbacks/callback_variable_minibatch.hpp" +#include "lbann/callbacks/profiler.hpp" #include "lbann/proto/factories.hpp" #include "lbann/proto/helpers.hpp" #include "lbann/utils/factory.hpp" #include "lbann/utils/memory.hpp" +#include +#include #include -#include #include #include diff --git a/src/proto/factories/layer_factory.cpp b/src/proto/factories/layer_factory.cpp index 50c8ffd82ec..02c7fe47164 100644 --- a/src/proto/factories/layer_factory.cpp +++ b/src/proto/factories/layer_factory.cpp @@ -25,8 +25,78 @@ //////////////////////////////////////////////////////////////////////////////// #include "lbann/proto/factories.hpp" + +#include "lbann/layers/layer.hpp" +#include "lbann/layers/activations/activations.hpp" +#include "lbann/layers/activations/elu.hpp" +#include "lbann/layers/activations/identity.hpp" +#include "lbann/layers/activations/leaky_relu.hpp" +#include "lbann/layers/activations/log_softmax.hpp" +#include "lbann/layers/activations/softmax.hpp" +#include "lbann/layers/image/bilinear_resize.hpp" +#include "lbann/layers/io/input/generic_input_layer.hpp" +#include "lbann/layers/io/input/input_layer.hpp" +#include "lbann/layers/io/io_layer.hpp" +#include "lbann/layers/learning/base_convolution.hpp" +#include "lbann/layers/learning/channelwise_scale_bias.hpp" +#include "lbann/layers/learning/convolution.hpp" +#include "lbann/layers/learning/deconvolution.hpp" +#include "lbann/layers/learning/embedding.hpp" +#include "lbann/layers/learning/entrywise_scale_bias.hpp" +#include "lbann/layers/learning/fully_connected.hpp" +#include "lbann/layers/learning/learning.hpp" +#include "lbann/layers/loss/categorical_accuracy.hpp" +#include "lbann/layers/loss/cross_entropy.hpp" +#include "lbann/layers/loss/entrywise.hpp" +#include "lbann/layers/loss/l1_norm.hpp" +#include "lbann/layers/loss/l2_norm2.hpp" +#include "lbann/layers/loss/mean_absolute_error.hpp" +#include "lbann/layers/loss/mean_squared_error.hpp" +#include "lbann/layers/loss/top_k_categorical_accuracy.hpp" +#include "lbann/layers/math/binary.hpp" +#include "lbann/layers/math/clamp.hpp" +#include "lbann/layers/math/unary.hpp" +#include "lbann/layers/misc/channelwise_mean.hpp" +#include "lbann/layers/misc/covariance.hpp" +#include "lbann/layers/misc/mini_batch_index.hpp" +#include "lbann/layers/misc/mini_batch_size.hpp" +#include "lbann/layers/misc/variance.hpp" +#include "lbann/layers/regularizers/batch_normalization.hpp" +#include "lbann/layers/regularizers/dropout.hpp" +#include "lbann/layers/regularizers/local_response_normalization.hpp" +#include "lbann/layers/regularizers/regularizer.hpp" +#include "lbann/layers/regularizers/selu_dropout.hpp" +#include "lbann/layers/transform/bernoulli.hpp" +#include "lbann/layers/transform/categorical_random.hpp" +#include "lbann/layers/transform/concatenation.hpp" +#include "lbann/layers/transform/constant.hpp" +#include "lbann/layers/transform/crop.hpp" +#include "lbann/layers/transform/discrete_random.hpp" +#include "lbann/layers/transform/dummy.hpp" +#include "lbann/layers/transform/evaluation.hpp" +#include "lbann/layers/transform/gaussian.hpp" +#include "lbann/layers/transform/hadamard.hpp" +#include "lbann/layers/transform/in_top_k.hpp" +#include "lbann/layers/transform/pooling.hpp" +#include "lbann/layers/transform/reduction.hpp" +#include "lbann/layers/transform/reshape.hpp" +#include "lbann/layers/transform/slice.hpp" +#include "lbann/layers/transform/sort.hpp" +#include "lbann/layers/transform/split.hpp" +#include "lbann/layers/transform/stop_gradient.hpp" +#include "lbann/layers/transform/sum.hpp" +#include "lbann/layers/transform/tessellate.hpp" +#include "lbann/layers/transform/transform.hpp" +#include "lbann/layers/transform/uniform.hpp" +#include "lbann/layers/transform/unpooling.hpp" +#include "lbann/layers/transform/weighted_sum.hpp" +#include "lbann/layers/transform/weights.hpp" + +#include "lbann/data_readers/data_reader_jag_conduit.hpp" #include "lbann/utils/peek_map.hpp" +#include + namespace lbann { namespace proto { diff --git a/src/proto/factories/layer_graph_factory.cpp b/src/proto/factories/layer_graph_factory.cpp index 5e8c12dd98e..4c75e068bb2 100644 --- a/src/proto/factories/layer_graph_factory.cpp +++ b/src/proto/factories/layer_graph_factory.cpp @@ -26,6 +26,16 @@ #include "lbann/proto/factories.hpp" +#include "lbann/layers/learning/fully_connected.hpp" +#include "lbann/layers/transform/pooling.hpp" +#include "lbann/layers/transform/unpooling.hpp" + +#include + +#include +#include +#include + namespace lbann { namespace proto { diff --git a/src/proto/factories/model_factory.cpp b/src/proto/factories/model_factory.cpp index ca3d55da361..6ca8c81903e 100644 --- a/src/proto/factories/model_factory.cpp +++ b/src/proto/factories/model_factory.cpp @@ -25,7 +25,16 @@ //////////////////////////////////////////////////////////////////////////////// #include "lbann/proto/factories.hpp" + +#include "lbann/models/model.hpp" +#include "lbann/models/directed_acyclic_graph.hpp" + +#include "lbann/metrics/layer_metric.hpp" #include "lbann/objective_functions/layer_term.hpp" +#include "lbann/objective_functions/weight_regularization/l2.hpp" + +#include +#include namespace lbann { namespace proto { diff --git a/src/proto/factories/objective_function_factory.cpp b/src/proto/factories/objective_function_factory.cpp index 9ca69a151bf..334c1fa2d0a 100644 --- a/src/proto/factories/objective_function_factory.cpp +++ b/src/proto/factories/objective_function_factory.cpp @@ -26,6 +26,11 @@ #include "lbann/proto/factories.hpp" +#include "lbann/objective_functions/layer_term.hpp" +#include "lbann/objective_functions/weight_regularization/l2.hpp" + +#include + namespace lbann { namespace proto { diff --git a/src/proto/factories/optimizer_factory.cpp b/src/proto/factories/optimizer_factory.cpp index a4215b4e644..210ec1e50b8 100644 --- a/src/proto/factories/optimizer_factory.cpp +++ b/src/proto/factories/optimizer_factory.cpp @@ -26,6 +26,15 @@ #include "lbann/proto/factories.hpp" +#include "lbann/optimizers/adagrad.hpp" +#include "lbann/optimizers/adam.hpp" +#include "lbann/optimizers/hypergradient_adam.hpp" +#include "lbann/optimizers/optimizer.hpp" +#include "lbann/optimizers/rmsprop.hpp" +#include "lbann/optimizers/sgd.hpp" + +#include + namespace lbann { namespace proto { diff --git a/src/proto/factories/transform_factory.cpp b/src/proto/factories/transform_factory.cpp index 47a721eb9aa..d7ac0811d5d 100644 --- a/src/proto/factories/transform_factory.cpp +++ b/src/proto/factories/transform_factory.cpp @@ -24,7 +24,6 @@ // permissions and limitations under the license. //////////////////////////////////////////////////////////////////////////////// -#include "lbann/proto/factories.hpp" #include "lbann/transforms/normalize.hpp" #include "lbann/transforms/sample_normalize.hpp" #include "lbann/transforms/scale.hpp" @@ -46,8 +45,14 @@ #include "lbann/transforms/vision/resized_center_crop.hpp" #include "lbann/transforms/vision/to_lbann_layout.hpp" #include "lbann/transforms/vision/vertical_flip.hpp" + +#include "lbann/proto/factories.hpp" +#include "lbann/proto/proto_common.hpp" #include "lbann/utils/memory.hpp" +#include +#include + namespace lbann { namespace proto { diff --git a/src/proto/factories/weights_factory.cpp b/src/proto/factories/weights_factory.cpp index 17ded03f1ae..860f2d8bc62 100644 --- a/src/proto/factories/weights_factory.cpp +++ b/src/proto/factories/weights_factory.cpp @@ -26,6 +26,12 @@ #include "lbann/proto/factories.hpp" +#include "lbann/weights/initializer.hpp" +#include "lbann/weights/variance_scaling_initializers.hpp" + +#include +#include + namespace lbann { namespace proto { diff --git a/src/proto/init_image_data_readers.cpp b/src/proto/init_image_data_readers.cpp index 6f68213c8dd..014314994b0 100644 --- a/src/proto/init_image_data_readers.cpp +++ b/src/proto/init_image_data_readers.cpp @@ -28,10 +28,24 @@ #include "lbann/proto/init_image_data_readers.hpp" #include "lbann/proto/factories.hpp" + +#include "lbann/data_readers/data_reader_cifar10.hpp" +#include "lbann/data_readers/data_reader_jag_conduit.hpp" +#include "lbann/data_readers/data_reader_imagenet.hpp" +#include "lbann/data_readers/data_reader_mnist.hpp" +#include "lbann/data_readers/data_reader_moving_mnist.hpp" +#include "lbann/data_readers/data_reader_multihead_siamese.hpp" + +#include + #include #include #include -#include // for dynamic_pointer_cast + +#include +#include +#include +#include namespace lbann { diff --git a/src/proto/proto_common.cpp b/src/proto/proto_common.cpp index 43b050b3a7e..3742d201edc 100644 --- a/src/proto/proto_common.cpp +++ b/src/proto/proto_common.cpp @@ -33,6 +33,9 @@ #include "lbann/proto/factories.hpp" #include "lbann/utils/file_utils.hpp" +#include +#include + #include #include #include diff --git a/src/utils/lbann_library.cpp b/src/utils/lbann_library.cpp index e347f6f2df9..77e08d9c627 100644 --- a/src/utils/lbann_library.cpp +++ b/src/utils/lbann_library.cpp @@ -25,7 +25,13 @@ //////////////////////////////////////////////////////////////////////////////// #include "lbann/utils/lbann_library.hpp" -#include "lbann/callbacks/callback_checkpoint.hpp" + +#include "lbann/proto/factories.hpp" +#include "lbann/utils/omp_diagnostics.hpp" +#include "lbann/utils/threads/thread_utils.hpp" + +#include +#include namespace lbann { diff --git a/src/utils/protobuf_utils.cpp b/src/utils/protobuf_utils.cpp index f93b62609ed..44cb1777b92 100644 --- a/src/utils/protobuf_utils.cpp +++ b/src/utils/protobuf_utils.cpp @@ -29,6 +29,8 @@ #include "lbann/utils/protobuf_utils.hpp" #include "lbann/proto/proto_common.hpp" +#include // Actually use LbannPB here + /** * all methods in protobuf_utils are static */ diff --git a/src/weights/weights.cpp b/src/weights/weights.cpp index ee5fa776e0c..43c52c60a2f 100644 --- a/src/weights/weights.cpp +++ b/src/weights/weights.cpp @@ -24,13 +24,19 @@ // permissions and limitations under the license. //////////////////////////////////////////////////////////////////////////////// -#include - #include "lbann/weights/weights.hpp" #include "lbann/optimizers/optimizer.hpp" #include "lbann/utils/exception.hpp" #include "lbann/io/file_io.hpp" +#include + +#include +#include +#include +#include +#include + namespace lbann { namespace { diff --git a/tests/test_shuffled_indices.cpp b/tests/test_shuffled_indices.cpp index d260a341f01..77d83ef29a9 100644 --- a/tests/test_shuffled_indices.cpp +++ b/tests/test_shuffled_indices.cpp @@ -29,6 +29,11 @@ #include "lbann/lbann.hpp" #include "lbann/proto/proto_common.hpp" +#include +#include + +#include + using namespace lbann; int mini_batch_size = 128; From e2783e8f916354198eea15426ff75e0f6daf4b51 Mon Sep 17 00:00:00 2001 From: "Thomas R. Benson" Date: Wed, 31 Jul 2019 16:31:48 -0700 Subject: [PATCH 178/634] fix a bug related to how unique_ptrs are consumed --- .../layers/learning/base_convolution.hpp | 20 +++++++++---------- .../learning/channelwise_scale_bias.hpp | 6 +++--- .../layers/learning/entrywise_scale_bias.hpp | 6 +++--- .../lbann/layers/learning/fully_connected.hpp | 20 +++++++++---------- .../regularizers/batch_normalization.hpp | 20 +++++++++---------- include/lbann/layers/transform/weights.hpp | 6 +++--- include/lbann/weights/weights.hpp | 4 ++-- src/weights/weights.cpp | 4 ++-- 8 files changed, 43 insertions(+), 43 deletions(-) diff --git a/include/lbann/layers/learning/base_convolution.hpp b/include/lbann/layers/learning/base_convolution.hpp index ecf417c24bb..a431c98cd37 100644 --- a/include/lbann/layers/learning/base_convolution.hpp +++ b/include/lbann/layers/learning/base_convolution.hpp @@ -350,14 +350,14 @@ class base_convolution_layer : public Layer { this->m_weights.resize(1, nullptr); } if (this->m_weights[0] == nullptr) { - auto* w = new weights(get_comm()); - std::unique_ptr init(new he_initializer(probability_distribution::gaussian)); + auto w = make_unique(get_comm()); + auto init = make_unique(probability_distribution::gaussian); std::unique_ptr opt(m_model->create_optimizer()); w->set_name(get_name() + "_kernel"); - w->set_initializer(init); - w->set_optimizer(opt); - this->m_weights[0] = w; - this->m_model->add_weights(w); + w->set_initializer(std::move(init)); + w->set_optimizer(std::move(opt)); + this->m_weights[0] = w.get(); + this->m_model->add_weights(w.release()); } auto& kernel_weights = *this->m_weights[0]; @@ -379,12 +379,12 @@ class base_convolution_layer : public Layer { // Set up bias if needed. if (m_bias_scaling_factor != DataType(0)) { if (this->m_weights[1] == nullptr) { - auto* w = new weights(get_comm()); + auto w = make_unique(get_comm()); std::unique_ptr opt(m_model->create_optimizer()); w->set_name(get_name() + "_bias"); - w->set_optimizer(opt); - this->m_weights[1] = w; - this->m_model->add_weights(w); + w->set_optimizer(std::move(opt)); + this->m_weights[1] = w.get(); + this->m_model->add_weights(w.release()); } auto& bias_weights = *this->m_weights[1]; bias_weights.set_dims(output_dims[0]); diff --git a/include/lbann/layers/learning/channelwise_scale_bias.hpp b/include/lbann/layers/learning/channelwise_scale_bias.hpp index 570aefa54f4..d1c776d68bc 100644 --- a/include/lbann/layers/learning/channelwise_scale_bias.hpp +++ b/include/lbann/layers/learning/channelwise_scale_bias.hpp @@ -97,11 +97,11 @@ class channelwise_scale_bias_layer : public Layer { this->m_weights.push_back(new weights(get_comm())); std::vector vals(2*num_channels, DataType{0}); std::fill(vals.begin(), vals.begin()+num_channels, DataType{1}); - std::unique_ptr init(new value_initializer(vals)); + auto init = make_unique(vals); std::unique_ptr opt(m_model->create_optimizer()); this->m_weights[0]->set_name(get_name() + "_weights"); - this->m_weights[0]->set_initializer(init); - this->m_weights[0]->set_optimizer(opt); + this->m_weights[0]->set_initializer(std::move(init)); + this->m_weights[0]->set_optimizer(std::move(opt)); this->m_model->add_weights(this->m_weights[0]); } if (this->m_weights.size() != 1) { diff --git a/include/lbann/layers/learning/entrywise_scale_bias.hpp b/include/lbann/layers/learning/entrywise_scale_bias.hpp index 528ef75bab8..545198955f1 100644 --- a/include/lbann/layers/learning/entrywise_scale_bias.hpp +++ b/include/lbann/layers/learning/entrywise_scale_bias.hpp @@ -93,11 +93,11 @@ class entrywise_scale_bias_layer : public Layer { this->m_weights.push_back(new weights(get_comm())); std::vector vals(2*size, DataType{0}); std::fill(vals.begin(), vals.begin()+size, DataType{1}); - std::unique_ptr init(new value_initializer(vals)); + auto init = make_unique(vals); std::unique_ptr opt(m_model->create_optimizer()); this->m_weights[0]->set_name(get_name() + "_weights"); - this->m_weights[0]->set_initializer(init); - this->m_weights[0]->set_optimizer(opt); + this->m_weights[0]->set_initializer(std::move(init)); + this->m_weights[0]->set_optimizer(std::move(opt)); this->m_model->add_weights(this->m_weights[0]); } if (this->m_weights.size() != 1) { diff --git a/include/lbann/layers/learning/fully_connected.hpp b/include/lbann/layers/learning/fully_connected.hpp index b89f4c04733..3acc8062322 100644 --- a/include/lbann/layers/learning/fully_connected.hpp +++ b/include/lbann/layers/learning/fully_connected.hpp @@ -127,14 +127,14 @@ class fully_connected_layer : public learning_layer { this->m_weights.resize(1, nullptr); } if (this->m_weights[0] == nullptr) { - auto* w = new weights(get_comm()); - std::unique_ptr init(new he_initializer(probability_distribution::gaussian)); + auto w = make_unique(get_comm()); + auto init = make_unique(probability_distribution::gaussian); std::unique_ptr opt(m_model->create_optimizer()); w->set_name(get_name() + "_linearity_weights"); - w->set_initializer(init); - w->set_optimizer(opt); - this->m_weights[0] = w; - this->m_model->add_weights(w); + w->set_initializer(std::move(init)); + w->set_optimizer(std::move(opt)); + this->m_weights[0] = w.get(); + this->m_model->add_weights(w.release()); } auto& linearity_weights = *this->m_weights[0]; @@ -163,12 +163,12 @@ class fully_connected_layer : public learning_layer { // Set up bias if needed. if (m_bias_scaling_factor != DataType(0)) { if (this->m_weights[1] == nullptr) { - auto* w = new weights(get_comm()); + auto w = make_unique(get_comm()); std::unique_ptr opt(m_model->create_optimizer()); w->set_name(get_name() + "_bias_weights"); - w->set_optimizer(opt); - this->m_weights[1] = w; - this->m_model->add_weights(w); + w->set_optimizer(std::move(opt)); + this->m_weights[1] = w.get(); + this->m_model->add_weights(w.release()); } auto& bias_weights = *this->m_weights[1]; // Setup bias weights diff --git a/include/lbann/layers/regularizers/batch_normalization.hpp b/include/lbann/layers/regularizers/batch_normalization.hpp index 67ef577c146..79318781a01 100644 --- a/include/lbann/layers/regularizers/batch_normalization.hpp +++ b/include/lbann/layers/regularizers/batch_normalization.hpp @@ -251,34 +251,34 @@ class batch_normalization_layer : public regularizer_layer { this->m_weights.resize(4, nullptr); if (this->m_weights[0] == nullptr) { this->m_weights[0] = new weights(get_comm()); - std::unique_ptr init(new constant_initializer(DataType(1))); + auto init = make_unique(DataType(1)); std::unique_ptr opt(m_model->create_optimizer()); this->m_weights[0]->set_name(get_name() + "_scale"); - this->m_weights[0]->set_initializer(init); - this->m_weights[0]->set_optimizer(opt); + this->m_weights[0]->set_initializer(std::move(init)); + this->m_weights[0]->set_optimizer(std::move(opt)); this->m_model->add_weights(this->m_weights[0]); } if (this->m_weights[1] == nullptr) { this->m_weights[1] = new weights(get_comm()); - std::unique_ptr init(new constant_initializer(DataType(0))); + auto init = make_unique(DataType(0)); std::unique_ptr opt(m_model->create_optimizer()); this->m_weights[1]->set_name(get_name() + "_bias"); - this->m_weights[1]->set_initializer(init); - this->m_weights[1]->set_optimizer(opt); + this->m_weights[1]->set_initializer(std::move(init)); + this->m_weights[1]->set_optimizer(std::move(opt)); this->m_model->add_weights(this->m_weights[1]); } if (this->m_weights[2] == nullptr) { this->m_weights[2] = new weights(get_comm()); this->m_weights[2]->set_name(get_name() + "_running_mean"); - std::unique_ptr init(new constant_initializer(DataType(0))); - this->m_weights[2]->set_initializer(init); + auto init = make_unique(DataType(0)); + this->m_weights[2]->set_initializer(std::move(init)); this->m_model->add_weights(this->m_weights[2]); } if (this->m_weights[3] == nullptr) { this->m_weights[3] = new weights(get_comm()); this->m_weights[3]->set_name(get_name() + "_running_variance"); - std::unique_ptr init(new constant_initializer(DataType(1))); - this->m_weights[3]->set_initializer(init); + auto init = make_unique(DataType(1)); + this->m_weights[3]->set_initializer(std::move(init)); this->m_model->add_weights(this->m_weights[3]); } diff --git a/include/lbann/layers/transform/weights.hpp b/include/lbann/layers/transform/weights.hpp index f6d74931347..bdf0de3a5e4 100644 --- a/include/lbann/layers/transform/weights.hpp +++ b/include/lbann/layers/transform/weights.hpp @@ -125,11 +125,11 @@ class weights_layer : public transform_layer { auto& w = this->m_weights[0]; if (w == nullptr) { w = new weights(get_comm()); - std::unique_ptr init(new constant_initializer(DataType(0))); + auto init = make_unique(DataType(0)); std::unique_ptr opt(m_model->create_optimizer()); w->set_name(get_name() + "_weights"); - w->set_initializer(init); - w->set_optimizer(opt); + w->set_initializer(std::move(init)); + w->set_optimizer(std::move(opt)); this->m_model->add_weights(w); } diff --git a/include/lbann/weights/weights.hpp b/include/lbann/weights/weights.hpp index 1fae5f54e35..24ace95c8a4 100644 --- a/include/lbann/weights/weights.hpp +++ b/include/lbann/weights/weights.hpp @@ -141,7 +141,7 @@ class weights { /** Set weights initializer. * The contents of 'init' are moved to a class member. */ - void set_initializer(std::unique_ptr& init); + void set_initializer(std::unique_ptr&& init); // ----------------------------------------------- // Optimizer accessors @@ -157,7 +157,7 @@ class weights { /** Set weights optimizer. * The contents of opt are moved to a class member. */ - void set_optimizer(std::unique_ptr& opt); + void set_optimizer(std::unique_ptr&& opt); // ----------------------------------------------- // Matrix distribution accessors diff --git a/src/weights/weights.cpp b/src/weights/weights.cpp index 43c52c60a2f..21d0d81612d 100644 --- a/src/weights/weights.cpp +++ b/src/weights/weights.cpp @@ -221,7 +221,7 @@ weights_initializer* weights::get_initializer() { const weights_initializer* weights::get_initializer() const { return m_initializer.get(); } -void weights::set_initializer(std::unique_ptr& init) { +void weights::set_initializer(std::unique_ptr&& init) { m_initializer = std::move(init); } @@ -239,7 +239,7 @@ const optimizer* weights::get_optimizer() const { return m_optimizer.get(); } } -void weights::set_optimizer(std::unique_ptr& opt) { +void weights::set_optimizer(std::unique_ptr&& opt) { m_optimizer = std::move(opt); } From 46411d4cb6f290b3f8a1012eba61e6c417a6181c Mon Sep 17 00:00:00 2001 From: "Thomas R. Benson" Date: Wed, 31 Jul 2019 16:32:06 -0700 Subject: [PATCH 179/634] refactor weights_initializer construction to use a factory --- include/lbann/proto/factories.hpp | 7 +- include/lbann/weights/initializer.hpp | 11 ++ .../weights/variance_scaling_initializers.hpp | 7 ++ src/proto/factories/callback_factory.cpp | 1 - src/proto/factories/model_factory.cpp | 7 +- src/proto/factories/weights_factory.cpp | 107 +++++++++--------- src/weights/initializer.cpp | 51 +++++++++ src/weights/variance_scaling_initializers.cpp | 45 ++++++++ 8 files changed, 173 insertions(+), 63 deletions(-) diff --git a/include/lbann/proto/factories.hpp b/include/lbann/proto/factories.hpp index 4aa32a651d4..1b3b5a4913d 100644 --- a/include/lbann/proto/factories.hpp +++ b/include/lbann/proto/factories.hpp @@ -66,9 +66,10 @@ std::unique_ptr construct_layer( const lbann_data::Layer& proto_layer); /** Construct weights specified with prototext. */ -weights* construct_weights(lbann_comm* comm, - const lbann_data::Optimizer& proto_opt, - const lbann_data::Weights& proto_weights); +std::unique_ptr construct_weights( + lbann_comm* comm, + const lbann_data::Optimizer& proto_opt, + const lbann_data::Weights& proto_weights); /** Construct a callback specified with prototext. */ std::unique_ptr diff --git a/include/lbann/weights/initializer.hpp b/include/lbann/weights/initializer.hpp index 41fd8b9bf59..8f9e222fa54 100644 --- a/include/lbann/weights/initializer.hpp +++ b/include/lbann/weights/initializer.hpp @@ -30,6 +30,8 @@ #include "lbann/base.hpp" #include "lbann/utils/description.hpp" +#include + namespace lbann { /** @brief Scheme for initializing weight values. */ @@ -139,6 +141,15 @@ class normal_initializer : public weights_initializer { }; +std::unique_ptr +build_constant_initializer_from_pbuf(google::protobuf::Message const& msg); +std::unique_ptr +build_value_initializer_from_pbuf(google::protobuf::Message const& msg); +std::unique_ptr +build_uniform_initializer_from_pbuf(google::protobuf::Message const& msg); +std::unique_ptr +build_normal_initializer_from_pbuf(google::protobuf::Message const& msg); + } // namespace lbann #endif // LBANN_WEIGHTS_INITIALIZER_HPP diff --git a/include/lbann/weights/variance_scaling_initializers.hpp b/include/lbann/weights/variance_scaling_initializers.hpp index 16a5359fbb6..11f17474cdd 100644 --- a/include/lbann/weights/variance_scaling_initializers.hpp +++ b/include/lbann/weights/variance_scaling_initializers.hpp @@ -109,6 +109,13 @@ class lecun_initializer : public variance_scaling_initializer { DataType get_variance(El::Int fan_in, El::Int fan_out) override; }; +std::unique_ptr +build_glorot_initializer_from_pbuf(google::protobuf::Message const& msg); +std::unique_ptr +build_he_initializer_from_pbuf(google::protobuf::Message const& msg); +std::unique_ptr +build_lecun_initializer_from_pbuf(google::protobuf::Message const& msg); + } // namespace lbann #endif // LBANN_WEIGHTS_VARIANCE_SCALING_INITIALIZER_HPP diff --git a/src/proto/factories/callback_factory.cpp b/src/proto/factories/callback_factory.cpp index 541a26d4176..b7aac70d936 100644 --- a/src/proto/factories/callback_factory.cpp +++ b/src/proto/factories/callback_factory.cpp @@ -81,7 +81,6 @@ namespace lbann { namespace proto { namespace { - // Define the factory type. using factory_type = lbann::generic_factory< lbann::callback_base, diff --git a/src/proto/factories/model_factory.cpp b/src/proto/factories/model_factory.cpp index 63abc320e5a..7b2e49c3397 100644 --- a/src/proto/factories/model_factory.cpp +++ b/src/proto/factories/model_factory.cpp @@ -260,9 +260,10 @@ model* construct_model(lbann_comm* comm, // Construct weights std::vector weights_list; for (int i=0; i #include namespace lbann { namespace proto { - namespace { -/** Construct a weights initialization specified with prototext. */ -weights_initializer* construct_initializer(const lbann_data::Weights& proto_weights) { - - // Constant initialization - if (proto_weights.has_constant_initializer()) { - const auto& params = proto_weights.constant_initializer(); - return new constant_initializer(params.value()); - } +// Define the factory type. +using factory_type = lbann::generic_factory< + lbann::weights_initializer, + std::string, + generate_builder_type, + default_key_error_policy>; + +void register_default_builders(factory_type& factory) +{ + factory.register_builder("ConstantInitializer", build_constant_initializer_from_pbuf); + factory.register_builder("ValueInitializer", build_value_initializer_from_pbuf); + factory.register_builder("UniformInitializer", build_uniform_initializer_from_pbuf); + factory.register_builder("NormalInitializer", build_normal_initializer_from_pbuf); + factory.register_builder("GlorotNormalInitializer", build_glorot_initializer_from_pbuf); + factory.register_builder("GlorotUniformInitializer", build_glorot_initializer_from_pbuf); + factory.register_builder("HeNormalInitializer", build_he_initializer_from_pbuf); + factory.register_builder("HeUniformInitializer", build_he_initializer_from_pbuf); + factory.register_builder("LeCunNormalInitializer", build_lecun_initializer_from_pbuf); + factory.register_builder("LeCunUniformInitializer", build_lecun_initializer_from_pbuf); +} - // Value initialization - if (proto_weights.has_value_initializer()) { - const auto& params = proto_weights.value_initializer(); - return new value_initializer(parse_list(params.values())); - } +// Manage a global factory +struct factory_manager +{ + factory_type factory_; - // Random initialization - if (proto_weights.has_uniform_initializer()) { - const auto& params = proto_weights.uniform_initializer(); - const auto& min = params.min(); - const auto& max = params.max(); - if (min != 0.0 || max != 0.0) { - return new uniform_initializer(min, max); - } else { - return new uniform_initializer(); - } - } - if (proto_weights.has_normal_initializer()) { - const auto& params = proto_weights.normal_initializer(); - const auto& mean = params.mean(); - const auto& standard_deviation = params.standard_deviation(); - if (mean != 0.0 || standard_deviation != 0.0) { - return new normal_initializer(mean, standard_deviation); - } else { - return new normal_initializer(); + factory_manager() { + register_default_builders(factory_); } - } +}; - // Variance scaling initialization - if (proto_weights.has_glorot_normal_initializer()) { - return new glorot_initializer(probability_distribution::gaussian); - } - if (proto_weights.has_glorot_uniform_initializer()) { - return new glorot_initializer(probability_distribution::uniform); - } - if (proto_weights.has_he_normal_initializer()) { - return new he_initializer(probability_distribution::gaussian); - } - if (proto_weights.has_he_uniform_initializer()) { - return new he_initializer(probability_distribution::uniform); - } +factory_manager factory_mgr_; +factory_type const& get_callback_factory() noexcept +{ + return factory_mgr_.factory_; +} - return nullptr; +/* Construct a weights initialization specified with prototext. */ +std::unique_ptr +construct_initializer(const lbann_data::Weights& proto_weights) { + auto const& factory = get_callback_factory(); + auto const& msg = + helpers::get_oneof_message(proto_weights, "initializer_type"); + return factory.create_object(msg.GetDescriptor()->name(), msg); } } // namespace -weights* construct_weights(lbann_comm* comm, - const lbann_data::Optimizer& proto_opt, - const lbann_data::Weights& proto_weights) { +std::unique_ptr construct_weights( + lbann_comm* comm, + const lbann_data::Optimizer& proto_opt, + const lbann_data::Weights& proto_weights) { std::stringstream err; // Instantiate weights - weights* w = new weights(comm); + auto w = make_unique(comm); // Set weights name if provided const auto& name = proto_weights.name(); @@ -114,18 +110,17 @@ weights* construct_weights(lbann_comm* comm, } // Set weights initializer and optimizer - std::unique_ptr init(construct_initializer(proto_weights)); + auto init = construct_initializer(proto_weights); std::unique_ptr opt; if (proto_weights.has_optimizer()) { opt.reset(construct_optimizer(comm, proto_weights.optimizer())); } else { opt.reset(construct_optimizer(comm, proto_opt)); } - w->set_initializer(init); - w->set_optimizer(opt); + w->set_initializer(std::move(init)); + w->set_optimizer(std::move(opt)); return w; - } } // namespace proto diff --git a/src/weights/initializer.cpp b/src/weights/initializer.cpp index 539fb157d9b..f902d814c4a 100644 --- a/src/weights/initializer.cpp +++ b/src/weights/initializer.cpp @@ -25,9 +25,16 @@ //////////////////////////////////////////////////////////////////////////////// #include "lbann/weights/initializer.hpp" + +#include "lbann/proto/proto_common.hpp" #include "lbann/utils/exception.hpp" +#include "lbann/utils/memory.hpp" #include "lbann/utils/random.hpp" +#include + +#include + namespace lbann { description weights_initializer::get_description() const { @@ -115,4 +122,48 @@ void normal_initializer::fill(AbsDistMat& matrix) { m_mean, m_standard_deviation); } +// +// Builder functions +// + +std::unique_ptr +build_constant_initializer_from_pbuf(google::protobuf::Message const& msg) { + const auto& params = + dynamic_cast(msg); + return make_unique(params.value()); +} + +std::unique_ptr +build_value_initializer_from_pbuf(google::protobuf::Message const& msg) { + const auto& params = + dynamic_cast(msg); + return make_unique(parse_list(params.values())); +} + +std::unique_ptr +build_uniform_initializer_from_pbuf(google::protobuf::Message const& msg) { + const auto& params = + dynamic_cast(msg); + const auto& min = params.min(); + const auto& max = params.max(); + if (min != 0.0 || max != 0.0) { + return make_unique(min, max); + } else { + return make_unique(); + } +} + +std::unique_ptr +build_normal_initializer_from_pbuf(google::protobuf::Message const& msg) { + const auto& params = + dynamic_cast(msg); + const auto& mean = params.mean(); + const auto& standard_deviation = params.standard_deviation(); + if (mean != 0.0 || standard_deviation != 0.0) { + return make_unique(mean, standard_deviation); + } else { + return make_unique(); + } +} + } // namespace lbann diff --git a/src/weights/variance_scaling_initializers.cpp b/src/weights/variance_scaling_initializers.cpp index 5baad441146..36e41261b01 100644 --- a/src/weights/variance_scaling_initializers.cpp +++ b/src/weights/variance_scaling_initializers.cpp @@ -26,6 +26,9 @@ #include "lbann/weights/variance_scaling_initializers.hpp" #include "lbann/utils/exception.hpp" +#include "lbann/utils/memory.hpp" + +#include namespace lbann { @@ -107,4 +110,46 @@ DataType lecun_initializer::get_variance(El::Int fan_in, El::Int fan_out) { return DataType(1) / fan_in; } +// +// Builder functions +// + +// FIXME (trb 07/31/2019): This is kinda ugly, but its fine if there +// are only 2 probability distributions +std::unique_ptr +build_glorot_initializer_from_pbuf(google::protobuf::Message const& msg) { + if (dynamic_cast(&msg)) + return make_unique(probability_distribution::gaussian); + else if (dynamic_cast(&msg)) + return make_unique(probability_distribution::uniform); + else { + LBANN_ERROR("build_glorot_initializer_from_pbuf: Bad message."); + return nullptr; + } +} + +std::unique_ptr +build_he_initializer_from_pbuf(google::protobuf::Message const& msg) { + if (dynamic_cast(&msg)) + return make_unique(probability_distribution::gaussian); + else if (dynamic_cast(&msg)) + return make_unique(probability_distribution::uniform); + else { + LBANN_ERROR("build_he_initializer_from_pbuf: Bad message."); + return nullptr; + } +} + +std::unique_ptr +build_lecun_initializer_from_pbuf(google::protobuf::Message const& msg) { + if (dynamic_cast(&msg)) + return make_unique(probability_distribution::gaussian); + else if (dynamic_cast(&msg)) + return make_unique(probability_distribution::uniform); + else { + LBANN_ERROR("build_lecun_initializer_from_pbuf: Bad message."); + return nullptr; + } +} + } // namespace lbann From 5604e3d2a57d29e4eb01b9f3dbae33d17222a299 Mon Sep 17 00:00:00 2001 From: "Thomas R. Benson" Date: Wed, 31 Jul 2019 17:01:09 -0700 Subject: [PATCH 180/634] refactor optimizer_factory to use factories --- include/lbann/optimizers/adagrad.hpp | 4 + include/lbann/optimizers/adam.hpp | 4 + .../lbann/optimizers/hypergradient_adam.hpp | 4 + include/lbann/optimizers/rmsprop.hpp | 4 + include/lbann/optimizers/sgd.hpp | 4 + include/lbann/proto/factories.hpp | 5 +- src/optimizers/adagrad.cpp | 11 +++ src/optimizers/adam.cpp | 15 ++++ src/optimizers/hypergradient_adam.cpp | 16 ++++ src/optimizers/rmsprop.cpp | 14 +++ src/optimizers/sgd.cpp | 13 +++ src/proto/factories/model_factory.cpp | 5 +- src/proto/factories/optimizer_factory.cpp | 89 +++++++++---------- src/proto/factories/weights_factory.cpp | 4 +- 14 files changed, 140 insertions(+), 52 deletions(-) diff --git a/include/lbann/optimizers/adagrad.hpp b/include/lbann/optimizers/adagrad.hpp index 9a5cc8adbe6..d4fdc8f6a6d 100644 --- a/include/lbann/optimizers/adagrad.hpp +++ b/include/lbann/optimizers/adagrad.hpp @@ -85,6 +85,10 @@ class adagrad : public optimizer { }; +std::unique_ptr +build_adagrad_optimizer_from_pbuf( + google::protobuf::Message const&, lbann_comm*); + } // namespace lbann #endif // LBANN_OPTIMIZERS_ADAGRAD_HPP_INCLUDED diff --git a/include/lbann/optimizers/adam.hpp b/include/lbann/optimizers/adam.hpp index 020909aeaa1..a1e5b109742 100644 --- a/include/lbann/optimizers/adam.hpp +++ b/include/lbann/optimizers/adam.hpp @@ -208,6 +208,10 @@ class adam : public optimizer { }; +std::unique_ptr +build_adam_optimizer_from_pbuf( + google::protobuf::Message const&, lbann_comm*); + } // namespace lbann #endif // LBANN_OPTIMIZERS_ADAM_HPP_INCLUDED diff --git a/include/lbann/optimizers/hypergradient_adam.hpp b/include/lbann/optimizers/hypergradient_adam.hpp index b0d362ad02e..c0a14a2a412 100644 --- a/include/lbann/optimizers/hypergradient_adam.hpp +++ b/include/lbann/optimizers/hypergradient_adam.hpp @@ -161,6 +161,10 @@ class hypergradient_adam : public optimizer { }; +std::unique_ptr +build_hypergradient_adam_optimizer_from_pbuf( + google::protobuf::Message const&, lbann_comm*); + } // namespace lbann #endif // LBANN_OPTIMIZER_HYPERGRADIENT_ADAM_HPP_INCLUDED diff --git a/include/lbann/optimizers/rmsprop.hpp b/include/lbann/optimizers/rmsprop.hpp index a8debaa076c..efc6c0db0d0 100644 --- a/include/lbann/optimizers/rmsprop.hpp +++ b/include/lbann/optimizers/rmsprop.hpp @@ -112,6 +112,10 @@ class rmsprop : public optimizer { }; +std::unique_ptr +build_rmsprop_optimizer_from_pbuf( + google::protobuf::Message const&, lbann_comm*); + } // namespace lbann #endif // LBANN_OPTIMIZERS_RMSPROP_HPP_INCLUDED diff --git a/include/lbann/optimizers/sgd.hpp b/include/lbann/optimizers/sgd.hpp index 2d59b8c2ffe..95e0e35cd2b 100644 --- a/include/lbann/optimizers/sgd.hpp +++ b/include/lbann/optimizers/sgd.hpp @@ -154,6 +154,10 @@ class sgd : public optimizer { }; +std::unique_ptr +build_sgd_optimizer_from_pbuf( + google::protobuf::Message const&, lbann_comm*); + } // namespace lbann #endif // LBANN_OPTIMIZERS_SGD_HPP_INCLUDED diff --git a/include/lbann/proto/factories.hpp b/include/lbann/proto/factories.hpp index 1b3b5a4913d..a73563eb168 100644 --- a/include/lbann/proto/factories.hpp +++ b/include/lbann/proto/factories.hpp @@ -84,8 +84,9 @@ lbann_summary* construct_summarizer(lbann_comm* comm, const lbann_data::Model& m); /** Construct an optimizer specified with prototext. */ -optimizer* construct_optimizer(lbann_comm* comm, - const lbann_data::Optimizer& proto_opt); +std::unique_ptr construct_optimizer( + lbann_comm* comm, + const lbann_data::Optimizer& proto_opt); /** Construct an objective function specified with prototext. */ objective_function* construct_objective_function(const lbann_data::ObjectiveFunction& proto_obj); diff --git a/src/optimizers/adagrad.cpp b/src/optimizers/adagrad.cpp index ab17e618b67..499a1d14c01 100644 --- a/src/optimizers/adagrad.cpp +++ b/src/optimizers/adagrad.cpp @@ -26,6 +26,9 @@ #include "lbann/optimizers/adagrad.hpp" #include "lbann/utils/exception.hpp" +#include "lbann/utils/memory.hpp" + +#include namespace lbann { @@ -142,4 +145,12 @@ bool adagrad::load_from_checkpoint_distributed(persist& p, std::string name_pref return true; } +std::unique_ptr +build_adagrad_optimizer_from_pbuf( + google::protobuf::Message const& msg, lbann_comm* comm) { + const auto& params = + dynamic_cast(msg); + return make_unique(comm, params.learn_rate(), params.eps()); +} + } // namespace lbann diff --git a/src/optimizers/adam.cpp b/src/optimizers/adam.cpp index 2b01c384e92..89bcb4ece69 100644 --- a/src/optimizers/adam.cpp +++ b/src/optimizers/adam.cpp @@ -26,6 +26,9 @@ #include "lbann/optimizers/adam.hpp" #include "lbann/utils/exception.hpp" +#include "lbann/utils/memory.hpp" + +#include namespace lbann { @@ -245,4 +248,16 @@ bool adam::load_from_checkpoint_distributed(persist& p, std::string name_prefix) return true; } +std::unique_ptr +build_adam_optimizer_from_pbuf( + google::protobuf::Message const& msg, lbann_comm* comm) { + const auto& params = + dynamic_cast(msg); + return make_unique(comm, + params.learn_rate(), + params.beta1(), + params.beta2(), + params.eps()); +} + } // namespace lbann diff --git a/src/optimizers/hypergradient_adam.cpp b/src/optimizers/hypergradient_adam.cpp index b8afe018dce..445c13ae81b 100644 --- a/src/optimizers/hypergradient_adam.cpp +++ b/src/optimizers/hypergradient_adam.cpp @@ -26,6 +26,9 @@ #include "lbann/optimizers/hypergradient_adam.hpp" #include "lbann/utils/exception.hpp" +#include "lbann/utils/memory.hpp" + +#include namespace lbann { @@ -221,4 +224,17 @@ bool hypergradient_adam::load_from_checkpoint_distributed(persist& p, std::strin return true; } +std::unique_ptr +build_hypergradient_adam_optimizer_from_pbuf( + google::protobuf::Message const& msg, lbann_comm* comm) { + const auto& params = + dynamic_cast(msg); + return make_unique(comm, + params.init_learning_rate(), + params.hyper_learning_rate(), + params.beta1(), + params.beta2(), + params.eps()); +} + } // namespace lbann diff --git a/src/optimizers/rmsprop.cpp b/src/optimizers/rmsprop.cpp index 23aa9193e6a..cc8f92ca7e3 100644 --- a/src/optimizers/rmsprop.cpp +++ b/src/optimizers/rmsprop.cpp @@ -26,6 +26,9 @@ #include "lbann/optimizers/rmsprop.hpp" #include "lbann/utils/exception.hpp" +#include "lbann/utils/memory.hpp" + +#include namespace lbann { @@ -150,4 +153,15 @@ bool rmsprop::load_from_checkpoint_shared(persist& p, std::string name_prefix) { return true; } +std::unique_ptr +build_rmsprop_optimizer_from_pbuf( + google::protobuf::Message const& msg, lbann_comm* comm) { + const auto& params = + dynamic_cast(msg); + return make_unique(comm, + params.learn_rate(), + params.decay_rate(), + params.eps()); +} + } // namespace lbann diff --git a/src/optimizers/sgd.cpp b/src/optimizers/sgd.cpp index 89c47e5da65..f4df3e8baba 100644 --- a/src/optimizers/sgd.cpp +++ b/src/optimizers/sgd.cpp @@ -26,6 +26,9 @@ #include "lbann/optimizers/sgd.hpp" #include "lbann/utils/exception.hpp" +#include "lbann/utils/memory.hpp" + +#include namespace lbann { @@ -218,4 +221,14 @@ bool sgd::load_from_checkpoint_distributed(persist& p, std::string name_prefix) return true; } +std::unique_ptr +build_sgd_optimizer_from_pbuf( + google::protobuf::Message const& msg, lbann_comm* comm) { + const auto& params = dynamic_cast(msg); + return make_unique(comm, + params.learn_rate(), + params.momentum(), + params.nesterov()); +} + } // namespace lbann diff --git a/src/proto/factories/model_factory.cpp b/src/proto/factories/model_factory.cpp index 7b2e49c3397..3e774562da0 100644 --- a/src/proto/factories/model_factory.cpp +++ b/src/proto/factories/model_factory.cpp @@ -49,13 +49,14 @@ model* instantiate_model(lbann_comm* comm, std::stringstream err; // Default optimizer - auto&& opt = construct_optimizer(comm, proto_opt); + auto opt = construct_optimizer(comm, proto_opt); // Construct model const auto& type = proto_model.type(); const auto& mini_batch_size = proto_model.mini_batch_size(); if (type.empty() || type == "directed_acyclic_graph_model") { - return new directed_acyclic_graph_model(comm, mini_batch_size, obj, opt); + return new directed_acyclic_graph_model( + comm, mini_batch_size, obj, opt.release()); } // Throw error if model type is not supported diff --git a/src/proto/factories/optimizer_factory.cpp b/src/proto/factories/optimizer_factory.cpp index 210ec1e50b8..17badda71bc 100644 --- a/src/proto/factories/optimizer_factory.cpp +++ b/src/proto/factories/optimizer_factory.cpp @@ -26,69 +26,66 @@ #include "lbann/proto/factories.hpp" +#include "lbann/optimizers/optimizer.hpp" + #include "lbann/optimizers/adagrad.hpp" #include "lbann/optimizers/adam.hpp" #include "lbann/optimizers/hypergradient_adam.hpp" -#include "lbann/optimizers/optimizer.hpp" #include "lbann/optimizers/rmsprop.hpp" #include "lbann/optimizers/sgd.hpp" +#include "lbann/proto/helpers.hpp" +#include "lbann/utils/factory.hpp" + #include namespace lbann { namespace proto { +namespace { -optimizer* construct_optimizer(lbann_comm* comm, - const lbann_data::Optimizer& proto_opt) { +using factory_type = lbann::generic_factory< + lbann::optimizer, + std::string, + generate_builder_type, + default_key_error_policy>; - // Stochastic gradient descent - if (proto_opt.has_sgd()) { - const auto& params = proto_opt.sgd(); - return new sgd(comm, - params.learn_rate(), - params.momentum(), - params.nesterov()); - } - - // AdaGrad - if (proto_opt.has_adagrad()) { - const auto& params = proto_opt.adagrad(); - return new adagrad(comm, params.learn_rate(), params.eps()); - } +void register_default_builders(factory_type& factory) +{ + factory.register_builder("AdaGrad", build_adagrad_optimizer_from_pbuf); + factory.register_builder("Adam", build_adam_optimizer_from_pbuf); + factory.register_builder("HypergradientAdam", + build_hypergradient_adam_optimizer_from_pbuf); + factory.register_builder("RMSprop", build_rmsprop_optimizer_from_pbuf); + factory.register_builder("SGD", build_sgd_optimizer_from_pbuf); +} - // RMSProp - if (proto_opt.has_rmsprop()) { - const auto& params = proto_opt.rmsprop(); - return new rmsprop(comm, - params.learn_rate(), - params.decay_rate(), - params.eps()); - } +// Manage a global factory +struct factory_manager +{ + factory_type factory_; - // Adam - if (proto_opt.has_adam()) { - const auto& params = proto_opt.adam(); - return new adam(comm, - params.learn_rate(), - params.beta1(), - params.beta2(), - params.eps()); - } + factory_manager() { + register_default_builders(factory_); + } +}; - // Hypergradient Adam - if (proto_opt.has_hypergradient_adam()) { - const auto& params = proto_opt.hypergradient_adam(); - return new hypergradient_adam(comm, - params.init_learning_rate(), - params.hyper_learning_rate(), - params.beta1(), - params.beta2(), - params.eps()); - } +factory_manager factory_mgr_; +factory_type const& get_optimizer_factory() noexcept +{ + return factory_mgr_.factory_; +} - // Return null pointer if no optimizer is specified - return nullptr; +}// namespace +std::unique_ptr construct_optimizer( + lbann_comm* comm, + const lbann_data::Optimizer& proto_opt) { + auto const& factory = get_optimizer_factory(); + auto const& msg = + helpers::get_oneof_message(proto_opt, "optimizer_type"); + return factory.create_object(msg.GetDescriptor()->name(), msg, comm); } } // namespace proto diff --git a/src/proto/factories/weights_factory.cpp b/src/proto/factories/weights_factory.cpp index e9dc1add41b..16522f9eb9b 100644 --- a/src/proto/factories/weights_factory.cpp +++ b/src/proto/factories/weights_factory.cpp @@ -113,9 +113,9 @@ std::unique_ptr construct_weights( auto init = construct_initializer(proto_weights); std::unique_ptr opt; if (proto_weights.has_optimizer()) { - opt.reset(construct_optimizer(comm, proto_weights.optimizer())); + opt = construct_optimizer(comm, proto_weights.optimizer()); } else { - opt.reset(construct_optimizer(comm, proto_opt)); + opt = construct_optimizer(comm, proto_opt); } w->set_initializer(std::move(init)); w->set_optimizer(std::move(opt)); From a076fb5ee34768b90bb1f187342a6dc0ca6c5cd8 Mon Sep 17 00:00:00 2001 From: "Thomas R. Benson" Date: Wed, 31 Jul 2019 17:02:09 -0700 Subject: [PATCH 181/634] fix copy-paste error --- src/proto/factories/weights_factory.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/proto/factories/weights_factory.cpp b/src/proto/factories/weights_factory.cpp index e9dc1add41b..f2605da8841 100644 --- a/src/proto/factories/weights_factory.cpp +++ b/src/proto/factories/weights_factory.cpp @@ -72,7 +72,7 @@ struct factory_manager }; factory_manager factory_mgr_; -factory_type const& get_callback_factory() noexcept +factory_type const& get_weight_initializer_factory() noexcept { return factory_mgr_.factory_; } @@ -80,7 +80,7 @@ factory_type const& get_callback_factory() noexcept /* Construct a weights initialization specified with prototext. */ std::unique_ptr construct_initializer(const lbann_data::Weights& proto_weights) { - auto const& factory = get_callback_factory(); + auto const& factory = get_weight_initializer_factory(); auto const& msg = helpers::get_oneof_message(proto_weights, "initializer_type"); return factory.create_object(msg.GetDescriptor()->name(), msg); From 3ca896fae251125d148dfce638424666e7afa6a9 Mon Sep 17 00:00:00 2001 From: "David A. Hysom" Date: Thu, 1 Aug 2019 09:19:42 -0700 Subject: [PATCH 182/634] mostly: changes from int to size_t for sample sizes --- .../lbann/data_store/data_store_conduit.hpp | 20 ++++---- src/data_store/data_store_conduit.cpp | 46 +++++++++---------- 2 files changed, 31 insertions(+), 35 deletions(-) diff --git a/include/lbann/data_store/data_store_conduit.hpp b/include/lbann/data_store/data_store_conduit.hpp index a3ba48cb151..67ce8b33abc 100644 --- a/include/lbann/data_store/data_store_conduit.hpp +++ b/include/lbann/data_store/data_store_conduit.hpp @@ -237,10 +237,8 @@ protected : std::vector> m_send_requests; std::vector> m_recv_requests; std::vector m_recv_buffer; - std::vector m_recv_buffer_sample_sizes; - std::vector m_send_buffer_sample_sizes; - std::vector m_outgoing_msg_sizes; - std::vector m_incoming_msg_sizes; + std::vector m_outgoing_msg_sizes; + std::vector m_incoming_msg_sizes; /// used in exchange_data_by_super_node(); contains the super_nodes, /// after they have been converted from compacted format @@ -278,32 +276,32 @@ protected : void error_check_compacted_node(const conduit::Node &nd, int data_id); /// for use when conduit Nodes have non-uniform size, e.g, imagenet - std::unordered_map m_sample_sizes; + std::unordered_map m_sample_sizes; /// used in set_conduit_node(...) std::mutex m_mutex; /// Currently only used for imagenet. On return, 'sizes' maps a sample_id to image size, and indices[p] contains the sample_ids that P_p owns /// for use in local cache mode - void get_image_sizes(std::unordered_map &sizes, std::vector> &indices); + void get_image_sizes(std::unordered_map &sizes, std::vector> &indices); /// offset at which the raw image will be stored in a shared memory segment; /// for use in local cache mode; maps data_id to offset std::unordered_map m_image_offsets; /// fills in m_image_offsets for use in local cache mode - void compute_image_offsets(std::unordered_map &sizes, std::vector> &indices); + void compute_image_offsets(std::unordered_map &sizes, std::vector> &indices); /// for use in local cache mode - void allocate_shared_segment(std::unordered_map &sizes, std::vector> &indices); + void allocate_shared_segment(std::unordered_map &sizes, std::vector> &indices); /// for use in local cache mode - void read_files(std::vector &work, std::unordered_map &sizes, std::vector &indices); + void read_files(std::vector &work, std::unordered_map &sizes, std::vector &indices); /// for use in local cache mode - void build_conduit_nodes(std::unordered_map &sizes); + void build_conduit_nodes(std::unordered_map &sizes); /// for use in local cache mode - void exchange_images(std::vector &work, std::unordered_map &image_sizes, std::vector> &indices); + void exchange_images(std::vector &work, std::unordered_map &image_sizes, std::vector> &indices); /// for use in local cache mode void fillin_shared_images(const std::vector &images, size_t offset); diff --git a/src/data_store/data_store_conduit.cpp b/src/data_store/data_store_conduit.cpp index 827bee27ea3..8a789eab464 100644 --- a/src/data_store/data_store_conduit.cpp +++ b/src/data_store/data_store_conduit.cpp @@ -201,7 +201,7 @@ void data_store_conduit::copy_members(const data_store_conduit& rhs, const std:: if (!m_super_node) { /// Repack the nodes because they don't seem to copy correctly - build_node_for_sending(rhs.m_data[i]["data"], m_data[i]); + compact_nodes(); } else { m_data[i] = rhs.m_data[i]; } @@ -629,7 +629,7 @@ void data_store_conduit::exchange_data_by_sample(size_t current_pos, size_t mb_s LBANN_ERROR("data_id: " + std::to_string(index) + " does not have a valid contiguous data pointer"); } - int sz = m_compacted_sample_size; + size_t sz = m_compacted_sample_size; if (m_node_sizes_vary) { if (m_sample_sizes.find(index) == m_sample_sizes.end()) { @@ -1031,20 +1031,20 @@ void data_store_conduit::exchange_sample_sizes() { } } - std::vector my_sizes(m_sample_sizes.size()*2); + std::vector my_sizes(m_sample_sizes.size()*2); size_t j = 0; for (auto t : m_sample_sizes) { my_sizes[j++] = t.first; my_sizes[j++] = t.second; } - std::vector other_sizes; + std::vector other_sizes; for (int k=0; kbroadcast(k, my_sizes.data(), all_counts[k]*2, m_comm->get_trainer_comm()); + m_comm->broadcast(k, my_sizes.data(), all_counts[k]*2, m_comm->get_trainer_comm()); } else { - m_comm->broadcast(k, other_sizes.data(), all_counts[k]*2, m_comm->get_trainer_comm()); + m_comm->broadcast(k, other_sizes.data(), all_counts[k]*2, m_comm->get_trainer_comm()); for (size_t i=0; i &file_sizes, std::vector> &indices) { +void data_store_conduit::get_image_sizes(std::unordered_map &file_sizes, std::vector> &indices) { /// this block fires if image sizes have been precomputed if (options::get()->has_string("image_sizes_filename")) { LBANN_ERROR("not yet implemented"); @@ -1077,7 +1077,7 @@ void data_store_conduit::get_image_sizes(std::unordered_map &file_sizes const std::vector &image_list = image_reader->get_image_list(); // get sizes of files for which I'm responsible - std::vector my_image_sizes; + std::vector my_image_sizes; for (size_t h=m_rank_in_trainer; hsize(); h += m_np_in_trainer) { const std::string fn = m_reader->get_file_dir() + '/' + image_list[(*m_shuffled_indices)[h]].first; std::ifstream in(fn.c_str()); @@ -1094,8 +1094,8 @@ void data_store_conduit::get_image_sizes(std::unordered_map &file_sizes std::vector counts(m_np_in_trainer); m_comm->all_gather(&my_count, 1, counts.data(), 1, m_comm->get_trainer_comm()); - //counts[h*2] contains the image index - //counts[h*2+1] contains the image sizee + //my_image_sizes[h*2] contains the image index + //my_image_sizes[h*2+1] contains the image sizee //fill in displacement vector for gathering the actual image sizes std::vector disp(m_np_in_trainer + 1); @@ -1104,16 +1104,16 @@ void data_store_conduit::get_image_sizes(std::unordered_map &file_sizes disp[h+1] = disp[h] + counts[h]; } - std::vector work(image_list.size()*2); - m_comm->trainer_all_gather(my_image_sizes, work, counts, disp); + std::vector work(image_list.size()*2); + m_comm->trainer_all_gather(my_image_sizes, work, counts, disp); indices.resize(m_np_in_trainer); for (int h=0; h &file_sizes } } -void data_store_conduit::compute_image_offsets(std::unordered_map &sizes, std::vector> &indices) { +void data_store_conduit::compute_image_offsets(std::unordered_map &sizes, std::vector> &indices) { size_t offset = 0; for (size_t p=0; p &size } -void data_store_conduit::allocate_shared_segment(std::unordered_map &sizes, std::vector> &indices) { +void data_store_conduit::allocate_shared_segment(std::unordered_map &sizes, std::vector> &indices) { off_t size = 0; - for (auto &&t : sizes) { size += t.second; } @@ -1224,7 +1223,7 @@ void data_store_conduit::allocate_shared_segment(std::unordered_map &si } void data_store_conduit::preload_local_cache() { - std::unordered_map file_sizes; + std::unordered_map file_sizes; std::vector> indices; double tm1 = get_time(); @@ -1263,7 +1262,7 @@ void data_store_conduit::preload_local_cache() { if (m_world_master) std::cerr << " build_conduit_nodes time: " << (get_time()-tm1) << std::endl; } -void data_store_conduit::read_files(std::vector &work, std::unordered_map &sizes, std::vector &indices) { +void data_store_conduit::read_files(std::vector &work, std::unordered_map &sizes, std::vector &indices) { //reserve space for reading this proc's files into a contiguous memory space size_t n = 0; @@ -1285,7 +1284,7 @@ void data_store_conduit::read_files(std::vector &work, std::unordered_map< if (m_world_master) std::cerr << " my num files: " << indices.size() << std::endl; for (size_t j=0; jget_file_dir() + '/' + image_list[idx].first; std::ifstream in(fn, std::ios::in | std::ios::binary); in.read(work.data()+offset, s); @@ -1295,7 +1294,7 @@ void data_store_conduit::read_files(std::vector &work, std::unordered_map< if (m_world_master) std::cout << " finished reading files\n"; } -void data_store_conduit::build_conduit_nodes(std::unordered_map &sizes) { +void data_store_conduit::build_conduit_nodes(std::unordered_map &sizes) { image_data_reader *image_reader = dynamic_cast(m_reader); const std::vector &image_list = image_reader->get_image_list(); for (size_t idx=0; idx &images, s memcpy(m_mem_seg+offset, reinterpret_cast(images.data()), images.size()); } -void data_store_conduit::exchange_images(std::vector &work, std::unordered_map &image_sizes, std::vector> &indices) { +void data_store_conduit::exchange_images(std::vector &work, std::unordered_map &image_sizes, std::vector> &indices) { std::vector work2; int node_rank = m_comm->get_rank_in_node(); size_t offset = 0; @@ -1339,7 +1338,6 @@ void data_store_conduit::exchange_images(std::vector &work, std::unordered for (size_t r=0; rbarrier(m_comm->get_node_comm()); From e960e0a3e8838c969ea3da4a724bd07b58971012 Mon Sep 17 00:00:00 2001 From: Ryan Forsyth Date: Tue, 30 Jul 2019 14:06:19 -0700 Subject: [PATCH 183/634] Test validation accuracy --- bamboo/integration_tests/common_code.py | 11 +++++++++++ .../catalyst/clang6/expected_performance.csv | 4 ++-- .../catalyst/gcc7/expected_performance.csv | 4 ++-- .../corona/gcc7/expected_performance.csv | 2 +- .../lassen/gcc7/expected_performance.csv | 2 +- .../pascal/gcc7/expected_performance.csv | 2 +- 6 files changed, 18 insertions(+), 7 deletions(-) diff --git a/bamboo/integration_tests/common_code.py b/bamboo/integration_tests/common_code.py index da6c0beb39a..be5b5c6449c 100644 --- a/bamboo/integration_tests/common_code.py +++ b/bamboo/integration_tests/common_code.py @@ -191,10 +191,21 @@ def extract_data(output_file_name, data_fields, should_log): print('extract_data: stdev={sv}'.format(sv=stdev_value)) data_dict[data_field][model_id][epoch_id] = stdev_value + # This will re-populate the value for 'test_accuracy' + # on each epoch, thus keeping the final value. + # Just keep the data_field as 'test_accuracy' so we don't have + # to update code and csv files to include 'validation_accuracy'. + regex = 'validation categorical accuracy : ([0-9.]+)' + data_field = 'test_accuracy' + populate_data_dict_overall(regex, line, data_field, data_fields, + data_dict, model_id) + + # Overwrite accuracy from validation if we have test accuracy. regex = 'test categorical accuracy : ([0-9.]+)' data_field = 'test_accuracy' populate_data_dict_overall(regex, line, data_field, data_fields, data_dict, model_id) + output_file.close() if should_log: print('extract_data: Extracted Data below:') diff --git a/bamboo/integration_tests/expected_values/catalyst/clang6/expected_performance.csv b/bamboo/integration_tests/expected_values/catalyst/clang6/expected_performance.csv index d7db441ade5..c767d373880 100644 --- a/bamboo/integration_tests/expected_values/catalyst/clang6/expected_performance.csv +++ b/bamboo/integration_tests/expected_values/catalyst/clang6/expected_performance.csv @@ -1,5 +1,5 @@ Model_name, training_run_time, training_mean, training_max, training_min, training_stdev, test_accuracy -alexnet_nightly, 117.00, 2.80, 9.00, 1.20, 2.00, 100.00 -alexnet_weekly, 490.00, 1.00, 3.00, 0.60, 0.50, 2.00 +alexnet_nightly, 117.00, 2.80, 9.00, 1.20, 2.00, 0.00 +alexnet_weekly, 490.00, 1.00, 3.00, 0.60, 0.50, 100.00 cache_alexnet, 0.00, 0.00, 0.00, 0.00, 0.00, 100.00 lenet_mnist, 100.00, 0.12, 0.40, 0.10, 0.09, 98.40 diff --git a/bamboo/integration_tests/expected_values/catalyst/gcc7/expected_performance.csv b/bamboo/integration_tests/expected_values/catalyst/gcc7/expected_performance.csv index 6b4eee1703b..258fa233523 100644 --- a/bamboo/integration_tests/expected_values/catalyst/gcc7/expected_performance.csv +++ b/bamboo/integration_tests/expected_values/catalyst/gcc7/expected_performance.csv @@ -1,5 +1,5 @@ Model_name, training_run_time, training_mean, training_max, training_min, training_stdev, test_accuracy -alexnet_nightly, 65.00, 1.50, 8.30, 0.37, 1.70, 100.00 -alexnet_weekly, 360.00, 0.90, 4.00, 0.40, 0.70, 3.00 +alexnet_nightly, 65.00, 1.50, 8.30, 0.37, 1.70, 0.1 +alexnet_weekly, 360.00, 0.90, 4.00, 0.40, 0.70, 100.00 cache_alexnet, 0.00, 0.00, 0.00, 0.00, 0.00, 100.00 lenet_mnist, 137.00, 0.18, 0.40, 0.15, 0.04, 98.92 diff --git a/bamboo/integration_tests/expected_values/corona/gcc7/expected_performance.csv b/bamboo/integration_tests/expected_values/corona/gcc7/expected_performance.csv index 42b575664c9..e36bf374407 100644 --- a/bamboo/integration_tests/expected_values/corona/gcc7/expected_performance.csv +++ b/bamboo/integration_tests/expected_values/corona/gcc7/expected_performance.csv @@ -1,5 +1,5 @@ Model_name, training_run_time, training_mean, training_max, training_min, training_stdev, test_accuracy -alexnet_nightly, 55.00, 1.03, 1.90, 0.80, 0.21, 100.00 +alexnet_nightly, 55.00, 1.03, 1.90, 0.80, 0.21, 0.00 alexnet_weekly, 0.00, 0.00, 0.00, 0.00, 0.00, 100.00 cache_alexnet, 0.00, 0.00, 0.00, 0.00, 0.00, 100.00 lenet_mnist, 385.00, 0.50, 2.00, 0.51, 0.80, 98.40 diff --git a/bamboo/integration_tests/expected_values/lassen/gcc7/expected_performance.csv b/bamboo/integration_tests/expected_values/lassen/gcc7/expected_performance.csv index 09dca6d2de5..9b46fecbd43 100644 --- a/bamboo/integration_tests/expected_values/lassen/gcc7/expected_performance.csv +++ b/bamboo/integration_tests/expected_values/lassen/gcc7/expected_performance.csv @@ -1,5 +1,5 @@ Model_name, training_run_time, training_mean, training_max, training_min, training_stdev, test_accuracy -alexnet_nightly, 23.00, 0.70, 10.30, 0.10, 1.20, 100.00 +alexnet_nightly, 23.00, 0.70, 10.30, 0.10, 1.20, 0.00 alexnet_weekly, 0.00, 0.00, 0.00, 0.00, 0.00, 100.00 cache_alexnet, 0.00, 0.00, 0.00, 0.00, 0.00, 100.00 lenet_mnist, 10.10, 0.06, 5.30, 0.01, 0.60, 98.30 diff --git a/bamboo/integration_tests/expected_values/pascal/gcc7/expected_performance.csv b/bamboo/integration_tests/expected_values/pascal/gcc7/expected_performance.csv index 12770f3b9fc..4285d808302 100644 --- a/bamboo/integration_tests/expected_values/pascal/gcc7/expected_performance.csv +++ b/bamboo/integration_tests/expected_values/pascal/gcc7/expected_performance.csv @@ -1,5 +1,5 @@ Model_name, training_run_time, training_mean, training_max, training_min, training_stdev, test_accuracy -alexnet_nightly, 51.00, 1.20, 4.00, 0.50, 0.40, 0.17 +alexnet_nightly, 51.00, 1.20, 4.00, 0.50, 0.40, 100.00 alexnet_weekly, 0.00, 0.00, 0.00, 0.00, 0.00, 100.00 cache_alexnet, 0.00, 0.00, 0.00, 0.00, 0.00, 100.00 lenet_mnist, 12.00, 0.04, 6.00, 0.01, 0.40, 98.40 From 8ffc5545c1731ca451deaaf04af19cabc57550d0 Mon Sep 17 00:00:00 2001 From: "Thomas R. Benson" Date: Thu, 1 Aug 2019 10:53:56 -0700 Subject: [PATCH 184/634] update protobuf to more gracefully handle weights initializers This also update python FE to use these changes --- python/lbann/util/class_generator.py | 8 ++++++-- python/lbann/weights.py | 24 ++++++++--------------- src/proto/factories/weights_factory.cpp | 26 +++++++++++++------------ src/proto/weights.proto | 4 +++- 4 files changed, 31 insertions(+), 31 deletions(-) diff --git a/python/lbann/util/class_generator.py b/python/lbann/util/class_generator.py index b52580af589..f1c32e0b5bc 100644 --- a/python/lbann/util/class_generator.py +++ b/python/lbann/util/class_generator.py @@ -1,6 +1,6 @@ """Utility functions to generate classes from Protobuf messages.""" import google.protobuf.descriptor -from lbann import lbann_pb2 +from lbann import lbann_pb2, callbacks_pb2, layers_pb2, metrics_pb2, model_pb2, objective_functions_pb2, optimizers_pb2, weights_pb2 # Map from Protobuf label enums to strings _proto_label_to_str = { @@ -96,7 +96,11 @@ def export_proto(self): message = getattr(proto, base_field_name) message.SetInParent() else: - proto = getattr(lbann_pb2, message_name)() + proto_modules = set([callbacks_pb2, layers_pb2, metrics_pb2, model_pb2, objective_functions_pb2, optimizers_pb2, weights_pb2]) + proto_type = None + while proto_type is None: + proto_type = getattr(proto_modules.pop(), message_name, None) + proto = proto_type() message = proto # Set message diff --git a/python/lbann/weights.py b/python/lbann/weights.py index 301f7bd00a5..3fe24af164a 100644 --- a/python/lbann/weights.py +++ b/python/lbann/weights.py @@ -6,15 +6,14 @@ class Initializer(abc.ABC): """Initialization scheme for `Weights`.""" def export_proto(self): - pass + """Construct and return a protobuf message.""" + return weights_pb2.Initializer() -# Generate Initializer sub-classes from lbann.proto. -# Note: The list of skip fields must be updated if any new fields are -# added to the Weights message in lbann.proto +# Generate Initializer sub-classes from weights.proto. classes = lbann.util.class_generator.generate_classes_from_protobuf_message( - weights_pb2.Weights, - skip_fields = set(['name', 'optimizer']), - base_class = Initializer) + weights_pb2.Initializer, + base_class = Initializer, + base_has_export_proto = True) for c in classes: globals()[c.__name__] = c @@ -36,15 +35,8 @@ def export_proto(self): # Set initializer if needed if self.initializer: - type_name = type(self.initializer).__name__ - field_name = None - for field in weights_pb2.Weights.DESCRIPTOR.fields: - if field.message_type and field.message_type.name == type_name: - field_name = field.name - break - init_message = getattr(proto, field_name) - init_message.CopyFrom(self.initializer.export_proto()) - init_message.SetInParent() + proto.initializer.CopyFrom(self.initializer.export_proto()) + proto.initializer.SetInParent() # Set optimizer if needed if self.optimizer: diff --git a/src/proto/factories/weights_factory.cpp b/src/proto/factories/weights_factory.cpp index 860f2d8bc62..0b99c9908f6 100644 --- a/src/proto/factories/weights_factory.cpp +++ b/src/proto/factories/weights_factory.cpp @@ -40,21 +40,23 @@ namespace { /** Construct a weights initialization specified with prototext. */ weights_initializer* construct_initializer(const lbann_data::Weights& proto_weights) { + auto const& proto_init = proto_weights.initializer(); + // Constant initialization - if (proto_weights.has_constant_initializer()) { - const auto& params = proto_weights.constant_initializer(); + if (proto_init.has_constant_initializer()) { + const auto& params = proto_init.constant_initializer(); return new constant_initializer(params.value()); } // Value initialization - if (proto_weights.has_value_initializer()) { - const auto& params = proto_weights.value_initializer(); + if (proto_init.has_value_initializer()) { + const auto& params = proto_init.value_initializer(); return new value_initializer(parse_list(params.values())); } // Random initialization - if (proto_weights.has_uniform_initializer()) { - const auto& params = proto_weights.uniform_initializer(); + if (proto_init.has_uniform_initializer()) { + const auto& params = proto_init.uniform_initializer(); const auto& min = params.min(); const auto& max = params.max(); if (min != 0.0 || max != 0.0) { @@ -63,8 +65,8 @@ weights_initializer* construct_initializer(const lbann_data::Weights& proto_weig return new uniform_initializer(); } } - if (proto_weights.has_normal_initializer()) { - const auto& params = proto_weights.normal_initializer(); + if (proto_init.has_normal_initializer()) { + const auto& params = proto_init.normal_initializer(); const auto& mean = params.mean(); const auto& standard_deviation = params.standard_deviation(); if (mean != 0.0 || standard_deviation != 0.0) { @@ -75,16 +77,16 @@ weights_initializer* construct_initializer(const lbann_data::Weights& proto_weig } // Variance scaling initialization - if (proto_weights.has_glorot_normal_initializer()) { + if (proto_init.has_glorot_normal_initializer()) { return new glorot_initializer(probability_distribution::gaussian); } - if (proto_weights.has_glorot_uniform_initializer()) { + if (proto_init.has_glorot_uniform_initializer()) { return new glorot_initializer(probability_distribution::uniform); } - if (proto_weights.has_he_normal_initializer()) { + if (proto_init.has_he_normal_initializer()) { return new he_initializer(probability_distribution::gaussian); } - if (proto_weights.has_he_uniform_initializer()) { + if (proto_init.has_he_uniform_initializer()) { return new he_initializer(probability_distribution::uniform); } diff --git a/src/proto/weights.proto b/src/proto/weights.proto index 2fe3e46ad2f..05617731318 100644 --- a/src/proto/weights.proto +++ b/src/proto/weights.proto @@ -31,10 +31,12 @@ import "optimizers.proto"; package lbann_data; message Weights { - string name = 1; Optimizer optimizer = 2; + Initializer initializer = 3; +} +message Initializer { oneof initializer_type { ConstantInitializer constant_initializer = 20; ValueInitializer value_initializer = 21; From 146bd869786f4fe27b0f6811193b9828b2bff3cd Mon Sep 17 00:00:00 2001 From: "Thomas R. Benson" Date: Thu, 1 Aug 2019 11:02:21 -0700 Subject: [PATCH 185/634] remove deprecated directory with timmoon10's blessing --- model_zoo/models/vram/.gitignore | 6 - model_zoo/models/vram/dram_template.prototext | 24 - model_zoo/models/vram/generate_dram.py | 419 ------------------ 3 files changed, 449 deletions(-) delete mode 100644 model_zoo/models/vram/.gitignore delete mode 100644 model_zoo/models/vram/dram_template.prototext delete mode 100755 model_zoo/models/vram/generate_dram.py diff --git a/model_zoo/models/vram/.gitignore b/model_zoo/models/vram/.gitignore deleted file mode 100644 index a0ac97666ed..00000000000 --- a/model_zoo/models/vram/.gitignore +++ /dev/null @@ -1,6 +0,0 @@ -lbann_pb2.py -lbann_pb2.pyc -dram.prototext -vram.prototext -vram_template.prototext -generate_vram.py diff --git a/model_zoo/models/vram/dram_template.prototext b/model_zoo/models/vram/dram_template.prototext deleted file mode 100644 index 62b6f66de05..00000000000 --- a/model_zoo/models/vram/dram_template.prototext +++ /dev/null @@ -1,24 +0,0 @@ -model { - data_layout: "data_parallel" - mini_batch_size: 256 - block_size: 256 - num_epochs: 20 - num_parallel_readers: 0 - procs_per_trainer: 0 - - ################################################### - # Objective function - ################################################### - objective_function { - l2_weight_regularization { - scale_factor: 0.0005 - } - } - - ################################################### - # Callbacks - ################################################### - callback { print {} } - callback { timer {} } - -} diff --git a/model_zoo/models/vram/generate_dram.py b/model_zoo/models/vram/generate_dram.py deleted file mode 100755 index 9dd68656da4..00000000000 --- a/model_zoo/models/vram/generate_dram.py +++ /dev/null @@ -1,419 +0,0 @@ -#!/usr/bin/env python -import sys -import os -import subprocess -import functools -import collections - -# Parameters -lbann_dir = subprocess.check_output(["git", "rev-parse", "--show-toplevel"]).strip() -lbann_proto_dir = lbann_dir + "/src/proto/" -work_dir = lbann_dir + "/model_zoo/models/vram" -template_proto = lbann_dir + "/model_zoo/models/vram/dram_template.prototext" -output_proto = lbann_dir + "/model_zoo/models/vram/dram.prototext" - -# Convert a list into a space-separated string -def str_list(l): - if isinstance(l, str): - return l - else: - return " ".join(str(i) for i in l) - -# Construct a new layer and add it to the model -def new_layer(model, name, parents, layer_type, device = "", weights = []): - if not isinstance(parents, collections.Iterable): - return new_layer(model, name, [parents], layer_type, device, weights) - if not isinstance(weights, collections.Iterable): - return new_layer(model, name, parents, layer_type, device, [weights]) - l = model.layer.add() - l.name = name - l.parents = str_list(map(lambda l : l.name, parents)) - exec("l." + layer_type + ".SetInParent()") - l.weights = str_list(map(lambda w : w.name, weights)) - l.device_allocation = device - return l - -# Construct a new set of weights and add it to the model -def new_weights(model, name, initializer = ""): - w = model.weights.add() - w.name = name - if initializer: - exec("w." + initializer + ".SetInParent()") - return w - -class FullyConnectedCell: - - name = "" - size = 0 - model = None - has_bias = False - activation = None - weights = [] - step = -1 - - def __init__(self, name, size, model, - activation = None, initializer = "constant_initializer", has_bias = True): - self.name = name - self.size = size - self.model = model - self.has_bias = has_bias - self.activation = activation - - # Initialize weights - self.weights = [new_weights(model, name + "_linearity", initializer), - new_weights(model, name + "_bias", "constant_initializer")] - - def __call__(self, parent): - self.step += 1 - fc = new_layer(self.model, "%s_fc_step%d" % (self.name, self.step), - parent, "fully_connected", "" ,self.weights) - fc.fully_connected.num_neurons = self.size - fc.fully_connected.has_bias = self.has_bias - if self.activation: - act = new_layer(self.model, - "%s_step%d" % (self.name, self.step), - fc, self.activation) - return act - else: - fc.name = "%s_step%d" % (self.name, self.step) - return fc - -class ConvolutionCell: - - name = "" - num_output_channels = 0 - num_dims = 0 - conv_dim = 0 - conv_stride = 0 - conv_pad = 0 - model = None - has_bias = False - activation = None - weights = [] - step = -1 - - def __init__(self, name, num_output_channels, - num_dims, conv_dim, conv_stride, conv_pad, - model, - activation = None, - initializer = "constant_initializer", - has_bias = True): - self.name = name - self.num_output_channels = num_output_channels - self.num_dims = num_dims - self.conv_dim = conv_dim - self.conv_stride = conv_stride - self.conv_pad = conv_pad - self.model = model - self.has_bias = has_bias - self.activation = activation - - # Initialize weights - self.weights = [new_weights(model, name + "_kernel", initializer), - new_weights(model, name + "_bias", "constant_initializer")] - - def __call__(self, parent): - self.step += 1 - conv = new_layer(self.model, "%s_conv_step%d" % (self.name, self.step), - parent, "convolution", "", self.weights) - conv.convolution.num_output_channels = self.num_output_channels - conv.convolution.num_dims = self.num_dims - conv.convolution.conv_dims_i = self.conv_dim - conv.convolution.conv_strides_i = self.conv_stride - conv.convolution.conv_pads_i = self.conv_pad - conv.convolution.has_bias = self.has_bias - if self.activation: - act = new_layer(self.model, - "%s_step%d" % (self.name, self.step), - conv, self.activation) - return act - else: - conv.name = "%s_step%d" % (self.name, self.step) - return conv - -# Uses reLU activations -class LstmCell: - - name = "" - size = 0 - model = None - step = -1 - outputs = [] - cells = [] - - # Fully-connected layers - forget_fc = None - input_fc = None - output_fc = None - cell_fc = None - - def __init__(self, name, size, model): - self.name = name - self.size = size - self.model = model - - # Fully-connected layers - self.forget_gate = FullyConnectedCell(name + "_forget_gate_fc", size, model, - "sigmoid", "glorot_normal_initializer", True) - self.input_gate = FullyConnectedCell(name + "_input_gate_fc", size, model, - "sigmoid", "glorot_normal_initializer", True) - self.output_gate = FullyConnectedCell(name + "_output_gate_fc", size, model, - "sigmoid", "glorot_normal_initializer", True) - self.cell_update = FullyConnectedCell(name + "_cell_update_fc", size, model, - "relu", "he_normal_initializer", True) - - # Initial state - self.outputs = [new_layer(model, name + "_output_init", [], "constant")] - self.outputs[0].constant.num_neurons = str(size) - self.cells = [new_layer(model, name + "_cell_init", [], "constant")] - self.cells[0].constant.num_neurons = str(size) - - def __call__(self, parent): - self.step += 1 - - # LSTM input state is from parent layer and previous output - input_state = new_layer(self.model, - "%s_input_state_step%d" % (self.name, self.step), - [parent, self.outputs[-1]], - "concatenation") - - # Gating units - f = self.forget_gate(input_state) - i = self.input_gate(input_state) - o = self.output_gate(input_state) - - # Cell state - c = self.cell_update(input_state) - cell_forget = new_layer(self.model, - "%s_cell_forget_step%d" % (self.name, self.step), - [f, self.cells[-1]], "hadamard") - cell_input = new_layer(self.model, - "%s_cell_input_step%d" % (self.name, self.step), - [i, c], "hadamard") - self.cells.append(new_layer(self.model, - "%s_cell_step%d" % (self.name, self.step), - [cell_forget, cell_input], - "sum")) - - # Output - act = new_layer(self.model, - "%s_cell_activation_step%d" % (self.name, self.step), - self.cells[-1], "relu") - self.outputs.append(new_layer(self.model, - "%s_step%d" % (self.name, self.step), - [o, act], "hadamard")) - return self.outputs[-1] - -# Configure a prototext model (e.g. add layers) -def configure_model(model): - - # Model parameters - unroll_depth = 4 - image_dims = [3, 227, 227] - label_dims = [1000] - hidden_size = 128 # RNN state size - num_locs = 32 - - # Initialize input - data = new_layer(model, "data", [], "input", "cpu") - image = new_layer(model, "image", data, "split") - label = new_layer(model, "label", data, "split") - data.children = str_list([image.name, label.name]) - - # Initialize useful constants - zero1 = new_layer(model, "zero1", [], "constant", "cpu") - zero1.constant.value = 0.0 - zero1.constant.num_neurons = str_list([1]) - zero3 = new_layer(model, "zero3", [], "constant", "cpu") - zero3.constant.value = 0.0 - zero3.constant.num_neurons = str_list([3]) - one3 = new_layer(model, "one3", [], "constant", "cpu") - one3.constant.value = 1.0 - one3.constant.num_neurons = str_list([3]) - - # Glimpse network components - glimpse1_conv1 = ConvolutionCell("glimpse1_conv1", 32, 2, 3, 1, 1, - model, "relu", "he_normal_initializer") - glimpse1_conv2 = ConvolutionCell("glimpse1_conv2", 64, 2, 3, 1, 1, - model, "relu", "he_normal_initializer") - glimpse1_conv3 = ConvolutionCell("glimpse1_conv3", 128, 2, 3, 1, 1, - model, "relu", "he_normal_initializer") - glimpse2_conv1 = ConvolutionCell("glimpse2_conv1", 32, 2, 3, 1, 1, - model, "relu", "he_normal_initializer") - glimpse2_conv2 = ConvolutionCell("glimpse2_conv2", 64, 2, 3, 1, 1, - model, "relu", "he_normal_initializer") - glimpse2_conv3 = ConvolutionCell("glimpse2_conv3", 128, 2, 3, 1, 1, - model, "relu", "he_normal_initializer") - glimpse3_conv1 = ConvolutionCell("glimpse3_conv1", 32, 2, 3, 1, 1, - model, "relu", "he_normal_initializer") - glimpse3_conv2 = ConvolutionCell("glimpse3_conv2", 64, 2, 3, 1, 1, - model, "relu", "he_normal_initializer") - glimpse3_conv3 = ConvolutionCell("glimpse3_conv3", 128, 2, 3, 1, 1, - model, "relu", "he_normal_initializer") - - # Recurrent network components - lstm1 = LstmCell("lstm1", hidden_size, model) - lstm2 = LstmCell("lstm2", hidden_size, model) - - # Location network components - loc_list = map(lambda i: 2.0 * i / num_locs - 1.0, range(num_locs)) - loc = zero3 - locx_network = FullyConnectedCell("locx_prob", num_locs, model, - "softmax", "glorot_normal_initializer", False) - locy_network = FullyConnectedCell("locy_prob", num_locs, model, - "softmax", "glorot_normal_initializer", False) - - # Classification network components - class_network = FullyConnectedCell("class_prob", label_dims[0], model, - "softmax", "glorot_normal_initializer", False) - - # Construct unrolled model - for step in range(unroll_depth): - - # Extract crops and resize - scaled_loc = new_layer(model, "loc_scaled_step%d" % step, - [loc, one3], "weighted_sum", "cpu") - scaled_loc.weighted_sum.scaling_factors = str_list([0.5, 0.5]) - crop1 = new_layer(model, "crop1_step%d" % step, - [image, scaled_loc], "crop", "cpu") - crop1.crop.dims = str_list([3, 32, 32]) - crop2 = new_layer(model, "crop2_step%d" % step, - [image, scaled_loc], "crop", "cpu") - crop2.crop.dims = str_list([3, 64, 64]) - crop2 = new_layer(model, "crop2_resized_step%d" % step, crop2, "pooling") - crop2.pooling.num_dims = 2 - crop2.pooling.pool_dims_i = 2 - crop2.pooling.pool_strides_i = crop2.pooling.pool_dims_i - crop2.pooling.pool_mode = "average" - crop3 = new_layer(model, "crop3_step%d" % step, - [image, scaled_loc], "crop", "cpu") - crop3.crop.dims = str_list([3, 128, 128]) - crop3 = new_layer(model, "crop3_resized_step%d" % step, crop3, "pooling") - crop3.pooling.num_dims = 2 - crop3.pooling.pool_dims_i = 4 - crop3.pooling.pool_strides_i = crop3.pooling.pool_dims_i - crop3.pooling.pool_mode = "average" - - # Glimpse networks - glimpse1 = glimpse1_conv1(crop1) - glimpse1 = glimpse1_conv2(glimpse1) - glimpse1 = glimpse1_conv3(glimpse1) - glimpse1 = new_layer(model, "glimpse1_step%d" % step, glimpse1, "pooling") - glimpse1.pooling.num_dims = 2 - glimpse1.pooling.pool_dims_i = 32 - glimpse1.pooling.pool_strides_i = glimpse1.pooling.pool_dims_i - glimpse1.pooling.pool_mode = "average" - glimpse2 = glimpse2_conv1(crop2) - glimpse2 = glimpse2_conv2(glimpse2) - glimpse2 = glimpse2_conv3(glimpse2) - glimpse2 = new_layer(model, "glimpse2_step%d" % step, glimpse2, "pooling") - glimpse2.pooling.num_dims = 2 - glimpse2.pooling.pool_dims_i = 32 - glimpse2.pooling.pool_strides_i = glimpse2.pooling.pool_dims_i - glimpse2.pooling.pool_mode = "average" - glimpse3 = glimpse3_conv1(crop3) - glimpse3 = glimpse3_conv2(glimpse3) - glimpse3 = glimpse3_conv3(glimpse3) - glimpse3 = new_layer(model, "glimpse3_step%d" % step, glimpse3, "pooling") - glimpse3.pooling.num_dims = 2 - glimpse3.pooling.pool_dims_i = 32 - glimpse3.pooling.pool_strides_i = glimpse3.pooling.pool_dims_i - glimpse3.pooling.pool_mode = "average" - glimpse = new_layer(model, "glimpse_step%d" % step, - [glimpse1, glimpse2, glimpse3], "concatenation") - glimpse = new_layer(model, "glimpse_flat_step%d" % step, - glimpse, "reshape") - glimpse.reshape.num_dims = 1 - glimpse.reshape.dims = str_list([128 * 3]) - - # Recurrent network - h1 = lstm1(glimpse) - h2 = lstm2(h1) - - # Location network - locx_prob = locx_network(h2) - locx_onehot = new_layer(model, "locx_onehot_step%d" % step, - locx_prob, "categorical_random", "cpu") - locx = new_layer(model, "locx_step%d" % step, - locx_onehot, "discrete_random", "cpu") - locx.discrete_random.values = str_list(loc_list) - locx.discrete_random.dims = str_list([1]) - locy_prob = locy_network(h2) - locy_onehot = new_layer(model, "locy_onehot_step%d" % step, - locy_prob, "categorical_random", "cpu") - locy = new_layer(model, "locy_step%d" % step, - locy_onehot, "discrete_random", "cpu") - locy.discrete_random.values = str_list(loc_list) - locy.discrete_random.dims = str_list([1]) - loc = new_layer(model, "loc_step%d" % (step+1), - [zero1, locy, locx], "concatenation", "cpu") - - # Classification network - class_prob = class_network(h1) - - # Categorical accuracy - acc1 = new_layer(model, "top1_accuracy_step%d" % step, - [class_prob, label], "categorical_accuracy") - acc5 = new_layer(model, "top5_accuracy_step%d" % step, - [class_prob, label], "top_k_categorical_accuracy") - acc5.top_k_categorical_accuracy.k = 5 - met = model.metric.add() - met.layer_metric.name = "categorical accuracy (step %d)" % step - met.layer_metric.layer = acc1.name - met.layer_metric.unit = "%" - met = model.metric.add() - met.layer_metric.name = "top-5 categorical accuracy (step %d)" % step - met.layer_metric.layer = acc5.name - met.layer_metric.unit = "%" - - # Objective function - class_obj = new_layer(model, "classification_cross_entropy_step%d" % step, - [class_prob, label], "cross_entropy") - locx_obj = new_layer(model, "locx_cross_entropy_step%d" % step, - [locx_prob, locx_onehot], "cross_entropy") - locy_obj = new_layer(model, "locy_cross_entropy_step%d" % step, - [locy_prob, locy_onehot], "cross_entropy") - obj = model.objective_function.layer_term.add() - obj.scale_factor = 1.0 - obj.layer = class_obj.name - obj = model.objective_function.layer_term.add() - obj.scale_factor = 1.0 - obj.layer = locx_obj.name - obj = model.objective_function.layer_term.add() - obj.scale_factor = 1.0 - obj.layer = locy_obj.name - - -if __name__ == "__main__": - - # Make sure protobuf Python implementation is built - host = subprocess.check_output("hostname").strip("\n1234567890") - protoc = lbann_dir + "/build/gnu.Release." + host + ".llnl.gov/install/bin/protoc" - proto_python_dir = lbann_dir + "/build/gnu.Release." + host + ".llnl.gov/protobuf/src/python" - os.putenv("PROTOC", protoc) - subprocess.call("cd " + proto_python_dir + "; " - + sys.executable + " " - + proto_python_dir + "/setup.py build", - shell=True) - sys.path.append(proto_python_dir) - import google.protobuf.text_format as txtf - - # Compile LBANN protobuf - subprocess.call([protoc, - "-I=" + lbann_proto_dir, - "--python_out=" + work_dir, - lbann_proto_dir + "/lbann.proto"]) - sys.path.append(work_dir) - global lbann_pb2 - import lbann_pb2 - - # Load template prototext - with open(template_proto, "r") as f: - pb = txtf.Merge(f.read(), lbann_pb2.LbannPB()) - - # Configure prototext model - configure_model(pb.model) - - # Export prototext - with open(output_proto, "w") as f: - f.write(txtf.MessageToString(pb)) From 2cda3c16791f3dd7a6dde6132648929c004b957f Mon Sep 17 00:00:00 2001 From: "Thomas R. Benson" Date: Thu, 1 Aug 2019 12:46:02 -0700 Subject: [PATCH 186/634] Fix all existing models' initializers; thanks perl --- .../candle/pilot1/ae_nodeselect_gdc.prototext | 4 +- .../models/candle/pilot1/combo.prototext | 12 +- .../gan/jags/cycle_gan/cycgan_m1.prototext | 56 +++- .../gan/jags/cycle_gan/cycgan_m2.prototext | 44 ++- .../gan/jags/cycle_gan/cycgan_m3.prototext | 44 ++- .../gan/mnist/adversarial_model.prototext | 20 +- .../gan/mnist/discriminator_model.prototext | 16 +- .../jag/gan/cyclic/cyclic_gan_model.prototext | 56 +++- .../models/jag/gan/vanilla/gan.prototext | 28 +- model_zoo/models/jag/wae.prototext | 28 +- .../jag/wae_cycle_gan/cycle_gan.prototext | 56 +++- .../wae_cycle_gan/cycle_gan_only.prototext | 56 +++- .../models/jag/wae_cycle_gan/wae.prototext | 52 +++- .../jag/wae_cycle_gan/wae_fw_inv.prototext | 48 +++- .../jag/wae_cycle_gan/wae_nobn.prototext | 52 +++- ...onv_molecular_autoencoder_pilot2.prototext | 4 +- ...olecular_bead_autoencoder_pilot2.prototext | 4 +- ...del_molecular_autoencoder_pilot2.prototext | 4 +- .../siamese/finetune-cub/model_cub.prototext | 112 +++++--- .../model_cub_batchnorm.prototext | 126 +++++--- ...batchnorm_transferred_and_frozen.prototext | 246 ++++++++++------ ..._alexnet_batchnorm_dag_frozen_bn.prototext | 270 ++++++++++++------ 22 files changed, 929 insertions(+), 409 deletions(-) diff --git a/model_zoo/models/candle/pilot1/ae_nodeselect_gdc.prototext b/model_zoo/models/candle/pilot1/ae_nodeselect_gdc.prototext index 93509871d2a..dfd926c0548 100644 --- a/model_zoo/models/candle/pilot1/ae_nodeselect_gdc.prototext +++ b/model_zoo/models/candle/pilot1/ae_nodeselect_gdc.prototext @@ -67,7 +67,9 @@ model { weights { name: "w1" - glorot_uniform_initializer {} + initializer { + glorot_uniform_initializer {} + } } layer { diff --git a/model_zoo/models/candle/pilot1/combo.prototext b/model_zoo/models/candle/pilot1/combo.prototext index 0b5a5ac5535..1d6aeb6a9d1 100644 --- a/model_zoo/models/candle/pilot1/combo.prototext +++ b/model_zoo/models/candle/pilot1/combo.prototext @@ -178,17 +178,23 @@ model { # Specify shared weights for drug tracks weights { name: "drug_fc1_w" - he_normal_initializer {} + initializer { + he_normal_initializer {} + } } weights { name: "drug_fc2_w" - he_normal_initializer {} + initializer { + he_normal_initializer {} + } } weights { name: "drug_fc3_w" - he_normal_initializer {} + initializer { + he_normal_initializer {} + } } #Drug1 Track diff --git a/model_zoo/models/gan/jags/cycle_gan/cycgan_m1.prototext b/model_zoo/models/gan/jags/cycle_gan/cycgan_m1.prototext index 26e83f01d6f..bcc80c98e3f 100644 --- a/model_zoo/models/gan/jags/cycle_gan/cycgan_m1.prototext +++ b/model_zoo/models/gan/jags/cycle_gan/cycgan_m1.prototext @@ -460,72 +460,100 @@ model { } weights { name: "gen1fc1linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { name: "gen1fc2linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { name: "gen1fc3linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { name: "gen1fc4linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { name: "gen2fc1linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { name: "gen2fc2linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { name: "gen2fc3linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { name: "gen2fc4linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { name: "disc1fc1linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { name: "disc1fc2linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { name: "disc1fc3linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { name: "disc2fc1linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { name: "disc2fc2linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { name: "disc2fc3linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } mini_batch_size: 64 diff --git a/model_zoo/models/gan/jags/cycle_gan/cycgan_m2.prototext b/model_zoo/models/gan/jags/cycle_gan/cycgan_m2.prototext index e188b803517..45c673ea736 100644 --- a/model_zoo/models/gan/jags/cycle_gan/cycgan_m2.prototext +++ b/model_zoo/models/gan/jags/cycle_gan/cycgan_m2.prototext @@ -456,57 +456,79 @@ model { } weights { name: "gen1fc1linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { name: "gen1fc2linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { name: "gen1fc3linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { name: "gen1fc4linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { name: "disc1fc1linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { name: "disc1fc2linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { name: "disc1fc3linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { name: "gen2fc1linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { name: "gen2fc2linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { name: "gen2fc3linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { name: "gen2fc4linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } mini_batch_size: 64 diff --git a/model_zoo/models/gan/jags/cycle_gan/cycgan_m3.prototext b/model_zoo/models/gan/jags/cycle_gan/cycgan_m3.prototext index ee04a006ed8..7277121a832 100644 --- a/model_zoo/models/gan/jags/cycle_gan/cycgan_m3.prototext +++ b/model_zoo/models/gan/jags/cycle_gan/cycgan_m3.prototext @@ -517,57 +517,79 @@ model { } weights { name: "gen2fc1linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { name: "gen2fc2linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { name: "gen2fc3linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { name: "gen2fc4linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { name: "disc2fc1linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { name: "disc2fc2linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { name: "disc2fc3linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { name: "gen1fc1linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { name: "gen1fc2linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { name: "gen1fc3linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { name: "gen1fc4linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } mini_batch_size: 64 diff --git a/model_zoo/models/gan/mnist/adversarial_model.prototext b/model_zoo/models/gan/mnist/adversarial_model.prototext index 644df4a90f3..19f2dd83493 100644 --- a/model_zoo/models/gan/mnist/adversarial_model.prototext +++ b/model_zoo/models/gan/mnist/adversarial_model.prototext @@ -146,7 +146,9 @@ model { weights { name: "gen_fc_weights" optimizer { } - glorot_normal_initializer {} + initializer { + glorot_normal_initializer {} + } } ############# # FC 1 @@ -302,7 +304,9 @@ model { weights { name: "dis_flatten_weights" optimizer { } - he_normal_initializer {} + initializer { + he_normal_initializer {} + } } # FULLY_CONNECTED dis_flatten layer { @@ -319,7 +323,9 @@ model { weights { name: "dis_fc1_weights" optimizer { } - glorot_normal_initializer {} + initializer { + glorot_normal_initializer {} + } } layer { name: "dis_fc1_proxy" @@ -343,7 +349,9 @@ model { weights { name: "dis_fc2_weights" optimizer { } - glorot_normal_initializer {} + initializer { + glorot_normal_initializer {} + } } layer { parents: "dis_fc1_relu" @@ -368,7 +376,9 @@ model { weights { name: "dis_fc3_weights" optimizer { } - glorot_normal_initializer {} + initializer { + glorot_normal_initializer {} + } } layer { parents: "dis_fc2_relu" diff --git a/model_zoo/models/gan/mnist/discriminator_model.prototext b/model_zoo/models/gan/mnist/discriminator_model.prototext index 063390d93dd..a10cc04afbb 100644 --- a/model_zoo/models/gan/mnist/discriminator_model.prototext +++ b/model_zoo/models/gan/mnist/discriminator_model.prototext @@ -138,7 +138,9 @@ model { weights { name: "gen_fc1_weights" optimizer { } - glorot_normal_initializer {} + initializer { + glorot_normal_initializer {} + } } ############# # FC 1 @@ -179,7 +181,9 @@ model { weights { name: "gen_fc2_weights" optimizer { } - glorot_normal_initializer {} + initializer { + glorot_normal_initializer {} + } } layer { name: "fc2" @@ -217,7 +221,9 @@ model { weights { name: "gen_fc3_weights" optimizer { } - glorot_normal_initializer {} + initializer { + glorot_normal_initializer {} + } } layer { name: "fc3" @@ -255,7 +261,9 @@ model { weights { name: "gen_fc4_weights" optimizer { } - glorot_normal_initializer {} + initializer { + glorot_normal_initializer {} + } } layer { name: "fc4" diff --git a/model_zoo/models/jag/gan/cyclic/cyclic_gan_model.prototext b/model_zoo/models/jag/gan/cyclic/cyclic_gan_model.prototext index 235a00313d7..a0d04525790 100644 --- a/model_zoo/models/jag/gan/cyclic/cyclic_gan_model.prototext +++ b/model_zoo/models/jag/gan/cyclic/cyclic_gan_model.prototext @@ -611,27 +611,37 @@ model { } weights { name: "gen1fc1linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { name: "gen1fc2linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { name: "gen1fc3linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { name: "gen1fc4linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { name: "d1fc1linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { @@ -639,7 +649,9 @@ model { } weights { name: "d1fc2linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { @@ -647,7 +659,9 @@ model { } weights { name: "d1fc3linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { @@ -655,27 +669,37 @@ model { } weights { name: "gen2fc1linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { name: "gen2fc2linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { name: "gen2fc3linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { name: "gen2fc4linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { name: "d1_invfc1linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { @@ -683,7 +707,9 @@ model { } weights { name: "d1_invfc2linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { @@ -691,7 +717,9 @@ model { } weights { name: "d1_invfc3linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { diff --git a/model_zoo/models/jag/gan/vanilla/gan.prototext b/model_zoo/models/jag/gan/vanilla/gan.prototext index 7bfa4e4d3d8..976151dd318 100644 --- a/model_zoo/models/jag/gan/vanilla/gan.prototext +++ b/model_zoo/models/jag/gan/vanilla/gan.prototext @@ -391,27 +391,37 @@ model { #} weights { name: "gen1fc1linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { name: "gen1fc2linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { name: "gen1fc3linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { name: "gen1fc4linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { name: "d1fc1linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { @@ -419,7 +429,9 @@ model { } weights { name: "d1fc2linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { @@ -427,7 +439,9 @@ model { } weights { name: "d1fc3linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { diff --git a/model_zoo/models/jag/wae.prototext b/model_zoo/models/jag/wae.prototext index f8edac45647..8d204e7a62a 100644 --- a/model_zoo/models/jag/wae.prototext +++ b/model_zoo/models/jag/wae.prototext @@ -479,27 +479,37 @@ model { weights { name: "gen1fc1linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { name: "gen1fc2linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { name: "gen1fc3linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { name: "gen1fc4linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { name: "d1fc1linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { @@ -507,7 +517,9 @@ model { } weights { name: "d1fc2linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { @@ -515,7 +527,9 @@ model { } weights { name: "d1fc3linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { diff --git a/model_zoo/models/jag/wae_cycle_gan/cycle_gan.prototext b/model_zoo/models/jag/wae_cycle_gan/cycle_gan.prototext index 69fbf816f53..716840be3ae 100644 --- a/model_zoo/models/jag/wae_cycle_gan/cycle_gan.prototext +++ b/model_zoo/models/jag/wae_cycle_gan/cycle_gan.prototext @@ -826,7 +826,9 @@ model { } weights { name: "gen1fc1linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { @@ -834,7 +836,9 @@ model { } weights { name: "gen1fc2linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { @@ -842,7 +846,9 @@ model { } weights { name: "gen1fc3linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { @@ -850,7 +856,9 @@ model { } weights { name: "gen1fc4linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { @@ -858,7 +866,9 @@ model { } weights { name: "d1fc1linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { @@ -866,7 +876,9 @@ model { } weights { name: "d1fc2linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { @@ -874,7 +886,9 @@ model { } weights { name: "d1fc3linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { @@ -882,22 +896,30 @@ model { } weights { name: "gen2fc1linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { name: "gen2fc2linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { name: "gen2fc3linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { name: "gen2fc4linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { @@ -914,7 +936,9 @@ model { } weights { name: "d1_invfc1linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { @@ -922,7 +946,9 @@ model { } weights { name: "d1_invfc2linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { @@ -930,7 +956,9 @@ model { } weights { name: "d1_invfc3linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { diff --git a/model_zoo/models/jag/wae_cycle_gan/cycle_gan_only.prototext b/model_zoo/models/jag/wae_cycle_gan/cycle_gan_only.prototext index 6c840d48da9..79a4e1264e7 100644 --- a/model_zoo/models/jag/wae_cycle_gan/cycle_gan_only.prototext +++ b/model_zoo/models/jag/wae_cycle_gan/cycle_gan_only.prototext @@ -696,7 +696,9 @@ model { } weights { name: "gen1fc1linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { @@ -704,7 +706,9 @@ model { } weights { name: "gen1fc2linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { @@ -712,7 +716,9 @@ model { } weights { name: "gen1fc3linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { @@ -720,7 +726,9 @@ model { } weights { name: "gen1fc4linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { @@ -728,7 +736,9 @@ model { } weights { name: "d1fc1linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { @@ -736,7 +746,9 @@ model { } weights { name: "d1fc2linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { @@ -744,7 +756,9 @@ model { } weights { name: "d1fc3linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { @@ -752,22 +766,30 @@ model { } weights { name: "gen2fc1linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { name: "gen2fc2linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { name: "gen2fc3linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { name: "gen2fc4linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { @@ -784,7 +806,9 @@ model { } weights { name: "d1_invfc1linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { @@ -792,7 +816,9 @@ model { } weights { name: "d1_invfc2linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { @@ -800,7 +826,9 @@ model { } weights { name: "d1_invfc3linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { diff --git a/model_zoo/models/jag/wae_cycle_gan/wae.prototext b/model_zoo/models/jag/wae_cycle_gan/wae.prototext index cf8265e7019..aa937b9ab7c 100644 --- a/model_zoo/models/jag/wae_cycle_gan/wae.prototext +++ b/model_zoo/models/jag/wae_cycle_gan/wae.prototext @@ -606,29 +606,39 @@ model { ###@todo : delete not used, LTFB uses encodefc*linearity_weights instead weights { name: "encodefc1linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { name: "encodefc2linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { name: "encodefc3linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { name: "encodefc4linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } #Decoder weights here to be used in WAE+cyclic model weights { name: "decode0linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { @@ -636,7 +646,9 @@ model { } weights { name: "decode1linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { @@ -644,7 +656,9 @@ model { } weights { name: "decode2linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { @@ -652,7 +666,9 @@ model { } weights { name: "decode3linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { @@ -663,7 +679,9 @@ model { #Discriminator (shared) weights { name: "wae_d1fc1linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { @@ -671,7 +689,9 @@ model { } weights { name: "wae_d1fc2linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { @@ -679,7 +699,9 @@ model { } weights { name: "wae_d1fc3linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { @@ -687,7 +709,9 @@ model { } weights { name: "wae_d1fc4linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { @@ -695,7 +719,9 @@ model { } weights { name: "wae_d1fc5linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { diff --git a/model_zoo/models/jag/wae_cycle_gan/wae_fw_inv.prototext b/model_zoo/models/jag/wae_cycle_gan/wae_fw_inv.prototext index c38e09e5fb2..5864a694d85 100644 --- a/model_zoo/models/jag/wae_cycle_gan/wae_fw_inv.prototext +++ b/model_zoo/models/jag/wae_cycle_gan/wae_fw_inv.prototext @@ -211,7 +211,9 @@ model { weights { name: "gen1fc1linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { @@ -219,7 +221,9 @@ model { } weights { name: "gen1fc2linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { @@ -227,7 +231,9 @@ model { } weights { name: "gen1fc3linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { @@ -235,7 +241,9 @@ model { } weights { name: "gen1fc4linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { @@ -484,7 +492,9 @@ model { ####Decoder weights weights { name: "decode0linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { @@ -493,7 +503,9 @@ model { weights { name: "decode1linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { @@ -501,7 +513,9 @@ model { } weights { name: "decode2linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { @@ -509,7 +523,9 @@ model { } weights { name: "decode3linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { @@ -924,22 +940,30 @@ model { } weights { name: "gen2fc1linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { name: "gen2fc2linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { name: "gen2fc3linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { name: "gen2fc4linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { diff --git a/model_zoo/models/jag/wae_cycle_gan/wae_nobn.prototext b/model_zoo/models/jag/wae_cycle_gan/wae_nobn.prototext index b3afa63d263..b0a68861ed1 100644 --- a/model_zoo/models/jag/wae_cycle_gan/wae_nobn.prototext +++ b/model_zoo/models/jag/wae_cycle_gan/wae_nobn.prototext @@ -586,29 +586,39 @@ model { ###@todo : delete not used, LTFB uses encodefc*linearity_weights instead weights { name: "encodefc1linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { name: "encodefc2linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { name: "encodefc3linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { name: "encodefc4linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } #Decoder weights here to be used in WAE+cyclic model weights { name: "decode0linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { @@ -616,7 +626,9 @@ model { } weights { name: "decode1linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { @@ -624,7 +636,9 @@ model { } weights { name: "decode2linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { @@ -632,7 +646,9 @@ model { } weights { name: "decode3linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { @@ -643,7 +659,9 @@ model { #Discriminator (shared) weights { name: "wae_d1fc1linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { @@ -651,7 +669,9 @@ model { } weights { name: "wae_d1fc2linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { @@ -659,7 +679,9 @@ model { } weights { name: "wae_d1fc3linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { @@ -667,7 +689,9 @@ model { } weights { name: "wae_d1fc4linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { @@ -675,7 +699,9 @@ model { } weights { name: "wae_d1fc5linearity" - he_normal_initializer { + initializer { + he_normal_initializer { + } } } weights { diff --git a/model_zoo/models/molecular_autoencoder_candle_pilot2/model_conv_molecular_autoencoder_pilot2.prototext b/model_zoo/models/molecular_autoencoder_candle_pilot2/model_conv_molecular_autoencoder_pilot2.prototext index 8b6bb672ceb..3791b5c4e0c 100644 --- a/model_zoo/models/molecular_autoencoder_candle_pilot2/model_conv_molecular_autoencoder_pilot2.prototext +++ b/model_zoo/models/molecular_autoencoder_candle_pilot2/model_conv_molecular_autoencoder_pilot2.prototext @@ -128,7 +128,9 @@ model { # decode2 weights { name: "decode2_fc_matrix" - glorot_uniform_initializer {} + initializer { + glorot_uniform_initializer {} + } } layer { parents: "encode2" diff --git a/model_zoo/models/molecular_autoencoder_candle_pilot2/model_conv_molecular_bead_autoencoder_pilot2.prototext b/model_zoo/models/molecular_autoencoder_candle_pilot2/model_conv_molecular_bead_autoencoder_pilot2.prototext index 2c503c625ab..d3827897409 100644 --- a/model_zoo/models/molecular_autoencoder_candle_pilot2/model_conv_molecular_bead_autoencoder_pilot2.prototext +++ b/model_zoo/models/molecular_autoencoder_candle_pilot2/model_conv_molecular_bead_autoencoder_pilot2.prototext @@ -158,7 +158,9 @@ model { # decode3 weights { name: "decode3_fc_matrix" - glorot_uniform_initializer {} + initializer { + glorot_uniform_initializer {} + } } layer { parents: "encode3" diff --git a/model_zoo/models/molecular_autoencoder_candle_pilot2/model_molecular_autoencoder_pilot2.prototext b/model_zoo/models/molecular_autoencoder_candle_pilot2/model_molecular_autoencoder_pilot2.prototext index 91810258bd0..e519fe44b87 100644 --- a/model_zoo/models/molecular_autoencoder_candle_pilot2/model_molecular_autoencoder_pilot2.prototext +++ b/model_zoo/models/molecular_autoencoder_candle_pilot2/model_molecular_autoencoder_pilot2.prototext @@ -76,7 +76,9 @@ model { weights { name: "decode1_fc_matrix" - glorot_uniform_initializer {} + initializer { + glorot_uniform_initializer {} + } } layer { parents: "encode1" diff --git a/model_zoo/models/siamese/finetune-cub/model_cub.prototext b/model_zoo/models/siamese/finetune-cub/model_cub.prototext index 6d9e9761e1f..0f139ba4ec2 100644 --- a/model_zoo/models/siamese/finetune-cub/model_cub.prototext +++ b/model_zoo/models/siamese/finetune-cub/model_cub.prototext @@ -64,9 +64,11 @@ model { weights { name: "conv1_kernel" - normal_initializer { - mean: 0.0 - standard_deviation: 0.01 + initializer { + normal_initializer { + mean: 0.0 + standard_deviation: 0.01 + } } optimizer { sgd { @@ -80,8 +82,10 @@ model { weights { name: "conv1_bias" - constant_initializer { - value: 0.0 + initializer { + constant_initializer { + value: 0.0 + } } optimizer { sgd { @@ -95,9 +99,11 @@ model { weights { name: "conv2_kernel" - normal_initializer { - mean: 0.0 - standard_deviation: 0.01 + initializer { + normal_initializer { + mean: 0.0 + standard_deviation: 0.01 + } } optimizer { sgd { @@ -111,8 +117,10 @@ model { weights { name: "conv2_bias" - constant_initializer { - value: 1.0 + initializer { + constant_initializer { + value: 1.0 + } } optimizer { sgd { @@ -126,9 +134,11 @@ model { weights { name: "conv3_kernel" - normal_initializer { - mean: 0.0 - standard_deviation: 0.01 + initializer { + normal_initializer { + mean: 0.0 + standard_deviation: 0.01 + } } optimizer { sgd { @@ -142,8 +152,10 @@ model { weights { name: "conv3_bias" - constant_initializer { - value: 0.0 + initializer { + constant_initializer { + value: 0.0 + } } optimizer { sgd { @@ -157,9 +169,11 @@ model { weights { name: "conv4_kernel" - normal_initializer { - mean: 0.0 - standard_deviation: 0.01 + initializer { + normal_initializer { + mean: 0.0 + standard_deviation: 0.01 + } } optimizer { sgd { @@ -173,8 +187,10 @@ model { weights { name: "conv4_bias" - constant_initializer { - value: 1.0 + initializer { + constant_initializer { + value: 1.0 + } } optimizer { sgd { @@ -188,9 +204,11 @@ model { weights { name: "conv5_kernel" - normal_initializer { - mean: 0.0 - standard_deviation: 0.01 + initializer { + normal_initializer { + mean: 0.0 + standard_deviation: 0.01 + } } optimizer { sgd { @@ -204,8 +222,10 @@ model { weights { name: "conv5_bias" - constant_initializer { - value: 1.0 + initializer { + constant_initializer { + value: 1.0 + } } optimizer { sgd { @@ -219,9 +239,11 @@ model { weights { name: "fc6_new_linearity" - normal_initializer { - mean: 0.0 - standard_deviation: 0.005 + initializer { + normal_initializer { + mean: 0.0 + standard_deviation: 0.005 + } } optimizer { sgd { @@ -235,8 +257,10 @@ model { weights { name: "fc6_new_bias" - constant_initializer { - value: 1.0 + initializer { + constant_initializer { + value: 1.0 + } } optimizer { sgd { @@ -250,9 +274,11 @@ model { weights { name: "fc7_new_linearity" - normal_initializer { - mean: 0.0 - standard_deviation: 0.005 + initializer { + normal_initializer { + mean: 0.0 + standard_deviation: 0.005 + } } optimizer { sgd { @@ -266,8 +292,10 @@ model { weights { name: "fc7_new_bias" - constant_initializer { - value: 1.0 + initializer { + constant_initializer { + value: 1.0 + } } optimizer { sgd { @@ -281,9 +309,11 @@ model { weights { name: "fc8_new_linearity" - normal_initializer { - mean: 0.0 - standard_deviation: 0.01 + initializer { + normal_initializer { + mean: 0.0 + standard_deviation: 0.01 + } } optimizer { sgd { @@ -297,8 +327,10 @@ model { weights { name: "fc8_new_bias" - constant_initializer { - value: 0.0 + initializer { + constant_initializer { + value: 0.0 + } } optimizer { sgd { diff --git a/model_zoo/models/siamese/finetune-cub/model_cub_batchnorm.prototext b/model_zoo/models/siamese/finetune-cub/model_cub_batchnorm.prototext index 091ed4b5acd..c7ab48dd13e 100644 --- a/model_zoo/models/siamese/finetune-cub/model_cub_batchnorm.prototext +++ b/model_zoo/models/siamese/finetune-cub/model_cub_batchnorm.prototext @@ -64,9 +64,11 @@ model { weights { name: "conv1_kernel" - normal_initializer { - mean: 0.0 - standard_deviation: 0.01 + initializer { + normal_initializer { + mean: 0.0 + standard_deviation: 0.01 + } } optimizer { sgd { @@ -80,8 +82,10 @@ model { weights { name: "conv1_bias" - constant_initializer { - value: 0.0 + initializer { + constant_initializer { + value: 0.0 + } } optimizer { sgd { @@ -95,9 +99,11 @@ model { weights { name: "conv2_kernel" - normal_initializer { - mean: 0.0 - standard_deviation: 0.01 + initializer { + normal_initializer { + mean: 0.0 + standard_deviation: 0.01 + } } optimizer { sgd { @@ -111,8 +117,10 @@ model { weights { name: "conv2_bias" - constant_initializer { - value: 0.1 + initializer { + constant_initializer { + value: 0.1 + } } optimizer { sgd { @@ -126,9 +134,11 @@ model { weights { name: "conv3_kernel" - normal_initializer { - mean: 0.0 - standard_deviation: 0.01 + initializer { + normal_initializer { + mean: 0.0 + standard_deviation: 0.01 + } } optimizer { sgd { @@ -142,8 +152,10 @@ model { weights { name: "conv3_bias" - constant_initializer { - value: 0.0 + initializer { + constant_initializer { + value: 0.0 + } } optimizer { sgd { @@ -157,9 +169,11 @@ model { weights { name: "conv4_kernel" - normal_initializer { - mean: 0.0 - standard_deviation: 0.01 + initializer { + normal_initializer { + mean: 0.0 + standard_deviation: 0.01 + } } optimizer { sgd { @@ -173,8 +187,10 @@ model { weights { name: "conv4_bias" - constant_initializer { - value: 0.1 + initializer { + constant_initializer { + value: 0.1 + } } optimizer { sgd { @@ -188,9 +204,11 @@ model { weights { name: "conv5_kernel" - normal_initializer { - mean: 0.0 - standard_deviation: 0.01 + initializer { + normal_initializer { + mean: 0.0 + standard_deviation: 0.01 + } } optimizer { sgd { @@ -204,8 +222,10 @@ model { weights { name: "conv5_bias" - constant_initializer { - value: 0.1 + initializer { + constant_initializer { + value: 0.1 + } } optimizer { sgd { @@ -219,9 +239,11 @@ model { weights { name: "conv6_new_kernel" - normal_initializer { - mean: 0.0 - standard_deviation: 0.01 + initializer { + normal_initializer { + mean: 0.0 + standard_deviation: 0.01 + } } optimizer { sgd { @@ -235,8 +257,10 @@ model { weights { name: "conv6_new_bias" - constant_initializer { - value: 0.1 + initializer { + constant_initializer { + value: 0.1 + } } optimizer { sgd { @@ -250,9 +274,11 @@ model { weights { name: "conv6b_new_kernel" - normal_initializer { - mean: 0.0 - standard_deviation: 0.01 + initializer { + normal_initializer { + mean: 0.0 + standard_deviation: 0.01 + } } optimizer { sgd { @@ -266,8 +292,10 @@ model { weights { name: "conv6b_new_bias" - constant_initializer { - value: 0.1 + initializer { + constant_initializer { + value: 0.1 + } } optimizer { sgd { @@ -281,9 +309,11 @@ model { weights { name: "fc7_new_linearity" - normal_initializer { - mean: 0.0 - standard_deviation: 0.005 + initializer { + normal_initializer { + mean: 0.0 + standard_deviation: 0.005 + } } optimizer { sgd { @@ -297,8 +327,10 @@ model { weights { name: "fc7_new_bias" - constant_initializer { - value: 0.1 + initializer { + constant_initializer { + value: 0.1 + } } optimizer { sgd { @@ -312,9 +344,11 @@ model { weights { name: "fc8_new_linearity" - normal_initializer { - mean: 0.0 - standard_deviation: 0.01 + initializer { + normal_initializer { + mean: 0.0 + standard_deviation: 0.01 + } } optimizer { sgd { @@ -328,8 +362,10 @@ model { weights { name: "fc8_new_bias" - constant_initializer { - value: 0.0 + initializer { + constant_initializer { + value: 0.0 + } } optimizer { sgd { diff --git a/model_zoo/models/siamese/finetune-cub/model_cub_batchnorm_transferred_and_frozen.prototext b/model_zoo/models/siamese/finetune-cub/model_cub_batchnorm_transferred_and_frozen.prototext index 1c8b5a4ffae..8b135bbcac1 100644 --- a/model_zoo/models/siamese/finetune-cub/model_cub_batchnorm_transferred_and_frozen.prototext +++ b/model_zoo/models/siamese/finetune-cub/model_cub_batchnorm_transferred_and_frozen.prototext @@ -64,9 +64,11 @@ model { weights { name: "conv1_kernel" - normal_initializer { - mean: 0.0 - standard_deviation: 0.01 + initializer { + normal_initializer { + mean: 0.0 + standard_deviation: 0.01 + } } optimizer { sgd { @@ -80,8 +82,10 @@ model { weights { name: "conv1_bias" - constant_initializer { - value: 0.0 + initializer { + constant_initializer { + value: 0.0 + } } optimizer { sgd { @@ -95,9 +99,11 @@ model { weights { name: "conv2_kernel" - normal_initializer { - mean: 0.0 - standard_deviation: 0.01 + initializer { + normal_initializer { + mean: 0.0 + standard_deviation: 0.01 + } } optimizer { sgd { @@ -111,8 +117,10 @@ model { weights { name: "conv2_bias" - constant_initializer { - value: 0.1 + initializer { + constant_initializer { + value: 0.1 + } } optimizer { sgd { @@ -126,9 +134,11 @@ model { weights { name: "conv3_kernel" - normal_initializer { - mean: 0.0 - standard_deviation: 0.01 + initializer { + normal_initializer { + mean: 0.0 + standard_deviation: 0.01 + } } optimizer { sgd { @@ -142,8 +152,10 @@ model { weights { name: "conv3_bias" - constant_initializer { - value: 0.0 + initializer { + constant_initializer { + value: 0.0 + } } optimizer { sgd { @@ -157,9 +169,11 @@ model { weights { name: "conv4_kernel" - normal_initializer { - mean: 0.0 - standard_deviation: 0.01 + initializer { + normal_initializer { + mean: 0.0 + standard_deviation: 0.01 + } } optimizer { sgd { @@ -173,8 +187,10 @@ model { weights { name: "conv4_bias" - constant_initializer { - value: 0.1 + initializer { + constant_initializer { + value: 0.1 + } } optimizer { sgd { @@ -188,9 +204,11 @@ model { weights { name: "conv5_kernel" - normal_initializer { - mean: 0.0 - standard_deviation: 0.01 + initializer { + normal_initializer { + mean: 0.0 + standard_deviation: 0.01 + } } optimizer { sgd { @@ -204,8 +222,10 @@ model { weights { name: "conv5_bias" - constant_initializer { - value: 0.1 + initializer { + constant_initializer { + value: 0.1 + } } optimizer { sgd { @@ -219,9 +239,11 @@ model { weights { name: "conv6_new_kernel" - normal_initializer { - mean: 0.0 - standard_deviation: 0.01 + initializer { + normal_initializer { + mean: 0.0 + standard_deviation: 0.01 + } } optimizer { sgd { @@ -235,8 +257,10 @@ model { weights { name: "conv6_new_bias" - constant_initializer { - value: 0.1 + initializer { + constant_initializer { + value: 0.1 + } } optimizer { sgd { @@ -250,9 +274,11 @@ model { weights { name: "conv6b_new_kernel" - normal_initializer { - mean: 0.0 - standard_deviation: 0.01 + initializer { + normal_initializer { + mean: 0.0 + standard_deviation: 0.01 + } } optimizer { sgd { @@ -266,8 +292,10 @@ model { weights { name: "conv6b_new_bias" - constant_initializer { - value: 0.1 + initializer { + constant_initializer { + value: 0.1 + } } optimizer { sgd { @@ -281,9 +309,11 @@ model { weights { name: "fc7_new_linearity" - normal_initializer { - mean: 0.0 - standard_deviation: 0.005 + initializer { + normal_initializer { + mean: 0.0 + standard_deviation: 0.005 + } } optimizer { sgd { @@ -297,8 +327,10 @@ model { weights { name: "fc7_new_bias" - constant_initializer { - value: 0.1 + initializer { + constant_initializer { + value: 0.1 + } } optimizer { sgd { @@ -312,9 +344,11 @@ model { weights { name: "fc8_new_linearity" - normal_initializer { - mean: 0.0 - standard_deviation: 0.01 + initializer { + normal_initializer { + mean: 0.0 + standard_deviation: 0.01 + } } optimizer { sgd { @@ -328,8 +362,10 @@ model { weights { name: "fc8_new_bias" - constant_initializer { - value: 0.0 + initializer { + constant_initializer { + value: 0.0 + } } optimizer { sgd { @@ -347,8 +383,10 @@ model { weights { name: "bn_conv1_scale" - constant_initializer { - value: 1.0 + initializer { + constant_initializer { + value: 1.0 + } } optimizer {} } @@ -356,8 +394,10 @@ model { weights { name: "bn_conv1_bias" - constant_initializer { - value: 0.0 + initializer { + constant_initializer { + value: 0.0 + } } optimizer {} } @@ -365,8 +405,10 @@ model { weights { name: "bn_conv1_running_mean" - constant_initializer { - value: 0.0 + initializer { + constant_initializer { + value: 0.0 + } } optimizer {} } @@ -374,8 +416,10 @@ model { weights { name: "bn_conv1_running_variance" - constant_initializer { - value: 1.0 + initializer { + constant_initializer { + value: 1.0 + } } optimizer {} } @@ -383,8 +427,10 @@ model { weights { name: "bn_conv2_scale" - constant_initializer { - value: 1.0 + initializer { + constant_initializer { + value: 1.0 + } } optimizer {} } @@ -392,8 +438,10 @@ model { weights { name: "bn_conv2_bias" - constant_initializer { - value: 0.0 + initializer { + constant_initializer { + value: 0.0 + } } optimizer {} } @@ -401,8 +449,10 @@ model { weights { name: "bn_conv2_running_mean" - constant_initializer { - value: 0.0 + initializer { + constant_initializer { + value: 0.0 + } } optimizer {} } @@ -410,8 +460,10 @@ model { weights { name: "bn_conv2_running_variance" - constant_initializer { - value: 1.0 + initializer { + constant_initializer { + value: 1.0 + } } optimizer {} } @@ -419,8 +471,10 @@ model { weights { name: "bn_conv3_scale" - constant_initializer { - value: 1.0 + initializer { + constant_initializer { + value: 1.0 + } } optimizer {} } @@ -428,8 +482,10 @@ model { weights { name: "bn_conv3_bias" - constant_initializer { - value: 0.0 + initializer { + constant_initializer { + value: 0.0 + } } optimizer {} } @@ -437,8 +493,10 @@ model { weights { name: "bn_conv3_running_mean" - constant_initializer { - value: 0.0 + initializer { + constant_initializer { + value: 0.0 + } } optimizer {} } @@ -446,8 +504,10 @@ model { weights { name: "bn_conv3_running_variance" - constant_initializer { - value: 1.0 + initializer { + constant_initializer { + value: 1.0 + } } optimizer {} } @@ -455,8 +515,10 @@ model { weights { name: "bn_conv4_scale" - constant_initializer { - value: 1.0 + initializer { + constant_initializer { + value: 1.0 + } } optimizer {} } @@ -464,8 +526,10 @@ model { weights { name: "bn_conv4_bias" - constant_initializer { - value: 0.0 + initializer { + constant_initializer { + value: 0.0 + } } optimizer {} } @@ -473,8 +537,10 @@ model { weights { name: "bn_conv4_running_mean" - constant_initializer { - value: 0.0 + initializer { + constant_initializer { + value: 0.0 + } } optimizer {} } @@ -482,8 +548,10 @@ model { weights { name: "bn_conv4_running_variance" - constant_initializer { - value: 1.0 + initializer { + constant_initializer { + value: 1.0 + } } optimizer {} } @@ -491,8 +559,10 @@ model { weights { name: "bn_conv5_scale" - constant_initializer { - value: 1.0 + initializer { + constant_initializer { + value: 1.0 + } } optimizer {} } @@ -500,8 +570,10 @@ model { weights { name: "bn_conv5_bias" - constant_initializer { - value: 0.0 + initializer { + constant_initializer { + value: 0.0 + } } optimizer {} } @@ -509,8 +581,10 @@ model { weights { name: "bn_conv5_running_mean" - constant_initializer { - value: 0.0 + initializer { + constant_initializer { + value: 0.0 + } } optimizer {} } @@ -518,8 +592,10 @@ model { weights { name: "bn_conv5_running_variance" - constant_initializer { - value: 1.0 + initializer { + constant_initializer { + value: 1.0 + } } optimizer {} } diff --git a/model_zoo/models/siamese/triplet/model_triplet_alexnet_batchnorm_dag_frozen_bn.prototext b/model_zoo/models/siamese/triplet/model_triplet_alexnet_batchnorm_dag_frozen_bn.prototext index f85cd86db11..83ace5138af 100644 --- a/model_zoo/models/siamese/triplet/model_triplet_alexnet_batchnorm_dag_frozen_bn.prototext +++ b/model_zoo/models/siamese/triplet/model_triplet_alexnet_batchnorm_dag_frozen_bn.prototext @@ -68,9 +68,11 @@ model { weights { name: "conv1_kernel" - normal_initializer { - mean: 0.0 - standard_deviation: 0.01 + initializer { + normal_initializer { + mean: 0.0 + standard_deviation: 0.01 + } } optimizer { sgd { @@ -83,8 +85,10 @@ model { weights { name: "conv1_bias" - constant_initializer { - value: 0.0 + initializer { + constant_initializer { + value: 0.0 + } } optimizer { sgd { @@ -97,9 +101,11 @@ model { weights { name: "conv2_kernel" - normal_initializer { - mean: 0.0 - standard_deviation: 0.01 + initializer { + normal_initializer { + mean: 0.0 + standard_deviation: 0.01 + } } optimizer { sgd { @@ -112,8 +118,10 @@ model { weights { name: "conv2_bias" - constant_initializer { - value: 0.1 + initializer { + constant_initializer { + value: 0.1 + } } optimizer { sgd { @@ -126,9 +134,11 @@ model { weights { name: "conv3_kernel" - normal_initializer { - mean: 0.0 - standard_deviation: 0.01 + initializer { + normal_initializer { + mean: 0.0 + standard_deviation: 0.01 + } } optimizer { sgd { @@ -141,8 +151,10 @@ model { weights { name: "conv3_bias" - constant_initializer { - value: 0.0 + initializer { + constant_initializer { + value: 0.0 + } } optimizer { sgd { @@ -155,9 +167,11 @@ model { weights { name: "conv4_kernel" - normal_initializer { - mean: 0.0 - standard_deviation: 0.01 + initializer { + normal_initializer { + mean: 0.0 + standard_deviation: 0.01 + } } optimizer { sgd { @@ -170,8 +184,10 @@ model { weights { name: "conv4_bias" - constant_initializer { - value: 0.1 + initializer { + constant_initializer { + value: 0.1 + } } optimizer { sgd { @@ -184,9 +200,11 @@ model { weights { name: "conv5_kernel" - normal_initializer { - mean: 0.0 - standard_deviation: 0.01 + initializer { + normal_initializer { + mean: 0.0 + standard_deviation: 0.01 + } } optimizer { sgd { @@ -199,8 +217,10 @@ model { weights { name: "conv5_bias" - constant_initializer { - value: 0.1 + initializer { + constant_initializer { + value: 0.1 + } } optimizer { sgd { @@ -213,9 +233,11 @@ model { weights { name: "fc6_linearity" - normal_initializer { - mean: 0.0 - standard_deviation: 0.005 + initializer { + normal_initializer { + mean: 0.0 + standard_deviation: 0.005 + } } optimizer { sgd { @@ -228,8 +250,10 @@ model { weights { name: "fc6_bias" - constant_initializer { - value: 0.1 + initializer { + constant_initializer { + value: 0.1 + } } optimizer { sgd { @@ -242,9 +266,11 @@ model { weights { name: "fc7_linearity" - normal_initializer { - mean: 0.0 - standard_deviation: 0.01 + initializer { + normal_initializer { + mean: 0.0 + standard_deviation: 0.01 + } } optimizer { sgd { @@ -257,8 +283,10 @@ model { weights { name: "fc7_bias" - constant_initializer { - value: 0.0 + initializer { + constant_initializer { + value: 0.0 + } } optimizer { sgd { @@ -271,9 +299,11 @@ model { weights { name: "fc8_linearity" - normal_initializer { - mean: 0.0 - standard_deviation: 0.01 + initializer { + normal_initializer { + mean: 0.0 + standard_deviation: 0.01 + } } optimizer { sgd { @@ -286,8 +316,10 @@ model { weights { name: "fc8_bias" - constant_initializer { - value: 0.0 + initializer { + constant_initializer { + value: 0.0 + } } optimizer { sgd { @@ -300,9 +332,11 @@ model { weights { name: "fc9_linearity" - normal_initializer { - mean: 0.0 - standard_deviation: 0.01 + initializer { + normal_initializer { + mean: 0.0 + standard_deviation: 0.01 + } } optimizer { sgd { @@ -315,8 +349,10 @@ model { weights { name: "fc9_bias" - constant_initializer { - value: 0.0 + initializer { + constant_initializer { + value: 0.0 + } } optimizer { sgd { @@ -333,8 +369,10 @@ model { weights { name: "bn_conv1_scale" - constant_initializer { - value: 1.0 + initializer { + constant_initializer { + value: 1.0 + } } optimizer {} } @@ -342,8 +380,10 @@ model { weights { name: "bn_conv1_bias" - constant_initializer { - value: 0.0 + initializer { + constant_initializer { + value: 0.0 + } } optimizer {} } @@ -351,8 +391,10 @@ model { weights { name: "bn_conv1_running_mean" - constant_initializer { - value: 0.0 + initializer { + constant_initializer { + value: 0.0 + } } optimizer {} } @@ -360,8 +402,10 @@ model { weights { name: "bn_conv1_running_variance" - constant_initializer { - value: 1.0 + initializer { + constant_initializer { + value: 1.0 + } } optimizer {} } @@ -369,8 +413,10 @@ model { weights { name: "bn_conv2_scale" - constant_initializer { - value: 1.0 + initializer { + constant_initializer { + value: 1.0 + } } optimizer {} } @@ -378,8 +424,10 @@ model { weights { name: "bn_conv2_bias" - constant_initializer { - value: 0.0 + initializer { + constant_initializer { + value: 0.0 + } } optimizer {} } @@ -387,8 +435,10 @@ model { weights { name: "bn_conv2_running_mean" - constant_initializer { - value: 0.0 + initializer { + constant_initializer { + value: 0.0 + } } optimizer {} } @@ -396,8 +446,10 @@ model { weights { name: "bn_conv2_running_variance" - constant_initializer { - value: 1.0 + initializer { + constant_initializer { + value: 1.0 + } } optimizer {} } @@ -405,8 +457,10 @@ model { weights { name: "bn_conv3_scale" - constant_initializer { - value: 1.0 + initializer { + constant_initializer { + value: 1.0 + } } optimizer {} } @@ -414,8 +468,10 @@ model { weights { name: "bn_conv3_bias" - constant_initializer { - value: 0.0 + initializer { + constant_initializer { + value: 0.0 + } } optimizer {} } @@ -423,8 +479,10 @@ model { weights { name: "bn_conv3_running_mean" - constant_initializer { - value: 0.0 + initializer { + constant_initializer { + value: 0.0 + } } optimizer {} } @@ -432,8 +490,10 @@ model { weights { name: "bn_conv3_running_variance" - constant_initializer { - value: 1.0 + initializer { + constant_initializer { + value: 1.0 + } } optimizer {} } @@ -441,8 +501,10 @@ model { weights { name: "bn_conv4_scale" - constant_initializer { - value: 1.0 + initializer { + constant_initializer { + value: 1.0 + } } optimizer {} } @@ -450,8 +512,10 @@ model { weights { name: "bn_conv4_bias" - constant_initializer { - value: 0.0 + initializer { + constant_initializer { + value: 0.0 + } } optimizer {} } @@ -459,8 +523,10 @@ model { weights { name: "bn_conv4_running_mean" - constant_initializer { - value: 0.0 + initializer { + constant_initializer { + value: 0.0 + } } optimizer {} } @@ -468,8 +534,10 @@ model { weights { name: "bn_conv4_running_variance" - constant_initializer { - value: 1.0 + initializer { + constant_initializer { + value: 1.0 + } } optimizer {} } @@ -477,8 +545,10 @@ model { weights { name: "bn_conv5_scale" - constant_initializer { - value: 1.0 + initializer { + constant_initializer { + value: 1.0 + } } optimizer {} } @@ -486,8 +556,10 @@ model { weights { name: "bn_conv5_bias" - constant_initializer { - value: 0.0 + initializer { + constant_initializer { + value: 0.0 + } } optimizer {} } @@ -495,8 +567,10 @@ model { weights { name: "bn_conv5_running_mean" - constant_initializer { - value: 0.0 + initializer { + constant_initializer { + value: 0.0 + } } optimizer {} } @@ -504,8 +578,10 @@ model { weights { name: "bn_conv5_running_variance" - constant_initializer { - value: 1.0 + initializer { + constant_initializer { + value: 1.0 + } } optimizer {} } @@ -513,8 +589,10 @@ model { weights { name: "bn_fc6_scale" - constant_initializer { - value: 1.0 + initializer { + constant_initializer { + value: 1.0 + } } optimizer {} } @@ -522,8 +600,10 @@ model { weights { name: "bn_fc6_bias" - constant_initializer { - value: 0.0 + initializer { + constant_initializer { + value: 0.0 + } } optimizer {} } @@ -531,8 +611,10 @@ model { weights { name: "bn_fc6_running_mean" - constant_initializer { - value: 0.0 + initializer { + constant_initializer { + value: 0.0 + } } optimizer {} } @@ -540,8 +622,10 @@ model { weights { name: "bn_fc6_running_variance" - constant_initializer { - value: 1.0 + initializer { + constant_initializer { + value: 1.0 + } } optimizer {} } From 0113e573aeb7f7b9186bc577c33666960fb130d5 Mon Sep 17 00:00:00 2001 From: "Thomas R. Benson" Date: Thu, 1 Aug 2019 12:53:00 -0700 Subject: [PATCH 187/634] add comment to class_generator.py --- python/lbann/util/class_generator.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/python/lbann/util/class_generator.py b/python/lbann/util/class_generator.py index f1c32e0b5bc..7a72dfe204b 100644 --- a/python/lbann/util/class_generator.py +++ b/python/lbann/util/class_generator.py @@ -96,6 +96,13 @@ def export_proto(self): message = getattr(proto, base_field_name) message.SetInParent() else: + # TODO (trb 08/01/2019): This list would have to be + # updated any time another _pb2 file is created. It might + # be better to have this as a global `frozenset` + # (ndryden's suggestion) that gets maintained + # elsewhere. But this code either works or doesn't get + # executed now, so I vote delaying this fix until a need + # arises. proto_modules = set([callbacks_pb2, layers_pb2, metrics_pb2, model_pb2, objective_functions_pb2, optimizers_pb2, weights_pb2]) proto_type = None while proto_type is None: From d3c8509dabf6d2db4bf3b0faabef609132af2cc9 Mon Sep 17 00:00:00 2001 From: "Thomas R. Benson" Date: Thu, 1 Aug 2019 13:46:16 -0700 Subject: [PATCH 188/634] fix protobuf class names --- src/weights/initializer.cpp | 8 ++++---- src/weights/variance_scaling_initializers.cpp | 12 ++++++------ 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/src/weights/initializer.cpp b/src/weights/initializer.cpp index f902d814c4a..239d41b7fc9 100644 --- a/src/weights/initializer.cpp +++ b/src/weights/initializer.cpp @@ -129,21 +129,21 @@ void normal_initializer::fill(AbsDistMat& matrix) { std::unique_ptr build_constant_initializer_from_pbuf(google::protobuf::Message const& msg) { const auto& params = - dynamic_cast(msg); + dynamic_cast(msg); return make_unique(params.value()); } std::unique_ptr build_value_initializer_from_pbuf(google::protobuf::Message const& msg) { const auto& params = - dynamic_cast(msg); + dynamic_cast(msg); return make_unique(parse_list(params.values())); } std::unique_ptr build_uniform_initializer_from_pbuf(google::protobuf::Message const& msg) { const auto& params = - dynamic_cast(msg); + dynamic_cast(msg); const auto& min = params.min(); const auto& max = params.max(); if (min != 0.0 || max != 0.0) { @@ -156,7 +156,7 @@ build_uniform_initializer_from_pbuf(google::protobuf::Message const& msg) { std::unique_ptr build_normal_initializer_from_pbuf(google::protobuf::Message const& msg) { const auto& params = - dynamic_cast(msg); + dynamic_cast(msg); const auto& mean = params.mean(); const auto& standard_deviation = params.standard_deviation(); if (mean != 0.0 || standard_deviation != 0.0) { diff --git a/src/weights/variance_scaling_initializers.cpp b/src/weights/variance_scaling_initializers.cpp index 36e41261b01..3b484825b88 100644 --- a/src/weights/variance_scaling_initializers.cpp +++ b/src/weights/variance_scaling_initializers.cpp @@ -118,9 +118,9 @@ DataType lecun_initializer::get_variance(El::Int fan_in, El::Int fan_out) { // are only 2 probability distributions std::unique_ptr build_glorot_initializer_from_pbuf(google::protobuf::Message const& msg) { - if (dynamic_cast(&msg)) + if (dynamic_cast(&msg)) return make_unique(probability_distribution::gaussian); - else if (dynamic_cast(&msg)) + else if (dynamic_cast(&msg)) return make_unique(probability_distribution::uniform); else { LBANN_ERROR("build_glorot_initializer_from_pbuf: Bad message."); @@ -130,9 +130,9 @@ build_glorot_initializer_from_pbuf(google::protobuf::Message const& msg) { std::unique_ptr build_he_initializer_from_pbuf(google::protobuf::Message const& msg) { - if (dynamic_cast(&msg)) + if (dynamic_cast(&msg)) return make_unique(probability_distribution::gaussian); - else if (dynamic_cast(&msg)) + else if (dynamic_cast(&msg)) return make_unique(probability_distribution::uniform); else { LBANN_ERROR("build_he_initializer_from_pbuf: Bad message."); @@ -142,9 +142,9 @@ build_he_initializer_from_pbuf(google::protobuf::Message const& msg) { std::unique_ptr build_lecun_initializer_from_pbuf(google::protobuf::Message const& msg) { - if (dynamic_cast(&msg)) + if (dynamic_cast(&msg)) return make_unique(probability_distribution::gaussian); - else if (dynamic_cast(&msg)) + else if (dynamic_cast(&msg)) return make_unique(probability_distribution::uniform); else { LBANN_ERROR("build_lecun_initializer_from_pbuf: Bad message."); From 2a7ea3ee2376a795af3ca4de578f2b0edc067eda Mon Sep 17 00:00:00 2001 From: "Thomas R. Benson" Date: Thu, 1 Aug 2019 14:15:08 -0700 Subject: [PATCH 189/634] whitespace change --- src/proto/factories/optimizer_factory.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/proto/factories/optimizer_factory.cpp b/src/proto/factories/optimizer_factory.cpp index 17badda71bc..c0f460263b6 100644 --- a/src/proto/factories/optimizer_factory.cpp +++ b/src/proto/factories/optimizer_factory.cpp @@ -64,11 +64,11 @@ void register_default_builders(factory_type& factory) // Manage a global factory struct factory_manager { - factory_type factory_; + factory_type factory_; - factory_manager() { - register_default_builders(factory_); - } + factory_manager() { + register_default_builders(factory_); + } }; factory_manager factory_mgr_; From 0a16602db3227eb7db628abba557c9751a8c2375 Mon Sep 17 00:00:00 2001 From: "Thomas R. Benson" Date: Thu, 1 Aug 2019 15:46:51 -0700 Subject: [PATCH 190/634] straightforward refactor of the transform factory --- include/lbann/transforms/normalize.hpp | 11 +- include/lbann/transforms/sample_normalize.hpp | 6 + include/lbann/transforms/scale.hpp | 8 + .../lbann/transforms/transform_pipeline.hpp | 2 +- .../transforms/vision/adjust_brightness.hpp | 8 +- .../transforms/vision/adjust_contrast.hpp | 10 +- .../transforms/vision/adjust_saturation.hpp | 8 +- .../lbann/transforms/vision/center_crop.hpp | 5 + .../lbann/transforms/vision/color_jitter.hpp | 7 +- include/lbann/transforms/vision/colorize.hpp | 5 + include/lbann/transforms/vision/cutout.hpp | 9 +- include/lbann/transforms/vision/grayscale.hpp | 5 + .../transforms/vision/horizontal_flip.hpp | 5 + .../vision/normalize_to_lbann_layout.hpp | 6 + .../lbann/transforms/vision/random_affine.hpp | 5 + .../lbann/transforms/vision/random_crop.hpp | 5 + .../transforms/vision/random_resized_crop.hpp | 5 + ...m_resized_crop_with_fixed_aspect_ratio.hpp | 6 + include/lbann/transforms/vision/resize.hpp | 5 + .../transforms/vision/resized_center_crop.hpp | 5 + .../transforms/vision/to_lbann_layout.hpp | 5 + .../lbann/transforms/vision/vertical_flip.hpp | 5 + src/proto/factories/transform_factory.cpp | 146 +++++++----------- src/transforms/normalize.cpp | 13 ++ src/transforms/sample_normalize.cpp | 7 + src/transforms/scale.cpp | 9 ++ src/transforms/vision/adjust_brightness.cpp | 9 ++ src/transforms/vision/adjust_contrast.cpp | 12 +- src/transforms/vision/adjust_saturation.cpp | 12 +- src/transforms/vision/center_crop.cpp | 12 +- src/transforms/vision/color_jitter.cpp | 15 +- src/transforms/vision/colorize.cpp | 6 + src/transforms/vision/cutout.cpp | 9 ++ src/transforms/vision/grayscale.cpp | 6 + src/transforms/vision/horizontal_flip.cpp | 9 ++ .../vision/normalize_to_lbann_layout.cpp | 14 ++ src/transforms/vision/random_affine.cpp | 16 +- src/transforms/vision/random_crop.cpp | 10 ++ src/transforms/vision/random_resized_crop.cpp | 21 ++- ...m_resized_crop_with_fixed_aspect_ratio.cpp | 17 +- src/transforms/vision/resize.cpp | 12 +- src/transforms/vision/resized_center_crop.cpp | 14 +- src/transforms/vision/to_lbann_layout.cpp | 6 + src/transforms/vision/vertical_flip.cpp | 9 ++ 44 files changed, 413 insertions(+), 107 deletions(-) diff --git a/include/lbann/transforms/normalize.hpp b/include/lbann/transforms/normalize.hpp index 3c21a86afbd..77bfa649489 100644 --- a/include/lbann/transforms/normalize.hpp +++ b/include/lbann/transforms/normalize.hpp @@ -27,9 +27,12 @@ #ifndef LBANN_TRANSFORMS_NORMALIZE_HPP_INCLUDED #define LBANN_TRANSFORMS_NORMALIZE_HPP_INCLUDED -#include -#include "lbann/utils/exception.hpp" #include "lbann/transforms/transform.hpp" +#include "lbann/utils/exception.hpp" + +#include + +#include namespace lbann { namespace transform { @@ -66,6 +69,10 @@ class normalize : public transform { std::vector m_stds; }; +// Builder function +std::unique_ptr +build_normalize_transform_from_pbuf(google::protobuf::Message const&); + } // namespace transform } // namespace lbann diff --git a/include/lbann/transforms/sample_normalize.hpp b/include/lbann/transforms/sample_normalize.hpp index 6bd3203ec69..b6766d16915 100644 --- a/include/lbann/transforms/sample_normalize.hpp +++ b/include/lbann/transforms/sample_normalize.hpp @@ -29,6 +29,8 @@ #include "lbann/transforms/transform.hpp" +#include + namespace lbann { namespace transform { @@ -45,6 +47,10 @@ class sample_normalize : public transform { void apply(utils::type_erased_matrix& data, std::vector& dims) override; }; +// Builder function +std::unique_ptr +build_sample_normalize_transform_from_pbuf(google::protobuf::Message const&); + } // namespace transform } // namespace lbann diff --git a/include/lbann/transforms/scale.hpp b/include/lbann/transforms/scale.hpp index 0d8218852c4..36ff3bad6ad 100644 --- a/include/lbann/transforms/scale.hpp +++ b/include/lbann/transforms/scale.hpp @@ -29,6 +29,10 @@ #include "lbann/transforms/transform.hpp" +#include + +#include + namespace lbann { namespace transform { @@ -48,6 +52,10 @@ class scale : public transform { float m_scale; }; +// Builder function +std::unique_ptr +build_scale_transform_from_pbuf(google::protobuf::Message const&); + } // namespace transform } // namespace lbann diff --git a/include/lbann/transforms/transform_pipeline.hpp b/include/lbann/transforms/transform_pipeline.hpp index aae1b8a545b..50ffb91b799 100644 --- a/include/lbann/transforms/transform_pipeline.hpp +++ b/include/lbann/transforms/transform_pipeline.hpp @@ -51,7 +51,7 @@ class transform_pipeline { /** * Add trans as the next transform to apply. */ - void add_transform(std::unique_ptr trans) { + void add_transform(std::unique_ptr&& trans) { m_transforms.push_back(std::move(trans)); } diff --git a/include/lbann/transforms/vision/adjust_brightness.hpp b/include/lbann/transforms/vision/adjust_brightness.hpp index 6482793f8a7..649c24c8feb 100644 --- a/include/lbann/transforms/vision/adjust_brightness.hpp +++ b/include/lbann/transforms/vision/adjust_brightness.hpp @@ -29,6 +29,8 @@ #include "lbann/transforms/transform.hpp" +#include + namespace lbann { namespace transform { @@ -44,7 +46,7 @@ class adjust_brightness : public transform { LBANN_ERROR("Brightness factor must be non-negative."); } } - + transform* copy() const override { return new adjust_brightness(*this); } std::string get_type() const override { return "adjust_brightness"; } @@ -56,6 +58,10 @@ class adjust_brightness : public transform { float m_factor; }; +// Builder function +std::unique_ptr +build_adjust_brightness_transform_from_pbuf(google::protobuf::Message const&); + } // namespace transform } // namespace lbann diff --git a/include/lbann/transforms/vision/adjust_contrast.hpp b/include/lbann/transforms/vision/adjust_contrast.hpp index 6a6a7528dc4..3c33a747289 100644 --- a/include/lbann/transforms/vision/adjust_contrast.hpp +++ b/include/lbann/transforms/vision/adjust_contrast.hpp @@ -29,12 +29,14 @@ #include "lbann/transforms/transform.hpp" +#include + namespace lbann { namespace transform { /** * Adjust the contrast of an image. - * + * * This operates similarly to the contrast control on a television. */ class adjust_contrast : public transform { @@ -49,7 +51,7 @@ class adjust_contrast : public transform { LBANN_ERROR("Contrast factor must be non-negative."); } } - + transform* copy() const override { return new adjust_contrast(*this); } std::string get_type() const override { return "adjust_contrast"; } @@ -61,6 +63,10 @@ class adjust_contrast : public transform { float m_factor; }; +// Builder function +std::unique_ptr +build_adjust_contrast_transform_from_pbuf(google::protobuf::Message const&); + } // namespace transform } // namespace lbann diff --git a/include/lbann/transforms/vision/adjust_saturation.hpp b/include/lbann/transforms/vision/adjust_saturation.hpp index 605043b9b4b..65fb2f9e636 100644 --- a/include/lbann/transforms/vision/adjust_saturation.hpp +++ b/include/lbann/transforms/vision/adjust_saturation.hpp @@ -29,6 +29,8 @@ #include "lbann/transforms/transform.hpp" +#include + namespace lbann { namespace transform { @@ -51,7 +53,7 @@ class adjust_saturation : public transform { LBANN_ERROR("Saturation factor must be non-negative."); } } - + transform* copy() const override { return new adjust_saturation(*this); } std::string get_type() const override { return "adjust_saturation"; } @@ -63,6 +65,10 @@ class adjust_saturation : public transform { float m_factor; }; + +std::unique_ptr +build_adjust_saturation_transform_from_pbuf(google::protobuf::Message const&); + } // namespace transform } // namespace lbann diff --git a/include/lbann/transforms/vision/center_crop.hpp b/include/lbann/transforms/vision/center_crop.hpp index e7d512c6f3e..9d4b2026a7e 100644 --- a/include/lbann/transforms/vision/center_crop.hpp +++ b/include/lbann/transforms/vision/center_crop.hpp @@ -29,6 +29,8 @@ #include "lbann/transforms/transform.hpp" +#include + namespace lbann { namespace transform { @@ -48,6 +50,9 @@ class center_crop : public transform { size_t m_h, m_w; }; +std::unique_ptr +build_center_crop_transform_from_pbuf(google::protobuf::Message const&); + } // namespace transform } // namespace lbann diff --git a/include/lbann/transforms/vision/color_jitter.hpp b/include/lbann/transforms/vision/color_jitter.hpp index 7d952e35cf3..cd0ac8805a0 100644 --- a/include/lbann/transforms/vision/color_jitter.hpp +++ b/include/lbann/transforms/vision/color_jitter.hpp @@ -29,6 +29,8 @@ #include "lbann/transforms/transform.hpp" +#include + namespace lbann { namespace transform { @@ -52,7 +54,7 @@ class color_jitter : public transform { color_jitter(float min_brightness_factor, float max_brightness_factor, float min_contrast_factor, float max_contrast_factor, float min_saturation_factor, float max_saturation_factor); - + transform* copy() const override { return new color_jitter(*this); } std::string get_type() const override { return "color_jitter"; } @@ -74,6 +76,9 @@ class color_jitter : public transform { float m_max_saturation_factor; }; +std::unique_ptr +build_color_jitter_transform_from_pbuf(google::protobuf::Message const&); + } // namespace transform } // namespace lbann diff --git a/include/lbann/transforms/vision/colorize.hpp b/include/lbann/transforms/vision/colorize.hpp index f5f444eacec..48864b0869f 100644 --- a/include/lbann/transforms/vision/colorize.hpp +++ b/include/lbann/transforms/vision/colorize.hpp @@ -29,6 +29,8 @@ #include "lbann/transforms/transform.hpp" +#include + namespace lbann { namespace transform { @@ -42,6 +44,9 @@ class colorize : public transform { void apply(utils::type_erased_matrix& data, std::vector& dims) override; }; +std::unique_ptr +build_colorize_transform_from_pbuf(google::protobuf::Message const&); + } // namespace transform } // namespace lbann diff --git a/include/lbann/transforms/vision/cutout.hpp b/include/lbann/transforms/vision/cutout.hpp index 861877e0d3e..b41c71f2800 100644 --- a/include/lbann/transforms/vision/cutout.hpp +++ b/include/lbann/transforms/vision/cutout.hpp @@ -29,14 +29,16 @@ #include "lbann/transforms/transform.hpp" +#include + namespace lbann { namespace transform { /** * Cutout data augmentation which randomly masks out square regions of input. - * + * * See: - * + * * DeVries and Taylor. "Improved Regularization of Convolutional Neural * Networks with Cutout". arXiv preprint arXiv:1708.04552 (2017). * @@ -76,6 +78,9 @@ class cutout : public transform { size_t m_length; }; +std::unique_ptr +build_cutout_transform_from_pbuf(google::protobuf::Message const&); + } // namespace transform } // namespace lbann diff --git a/include/lbann/transforms/vision/grayscale.hpp b/include/lbann/transforms/vision/grayscale.hpp index b185344eff6..a03b2b940cd 100644 --- a/include/lbann/transforms/vision/grayscale.hpp +++ b/include/lbann/transforms/vision/grayscale.hpp @@ -29,6 +29,8 @@ #include "lbann/transforms/transform.hpp" +#include + namespace lbann { namespace transform { @@ -42,6 +44,9 @@ class grayscale : public transform { void apply(utils::type_erased_matrix& data, std::vector& dims) override; }; +std::unique_ptr +build_grayscale_transform_from_pbuf(google::protobuf::Message const&); + } // namespace transform } // namespace lbann diff --git a/include/lbann/transforms/vision/horizontal_flip.hpp b/include/lbann/transforms/vision/horizontal_flip.hpp index 63ea133d25d..0d7a640f698 100644 --- a/include/lbann/transforms/vision/horizontal_flip.hpp +++ b/include/lbann/transforms/vision/horizontal_flip.hpp @@ -29,6 +29,8 @@ #include "lbann/transforms/transform.hpp" +#include + namespace lbann { namespace transform { @@ -49,6 +51,9 @@ class horizontal_flip : public transform { float m_p; }; +std::unique_ptr +build_horizontal_flip_transform_from_pbuf(google::protobuf::Message const&); + } // namespace transform } // namespace lbann diff --git a/include/lbann/transforms/vision/normalize_to_lbann_layout.hpp b/include/lbann/transforms/vision/normalize_to_lbann_layout.hpp index 385dd1dc446..ef91c7fedaa 100644 --- a/include/lbann/transforms/vision/normalize_to_lbann_layout.hpp +++ b/include/lbann/transforms/vision/normalize_to_lbann_layout.hpp @@ -29,6 +29,8 @@ #include "lbann/transforms/transform.hpp" +#include + namespace lbann { namespace transform { @@ -67,6 +69,10 @@ class normalize_to_lbann_layout : public transform { std::vector m_stds; }; +std::unique_ptr +build_normalize_to_lbann_layout_transform_from_pbuf( + google::protobuf::Message const&); + } // namespace transform } // namespace lbann diff --git a/include/lbann/transforms/vision/random_affine.hpp b/include/lbann/transforms/vision/random_affine.hpp index a54a392cd4c..4ef0c587a24 100644 --- a/include/lbann/transforms/vision/random_affine.hpp +++ b/include/lbann/transforms/vision/random_affine.hpp @@ -29,6 +29,8 @@ #include "lbann/transforms/transform.hpp" +#include + namespace lbann { namespace transform { @@ -71,6 +73,9 @@ class random_affine : public transform { float m_shear_min, m_shear_max; }; +std::unique_ptr +build_random_affine_transform_from_pbuf(google::protobuf::Message const&); + } // namespace transform } // namespace lbann diff --git a/include/lbann/transforms/vision/random_crop.hpp b/include/lbann/transforms/vision/random_crop.hpp index 43f3c003a91..dce14b98111 100644 --- a/include/lbann/transforms/vision/random_crop.hpp +++ b/include/lbann/transforms/vision/random_crop.hpp @@ -29,6 +29,8 @@ #include "lbann/transforms/transform.hpp" +#include + namespace lbann { namespace transform { @@ -48,6 +50,9 @@ class random_crop : public transform { size_t m_h, m_w; }; +std::unique_ptr +build_random_crop_transform_from_pbuf(google::protobuf::Message const&); + } // namespace transform } // namespace lbann diff --git a/include/lbann/transforms/vision/random_resized_crop.hpp b/include/lbann/transforms/vision/random_resized_crop.hpp index 261adda9a31..8f957106303 100644 --- a/include/lbann/transforms/vision/random_resized_crop.hpp +++ b/include/lbann/transforms/vision/random_resized_crop.hpp @@ -29,6 +29,8 @@ #include "lbann/transforms/transform.hpp" +#include + namespace lbann { namespace transform { @@ -69,6 +71,9 @@ class random_resized_crop : public transform { float m_ar_min, m_ar_max; }; +std::unique_ptr +build_random_resized_crop_transform_from_pbuf(google::protobuf::Message const&); + } // namespace transform } // namespace lbann diff --git a/include/lbann/transforms/vision/random_resized_crop_with_fixed_aspect_ratio.hpp b/include/lbann/transforms/vision/random_resized_crop_with_fixed_aspect_ratio.hpp index 95de08165e9..8290254aa82 100644 --- a/include/lbann/transforms/vision/random_resized_crop_with_fixed_aspect_ratio.hpp +++ b/include/lbann/transforms/vision/random_resized_crop_with_fixed_aspect_ratio.hpp @@ -29,6 +29,8 @@ #include "lbann/transforms/transform.hpp" +#include + namespace lbann { namespace transform { @@ -56,6 +58,10 @@ class random_resized_crop_with_fixed_aspect_ratio : public transform { size_t m_crop_h, m_crop_w; }; +std::unique_ptr +build_random_resized_crop_with_fixed_aspect_ratio_transform_from_pbuf( + google::protobuf::Message const&); + } // namespace transform } // namespace lbann diff --git a/include/lbann/transforms/vision/resize.hpp b/include/lbann/transforms/vision/resize.hpp index 244c3547df5..668b925c9b9 100644 --- a/include/lbann/transforms/vision/resize.hpp +++ b/include/lbann/transforms/vision/resize.hpp @@ -29,6 +29,8 @@ #include "lbann/transforms/transform.hpp" +#include + namespace lbann { namespace transform { @@ -48,6 +50,9 @@ class resize : public transform { size_t m_h, m_w; }; +std::unique_ptr +build_resize_transform_from_pbuf(google::protobuf::Message const&); + } // namespace transform } // namespace lbann diff --git a/include/lbann/transforms/vision/resized_center_crop.hpp b/include/lbann/transforms/vision/resized_center_crop.hpp index 81eead713b0..0ccb0ef93e6 100644 --- a/include/lbann/transforms/vision/resized_center_crop.hpp +++ b/include/lbann/transforms/vision/resized_center_crop.hpp @@ -29,6 +29,8 @@ #include "lbann/transforms/transform.hpp" +#include + namespace lbann { namespace transform { @@ -51,6 +53,9 @@ class resized_center_crop : public transform { size_t m_crop_h, m_crop_w; }; +std::unique_ptr +build_resized_center_crop_transform_from_pbuf(google::protobuf::Message const&); + } // namespace transform } // namespace lbann diff --git a/include/lbann/transforms/vision/to_lbann_layout.hpp b/include/lbann/transforms/vision/to_lbann_layout.hpp index 27610f4e094..5cbb81f699a 100644 --- a/include/lbann/transforms/vision/to_lbann_layout.hpp +++ b/include/lbann/transforms/vision/to_lbann_layout.hpp @@ -29,6 +29,8 @@ #include "lbann/transforms/transform.hpp" +#include + namespace lbann { namespace transform { @@ -51,6 +53,9 @@ class to_lbann_layout : public transform { std::vector& dims) override; }; +std::unique_ptr +build_to_lbann_layout_transform_from_pbuf(google::protobuf::Message const&); + } // namespace transform } // namespace lbann diff --git a/include/lbann/transforms/vision/vertical_flip.hpp b/include/lbann/transforms/vision/vertical_flip.hpp index 9e02a7ea14b..712547c733a 100644 --- a/include/lbann/transforms/vision/vertical_flip.hpp +++ b/include/lbann/transforms/vision/vertical_flip.hpp @@ -29,6 +29,8 @@ #include "lbann/transforms/transform.hpp" +#include + namespace lbann { namespace transform { @@ -49,6 +51,9 @@ class vertical_flip : public transform { float m_p; }; +std::unique_ptr +build_vertical_flip_transform_from_pbuf(google::protobuf::Message const&); + } // namespace transform } // namespace lbann diff --git a/src/proto/factories/transform_factory.cpp b/src/proto/factories/transform_factory.cpp index d7ac0811d5d..6a8fce802bd 100644 --- a/src/proto/factories/transform_factory.cpp +++ b/src/proto/factories/transform_factory.cpp @@ -48,6 +48,8 @@ #include "lbann/proto/factories.hpp" #include "lbann/proto/proto_common.hpp" +#include "lbann/proto/helpers.hpp" +#include "lbann/utils/factory.hpp" #include "lbann/utils/memory.hpp" #include @@ -55,98 +57,66 @@ namespace lbann { namespace proto { +namespace { + +using factory_type = lbann::generic_factory< + transform::transform, + std::string, + generate_builder_type, + default_key_error_policy>; + +void register_default_builders(factory_type& factory) +{ + using namespace transform; + factory.register_builder("AdjustBrightness", build_adjust_brightness_transform_from_pbuf); + factory.register_builder("AdjustContrast", build_adjust_contrast_transform_from_pbuf); + factory.register_builder("AdjustSaturation", build_adjust_saturation_transform_from_pbuf); + factory.register_builder("CenterCrop", build_center_crop_transform_from_pbuf); + factory.register_builder("ColorJitter", build_color_jitter_transform_from_pbuf); + factory.register_builder("Colorize", build_colorize_transform_from_pbuf); + factory.register_builder("Cutout", build_cutout_transform_from_pbuf); + factory.register_builder("Grayscale", build_grayscale_transform_from_pbuf); + factory.register_builder("HorizontalFlip", build_horizontal_flip_transform_from_pbuf); + factory.register_builder("Normalize", build_normalize_transform_from_pbuf); + factory.register_builder("NormalizeToLBANNLayout", build_normalize_to_lbann_layout_transform_from_pbuf); + factory.register_builder("RandomAffine", build_random_affine_transform_from_pbuf); + factory.register_builder("RandomCrop", build_random_crop_transform_from_pbuf); + factory.register_builder("RandomResizedCrop", build_random_resized_crop_transform_from_pbuf); + factory.register_builder("RandomResizedCropWithFixedAspectRatio", build_random_resized_crop_with_fixed_aspect_ratio_transform_from_pbuf); + factory.register_builder("Resize", build_resize_transform_from_pbuf); + factory.register_builder("ResizedCenterCrop", build_resized_center_crop_transform_from_pbuf); + factory.register_builder("SampleNormalize", build_sample_normalize_transform_from_pbuf); + factory.register_builder("Scale", build_scale_transform_from_pbuf); + factory.register_builder("ToLBANNLayout", build_to_lbann_layout_transform_from_pbuf); + factory.register_builder("VerticalFlip", build_vertical_flip_transform_from_pbuf); +} + +// Manage a global factory +struct factory_manager +{ + factory_type factory_; + + factory_manager() { + register_default_builders(factory_); + } +}; + +factory_manager factory_mgr_; +factory_type const& get_transform_factory() noexcept +{ + return factory_mgr_.factory_; +} + +}// namespace std::unique_ptr construct_transform( const lbann_data::Transform& trans) { - if (trans.has_normalize()) { - auto& pb_trans = trans.normalize(); - return make_unique( - parse_list(pb_trans.means()), - parse_list(pb_trans.stddevs())); - } else if (trans.has_sample_normalize()) { - return make_unique(); - } else if (trans.has_scale()) { - return make_unique(trans.scale().scale()); - } else if (trans.has_center_crop()) { - auto& pb_trans = trans.center_crop(); - return make_unique( - pb_trans.height(), pb_trans.width()); - } else if (trans.has_colorize()) { - return make_unique(); - } else if (trans.has_grayscale()) { - return make_unique(); - } else if (trans.has_horizontal_flip()) { - return make_unique( - trans.horizontal_flip().p()); - } else if (trans.has_normalize_to_lbann_layout()) { - auto& pb_trans = trans.normalize_to_lbann_layout(); - return make_unique( - parse_list(pb_trans.means()), - parse_list(pb_trans.stddevs())); - } else if (trans.has_random_affine()) { - auto& pb_trans = trans.random_affine(); - return make_unique( - pb_trans.rotate_min(), pb_trans.rotate_max(), - pb_trans.translate_h(), pb_trans.translate_w(), - pb_trans.scale_min(), pb_trans.scale_max(), - pb_trans.shear_min(), pb_trans.shear_max()); - } else if (trans.has_random_crop()) { - auto& pb_trans = trans.random_crop(); - return make_unique( - pb_trans.height(), pb_trans.width()); - } else if (trans.has_random_resized_crop()) { - auto& pb_trans = trans.random_resized_crop(); - // Handle defaults: If one specified, all must be. - if (pb_trans.scale_min() != 0.0f) { - return make_unique( - pb_trans.height(), pb_trans.width(), - pb_trans.scale_min(), pb_trans.scale_max(), - pb_trans.ar_min(), pb_trans.ar_max()); - } else { - return make_unique( - pb_trans.height(), pb_trans.width()); - } - } else if (trans.has_random_resized_crop_with_fixed_aspect_ratio()) { - auto& pb_trans = trans.random_resized_crop_with_fixed_aspect_ratio(); - return make_unique( - pb_trans.height(), pb_trans.width(), - pb_trans.crop_height(), pb_trans.crop_width()); - } else if (trans.has_resize()) { - auto& pb_trans = trans.resize(); - return make_unique(pb_trans.height(), pb_trans.width()); - } else if (trans.has_resized_center_crop()) { - auto& pb_trans = trans.resized_center_crop(); - return make_unique( - pb_trans.height(), pb_trans.width(), - pb_trans.crop_height(), pb_trans.crop_width()); - } else if (trans.has_to_lbann_layout()) { - return make_unique(); - } else if (trans.has_vertical_flip()) { - return make_unique( - trans.vertical_flip().p()); - } else if (trans.has_adjust_brightness()) { - return make_unique( - trans.adjust_brightness().factor()); - } else if (trans.has_adjust_contrast()) { - return make_unique( - trans.adjust_contrast().factor()); - } else if (trans.has_adjust_saturation()) { - return make_unique( - trans.adjust_saturation().factor()); - } else if (trans.has_color_jitter()) { - auto& pb_trans = trans.color_jitter(); - return make_unique( - pb_trans.min_brightness_factor(), pb_trans.max_brightness_factor(), - pb_trans.min_contrast_factor(), pb_trans.max_contrast_factor(), - pb_trans.min_saturation_factor(), pb_trans.max_saturation_factor()); - } else if (trans.has_cutout()) { - auto& pb_trans = trans.cutout(); - return make_unique( - pb_trans.num_holes(), pb_trans.length()); - } - LBANN_ERROR("Unknown transform"); - return nullptr; + auto const& factory = get_transform_factory(); + auto const& msg = + helpers::get_oneof_message(trans, "transform_type"); + return factory.create_object(msg.GetDescriptor()->name(), msg); } transform::transform_pipeline construct_transform_pipeline( diff --git a/src/transforms/normalize.cpp b/src/transforms/normalize.cpp index d4803185d30..28ff39bb18b 100644 --- a/src/transforms/normalize.cpp +++ b/src/transforms/normalize.cpp @@ -26,6 +26,10 @@ #include "lbann/transforms/normalize.hpp" +#include "lbann/proto/proto_common.hpp" + +#include + namespace lbann { namespace transform { @@ -101,5 +105,14 @@ void normalize::apply(utils::type_erased_matrix& data, CPUMat& out, } } +std::unique_ptr +build_normalize_transform_from_pbuf(google::protobuf::Message const& msg) +{ + auto& pb_trans = dynamic_cast(msg); + return make_unique( + parse_list(pb_trans.means()), + parse_list(pb_trans.stddevs())); +} + } // namespace transform } // namespace lbann diff --git a/src/transforms/sample_normalize.cpp b/src/transforms/sample_normalize.cpp index c52d78c3ecf..2f8eab63ea3 100644 --- a/src/transforms/sample_normalize.cpp +++ b/src/transforms/sample_normalize.cpp @@ -25,6 +25,7 @@ //////////////////////////////////////////////////////////////////////////////// #include "lbann/transforms/sample_normalize.hpp" +#include "lbann/utils/memory.hpp" #include "lbann/utils/statistics.hpp" namespace lbann { @@ -45,5 +46,11 @@ void sample_normalize::apply(utils::type_erased_matrix& data, std::vector +build_sample_normalize_transform_from_pbuf(google::protobuf::Message const&) +{ + return make_unique(); +} + } // namespace transform } // namespace lbann diff --git a/src/transforms/scale.cpp b/src/transforms/scale.cpp index 9c2ceb45a38..f1b5950e4c1 100644 --- a/src/transforms/scale.cpp +++ b/src/transforms/scale.cpp @@ -25,6 +25,8 @@ //////////////////////////////////////////////////////////////////////////////// #include "lbann/transforms/scale.hpp" +#include "lbann/utils/memory.hpp" +#include namespace lbann { namespace transform { @@ -44,5 +46,12 @@ void scale::apply(utils::type_erased_matrix& data, std::vector&) { } } +std::unique_ptr +build_scale_transform_from_pbuf(google::protobuf::Message const& msg) +{ + auto const& params = dynamic_cast(msg); + return make_unique(params.scale()); +} + } // namespace transform } // namespace lbann diff --git a/src/transforms/vision/adjust_brightness.cpp b/src/transforms/vision/adjust_brightness.cpp index ce8d0b98c85..014cbc00444 100644 --- a/src/transforms/vision/adjust_brightness.cpp +++ b/src/transforms/vision/adjust_brightness.cpp @@ -25,8 +25,11 @@ //////////////////////////////////////////////////////////////////////////////// #include "lbann/transforms/vision/adjust_brightness.hpp" +#include "lbann/utils/memory.hpp" #include "lbann/utils/opencv.hpp" +#include + namespace lbann { namespace transform { @@ -45,5 +48,11 @@ void adjust_brightness::apply(utils::type_erased_matrix& data, std::vector +build_adjust_brightness_transform_from_pbuf(google::protobuf::Message const& msg) { + auto const& params = dynamic_cast(msg); + return make_unique(params.factor()); +} + } // namespace transform } // namespace lbann diff --git a/src/transforms/vision/adjust_contrast.cpp b/src/transforms/vision/adjust_contrast.cpp index 5af3fce227b..eca8a33c68a 100644 --- a/src/transforms/vision/adjust_contrast.cpp +++ b/src/transforms/vision/adjust_contrast.cpp @@ -24,10 +24,14 @@ // permissions and limitations under the license. //////////////////////////////////////////////////////////////////////////////// -#include #include "lbann/transforms/vision/adjust_contrast.hpp" +#include "lbann/utils/memory.hpp" #include "lbann/utils/opencv.hpp" +#include + +#include + namespace lbann { namespace transform { @@ -80,5 +84,11 @@ void adjust_contrast::apply(utils::type_erased_matrix& data, std::vector } } +std::unique_ptr +build_adjust_contrast_transform_from_pbuf(google::protobuf::Message const& msg) { + auto const& params = dynamic_cast(msg); + return make_unique(params.factor()); +} + } // namespace transform } // namespace lbann diff --git a/src/transforms/vision/adjust_saturation.cpp b/src/transforms/vision/adjust_saturation.cpp index e5224422059..2532a637856 100644 --- a/src/transforms/vision/adjust_saturation.cpp +++ b/src/transforms/vision/adjust_saturation.cpp @@ -24,10 +24,14 @@ // permissions and limitations under the license. //////////////////////////////////////////////////////////////////////////////// -#include #include "lbann/transforms/vision/adjust_saturation.hpp" +#include "lbann/utils/memory.hpp" #include "lbann/utils/opencv.hpp" +#include + +#include + namespace lbann { namespace transform { @@ -68,5 +72,11 @@ void adjust_saturation::apply(utils::type_erased_matrix& data, std::vector +build_adjust_saturation_transform_from_pbuf(google::protobuf::Message const& msg) { + auto const& params = dynamic_cast(msg); + return make_unique(params.factor()); +} + } // namespace transform } // namespace lbann diff --git a/src/transforms/vision/center_crop.cpp b/src/transforms/vision/center_crop.cpp index 9f16ccb4e78..fbc30e41fbe 100644 --- a/src/transforms/vision/center_crop.cpp +++ b/src/transforms/vision/center_crop.cpp @@ -24,10 +24,14 @@ // permissions and limitations under the license. //////////////////////////////////////////////////////////////////////////////// -#include #include "lbann/transforms/vision/center_crop.hpp" +#include "lbann/utils/memory.hpp" #include "lbann/utils/opencv.hpp" +#include + +#include + namespace lbann { namespace transform { @@ -61,5 +65,11 @@ void center_crop::apply(utils::type_erased_matrix& data, std::vector& di dims = new_dims; } +std::unique_ptr +build_center_crop_transform_from_pbuf(google::protobuf::Message const& msg) { + auto const& params = dynamic_cast(msg); + return make_unique(params.height(), params.width()); +} + } // namespace transform } // namespace lbann diff --git a/src/transforms/vision/color_jitter.cpp b/src/transforms/vision/color_jitter.cpp index e5d3b2c5294..c4833e9e9d2 100644 --- a/src/transforms/vision/color_jitter.cpp +++ b/src/transforms/vision/color_jitter.cpp @@ -24,14 +24,18 @@ // permissions and limitations under the license. //////////////////////////////////////////////////////////////////////////////// -#include #include "lbann/transforms/vision/color_jitter.hpp" #include "lbann/transforms/vision/adjust_brightness.hpp" #include "lbann/transforms/vision/adjust_contrast.hpp" #include "lbann/transforms/vision/adjust_saturation.hpp" #include "lbann/utils/random.hpp" +#include "lbann/utils/memory.hpp" #include "lbann/utils/opencv.hpp" +#include + +#include + namespace lbann { namespace transform { @@ -111,5 +115,14 @@ void color_jitter::apply(utils::type_erased_matrix& data, std::vector& d } } +std::unique_ptr +build_color_jitter_transform_from_pbuf(google::protobuf::Message const& msg) { + auto const& params = dynamic_cast(msg); + return make_unique( + params.min_brightness_factor(), params.max_brightness_factor(), + params.min_contrast_factor(), params.max_contrast_factor(), + params.min_saturation_factor(), params.max_saturation_factor()); +} + } // namespace transform } // namespace lbann diff --git a/src/transforms/vision/colorize.cpp b/src/transforms/vision/colorize.cpp index 4ece618b727..f32f87741d1 100644 --- a/src/transforms/vision/colorize.cpp +++ b/src/transforms/vision/colorize.cpp @@ -26,6 +26,7 @@ #include #include "lbann/transforms/vision/colorize.hpp" +#include "lbann/utils/memory.hpp" #include "lbann/utils/opencv.hpp" namespace lbann { @@ -44,5 +45,10 @@ void colorize::apply(utils::type_erased_matrix& data, std::vector& dims) dims = new_dims; } +std::unique_ptr +build_colorize_transform_from_pbuf(google::protobuf::Message const&) { + return make_unique(); +} + } // namespace transform } // namespace lbann diff --git a/src/transforms/vision/cutout.cpp b/src/transforms/vision/cutout.cpp index 419a78b5e28..0a4ed9eeccd 100644 --- a/src/transforms/vision/cutout.cpp +++ b/src/transforms/vision/cutout.cpp @@ -25,8 +25,11 @@ //////////////////////////////////////////////////////////////////////////////// #include "lbann/transforms/vision/cutout.hpp" +#include "lbann/utils/memory.hpp" #include "lbann/utils/opencv.hpp" +#include + namespace lbann { namespace transform { @@ -61,5 +64,11 @@ void cutout::apply(utils::type_erased_matrix& data, std::vector& dims) { } } +std::unique_ptr +build_cutout_transform_from_pbuf(google::protobuf::Message const& msg) { + auto const& params = dynamic_cast(msg); + return make_unique(params.num_holes(), params.length()); +} + } // namespace transform } // namespace lbann diff --git a/src/transforms/vision/grayscale.cpp b/src/transforms/vision/grayscale.cpp index e5b4a54fd6d..28fef85422b 100644 --- a/src/transforms/vision/grayscale.cpp +++ b/src/transforms/vision/grayscale.cpp @@ -26,6 +26,7 @@ #include #include "lbann/transforms/vision/grayscale.hpp" +#include "lbann/utils/memory.hpp" #include "lbann/utils/opencv.hpp" namespace lbann { @@ -44,5 +45,10 @@ void grayscale::apply(utils::type_erased_matrix& data, std::vector& dims dims = new_dims; } +std::unique_ptr +build_grayscale_transform_from_pbuf(google::protobuf::Message const&) { + return make_unique(); +} + } // namespace transform } // namespace lbann diff --git a/src/transforms/vision/horizontal_flip.cpp b/src/transforms/vision/horizontal_flip.cpp index 3bdc190178f..fa6306ed2c3 100644 --- a/src/transforms/vision/horizontal_flip.cpp +++ b/src/transforms/vision/horizontal_flip.cpp @@ -25,8 +25,11 @@ //////////////////////////////////////////////////////////////////////////////// #include "lbann/transforms/vision/horizontal_flip.hpp" +#include "lbann/utils/memory.hpp" #include "lbann/utils/opencv.hpp" +#include + namespace lbann { namespace transform { @@ -40,5 +43,11 @@ void horizontal_flip::apply(utils::type_erased_matrix& data, std::vector } } +std::unique_ptr +build_horizontal_flip_transform_from_pbuf(google::protobuf::Message const& msg) { + auto const& params = dynamic_cast(msg); + return make_unique(params.p()); +} + } // namespace transform } // namespace lbann diff --git a/src/transforms/vision/normalize_to_lbann_layout.cpp b/src/transforms/vision/normalize_to_lbann_layout.cpp index c65eef9052b..67a47844b08 100644 --- a/src/transforms/vision/normalize_to_lbann_layout.cpp +++ b/src/transforms/vision/normalize_to_lbann_layout.cpp @@ -25,8 +25,12 @@ //////////////////////////////////////////////////////////////////////////////// #include "lbann/transforms/vision/normalize_to_lbann_layout.hpp" +#include "lbann/proto/proto_common.hpp" +#include "lbann/utils/memory.hpp" #include "lbann/utils/opencv.hpp" +#include + namespace lbann { namespace transform { @@ -90,5 +94,15 @@ void normalize_to_lbann_layout::apply(utils::type_erased_matrix& data, } } +std::unique_ptr +build_normalize_to_lbann_layout_transform_from_pbuf( + google::protobuf::Message const& msg) { + auto const& params = + dynamic_cast(msg); + return make_unique( + parse_list(params.means()), + parse_list(params.stddevs())); +} + } // namespace transform } // namespace lbann diff --git a/src/transforms/vision/random_affine.cpp b/src/transforms/vision/random_affine.cpp index 90d6d7f55b8..aa43d381186 100644 --- a/src/transforms/vision/random_affine.cpp +++ b/src/transforms/vision/random_affine.cpp @@ -24,10 +24,14 @@ // permissions and limitations under the license. //////////////////////////////////////////////////////////////////////////////// -#include #include "lbann/transforms/vision/random_affine.hpp" +#include "lbann/utils/memory.hpp" #include "lbann/utils/opencv.hpp" +#include + +#include + namespace lbann { namespace transform { @@ -99,5 +103,15 @@ void random_affine::apply(utils::type_erased_matrix& data, std::vector& data.emplace(std::move(dst_real)); } +std::unique_ptr +build_random_affine_transform_from_pbuf(google::protobuf::Message const& msg) { + auto const& params = dynamic_cast(msg); + return make_unique( + params.rotate_min(), params.rotate_max(), + params.translate_h(), params.translate_w(), + params.scale_min(), params.scale_max(), + params.shear_min(), params.shear_max()); +} + } // namespace transform } // namespace lbann diff --git a/src/transforms/vision/random_crop.cpp b/src/transforms/vision/random_crop.cpp index 416bae6abfe..7c4a7ed8897 100644 --- a/src/transforms/vision/random_crop.cpp +++ b/src/transforms/vision/random_crop.cpp @@ -25,8 +25,11 @@ //////////////////////////////////////////////////////////////////////////////// #include "lbann/transforms/vision/random_crop.hpp" +#include "lbann/utils/memory.hpp" #include "lbann/utils/opencv.hpp" +#include + namespace lbann { namespace transform { @@ -60,5 +63,12 @@ void random_crop::apply(utils::type_erased_matrix& data, std::vector& di dims = new_dims; } +std::unique_ptr +build_random_crop_transform_from_pbuf(google::protobuf::Message const& msg) { + auto const& params = + dynamic_cast(msg); + return make_unique(params.height(), params.width()); +} + } // namespace transform } // namespace lbann diff --git a/src/transforms/vision/random_resized_crop.cpp b/src/transforms/vision/random_resized_crop.cpp index 0fa9420e538..2f99b63ad02 100644 --- a/src/transforms/vision/random_resized_crop.cpp +++ b/src/transforms/vision/random_resized_crop.cpp @@ -24,10 +24,14 @@ // permissions and limitations under the license. //////////////////////////////////////////////////////////////////////////////// -#include #include "lbann/transforms/vision/random_resized_crop.hpp" +#include "lbann/utils/memory.hpp" #include "lbann/utils/opencv.hpp" +#include + +#include + namespace lbann { namespace transform { @@ -89,5 +93,20 @@ void random_resized_crop::apply(utils::type_erased_matrix& data, dims = new_dims; } +std::unique_ptr +build_random_resized_crop_transform_from_pbuf( + google::protobuf::Message const& msg) { + auto const& params = + dynamic_cast(msg); + if (params.scale_min() != 0.0f) { + return make_unique( + params.height(), params.width(), + params.scale_min(), params.scale_max(), + params.ar_min(), params.ar_max()); + } else { + return make_unique(params.height(), params.width()); + } +} + } // namespace transform } // namespace lbann diff --git a/src/transforms/vision/random_resized_crop_with_fixed_aspect_ratio.cpp b/src/transforms/vision/random_resized_crop_with_fixed_aspect_ratio.cpp index e66afd3acfa..5245cdbce1b 100644 --- a/src/transforms/vision/random_resized_crop_with_fixed_aspect_ratio.cpp +++ b/src/transforms/vision/random_resized_crop_with_fixed_aspect_ratio.cpp @@ -24,10 +24,14 @@ // permissions and limitations under the license. //////////////////////////////////////////////////////////////////////////////// -#include #include "lbann/transforms/vision/random_resized_crop_with_fixed_aspect_ratio.hpp" +#include "lbann/utils/memory.hpp" #include "lbann/utils/opencv.hpp" +#include + +#include + namespace lbann { namespace transform { @@ -67,5 +71,16 @@ void random_resized_crop_with_fixed_aspect_ratio::apply( dims = new_dims; } +std::unique_ptr +build_random_resized_crop_with_fixed_aspect_ratio_transform_from_pbuf( + google::protobuf::Message const& msg) { + using namespace lbann_data; + auto const& params = + dynamic_cast(msg); + return make_unique( + params.height(), params.width(), + params.crop_height(), params.crop_width()); +} + } // namespace transform } // namespace lbann diff --git a/src/transforms/vision/resize.cpp b/src/transforms/vision/resize.cpp index 69bf3facdac..230b1bf2c50 100644 --- a/src/transforms/vision/resize.cpp +++ b/src/transforms/vision/resize.cpp @@ -24,10 +24,14 @@ // permissions and limitations under the license. //////////////////////////////////////////////////////////////////////////////// -#include #include "lbann/transforms/vision/resize.hpp" +#include "lbann/utils/memory.hpp" #include "lbann/utils/opencv.hpp" +#include + +#include + namespace lbann { namespace transform { @@ -41,5 +45,11 @@ void resize::apply(utils::type_erased_matrix& data, std::vector& dims) { dims = new_dims; } +std::unique_ptr +build_resize_transform_from_pbuf(google::protobuf::Message const& msg) { + auto const& params = dynamic_cast(msg); + return make_unique(params.height(), params.width()); +} + } // namespace transform } // namespace lbann diff --git a/src/transforms/vision/resized_center_crop.cpp b/src/transforms/vision/resized_center_crop.cpp index 3aa370c9361..5721fc7511a 100644 --- a/src/transforms/vision/resized_center_crop.cpp +++ b/src/transforms/vision/resized_center_crop.cpp @@ -24,10 +24,14 @@ // permissions and limitations under the license. //////////////////////////////////////////////////////////////////////////////// -#include #include "lbann/transforms/vision/resized_center_crop.hpp" +#include "lbann/utils/memory.hpp" #include "lbann/utils/opencv.hpp" +#include + +#include + namespace lbann { namespace transform { @@ -63,5 +67,13 @@ void resized_center_crop::apply(utils::type_erased_matrix& data, std::vector +build_resized_center_crop_transform_from_pbuf(google::protobuf::Message const& msg) { + auto const& params = dynamic_cast(msg); + return make_unique( + params.height(), params.width(), + params.crop_height(), params.crop_width()); +} + } // namespace transform } // namespace lbann diff --git a/src/transforms/vision/to_lbann_layout.cpp b/src/transforms/vision/to_lbann_layout.cpp index 49f17a7e7d5..6dc741989c4 100644 --- a/src/transforms/vision/to_lbann_layout.cpp +++ b/src/transforms/vision/to_lbann_layout.cpp @@ -25,6 +25,7 @@ //////////////////////////////////////////////////////////////////////////////// #include "lbann/transforms/vision/to_lbann_layout.hpp" +#include "lbann/utils/memory.hpp" #include "lbann/utils/opencv.hpp" namespace lbann { @@ -76,5 +77,10 @@ void to_lbann_layout::apply(utils::type_erased_matrix& data, CPUMat& out, } } +std::unique_ptr +build_to_lbann_layout_transform_from_pbuf(google::protobuf::Message const&) { + return make_unique(); +} + } // namespace transform } // namespace lbann diff --git a/src/transforms/vision/vertical_flip.cpp b/src/transforms/vision/vertical_flip.cpp index f84a789a25c..bd48657a815 100644 --- a/src/transforms/vision/vertical_flip.cpp +++ b/src/transforms/vision/vertical_flip.cpp @@ -25,8 +25,11 @@ //////////////////////////////////////////////////////////////////////////////// #include "lbann/transforms/vision/vertical_flip.hpp" +#include "lbann/utils/memory.hpp" #include "lbann/utils/opencv.hpp" +#include + namespace lbann { namespace transform { @@ -40,5 +43,11 @@ void vertical_flip::apply(utils::type_erased_matrix& data, std::vector& } } +std::unique_ptr +build_vertical_flip_transform_from_pbuf(google::protobuf::Message const& msg) { + auto const& params = dynamic_cast(msg); + return make_unique(params.p()); +} + } // namespace transform } // namespace lbann From ae68c5aff255672ac1c7ded058e1bf985d035475 Mon Sep 17 00:00:00 2001 From: "Thomas R. Benson" Date: Fri, 2 Aug 2019 09:08:24 -0700 Subject: [PATCH 191/634] update protobuf to build at master in superbuild --- superbuild/protobuf/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/superbuild/protobuf/CMakeLists.txt b/superbuild/protobuf/CMakeLists.txt index b004d000949..9b9656b87b9 100644 --- a/superbuild/protobuf/CMakeLists.txt +++ b/superbuild/protobuf/CMakeLists.txt @@ -31,7 +31,7 @@ else () endif () # ... then the tag. -set(PROTOBUF_TAG "v3.6.1" +set(PROTOBUF_TAG "master" CACHE STRING "The git tag or hash to checkout for PROTOBUF") # Where to install PROTOBUF From 64c85d6b34d829a9d2c8e81b60b2d5d43e98508c Mon Sep 17 00:00:00 2001 From: "Thomas R. Benson" Date: Fri, 2 Aug 2019 11:13:46 -0700 Subject: [PATCH 192/634] remove sync_selected callback and all references to it --- include/lbann/callbacks/CMakeLists.txt | 1 - include/lbann/callbacks/sync_selected.hpp | 145 ---------- include/lbann/layers/layer.hpp | 2 - include/lbann/lbann.hpp | 1 - src/callbacks/CMakeLists.txt | 1 - src/callbacks/sync_selected.cpp | 324 ---------------------- src/proto/callbacks.proto | 29 -- src/proto/factories/callback_factory.cpp | 3 - 8 files changed, 506 deletions(-) delete mode 100644 include/lbann/callbacks/sync_selected.hpp delete mode 100644 src/callbacks/sync_selected.cpp diff --git a/include/lbann/callbacks/CMakeLists.txt b/include/lbann/callbacks/CMakeLists.txt index bbd67d8dda5..ff9c269f350 100644 --- a/include/lbann/callbacks/CMakeLists.txt +++ b/include/lbann/callbacks/CMakeLists.txt @@ -34,7 +34,6 @@ set_full_path(THIS_DIR_HEADERS save_topk_models.hpp summary.hpp sync_layers.hpp - sync_selected.hpp timeline.hpp timer.hpp variable_minibatch.hpp diff --git a/include/lbann/callbacks/sync_selected.hpp b/include/lbann/callbacks/sync_selected.hpp deleted file mode 100644 index b9c101f294d..00000000000 --- a/include/lbann/callbacks/sync_selected.hpp +++ /dev/null @@ -1,145 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. -// Produced at the Lawrence Livermore National Laboratory. -// Written by the LBANN Research Team (B. Van Essen, et al.) listed in -// the CONTRIBUTORS file. -// -// LLNL-CODE-697807. -// All rights reserved. -// -// This file is part of LBANN: Livermore Big Artificial Neural Network -// Toolkit. For details, see http://software.llnl.gov/LBANN or -// https://github.com/LLNL/LBANN. -// -// Licensed under the Apache License, Version 2.0 (the "Licensee"); you -// may not use this file except in compliance with the License. You may -// obtain a copy of the License at: -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the license. -// -// callback_sync_selected.hpp - Callback to synchronize selected layers -//////////////////////////////////////////////////////////////////////////////// - -#ifndef LBANN_CALLBACKS_CALLBACK_SYNC_SELECTED_HPP_INCLUDED -#define LBANN_CALLBACKS_CALLBACK_SYNC_SELECTED_HPP_INCLUDED - -#include "lbann/callbacks/sync_layers.hpp" -#include -#include - -namespace lbann { -namespace callback { - -/** - * Synchronize at the beginning and the end of the propagation operation(s) of - * a selected layer, which can be both/either of the forward prop and/or the - * backward prop of the layer. Additionally updates layer timing information to - * account for the synchronization at the end of propagation(s). - * When nvprof is enabled, cudaProfilerStart() follows the synchronization - * inserted at the beginning of the selected prop step(s), and cudaProfilerEnd() - * comes after the local GPU sychronization and before the global MPI barrier - * inserted at the end of the selected prop step(s). - * Note that this callback should come before the summarizer callback - * as the base callback sync_layers requires. - */ -class sync_selected : public sync_layers { - public: - ///type of propagation toch synchronize - enum prop_t {Both = 0, Forward = 1, Backward = 2}; - static const std::map m_prop_str; - - using layers_t = std::unordered_map; - using layer_ptrs_t = std::unordered_set; - - /** - * @param layers specifies the layers to synchronize - * @param async_gpus sets not to synchronize gpus. The default is false. - * @param async_mpi sets not to synchronize mpi. The default is false. - */ - sync_selected(const layers_t& layers, - bool async_gpus = false, bool async_mpi = false); - - sync_selected(const sync_selected&) = default; - - sync_selected& operator=( - const sync_selected&) = default; - - sync_selected* copy() const override { - return new sync_selected(*this); - } - - ~sync_selected() override; - - std::string name() const override { return "sync_selected"; } - std::string get_description() const; - - /// To protect in case that cudaProfilerInitialized() has already been called - static void turn_off_init_cuda_profiler(); - - /// Tells if cuda_profiler has been initialized - static bool check_if_cuda_profiler_initialized(); - - void init_cuda_profiler(const std::string cfg_file, const std::string out_dir, - int out_mode, lbann_comm* comm) const; - - /** Called once to set up the callback (after all layers are set up). - * Then, populate the layer pointers */ - void setup(model *m) override; - - using callback_base::on_forward_prop_begin; - using callback_base::on_backward_prop_begin; - using sync_layers::on_forward_prop_end; - using sync_layers::on_backward_prop_end; - - /// Synchronize at the beginning of the forward prop of layer l - void on_forward_prop_begin(model* m, Layer* l) override; - /// Synchronize at the end of the forward prop of layer l - void on_forward_prop_end(model* m, Layer* l) override; - /// Synchronize at the beginning of the backward prop of layer l - void on_backward_prop_begin(model* m, Layer* l) override; - /// Synchronize at the end of the backward prop of layer l - void on_backward_prop_end(model* m, Layer* l) override; - - protected: - bool check_if_all_accounted_for() const; - - layer_ptrs_t::iterator populate_layer_ptrs(Layer* l, const prop_t current_prop); - - /// Synchronize and enable cuda profiler - void do_pre_sync(Layer* l); - /// Synchronize and disble cuda profiler - void do_sync(Layer* l) override; - - /// The layers to synchronize. - layers_t m_layers; - - /** The pointers of layers to synchronize for forward prop. - * This set includes those of layers to synchronize for both props. */ - layer_ptrs_t m_fwd_ptrs; - /** The pointers of layers to synchronize for backward prop. - * This set includes those of layers to synchronize for both props. */ - layer_ptrs_t m_bwd_ptrs; - /// The pointers of layers to synchronize for both props. - layer_ptrs_t m_both_ptrs; - - bool m_all_set; ///< whether all the layer pointers are collected - - /// Tells if cudaProfilerInitialized() has already been called. - static bool m_cuda_profiler_initialized; -}; - -// Builder function -std::unique_ptr -build_sync_selected_callback_from_pbuf( - const google::protobuf::Message&, lbann_summary*); - -} // namespace callback -} // namespace lbann - -#endif // LBANN_CALLBACKS_CALLBACK_SYNC_SELECTED_HPP_INCLUDED diff --git a/include/lbann/layers/layer.hpp b/include/lbann/layers/layer.hpp index 8ae20b69b57..f56650a9694 100644 --- a/include/lbann/layers/layer.hpp +++ b/include/lbann/layers/layer.hpp @@ -46,7 +46,6 @@ class model; class weights; namespace callback { class sync_layers; -class sync_selected; } // namespace callback /** @@ -68,7 +67,6 @@ class sync_selected; */ class Layer { friend class callback::sync_layers; - friend class callback::sync_selected; public: diff --git a/include/lbann/lbann.hpp b/include/lbann/lbann.hpp index 3f3d1e2d5aa..d41fe933c90 100644 --- a/include/lbann/lbann.hpp +++ b/include/lbann/lbann.hpp @@ -166,7 +166,6 @@ #include "lbann/callbacks/save_topk_models.hpp" #include "lbann/callbacks/summary.hpp" #include "lbann/callbacks/sync_layers.hpp" -#include "lbann/callbacks/sync_selected.hpp" #include "lbann/callbacks/timeline.hpp" #include "lbann/callbacks/timer.hpp" #include "lbann/callbacks/variable_minibatch.hpp" diff --git a/src/callbacks/CMakeLists.txt b/src/callbacks/CMakeLists.txt index 1068a4955ff..27e08c5d5c9 100644 --- a/src/callbacks/CMakeLists.txt +++ b/src/callbacks/CMakeLists.txt @@ -33,7 +33,6 @@ set_full_path(THIS_DIR_SOURCES save_topk_models.cpp summary.cpp sync_layers.cpp - sync_selected.cpp timeline.cpp timer.cpp variable_minibatch.cpp diff --git a/src/callbacks/sync_selected.cpp b/src/callbacks/sync_selected.cpp deleted file mode 100644 index efe0c2a24e1..00000000000 --- a/src/callbacks/sync_selected.cpp +++ /dev/null @@ -1,324 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. -// Produced at the Lawrence Livermore National Laboratory. -// Written by the LBANN Research Team (B. Van Essen, et al.) listed in -// the CONTRIBUTORS file. -// -// LLNL-CODE-697807. -// All rights reserved. -// -// This file is part of LBANN: Livermore Big Artificial Neural Network -// Toolkit. For details, see http://software.llnl.gov/LBANN or -// https://github.com/LLNL/LBANN. -// -// Licensed under the Apache License, Version 2.0 (the "Licensee"); you -// may not use this file except in compliance with the License. You may -// obtain a copy of the License at: -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the license. -// -// callback_sync_selected.cpp - Callback to synchronize selected layers -/////////////////////////////////////////////////////////////////////////////// - -#include "lbann/callbacks/sync_selected.hpp" -#include "lbann/utils/timer.hpp" -#ifdef LBANN_NVPROF -#include -#include "lbann/utils/file_utils.hpp" -#include -#endif // LBANN_NVPROF - -namespace lbann { -namespace callback { - -bool sync_selected::m_cuda_profiler_initialized = false; -const std::map - sync_selected::m_prop_str - = {std::make_pair(sync_selected::prop_t::Both, "Both"), - std::make_pair(sync_selected::prop_t::Forward, "Forward"), - std::make_pair(sync_selected::prop_t::Backward, "Backward")}; - -sync_selected::sync_selected( - const sync_selected::layers_t& layers, bool async_gpus, bool async_mpi) - : sync_layers(!async_gpus, !async_mpi, false), - m_layers(layers), m_all_set(false) { - #ifdef LBANN_NVPROF - cudaProfilerStop(); // make sure to flush out profile data - #endif - - size_t cnt_fwd = 0u; - size_t cnt_bwd = 0u; - for(const auto& l: m_layers) { - switch (l.second) { - case Forward: cnt_fwd ++; break; - case Backward: cnt_bwd ++; break; - case Both: cnt_fwd ++; cnt_bwd ++; break; - } - } - m_fwd_ptrs.reserve(cnt_fwd); - m_bwd_ptrs.reserve(cnt_bwd); -} - -sync_selected::~sync_selected() { - #ifdef LBANN_NVPROF - cudaProfilerStop(); // make sure to flush out profile data - #endif -} - -std::string sync_selected::get_description() const { - std::string selection; - for (const auto& l: m_layers) { - std::map::const_iterator it = m_prop_str.find(l.second); - selection += l.first + '.' + it->second + ' '; - } - return "sync_selected : { " + selection + '}'; -} - -void sync_selected::turn_off_init_cuda_profiler() { - m_cuda_profiler_initialized = true; -} - -bool sync_selected::check_if_cuda_profiler_initialized() { - return m_cuda_profiler_initialized; -} - -/** - * Allow users to pass parameters to cudaProfilerInitialize() via prototext. - * @param cfg_file configuration file for cuda profiler. - * (cuda_profiler_setup::config_file in the prototext) - * @param out_dir output mode for cuda profiler. - * (cuda_profiler_setup::output_dir in the prototext) - * @param out_mode output mode for cuda profiler. - * (cuda_profiler_setup::output_mode in the prototext) - * @param comm global world communicator. - * The profile output will be wrttien to out_dir/layer_name.prop.rank.prof - */ -void sync_selected::init_cuda_profiler( - const std::string cfg_file, const std::string out_dir, int out_mode, lbann_comm* comm) const { -#ifdef LBANN_NVPROF - if (check_if_cuda_profiler_initialized()) { - return; - } - turn_off_init_cuda_profiler(); - - std::string o_dir = out_dir; - if (comm->am_world_master()) { - if (!lbann::create_dir(o_dir)) { - throw lbann_exception("sync_selected failed to create output directory: " + out_dir); - } - } - o_dir = add_delimiter(o_dir); - - El::GPUManager::SynchronizeDevice(); - comm->global_barrier(); - - std::string selection; - for (const auto& l: m_layers) { - std::map::const_iterator it = m_prop_str.find(l.second); - selection += l.first + '.' + it->second + '.'; - } - const std::string o_prefix = o_dir + selection; - const int my_rank = comm->get_rank_in_world(); - const std::string o_file = o_prefix + std::to_string(my_rank) + ".prof"; - const cudaOutputMode_t o_mode = (out_mode == 0)? cudaKeyValuePair : cudaCSV; - - const auto ret = cudaProfilerInitialize(cfg_file.c_str(), o_file.c_str(), o_mode); - - if (ret == cudaErrorInvalidValue) { - throw lbann_exception("sync_selected is unabled to initialze cuda profiler: invalid inputs."); - } else if (ret == cudaErrorProfilerDisabled) { - std::stringstream err; - err << "sync_selected is unable to initialize cuda profiler: " << std::endl - << " An external profiling tool (nvprof/nvvp) may already be running." << std::endl - << " To use this callback with such a tool, set 'cuda_profiler::no_init'." << std::endl; - throw lbann_exception(err.str()); - } else { - cudaProfilerStop(); // suppress profiling until reaching the region of interest - - if (comm->am_world_master()) { - std::string msg = "Preparing callback sync_selected"; - if (!o_prefix.empty()) { - msg += " with cudaProfiler writing to " + o_prefix + ".rank.prof"; - } - std::cout << msg << std::endl; - } - } -#endif -} - -void sync_selected::setup(model *m) { - const std::vector& layers = m->get_layers(); - for (auto l: layers) { - populate_layer_ptrs(l, Forward); - populate_layer_ptrs(l, Backward); - } - if (!m_all_set) { - throw lbann_exception("sync_selected cannot recognize all the layer names"); - } -} - - -void sync_selected::on_forward_prop_begin(model *m, Layer *l) { - const layer_ptrs_t::const_iterator it = m_fwd_ptrs.find(l); - - if (it == m_fwd_ptrs.cend()) { - return; - } - // We do not measure the time to synchronize here and thus not contribute it - // back to the cost of the preceding layer as we are only interested in the - // selected layer. - do_pre_sync(l); -} - -void sync_selected::on_forward_prop_end(model *m, Layer *l) { - const layer_ptrs_t::const_iterator it = m_fwd_ptrs.find(l); - if (it == m_fwd_ptrs.cend()) { - return; - } - const double start = get_time(); - do_sync(l); - l->m_fp_time += get_time() - start; -} - -void sync_selected::on_backward_prop_begin(model *m, Layer *l) { - const layer_ptrs_t::const_iterator it = m_bwd_ptrs.find(l); - - if (it == m_bwd_ptrs.cend()) { - return; - } - do_pre_sync(l); -} - -void sync_selected::on_backward_prop_end(model *m, Layer *l) { - const layer_ptrs_t::const_iterator it = m_bwd_ptrs.find(l); - if (it == m_bwd_ptrs.cend()) { - return; - } - const double start = get_time(); - do_sync(l); - l->m_bp_time += get_time() - start; -} - -bool sync_selected::check_if_all_accounted_for() const { - return (m_fwd_ptrs.size() + m_bwd_ptrs.size() - == m_layers.size() + m_both_ptrs.size()); -} - -/** - * When the pointer of a selected layer is not known, rely on the layer name - * to match. When the first time the match is found, save the pointer of the - * selected layer and use it for the subsequent matching instead of name. - */ -sync_selected::layer_ptrs_t::iterator -sync_selected::populate_layer_ptrs( - Layer* l, const sync_selected::prop_t current_prop) { - - std::pair ret - = std::make_pair(((current_prop == Forward)? m_fwd_ptrs.end() : m_bwd_ptrs.end()), false); - - const layers_t::const_iterator it = m_layers.find(l->get_name()); - - if (it != m_layers.cend()) { // A matching layer is found - const prop_t selected_prop = it->second; - - if ((selected_prop != Both) && (selected_prop != current_prop)) { - return ret.first; // Prop direction does not match - } - - if (selected_prop == Forward) { - ret = m_fwd_ptrs.emplace(l); - } else if (selected_prop == Backward) { - ret = m_bwd_ptrs.emplace(l); - } else { // Both - m_both_ptrs.emplace(l); - - if (current_prop == Forward) { - ret = m_fwd_ptrs.emplace(l); - m_bwd_ptrs.emplace(l); - } else { - m_fwd_ptrs.emplace(l); - ret = m_bwd_ptrs.emplace(l); - } - } - if (check_if_all_accounted_for()) { - m_all_set = true; - } - } - return ret.first; -} - - -void sync_selected::do_pre_sync(Layer *l) { - sync_layers::do_sync(l); - #ifdef LBANN_NVPROF - cudaProfilerStart(); - #endif -} - -void sync_selected::do_sync(Layer *l) { -#ifdef LBANN_NVPROF //(also deinfed LBANN_HAS_GPU) - if (m_sync_gpus) { - El::GPUManager::SynchronizeDevice(); - cudaProfilerStop(); - } - if (m_sync_mpi) { - l->get_comm()->global_barrier(); - } - if (!m_sync_gpus) { - cudaProfilerStop(); - } -#else - sync_layers::do_sync(l); -#endif -} - -std::unique_ptr -build_sync_selected_callback_from_pbuf( - const google::protobuf::Message& proto_msg, lbann_summary*) { - const auto& params = - dynamic_cast(proto_msg); - const int num_layers = params.layer_to_sync_size(); - if (num_layers == 0) { - throw lbann_exception("sync_selected requires at least a layer " - "to synchronize."); - } - - using layers_t = sync_selected::layers_t; - using prop_t = sync_selected::prop_t; - - layers_t selected_layers; - selected_layers.reserve(num_layers); - - for (int i = 0; i < num_layers; ++i) { - const auto& layer_to_sync = params.layer_to_sync(i); - selected_layers.emplace(layer_to_sync.name(), - static_cast(layer_to_sync.prop())); - } - - auto cb_ptr - = make_unique(selected_layers, - params.async_gpus(), - params.async_mpi()); - -#ifdef LBANN_NVPROF - const auto& cp_setup = params.cuda_profiler_setup(); - if (cp_setup.no_init()) { - sync_selected::turn_off_init_cuda_profiler(); - } else { - cb_ptr->init_cuda_profiler(cp_setup.config_file(), - cp_setup.output_dir(), - cp_setup.output_mode(), - comm); - } -#endif // LBANN_NVPROF - return cb_ptr; -} - -} // namespace callback -} // namespace lbann diff --git a/src/proto/callbacks.proto b/src/proto/callbacks.proto index e72773e864f..647d852e09a 100644 --- a/src/proto/callbacks.proto +++ b/src/proto/callbacks.proto @@ -65,7 +65,6 @@ message Callback { CallbackReplaceWeights replace_weights = 31; CallbackGPUMemoryUsage gpu_memory_usage = 32; CallbackSyncLayers sync_layers = 33; - CallbackSyncSelected sync_selected = 34; CallbackConfusionMatrix confusion_matrix = 36; CallbackCheckMetric check_metric = 37; CallbackPerturbAdam perturb_adam = 38; @@ -273,34 +272,6 @@ message Callback { bool only_input = 3; } - message CallbackSyncSelected { - message LayerToSync { - enum PropDirection { - Both = 0; - Forward = 1; - Backward = 2; - } - string name = 1; // name of the layer to synchronize - PropDirection prop = 2; // propagation setep to synchronize - } - - message CudaProfilerSetup { - enum OutputMode { - KeyValuePair = 0; - CSV = 1; - } - bool no_init = 1; - string config_file = 2; - string output_dir = 3; - OutputMode output_mode = 4; - } - - bool async_gpus = 1; - bool async_mpi = 2; - repeated LayerToSync layer_to_sync = 3; - CudaProfilerSetup cuda_profiler_setup = 4; - } - message CallbackConfusionMatrix { string prediction = 1; // Prediction layer string label = 2; // Label layer diff --git a/src/proto/factories/callback_factory.cpp b/src/proto/factories/callback_factory.cpp index d8b24f6e183..ff142018276 100644 --- a/src/proto/factories/callback_factory.cpp +++ b/src/proto/factories/callback_factory.cpp @@ -58,7 +58,6 @@ #include "lbann/callbacks/save_topk_models.hpp" #include "lbann/callbacks/summary.hpp" #include "lbann/callbacks/sync_layers.hpp" -#include "lbann/callbacks/sync_selected.hpp" #include "lbann/callbacks/timeline.hpp" #include "lbann/callbacks/timer.hpp" #include "lbann/callbacks/variable_minibatch.hpp" @@ -173,8 +172,6 @@ void register_default_builders(factory_type& factory) build_summary_callback_from_pbuf); factory.register_builder("CallbackSyncLayers", build_sync_layers_callback_from_pbuf); - factory.register_builder("CallbackSyncSelected", - build_sync_selected_callback_from_pbuf); factory.register_builder("CallbackTimeline", build_timeline_callback_from_pbuf); factory.register_builder("CallbackTimer", From a6f797fa00ea68a75514fb1b379d36f6c76fcd72 Mon Sep 17 00:00:00 2001 From: "Thomas R. Benson" Date: Fri, 2 Aug 2019 13:52:40 -0700 Subject: [PATCH 193/634] address PR comments --- python/lbann/util/class_generator.py | 2 +- src/proto/layers.proto | 14 +------------- src/proto/metrics.proto | 2 +- src/proto/optimizers.proto | 2 +- src/proto/weights.proto | 2 +- 5 files changed, 5 insertions(+), 17 deletions(-) diff --git a/python/lbann/util/class_generator.py b/python/lbann/util/class_generator.py index 7a72dfe204b..62a5f5b9321 100644 --- a/python/lbann/util/class_generator.py +++ b/python/lbann/util/class_generator.py @@ -103,7 +103,7 @@ def export_proto(self): # elsewhere. But this code either works or doesn't get # executed now, so I vote delaying this fix until a need # arises. - proto_modules = set([callbacks_pb2, layers_pb2, metrics_pb2, model_pb2, objective_functions_pb2, optimizers_pb2, weights_pb2]) + proto_modules = [callbacks_pb2, layers_pb2, metrics_pb2, model_pb2, objective_functions_pb2, optimizers_pb2, weights_pb2] proto_type = None while proto_type is None: proto_type = getattr(proto_modules.pop(), message_name, None) diff --git a/src/proto/layers.proto b/src/proto/layers.proto index f365c2f47de..417dd30ec91 100644 --- a/src/proto/layers.proto +++ b/src/proto/layers.proto @@ -44,19 +44,7 @@ message Layer { string bottom = 155; string type = 156; - // a Layer should contain exactly one of the following - // (this may or may not be properly checked for in proto_common.cpp) - // - // @todo: this should be done better using oneof: - // oneof a_layer { - // Reshape reshape = 306 - // Pooling pooling = 12; - // ... - // } - // - // - - oneof layer_type { + oneof layer_type { // Input layers Input input = 2; diff --git a/src/proto/metrics.proto b/src/proto/metrics.proto index 2c59ebdc66e..5da7c671f4d 100644 --- a/src/proto/metrics.proto +++ b/src/proto/metrics.proto @@ -37,4 +37,4 @@ message Metric { } LayerMetric layer_metric = 11; -} \ No newline at end of file +} diff --git a/src/proto/optimizers.proto b/src/proto/optimizers.proto index c914efd26d5..48df859c926 100644 --- a/src/proto/optimizers.proto +++ b/src/proto/optimizers.proto @@ -68,4 +68,4 @@ message Optimizer { double momentum = 2; // Set to zero for vanilla SGD bool nesterov = 4; } -} \ No newline at end of file +} diff --git a/src/proto/weights.proto b/src/proto/weights.proto index 05617731318..bd09244e825 100644 --- a/src/proto/weights.proto +++ b/src/proto/weights.proto @@ -71,4 +71,4 @@ message Initializer { message HeUniformInitializer {} message LeCunNormalInitializer {} message LeCunUniformInitializer {} -} \ No newline at end of file +} From 39f9f46582e139ebb3479012c0242c02e016ae42 Mon Sep 17 00:00:00 2001 From: Katie Graham Date: Fri, 2 Aug 2019 14:24:52 -0700 Subject: [PATCH 194/634] Changed lbann_summary* to a shared_ptr --- include/lbann/callbacks/callback.hpp | 10 +++++----- include/lbann/callbacks/debug.hpp | 2 +- include/lbann/callbacks/debug_io.hpp | 2 +- include/lbann/callbacks/imcomm.hpp | 4 ++-- include/lbann/callbacks/ltfb.hpp | 2 +- include/lbann/callbacks/summary.hpp | 3 +-- include/lbann/callbacks/timer.hpp | 2 +- src/callbacks/check_gradients.cpp | 2 +- src/callbacks/check_metric.cpp | 2 +- src/callbacks/checkpoint.cpp | 2 +- src/callbacks/confusion_matrix.cpp | 2 +- src/callbacks/debug.cpp | 2 +- src/callbacks/debug_io.cpp | 2 +- src/callbacks/dump_error_signals.cpp | 2 +- src/callbacks/dump_gradients.cpp | 2 +- .../dump_minibatch_sample_indices.cpp | 2 +- src/callbacks/dump_outputs.cpp | 2 +- src/callbacks/dump_weights.cpp | 2 +- src/callbacks/early_stopping.cpp | 2 +- src/callbacks/hang.cpp | 2 +- src/callbacks/imcomm.cpp | 6 +++--- src/callbacks/learning_rate.cpp | 12 +++++------ src/callbacks/ltfb.cpp | 14 ++++++------- src/callbacks/mixup.cpp | 2 +- src/callbacks/monitor_io.cpp | 2 +- src/callbacks/perturb_adam.cpp | 2 +- src/callbacks/perturb_dropout.cpp | 2 +- src/callbacks/print_statistics.cpp | 2 +- src/callbacks/profiler.cpp | 2 +- src/callbacks/replace_weights.cpp | 2 +- src/callbacks/save_images.cpp | 2 +- src/callbacks/save_model.cpp | 2 +- src/callbacks/save_topk_models.cpp | 2 +- src/callbacks/summary.cpp | 8 ++------ src/callbacks/sync_layers.cpp | 2 +- src/callbacks/sync_selected.cpp | 2 +- src/callbacks/timeline.cpp | 2 +- src/callbacks/timer.cpp | 2 +- src/callbacks/variable_minibatch.cpp | 4 ++-- src/proto/factories/callback_factory.cpp | 20 +++++++++++++++---- 40 files changed, 75 insertions(+), 68 deletions(-) diff --git a/include/lbann/callbacks/callback.hpp b/include/lbann/callbacks/callback.hpp index 5a3abe68e7e..1337a3d3b8e 100644 --- a/include/lbann/callbacks/callback.hpp +++ b/include/lbann/callbacks/callback.hpp @@ -38,7 +38,7 @@ // builders. #define LBANN_ADD_DEFAULT_CALLBACK_BUILDER(Class, FunctionName) \ inline std::unique_ptr FunctionName( \ - const google::protobuf::Message&, lbann_summary*) { \ + const google::protobuf::Message&, std::shared_ptr const&) { \ return lbann::make_unique(); \ } @@ -62,10 +62,10 @@ class callback_base { * summarizer. */ callback_base(int batch_interval = 1, - lbann_summary *summarizer = nullptr) : + const std::shared_ptr& summarizer = nullptr) : m_batch_interval(std::max(batch_interval, 1)), m_summarizer(summarizer) {} callback_base(const callback_base&) = default; - virtual ~callback_base() {} + virtual ~callback_base() = default; ///@} /** @name Polymorphic copy */ @@ -77,7 +77,7 @@ class callback_base { /** @name Modifiers */ ///@{ - void set_summarizer(lbann_summary *summarizer) { + void set_summarizer(const std::shared_ptr& summarizer) { m_summarizer = summarizer; } @@ -190,7 +190,7 @@ class callback_base { /** @brief Batch methods should once every this many steps. */ int m_batch_interval; /** @brief Optional summarizer for the callbacks to use. */ - lbann_summary *m_summarizer; + std::shared_ptr m_summarizer; }; } // namespace lbann diff --git a/include/lbann/callbacks/debug.hpp b/include/lbann/callbacks/debug.hpp index 455e8b16c4b..c5a1382e543 100644 --- a/include/lbann/callbacks/debug.hpp +++ b/include/lbann/callbacks/debug.hpp @@ -51,7 +51,7 @@ class debug : public callback_base { * execution modes. */ debug(std::set modes, - lbann_summary *summarizer = nullptr) : + const std::shared_ptr& summarizer = nullptr) : callback_base(1, summarizer), m_modes(std::move(modes)) {} debug(const debug&) = default; debug& operator=(const debug&) = default; diff --git a/include/lbann/callbacks/debug_io.hpp b/include/lbann/callbacks/debug_io.hpp index 6d2b11866bb..5aaf5fe2e17 100644 --- a/include/lbann/callbacks/debug_io.hpp +++ b/include/lbann/callbacks/debug_io.hpp @@ -54,7 +54,7 @@ class debug_io : public callback_base { */ debug_io(execution_mode phase = execution_mode::invalid, int debug_lvl = 0, - lbann_summary *summarizer = nullptr) : + const std::shared_ptr& summarizer = nullptr) : callback_base(1, summarizer), m_debug_phase(phase), m_debug_lvl(debug_lvl) {} debug_io(const debug_io&) = default; debug_io& operator=( diff --git a/include/lbann/callbacks/imcomm.hpp b/include/lbann/callbacks/imcomm.hpp index e906d68a0ea..77d27bab183 100644 --- a/include/lbann/callbacks/imcomm.hpp +++ b/include/lbann/callbacks/imcomm.hpp @@ -54,7 +54,7 @@ class imcomm : public callback_base { * Initialize with ct being used for all weights. */ imcomm(comm_type ct = NORMAL, - lbann_summary *summarizer = nullptr); + const std::shared_ptr& summarizer = nullptr); imcomm(const imcomm&) = default; imcomm& operator=(const imcomm&) = default; imcomm* copy() const override { @@ -65,7 +65,7 @@ class imcomm : public callback_base { * Implies no inter-model updates for other weights. */ imcomm(comm_type ct, std::unordered_set weights_list, - lbann_summary *summarizer = nullptr); + const std::shared_ptr& summarizer = nullptr); /** Choose comm type ct for weights. */ void set_weights_comm(weights *w, comm_type ct); diff --git a/include/lbann/callbacks/ltfb.hpp b/include/lbann/callbacks/ltfb.hpp index f6a4adbd06e..8115b30b094 100644 --- a/include/lbann/callbacks/ltfb.hpp +++ b/include/lbann/callbacks/ltfb.hpp @@ -122,7 +122,7 @@ class ltfb : public callback_base { bool low_score_wins = false, communication_algorithm comm_algo = communication_algorithm::sendrecv_weights, bool exchange_hyperparameters = false, - lbann_summary *summarizer = nullptr); + const std::shared_ptr& summarizer = nullptr); ltfb(const ltfb& other); ltfb& operator=(const ltfb& other); ltfb* copy() const override { return new ltfb(*this); } diff --git a/include/lbann/callbacks/summary.hpp b/include/lbann/callbacks/summary.hpp index c9c2512c3ef..f40a2ddcd47 100644 --- a/include/lbann/callbacks/summary.hpp +++ b/include/lbann/callbacks/summary.hpp @@ -47,9 +47,8 @@ class summary : public callback_base { * @param mat_interval FIXME * @todo Document mat_interval parameter. */ - summary(lbann_summary *summarizer, int batch_interval = 1, + summary(const std::shared_ptr& summarizer, int batch_interval = 1, int mat_interval = 25); - ~summary() override; summary(const summary&) = default; summary& operator=(const summary&) = default; summary* copy() const override { diff --git a/include/lbann/callbacks/timer.hpp b/include/lbann/callbacks/timer.hpp index 2864b075325..f38dbb8cb99 100644 --- a/include/lbann/callbacks/timer.hpp +++ b/include/lbann/callbacks/timer.hpp @@ -43,7 +43,7 @@ namespace callback { class timer : public callback_base { public: - timer(lbann_summary *summarizer = nullptr) + timer(const std::shared_ptr& summarizer = nullptr) : callback_base(1, summarizer) {} timer(const timer&) = default; timer& operator=(const timer&) = default; diff --git a/src/callbacks/check_gradients.cpp b/src/callbacks/check_gradients.cpp index f77b3d87764..bbd635fdb49 100644 --- a/src/callbacks/check_gradients.cpp +++ b/src/callbacks/check_gradients.cpp @@ -232,7 +232,7 @@ void check_gradients::on_test_end(model *m) { // Builder function std::unique_ptr build_check_gradients_callback_from_pbuf( - const google::protobuf::Message& proto_msg, lbann_summary*) { + const google::protobuf::Message& proto_msg, const std::shared_ptr&) { const auto& params = dynamic_cast(proto_msg); return make_unique(params.step_size(), diff --git a/src/callbacks/check_metric.cpp b/src/callbacks/check_metric.cpp index 5a139b17416..e92d3098bcc 100644 --- a/src/callbacks/check_metric.cpp +++ b/src/callbacks/check_metric.cpp @@ -91,7 +91,7 @@ void check_metric::do_check_metric(const model& m) const { std::unique_ptr build_check_metric_callback_from_pbuf( - const google::protobuf::Message& proto_msg, lbann_summary*) { + const google::protobuf::Message& proto_msg, std::shared_ptr const&) { const auto& params = dynamic_cast(proto_msg); const auto& modes = diff --git a/src/callbacks/checkpoint.cpp b/src/callbacks/checkpoint.cpp index fd10eb718ff..2b4904ee688 100644 --- a/src/callbacks/checkpoint.cpp +++ b/src/callbacks/checkpoint.cpp @@ -325,7 +325,7 @@ bool checkpoint::restart(model *m) { std::unique_ptr build_checkpoint_callback_from_pbuf( - const google::protobuf::Message& proto_msg, lbann_summary*) { + const google::protobuf::Message& proto_msg, const std::shared_ptr&) { const auto& params = dynamic_cast(proto_msg); return make_unique(params.checkpoint_dir(), diff --git a/src/callbacks/confusion_matrix.cpp b/src/callbacks/confusion_matrix.cpp index c8215f41c74..6d0eeb17b52 100644 --- a/src/callbacks/confusion_matrix.cpp +++ b/src/callbacks/confusion_matrix.cpp @@ -235,7 +235,7 @@ void confusion_matrix::save_confusion_matrix(const model& m) { std::unique_ptr build_confusion_matrix_callback_from_pbuf( - const google::protobuf::Message& proto_msg, lbann_summary*) { + const google::protobuf::Message& proto_msg, const std::shared_ptr&) { const auto& params = dynamic_cast(proto_msg); return make_unique(params.prediction(), diff --git a/src/callbacks/debug.cpp b/src/callbacks/debug.cpp index f7c36534f5b..d8e44f00d1a 100644 --- a/src/callbacks/debug.cpp +++ b/src/callbacks/debug.cpp @@ -160,7 +160,7 @@ void debug::on_optimize_end(model *m, weights *w) { std::unique_ptr build_debug_callback_from_pbuf(const google::protobuf::Message& proto_msg, - lbann_summary* summarizer) { + const std::shared_ptr& summarizer) { const auto& params = dynamic_cast(proto_msg); const auto& modes = diff --git a/src/callbacks/debug_io.cpp b/src/callbacks/debug_io.cpp index 8ddcb35c2b1..15b1d3657ab 100644 --- a/src/callbacks/debug_io.cpp +++ b/src/callbacks/debug_io.cpp @@ -155,7 +155,7 @@ void debug_io::on_test_begin(model *m) { std::unique_ptr build_debug_io_callback_from_pbuf( - const google::protobuf::Message& proto_msg, lbann_summary*) { + const google::protobuf::Message& proto_msg, const std::shared_ptr&) { const auto& params = dynamic_cast(proto_msg); const auto& phase = exe_mode_from_string(params.phase()); diff --git a/src/callbacks/dump_error_signals.cpp b/src/callbacks/dump_error_signals.cpp index 99445873bfa..eee0be5bfcf 100644 --- a/src/callbacks/dump_error_signals.cpp +++ b/src/callbacks/dump_error_signals.cpp @@ -55,7 +55,7 @@ void dump_error_signals::on_backward_prop_end(model *m, Layer *l) { std::unique_ptr build_dump_error_signals_callback_from_pbuf( - const google::protobuf::Message& proto_msg, lbann_summary*) { + const google::protobuf::Message& proto_msg, const std::shared_ptr&) { const auto& params = dynamic_cast(proto_msg); return make_unique(params.basename()); diff --git a/src/callbacks/dump_gradients.cpp b/src/callbacks/dump_gradients.cpp index 5b57482f2d4..f8ed1d23679 100644 --- a/src/callbacks/dump_gradients.cpp +++ b/src/callbacks/dump_gradients.cpp @@ -53,7 +53,7 @@ void dump_gradients::on_backward_prop_end(model *m) { std::unique_ptr build_dump_gradients_callback_from_pbuf( - const google::protobuf::Message& proto_msg, lbann_summary*) { + const google::protobuf::Message& proto_msg, const std::shared_ptr&) { const auto& params = dynamic_cast(proto_msg); return make_unique(params.basename(), diff --git a/src/callbacks/dump_minibatch_sample_indices.cpp b/src/callbacks/dump_minibatch_sample_indices.cpp index 58657dbc309..d4b29d06b1c 100644 --- a/src/callbacks/dump_minibatch_sample_indices.cpp +++ b/src/callbacks/dump_minibatch_sample_indices.cpp @@ -80,7 +80,7 @@ void dump_minibatch_sample_indices::on_evaluate_forward_prop_end(model *m, Layer std::unique_ptr build_dump_mb_indices_callback_from_pbuf( - const google::protobuf::Message& proto_msg, lbann_summary*) { + const google::protobuf::Message& proto_msg, const std::shared_ptr&) { const auto& params = dynamic_cast(proto_msg); return make_unique( diff --git a/src/callbacks/dump_outputs.cpp b/src/callbacks/dump_outputs.cpp index 926715e87ba..38ffd3f7149 100644 --- a/src/callbacks/dump_outputs.cpp +++ b/src/callbacks/dump_outputs.cpp @@ -181,7 +181,7 @@ void dump_outputs::do_dump_outputs(const model& m, const Layer& l) { std::unique_ptr build_dump_outputs_callback_from_pbuf( - const google::protobuf::Message& proto_msg, lbann_summary*) { + const google::protobuf::Message& proto_msg, const std::shared_ptr&) { const auto& params = dynamic_cast(proto_msg); const auto& layer_names = parse_set(params.layers()); diff --git a/src/callbacks/dump_weights.cpp b/src/callbacks/dump_weights.cpp index 5ad35f2a742..c047d5b998f 100644 --- a/src/callbacks/dump_weights.cpp +++ b/src/callbacks/dump_weights.cpp @@ -58,7 +58,7 @@ void dump_weights::do_dump_weights(model *m, std::string s) { std::unique_ptr build_dump_weights_callback_from_pbuf( - const google::protobuf::Message& proto_msg, lbann_summary*) { + const google::protobuf::Message& proto_msg, const std::shared_ptr&) { const auto& params = dynamic_cast(proto_msg); return make_unique(params.basename()); diff --git a/src/callbacks/early_stopping.cpp b/src/callbacks/early_stopping.cpp index 4232b71ac6b..7387b8339cf 100644 --- a/src/callbacks/early_stopping.cpp +++ b/src/callbacks/early_stopping.cpp @@ -63,7 +63,7 @@ void early_stopping::on_validation_end(model *m) { std::unique_ptr build_early_stopping_callback_from_pbuf( - const google::protobuf::Message& proto_msg, lbann_summary*) { + const google::protobuf::Message& proto_msg, const std::shared_ptr&) { const auto& params = dynamic_cast(proto_msg); return make_unique(params.patience()); diff --git a/src/callbacks/hang.cpp b/src/callbacks/hang.cpp index 4fb7d151166..70b53f5de8e 100644 --- a/src/callbacks/hang.cpp +++ b/src/callbacks/hang.cpp @@ -46,7 +46,7 @@ void hang::setup(model* m) std::unique_ptr build_hang_callback_from_pbuf( - const google::protobuf::Message& proto_msg, lbann_summary*) { + const google::protobuf::Message& proto_msg, std::shared_ptr const&) { const auto& params = dynamic_cast(proto_msg); return make_unique(params.rank()); diff --git a/src/callbacks/imcomm.cpp b/src/callbacks/imcomm.cpp index 0c94b58325b..c02c08dcdfc 100644 --- a/src/callbacks/imcomm.cpp +++ b/src/callbacks/imcomm.cpp @@ -36,12 +36,12 @@ namespace lbann { namespace callback { imcomm::imcomm(imcomm::comm_type ct, - lbann_summary *summarizer) : + const std::shared_ptr& summarizer) : callback_base(1, summarizer), m_default_ct(ct) {} imcomm::imcomm(imcomm::comm_type ct, std::unordered_set weights_list, - lbann_summary *summarizer) : + const std::shared_ptr& summarizer) : imcomm(ct, summarizer) { for (weights *w : weights_list) { m_weights_params[w] = {}; @@ -162,7 +162,7 @@ std::string get_comm_type_name(imcomm::comm_type m) { std::unique_ptr build_imcomm_callback_from_pbuf( const google::protobuf::Message& proto_msg, - lbann_summary* summarizer) { + const std::shared_ptr& summarizer) { const auto& params = dynamic_cast(proto_msg); const auto& type_str = params.intertrainer_comm_method(); imcomm::comm_type type = imcomm::comm_type::NONE; diff --git a/src/callbacks/learning_rate.cpp b/src/callbacks/learning_rate.cpp index e41aed36f20..099ba11aad6 100644 --- a/src/callbacks/learning_rate.cpp +++ b/src/callbacks/learning_rate.cpp @@ -307,7 +307,7 @@ float optimizerwise_adaptive_learning_rate::optimizer_schedule( std::unique_ptr build_step_learning_rate_callback_from_pbuf( - const google::protobuf::Message& proto_msg, lbann_summary*) { + const google::protobuf::Message& proto_msg, const std::shared_ptr&) { const auto& params = dynamic_cast(proto_msg); return make_unique( @@ -318,7 +318,7 @@ build_step_learning_rate_callback_from_pbuf( std::unique_ptr build_adaptive_learning_rate_callback_from_pbuf( - const google::protobuf::Message& proto_msg, lbann_summary*) { + const google::protobuf::Message& proto_msg, const std::shared_ptr&) { const auto& params = dynamic_cast(proto_msg); return make_unique( @@ -329,7 +329,7 @@ build_adaptive_learning_rate_callback_from_pbuf( std::unique_ptr build_drop_fixed_learning_rate_callback_from_pbuf( - const google::protobuf::Message& proto_msg, lbann_summary*) { + const google::protobuf::Message& proto_msg, const std::shared_ptr&) { const auto& params = dynamic_cast(proto_msg); std::vector drop_epochs; @@ -344,7 +344,7 @@ build_drop_fixed_learning_rate_callback_from_pbuf( std::unique_ptr build_linear_growth_learning_rate_callback_from_pbuf( - const google::protobuf::Message& proto_msg,lbann_summary*) { + const google::protobuf::Message& proto_msg,const std::shared_ptr&) { using MsgType = lbann_data::Callback::CallbackLinearGrowthLearningRate; using CallbackType = linear_growth_learning_rate; const auto& params = @@ -357,7 +357,7 @@ build_linear_growth_learning_rate_callback_from_pbuf( std::unique_ptr build_optimizerwise_adaptive_learning_rate_callback_from_pbuf( - const google::protobuf::Message& proto_msg,lbann_summary*) { + const google::protobuf::Message& proto_msg,const std::shared_ptr&) { using MsgType = lbann_data::Callback::CallbackOptimizerwiseAdaptiveLearningRate; using CallbackType = optimizerwise_adaptive_learning_rate; const auto& params = dynamic_cast(proto_msg); @@ -367,7 +367,7 @@ build_optimizerwise_adaptive_learning_rate_callback_from_pbuf( std::unique_ptr build_poly_learning_rate_callback_from_pbuf( - const google::protobuf::Message& proto_msg, lbann_summary*) { + const google::protobuf::Message& proto_msg, const std::shared_ptr&) { const auto& params = dynamic_cast(proto_msg); return make_unique( diff --git a/src/callbacks/ltfb.cpp b/src/callbacks/ltfb.cpp index 511a160f841..8104d7587bb 100644 --- a/src/callbacks/ltfb.cpp +++ b/src/callbacks/ltfb.cpp @@ -323,12 +323,12 @@ EvalType evaluate(model& m, const std::string& metric_name) { } // namespace ltfb::ltfb(El::Int batch_interval, - std::string metric_name, - std::set weights_names, - bool low_score_wins, - communication_algorithm comm_algo, - bool exchange_hyperparameters, - lbann_summary *summarizer) + std::string metric_name, + std::set weights_names, + bool low_score_wins, + communication_algorithm comm_algo, + bool exchange_hyperparameters, + const std::shared_ptr& summarizer) : callback_base(batch_interval, summarizer), m_metric_name(std::move(metric_name)), m_weights_names(std::move(weights_names)), @@ -528,7 +528,7 @@ ltfb::string_to_comm_algo(const std::string& str) { std::unique_ptr build_ltfb_callback_from_pbuf( const google::protobuf::Message& proto_msg, - lbann_summary* summarizer) { + std::shared_ptr const& summarizer) { const auto& params = dynamic_cast(proto_msg); return make_unique( diff --git a/src/callbacks/mixup.cpp b/src/callbacks/mixup.cpp index 5e49fae4085..1e83cd7c709 100644 --- a/src/callbacks/mixup.cpp +++ b/src/callbacks/mixup.cpp @@ -100,7 +100,7 @@ void mixup::on_forward_prop_end(model *m, Layer *l) { std::unique_ptr build_mixup_callback_from_pbuf( - const google::protobuf::Message& proto_msg, lbann_summary*) { + const google::protobuf::Message& proto_msg, const std::shared_ptr&) { const auto& params = dynamic_cast(proto_msg); const auto& layers_list = parse_list(params.layers()); diff --git a/src/callbacks/monitor_io.cpp b/src/callbacks/monitor_io.cpp index 19be961ade1..3ceca25ad12 100644 --- a/src/callbacks/monitor_io.cpp +++ b/src/callbacks/monitor_io.cpp @@ -72,7 +72,7 @@ void monitor_io::on_test_end(model *m) { std::unique_ptr build_monitor_io_callback_from_pbuf( - const google::protobuf::Message& proto_msg, lbann_summary*) { + const google::protobuf::Message& proto_msg, const std::shared_ptr&) { const auto& params = dynamic_cast(proto_msg); return make_unique( diff --git a/src/callbacks/perturb_adam.cpp b/src/callbacks/perturb_adam.cpp index 6f63060d771..8494fec43af 100644 --- a/src/callbacks/perturb_adam.cpp +++ b/src/callbacks/perturb_adam.cpp @@ -164,7 +164,7 @@ void perturb_adam::perturb(lbann_comm& comm, adam& opt) const { std::unique_ptr build_perturb_adam_callback_from_pbuf( - const google::protobuf::Message& proto_msg, lbann_summary*) { + const google::protobuf::Message& proto_msg, const std::shared_ptr&) { const auto& params = dynamic_cast(proto_msg); return make_unique( diff --git a/src/callbacks/perturb_dropout.cpp b/src/callbacks/perturb_dropout.cpp index c656b8b1d00..fec371f4353 100644 --- a/src/callbacks/perturb_dropout.cpp +++ b/src/callbacks/perturb_dropout.cpp @@ -120,7 +120,7 @@ void perturb_dropout::perturb(model& m) { std::unique_ptr build_perturb_dropout_callback_from_pbuf( - const google::protobuf::Message& proto_msg, lbann_summary*) { + const google::protobuf::Message& proto_msg, const std::shared_ptr&) { const auto& params = dynamic_cast(proto_msg); return make_unique( diff --git a/src/callbacks/print_statistics.cpp b/src/callbacks/print_statistics.cpp index 058241fcf6a..5e9e098a31e 100644 --- a/src/callbacks/print_statistics.cpp +++ b/src/callbacks/print_statistics.cpp @@ -249,7 +249,7 @@ void print_statistics::report_results(model *m) { std::unique_ptr build_print_statistics_callback_from_pbuf( - const google::protobuf::Message& proto_msg, lbann_summary*) { + const google::protobuf::Message& proto_msg, const std::shared_ptr&) { const auto& params = dynamic_cast(proto_msg); return make_unique(params.interval(), diff --git a/src/callbacks/profiler.cpp b/src/callbacks/profiler.cpp index 3808fe90b07..363224ad5f4 100644 --- a/src/callbacks/profiler.cpp +++ b/src/callbacks/profiler.cpp @@ -196,7 +196,7 @@ void profiler::on_optimize_end(model *m, weights *w) { std::unique_ptr build_profiler_callback_from_pbuf( - const google::protobuf::Message& proto_msg, lbann_summary*) { + const google::protobuf::Message& proto_msg, const std::shared_ptr&) { const auto& params = dynamic_cast(proto_msg); return make_unique(params.sync(), diff --git a/src/callbacks/replace_weights.cpp b/src/callbacks/replace_weights.cpp index 8ffa70e2d1d..ce091c16c2a 100644 --- a/src/callbacks/replace_weights.cpp +++ b/src/callbacks/replace_weights.cpp @@ -53,7 +53,7 @@ void replace_weights::on_batch_end(model *m) { std::unique_ptr build_replace_weights_callback_from_pbuf( - const google::protobuf::Message& proto_msg, lbann_summary*) { + const google::protobuf::Message& proto_msg, const std::shared_ptr&) { const auto& params = dynamic_cast(proto_msg); return make_unique( diff --git a/src/callbacks/save_images.cpp b/src/callbacks/save_images.cpp index 114665f39ed..62e72f32f52 100644 --- a/src/callbacks/save_images.cpp +++ b/src/callbacks/save_images.cpp @@ -159,7 +159,7 @@ void save_images::on_test_end(model *m) { std::unique_ptr build_save_images_callback_from_pbuf( - const google::protobuf::Message& proto_msg, lbann_summary*) { + const google::protobuf::Message& proto_msg, const std::shared_ptr&) { const auto& params = dynamic_cast(proto_msg); return make_unique( diff --git a/src/callbacks/save_model.cpp b/src/callbacks/save_model.cpp index 055231c9c28..8339633e9e2 100644 --- a/src/callbacks/save_model.cpp +++ b/src/callbacks/save_model.cpp @@ -177,7 +177,7 @@ bool save_model::load_model_weights(std::string ckpt_dir, model * m, bool ckptdi std::unique_ptr build_save_model_callback_from_pbuf( - const google::protobuf::Message& proto_msg, lbann_summary*) { + const google::protobuf::Message& proto_msg, const std::shared_ptr&) { const auto& params = dynamic_cast(proto_msg); if(params.extension().size() != 0) { diff --git a/src/callbacks/save_topk_models.cpp b/src/callbacks/save_topk_models.cpp index 0619e9d2992..19286b9deb9 100644 --- a/src/callbacks/save_topk_models.cpp +++ b/src/callbacks/save_topk_models.cpp @@ -91,7 +91,7 @@ bool save_topk_models::am_in_topk(model *m) { std::unique_ptr build_save_topk_models_callback_from_pbuf( - const google::protobuf::Message& proto_msg, lbann_summary*) { + const google::protobuf::Message& proto_msg, const std::shared_ptr&) { const auto& params = dynamic_cast(proto_msg); return make_unique( diff --git a/src/callbacks/summary.cpp b/src/callbacks/summary.cpp index 658e540d5e9..c9d09351de7 100644 --- a/src/callbacks/summary.cpp +++ b/src/callbacks/summary.cpp @@ -32,16 +32,12 @@ namespace lbann { namespace callback { -summary::summary(lbann_summary *summarizer, +summary::summary(const std::shared_ptr& summarizer, int batch_interval, int mat_interval) : callback_base(batch_interval, summarizer), m_mat_interval(mat_interval) {} -summary::~summary() { - delete m_summarizer; -} - void summary::on_train_begin(model *m) { save_histograms(m); } @@ -136,7 +132,7 @@ void summary::save_histograms(model *m) { std::unique_ptr build_summary_callback_from_pbuf( const google::protobuf::Message& proto_msg, - lbann_summary* summarizer) { + const std::shared_ptr& summarizer) { const auto& params = dynamic_cast(proto_msg); return make_unique(summarizer, diff --git a/src/callbacks/sync_layers.cpp b/src/callbacks/sync_layers.cpp index 598b4c4ba8a..a6cc6ff23d3 100644 --- a/src/callbacks/sync_layers.cpp +++ b/src/callbacks/sync_layers.cpp @@ -64,7 +64,7 @@ void sync_layers::do_sync(Layer *l) { std::unique_ptr build_sync_layers_callback_from_pbuf( - const google::protobuf::Message& proto_msg, lbann_summary*) { + const google::protobuf::Message& proto_msg, const std::shared_ptr&) { const auto& params = dynamic_cast(proto_msg); return make_unique(params.sync_gpus(), diff --git a/src/callbacks/sync_selected.cpp b/src/callbacks/sync_selected.cpp index efe0c2a24e1..3f271dc9a91 100644 --- a/src/callbacks/sync_selected.cpp +++ b/src/callbacks/sync_selected.cpp @@ -280,7 +280,7 @@ void sync_selected::do_sync(Layer *l) { std::unique_ptr build_sync_selected_callback_from_pbuf( - const google::protobuf::Message& proto_msg, lbann_summary*) { + const google::protobuf::Message& proto_msg, const std::shared_ptr&) { const auto& params = dynamic_cast(proto_msg); const int num_layers = params.layer_to_sync_size(); diff --git a/src/callbacks/timeline.cpp b/src/callbacks/timeline.cpp index 7f6d59516c1..a0183e14c2f 100644 --- a/src/callbacks/timeline.cpp +++ b/src/callbacks/timeline.cpp @@ -101,7 +101,7 @@ void timeline::on_optimize_end(model *m, weights *w) { std::unique_ptr build_timeline_callback_from_pbuf( - const google::protobuf::Message& proto_msg, lbann_summary*) { + const google::protobuf::Message& proto_msg, std::shared_ptr const&) { const auto& params = dynamic_cast(proto_msg); return make_unique(params.directory()); diff --git a/src/callbacks/timer.cpp b/src/callbacks/timer.cpp index 9449782c718..ddf42fdba39 100644 --- a/src/callbacks/timer.cpp +++ b/src/callbacks/timer.cpp @@ -170,7 +170,7 @@ void timer::timing_end(model& m) { std::unique_ptr build_timer_callback_from_pbuf( - const google::protobuf::Message&, lbann_summary* summarizer) { + const google::protobuf::Message&, std::shared_ptr const& summarizer) { return make_unique(summarizer); } diff --git a/src/callbacks/variable_minibatch.cpp b/src/callbacks/variable_minibatch.cpp index a16a7f2ddf1..b52d405164f 100644 --- a/src/callbacks/variable_minibatch.cpp +++ b/src/callbacks/variable_minibatch.cpp @@ -186,7 +186,7 @@ bool minibatch_schedule::schedule( std::unique_ptr build_step_minibatch_callback_from_pbuf( - const google::protobuf::Message& proto_msg, lbann_summary*) { + const google::protobuf::Message& proto_msg, std::shared_ptr const&) { const auto& params = dynamic_cast(proto_msg); return make_unique(params.starting_mbsize(), @@ -196,7 +196,7 @@ build_step_minibatch_callback_from_pbuf( std::unique_ptr build_minibatch_schedule_callback_from_pbuf( - const google::protobuf::Message& proto_msg, lbann_summary*) { + const google::protobuf::Message& proto_msg, std::shared_ptr const&) { const auto& params = dynamic_cast(proto_msg); std::vector steps; diff --git a/src/proto/factories/callback_factory.cpp b/src/proto/factories/callback_factory.cpp index d8b24f6e183..a4ba60ea63e 100644 --- a/src/proto/factories/callback_factory.cpp +++ b/src/proto/factories/callback_factory.cpp @@ -88,6 +88,18 @@ using factory_type = lbann::generic_factory< lbann_summary*>, default_key_error_policy>; +namespace +{ +template +std::string BuildErrorMessage(Ts... args) +{ + std::ostringstream oss; + int dummy[] = { (oss << args, 0)... }; + (void) dummy; + LBANN_ERROR(oss.str()); +} +} + void register_default_builders(factory_type& factory) { using namespace callback; @@ -201,7 +213,7 @@ factory_type const& get_callback_factory() noexcept std::unique_ptr construct_callback( - const google::protobuf::Message& proto_msg, lbann_summary* summarizer) { + const google::protobuf::Message& proto_msg, std::shared_ptr const& summarizer) { auto const& factory = get_callback_factory(); auto const& msg = @@ -226,9 +238,9 @@ lbann_summary* construct_summarizer(lbann_comm* comm, struct stat sb; if (! ( stat(c.dir().c_str(), &sb) == 0 && S_ISDIR(sb.st_mode) )) { if (master) { - throw lbann_exception( - std::string {} + __FILE__ + " " + std::to_string(__LINE__) + " :: " + - "summary directory " + c.dir() + " does not exist"); + LBANN_ERROR(BuildErrorMessage( std::string {}, __File__, " ", + std::to_string(__LINE__),"summary directory ", + c.dir(), " does not exist.")); } } summary = new lbann_summary(c.dir(), comm); From 983324f54d4cbd61949a65de6087502b7d648476 Mon Sep 17 00:00:00 2001 From: "Thomas R. Benson" Date: Sat, 3 Aug 2019 16:25:15 -0700 Subject: [PATCH 195/634] fix an issue on OSX in which nonportable pthread features are not available Addresses issue #1117. --- CMakeLists.txt | 3 ++ cmake/configure_files/lbann_config.hpp.in | 3 ++ include/lbann/utils/threads/thread_pool.hpp | 21 ++++++++------ src/utils/threads/CMakeLists.txt | 29 +++++++++++++++++++ src/utils/threads/thread_pool.cpp | 32 +++++++++++++++++---- 5 files changed, 75 insertions(+), 13 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 45fccfb2403..1a8577a6c3e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -161,6 +161,8 @@ include(SetupCXX) ################################################################ # Required dependencies +find_package(Threads REQUIRED) + find_package(CEREAL NO_MODULE HINTS ${CEREAL_DIR} $ENV{CEREAL_DIR} PATH_SUFFIXES share/cmake/cereal @@ -492,6 +494,7 @@ endif () # Use the IMPORTED targets when possible. target_link_libraries(lbann PUBLIC LbannProto) +target_link_libraries(lbann PUBLIC Threads::Threads) target_link_libraries(lbann PUBLIC cereal) target_link_libraries(lbann PUBLIC OpenMP::OpenMP_CXX) target_link_libraries(lbann PUBLIC MPI::MPI_CXX) diff --git a/cmake/configure_files/lbann_config.hpp.in b/cmake/configure_files/lbann_config.hpp.in index 08306a8d10f..e25ca5ea590 100644 --- a/cmake/configure_files/lbann_config.hpp.in +++ b/cmake/configure_files/lbann_config.hpp.in @@ -45,6 +45,9 @@ #cmakedefine LBANN_HAS_STD_ANY #cmakedefine LBANN_HAS_STD_MAKE_UNIQUE +// API support for non-portable pthread functionality. +#cmakedefine LBANN_HAS_PTHREAD_AFFINITY_SUPPORT + // Define the LBANN datatype namespace lbann { diff --git a/include/lbann/utils/threads/thread_pool.hpp b/include/lbann/utils/threads/thread_pool.hpp index 81cfcf4c7a0..fb5b4f11640 100644 --- a/include/lbann/utils/threads/thread_pool.hpp +++ b/include/lbann/utils/threads/thread_pool.hpp @@ -1,15 +1,19 @@ -#ifndef __LBANN_THREAD_POOL_HPP__ -#define __LBANN_THREAD_POOL_HPP__ +#ifndef LBANN_UTILS_THREADS_THREAD_POOL_HPP_INCLUDED +#define LBANN_UTILS_THREADS_THREAD_POOL_HPP_INCLUDED -#include -#include -#include -#include +#include "lbann_config.hpp" #include "thread_safe_queue.hpp" #include "type_erased_function.hpp" #include "lbann/utils/exception.hpp" +#include + +#include +#include +#include +#include + namespace lbann { class thread_pool { @@ -111,8 +115,9 @@ class thread_pool { private: /** @brief The task executed by each thread */ void do_thread_work_(); +#ifdef LBANN_HAS_PTHREAD_AFFINITY_SUPPORT void do_thread_work_pinned_thread_(int tid, cpu_set_t cpu_set); - +#endif // LBANN_HAS_PTHREAD_AFFINITY_SUPPORT private: /** @brief Container holding the threads */ @@ -138,4 +143,4 @@ class thread_pool { };// class thread_pool }// namespace lbann -#endif /* __LBANN_THREAD_POOL_HPP__ */ +#endif /* LBANN_UTILS_THREADS_THREAD_POOL_HPP_INCLUDED */ diff --git a/src/utils/threads/CMakeLists.txt b/src/utils/threads/CMakeLists.txt index ca1ab783b27..dbd8ab558f0 100644 --- a/src/utils/threads/CMakeLists.txt +++ b/src/utils/threads/CMakeLists.txt @@ -1,3 +1,32 @@ +# Test for non-portable POSIX features +include(CheckCXXSourceCompiles) +set(CMAKE_REQUIRED_LIBRARIES Threads::Threads) +set(_PTHREAD_SETAFFINITY_NP_TEST_CODE + "#include +int main(int, char* argv[]) { + pthread_t thd = pthread_self(); + cpu_set_t cpuset; + CPU_ZERO(&cpuset); + return pthread_setaffinity_np(thd, sizeof(cpu_set_t), &cpuset); +}") +check_cxx_source_compiles( + "${_PTHREAD_SETAFFINITY_NP_TEST_CODE}" LBANN_HAS_PTHREAD_SETAFFINITY_NP) +set(_PTHREAD_GETAFFINITY_NP_TEST_CODE + "#include +int main(int, char* argv[]) { + pthread_t thd = pthread_self(); + cpu_set_t cpuset; + CPU_ZERO(&cpuset); + return pthread_getaffinity_np(thd, sizeof(cpu_set_t), &cpuset); +}") +check_cxx_source_compiles( + "${_PTHREAD_GETAFFINITY_NP_TEST_CODE}" LBANN_HAS_PTHREAD_GETAFFINITY_NP) +set(CMAKE_REQUIRED_LIBRARIES) + +if (LBANN_HAS_PTHREAD_SETAFFINITY_NP AND LBANN_HAS_PTHREAD_GETAFFINITY_NP) + set(LBANN_HAS_PTHREAD_AFFINITY_SUPPORT TRUE) +endif () + # Add the source files for this directory set_full_path(THIS_DIR_SOURCES thread_pool.cpp diff --git a/src/utils/threads/thread_pool.cpp b/src/utils/threads/thread_pool.cpp index 0c79eef54c8..e210ed7657a 100644 --- a/src/utils/threads/thread_pool.cpp +++ b/src/utils/threads/thread_pool.cpp @@ -37,7 +37,20 @@ void thread_pool::launch_threads(size_type num_threads) } } -void thread_pool::launch_pinned_threads(size_type num_threads, int cpu_offset) { +// FIXME (trb 08/03/2019): Setting thread affinity is not a portable +// pthread operation (hence the _np suffix); indeed, OSX does not +// support it. Unfortunately the case on OSX is even more dire -- they +// seem to want to prevent you from messing with their scheduler at +// all. The MACH kernel API for doing this is marked "deprecated" and +// its use is not advised for code that is not tied to a specific OSX +// version (see here for more information: +// http://web.mit.edu/darwin/src/modules/xnu/osfmk/man/). +// +// As a result of the above, this will, in fact, *not* launch pinned +// threads when the locally-supported pthread API does not support it. +void thread_pool::launch_pinned_threads( + size_type num_threads, int cpu_offset) { +#ifdef LBANN_HAS_PTHREAD_AFFINITY_SUPPORT threads_.reserve(num_threads); m_work_group.reserve(num_threads); m_thread_id_to_local_id_map.reserve(num_threads); @@ -48,7 +61,8 @@ void thread_pool::launch_pinned_threads(size_type num_threads, int cpu_offset) { cpu_set_t cpuset, ht_cpuset; CPU_ZERO(&cpuset); - auto error = pthread_getaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset); + auto error = pthread_getaffinity_np(pthread_self(), + sizeof(cpu_set_t), &cpuset); if (error != 0) { std::cerr << "error in pthread_getaffinity_np, error=" << error << std::endl; @@ -66,7 +80,8 @@ void thread_pool::launch_pinned_threads(size_type num_threads, int cpu_offset) { } } - threads_.emplace_back(&thread_pool::do_thread_work_pinned_thread_,this, cnt, ht_cpuset); + threads_.emplace_back(&thread_pool::do_thread_work_pinned_thread_, + this, cnt, ht_cpuset); } } catch(...) @@ -74,6 +89,9 @@ void thread_pool::launch_pinned_threads(size_type num_threads, int cpu_offset) { all_work_done_ = true; throw; } +#else + launch_threads(num_threads); +#endif// LBANN_HAS_PTHREAD_AFFINITY_SUPPORT } void thread_pool::reap_threads() { @@ -110,12 +128,15 @@ void thread_pool::do_thread_work_() } } +#ifdef LBANN_HAS_PTHREAD_AFFINITY_SUPPORT void thread_pool::do_thread_work_pinned_thread_(int tid, cpu_set_t cpu_set) { // Set the CPU affinity for the thread - auto error = pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpu_set); + auto error = pthread_setaffinity_np(pthread_self(), + sizeof(cpu_set_t), &cpu_set); if (error != 0) { - std::cerr << "error in pthread_setaffinity_np, error=" << error << std::endl; + std::cerr << "error in pthread_setaffinity_np, error=" + << error << std::endl; } { @@ -132,6 +153,7 @@ void thread_pool::do_thread_work_pinned_thread_(int tid, cpu_set_t cpu_set) } } } +#endif // LBANN_HAS_PTHREAD_AFFINITY_SUPPORT int thread_pool::get_local_thread_id() { std::thread::id this_id = std::this_thread::get_id(); From 603ce868aa10c437e8ade23f8005a234b6fb9ffd Mon Sep 17 00:00:00 2001 From: "Thomas R. Benson" Date: Sat, 3 Aug 2019 16:27:11 -0700 Subject: [PATCH 196/634] clean up includes and header guards in threads directory --- include/lbann/utils/threads/thread_safe_queue.hpp | 11 ++++++----- include/lbann/utils/threads/thread_utils.hpp | 6 +++--- include/lbann/utils/threads/type_erased_function.hpp | 12 +++++++----- 3 files changed, 16 insertions(+), 13 deletions(-) diff --git a/include/lbann/utils/threads/thread_safe_queue.hpp b/include/lbann/utils/threads/thread_safe_queue.hpp index 29e729aa8fe..882da33da67 100644 --- a/include/lbann/utils/threads/thread_safe_queue.hpp +++ b/include/lbann/utils/threads/thread_safe_queue.hpp @@ -1,11 +1,12 @@ -#ifndef __LBANN_THREAD_SAFE_QUEUE_HPP__ -#define __LBANN_THREAD_SAFE_QUEUE_HPP__ +#ifndef LBANN_UTILS_THREADS_THREAD_SAFE_QUEUE_HPP_INCLUDED +#define LBANN_UTILS_THREADS_THREAD_SAFE_QUEUE_HPP_INCLUDED + +#include #include +#include #include -#include - namespace lbann { /** @class thread_safe_queue @@ -146,4 +147,4 @@ class thread_safe_queue { };// class thread_safe_queue }// namespace lbann -#endif /* __LBANN_THREAD_SAFE_QUEUE_HPP__ */ +#endif /* LBANN_UTILS_THREADS_THREAD_SAFE_QUEUE_HPP_INCLUDED */ diff --git a/include/lbann/utils/threads/thread_utils.hpp b/include/lbann/utils/threads/thread_utils.hpp index 514ff1e4549..7e19d9a21ab 100644 --- a/include/lbann/utils/threads/thread_utils.hpp +++ b/include/lbann/utils/threads/thread_utils.hpp @@ -24,8 +24,8 @@ // permissions and limitations under the license. //////////////////////////////////////////////////////////////////////////////// -#ifndef LBANN_THREAD_UTILS_HPP -#define LBANN_THREAD_UTILS_HPP +#ifndef LBANN_UTILS_THREADS_THREAD_UTILS_HPP_INCLUDED +#define LBANN_UTILS_THREADS_THREAD_UTILS_HPP_INCLUDED #include "lbann/comm.hpp" @@ -36,4 +36,4 @@ int free_core_offset(const lbann_comm *comm); } // namespace lbann -#endif // LBANN_THREAD_UTILS_HPP +#endif // LBANN_UTILS_THREADS_THREAD_UTILS_HPP_INCLUDED diff --git a/include/lbann/utils/threads/type_erased_function.hpp b/include/lbann/utils/threads/type_erased_function.hpp index 1b46f082f4c..3c7339a5d6d 100644 --- a/include/lbann/utils/threads/type_erased_function.hpp +++ b/include/lbann/utils/threads/type_erased_function.hpp @@ -1,10 +1,12 @@ -#ifndef __LBANN_TYPE_ERASED_FUNCTION_HPP__ -#define __LBANN_TYPE_ERASED_FUNCTION_HPP__ - -#include +#ifndef LBANN_UTILS_THREADS_TYPE_ERASED_FUNCTION_HPP_INCLUDED +#define LBANN_UTILS_THREADS_TYPE_ERASED_FUNCTION_HPP_INCLUDED #include +#include +#include +#include + namespace lbann { /** @class type_erased_function @@ -88,4 +90,4 @@ class type_erased_function { };// class type_erased_function }// namespace lbann -#endif /* __LBANN_TYPE_ERASED_FUNCTION_HPP__ */ +#endif /* LBANN_UTILS_THREADS_TYPE_ERASED_FUNCTION_HPP_INCLUDED */ From c86d0d43e66b302ce4339c461a6189e33d7c7a6e Mon Sep 17 00:00:00 2001 From: "Thomas R. Benson" Date: Mon, 5 Aug 2019 10:58:56 -0700 Subject: [PATCH 197/634] small updates to model factory and objective function factory to use managed memory --- include/lbann/proto/factories.hpp | 18 ++++--- src/proto/factories/model_factory.cpp | 51 +++++++++++-------- .../factories/objective_function_factory.cpp | 5 +- src/utils/lbann_library.cpp | 8 +-- 4 files changed, 48 insertions(+), 34 deletions(-) diff --git a/include/lbann/proto/factories.hpp b/include/lbann/proto/factories.hpp index a73563eb168..32a6b6890e1 100644 --- a/include/lbann/proto/factories.hpp +++ b/include/lbann/proto/factories.hpp @@ -28,11 +28,15 @@ #define LBANN_PROTO_FACTORIES_HPP_INCLUDED #include "lbann/callbacks/callback.hpp" -#include "lbann/proto/proto_common.hpp" #include "lbann/data_readers/data_reader.hpp" +#include "lbann/models/model.hpp" +#include "lbann/proto/proto_common.hpp" #include "lbann/transforms/transform.hpp" #include "lbann/transforms/transform_pipeline.hpp" +#include +#include + namespace lbann_data { class Model; class ObjectiveFunction; @@ -46,10 +50,11 @@ namespace lbann { namespace proto { /** Construct a model specified with a prototext. */ -model* construct_model(lbann_comm* comm, - const std::map& data_readers, - const lbann_data::Optimizer& proto_opt, - const lbann_data::Model& proto_model); +std::unique_ptr construct_model( + lbann_comm* comm, + const std::map& data_readers, + const lbann_data::Optimizer& proto_opt, + const lbann_data::Model& proto_model); /** Construct a layer graph specified with a prototext. */ std::vector> construct_layer_graph( @@ -89,7 +94,8 @@ std::unique_ptr construct_optimizer( const lbann_data::Optimizer& proto_opt); /** Construct an objective function specified with prototext. */ -objective_function* construct_objective_function(const lbann_data::ObjectiveFunction& proto_obj); +std::unique_ptr +construct_objective_function(const lbann_data::ObjectiveFunction& proto_obj); /** Construct a transform given a prototext. */ std::unique_ptr construct_transform( diff --git a/src/proto/factories/model_factory.cpp b/src/proto/factories/model_factory.cpp index 3e774562da0..21f054adf4e 100644 --- a/src/proto/factories/model_factory.cpp +++ b/src/proto/factories/model_factory.cpp @@ -26,27 +26,35 @@ #include "lbann/proto/factories.hpp" +#include "lbann/metrics/layer_metric.hpp" #include "lbann/models/model.hpp" #include "lbann/models/directed_acyclic_graph.hpp" - -#include "lbann/metrics/layer_metric.hpp" #include "lbann/objective_functions/layer_term.hpp" #include "lbann/objective_functions/weight_regularization/l2.hpp" +#include "lbann/utils/memory.hpp" #include #include +#include +#include +#include +#include +#include +#include +#include + namespace lbann { namespace proto { namespace { /** Instantiate a model based on prototext. */ -model* instantiate_model(lbann_comm* comm, - objective_function* obj, - const lbann_data::Optimizer& proto_opt, - const lbann_data::Model& proto_model) { - std::stringstream err; +std::unique_ptr +instantiate_model(lbann_comm* comm, + std::unique_ptr obj, + const lbann_data::Optimizer& proto_opt, + const lbann_data::Model& proto_model) { // Default optimizer auto opt = construct_optimizer(comm, proto_opt); @@ -55,24 +63,26 @@ model* instantiate_model(lbann_comm* comm, const auto& type = proto_model.type(); const auto& mini_batch_size = proto_model.mini_batch_size(); if (type.empty() || type == "directed_acyclic_graph_model") { - return new directed_acyclic_graph_model( - comm, mini_batch_size, obj, opt.release()); + return make_unique( + comm, mini_batch_size, obj.release(), opt.release()); } // Throw error if model type is not supported + std::stringstream err; err << "unknown model type (" << type << ")"; LBANN_ERROR(err.str()); return nullptr; - } /** Setup pointers from objective function to layers. * * Layer terms require pointers to layers. */ -void assign_layers_to_objective_function(std::vector& layer_list, - objective_function& obj, - const lbann_data::ObjectiveFunction& proto_obj) { +void assign_layers_to_objective_function( + std::vector& layer_list, + objective_function& obj, + const lbann_data::ObjectiveFunction& proto_obj) { + std::stringstream err; // Construct map from layer names to layers @@ -238,10 +248,11 @@ void assign_weights_to_objective_function(std::vector& weights_list, } // namespace -model* construct_model(lbann_comm* comm, - const std::map& data_readers, - const lbann_data::Optimizer& proto_opt, - const lbann_data::Model& proto_model) { +std::unique_ptr construct_model( + lbann_comm* comm, + const std::map& data_readers, + const lbann_data::Optimizer& proto_opt, + const lbann_data::Model& proto_model) { // Construct layer graph auto&& layer_list = construct_layer_graph(comm, @@ -255,7 +266,7 @@ model* construct_model(lbann_comm* comm, // Construct objective function const auto& proto_obj = proto_model.objective_function(); - auto&& obj = construct_objective_function(proto_obj); + auto obj = construct_objective_function(proto_obj); assign_layers_to_objective_function(layer_pointers, *obj, proto_obj); // Construct weights @@ -288,7 +299,7 @@ model* construct_model(lbann_comm* comm, } // Instantiate model - auto&& m = instantiate_model(comm, obj, proto_opt, proto_model); + auto m = instantiate_model(comm, std::move(obj), proto_opt, proto_model); for (auto&& l : layer_list ) { m->add_layer(std::move(l)); } for (auto&& w : weights_list ) { m->add_weights(w); } for (auto&& met : metric_list ) { m->add_metric(met); } @@ -298,7 +309,7 @@ model* construct_model(lbann_comm* comm, m->set_name(name); } for (auto t : data_readers) { - t.second->set_model(m); + t.second->set_model(m.get()); } return m; diff --git a/src/proto/factories/objective_function_factory.cpp b/src/proto/factories/objective_function_factory.cpp index 334c1fa2d0a..5e3465f09c0 100644 --- a/src/proto/factories/objective_function_factory.cpp +++ b/src/proto/factories/objective_function_factory.cpp @@ -34,10 +34,11 @@ namespace lbann { namespace proto { -objective_function* construct_objective_function(const lbann_data::ObjectiveFunction& proto_obj) { +std::unique_ptr +construct_objective_function(const lbann_data::ObjectiveFunction& proto_obj) { // Instantiate objective function - objective_function* obj = new objective_function(); + auto obj = make_unique(); // Weight regularization terms for (int i=0; i build_model_from_prototext( print_parameters(*comm, pb); // Initalize model - std::unique_ptr ret_model{ - proto::construct_model(comm, - data_readers, - pb.optimizer(), - pb.model()) - }; + auto ret_model = + proto::construct_model(comm, data_readers, pb.optimizer(), pb.model()); ret_model->setup(std::move(io_thread_pool)); if(opts->get_bool("disable_background_io_activity")) { From 8204e7021a3d02bfc1105e8f050f455d9678de6e Mon Sep 17 00:00:00 2001 From: "Thomas R. Benson" Date: Mon, 5 Aug 2019 16:37:21 -0700 Subject: [PATCH 198/634] Fix exception-safety related issues in model_factory. --- src/proto/factories/model_factory.cpp | 36 ++++++++++++++------------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/src/proto/factories/model_factory.cpp b/src/proto/factories/model_factory.cpp index 21f054adf4e..a2eea721962 100644 --- a/src/proto/factories/model_factory.cpp +++ b/src/proto/factories/model_factory.cpp @@ -127,7 +127,7 @@ void assign_layers_to_objective_function( } void assign_layers_to_metrics(std::vector& layer_list, - std::vector& metric_list, + std::vector>& metric_list, const lbann_data::Model& proto_model) { // Construct map from layer names to layers @@ -144,7 +144,7 @@ void assign_layers_to_metrics(std::vector& layer_list, // Assign layers to layer metrics for (int i=0; i(metric_list[i]); + auto&& m = dynamic_cast(metric_list[i].get()); if (m != nullptr) { const auto& params = proto_model.metric(i).layer_metric(); auto* l = names_to_layers[params.layer()]; @@ -163,7 +163,7 @@ void assign_layers_to_metrics(std::vector& layer_list, /** Setup pointers from layers to weights. */ void assign_weights_to_layers(std::vector& layer_list, - std::vector& weights_list, + std::vector>& weights_list, const lbann_data::Model& proto_model) { std::stringstream err; @@ -175,7 +175,7 @@ void assign_weights_to_layers(std::vector& layer_list, err << "weights name \"" << name << "\" is not unique"; LBANN_ERROR(err.str()); } - names_to_weights[name] = w; + names_to_weights[name] = w.get(); } // Find weights assigned to each layer @@ -185,7 +185,7 @@ void assign_weights_to_layers(std::vector& layer_list, const bool is_frozen = layer_list[i]->is_frozen(); for (auto&& name : parse_list(proto_layer.weights())) { auto&& w = names_to_weights[name]; - if (w == nullptr) { + if (!w) { err << "could not find weights named \"" << name << "\", " << "which are expected by layer " << layer_list[i]->get_name(); LBANN_ERROR(err.str()); @@ -205,9 +205,11 @@ void assign_weights_to_layers(std::vector& layer_list, * * L2 weight regularization requires pointers to weights. */ -void assign_weights_to_objective_function(std::vector& weights_list, - objective_function& obj, - const lbann_data::ObjectiveFunction& proto_obj) { +void assign_weights_to_objective_function( + std::vector>& weights_list, + objective_function& obj, + const lbann_data::ObjectiveFunction& proto_obj) { + std::stringstream err; // Construct map from weights names to weights @@ -218,7 +220,7 @@ void assign_weights_to_objective_function(std::vector& weights_list, err << "weights name \"" << name << "\" is not unique"; LBANN_ERROR(err.str()); } - names_to_weights[name] = w; + names_to_weights[name] = w.get(); } // Setup weights with L2 regularization @@ -270,23 +272,23 @@ std::unique_ptr construct_model( assign_layers_to_objective_function(layer_pointers, *obj, proto_obj); // Construct weights - std::vector weights_list; + std::vector> weights_list; for (int i=0; i metric_list; + std::vector> metric_list; for (int i=0; i(comm, + params.name(), + params.unit())); } assign_layers_to_metrics(layer_pointers, metric_list, proto_model); @@ -301,8 +303,8 @@ std::unique_ptr construct_model( // Instantiate model auto m = instantiate_model(comm, std::move(obj), proto_opt, proto_model); for (auto&& l : layer_list ) { m->add_layer(std::move(l)); } - for (auto&& w : weights_list ) { m->add_weights(w); } - for (auto&& met : metric_list ) { m->add_metric(met); } + for (auto&& w : weights_list ) { m->add_weights(w.release()); } + for (auto&& met : metric_list ) { m->add_metric(met.release()); } for (auto&& cb : callback_list) { m->add_callback(cb.release()); } const auto& name = proto_model.name(); if (!name.empty()) { From 9d49cede9a743e16380d2cea065048f4cac6516c Mon Sep 17 00:00:00 2001 From: Katie Graham Date: Mon, 5 Aug 2019 16:50:53 -0700 Subject: [PATCH 199/634] Refactored callbacks to use shared_ptr for lbann_summary --- include/lbann/callbacks/callback.hpp | 22 +++------- include/lbann/callbacks/check_gradients.hpp | 2 +- include/lbann/callbacks/check_metric.hpp | 2 +- include/lbann/callbacks/checkpoint.hpp | 2 +- include/lbann/callbacks/confusion_matrix.hpp | 2 +- include/lbann/callbacks/debug.hpp | 7 ++- include/lbann/callbacks/debug_io.hpp | 9 ++-- .../lbann/callbacks/dump_error_signals.hpp | 2 +- include/lbann/callbacks/dump_gradients.hpp | 2 +- .../dump_minibatch_sample_indices.hpp | 2 +- include/lbann/callbacks/dump_outputs.hpp | 2 +- include/lbann/callbacks/dump_weights.hpp | 2 +- include/lbann/callbacks/early_stopping.hpp | 2 +- include/lbann/callbacks/hang.hpp | 2 +- include/lbann/callbacks/imcomm.hpp | 5 ++- include/lbann/callbacks/learning_rate.hpp | 12 ++--- include/lbann/callbacks/ltfb.hpp | 6 +-- include/lbann/callbacks/mixup.hpp | 2 +- include/lbann/callbacks/monitor_io.hpp | 2 +- include/lbann/callbacks/perturb_adam.hpp | 2 +- include/lbann/callbacks/perturb_dropout.hpp | 2 +- include/lbann/callbacks/print_statistics.hpp | 2 +- include/lbann/callbacks/profiler.hpp | 2 +- include/lbann/callbacks/replace_weights.hpp | 2 +- include/lbann/callbacks/save_images.hpp | 2 +- include/lbann/callbacks/save_model.hpp | 2 +- include/lbann/callbacks/save_topk_models.hpp | 2 +- include/lbann/callbacks/summary.hpp | 12 +++-- include/lbann/callbacks/sync_layers.hpp | 2 +- include/lbann/callbacks/timeline.hpp | 2 +- include/lbann/callbacks/timer.hpp | 7 ++- .../lbann/callbacks/variable_minibatch.hpp | 4 +- include/lbann/proto/factories.hpp | 6 +-- src/callbacks/confusion_matrix.cpp | 2 +- src/callbacks/debug.cpp | 4 +- src/callbacks/imcomm.cpp | 2 +- src/callbacks/ltfb.cpp | 10 ++--- src/callbacks/summary.cpp | 39 +++++++++++++--- src/callbacks/timer.cpp | 2 +- src/proto/callbacks.proto | 1 - src/proto/factories/callback_factory.cpp | 44 ++++++++----------- src/proto/factories/model_factory.cpp | 2 +- src/proto/model.proto | 6 +++ 43 files changed, 136 insertions(+), 112 deletions(-) diff --git a/include/lbann/callbacks/callback.hpp b/include/lbann/callbacks/callback.hpp index 5225dabc4bf..224b0d04399 100644 --- a/include/lbann/callbacks/callback.hpp +++ b/include/lbann/callbacks/callback.hpp @@ -39,8 +39,8 @@ #include #include -// A utility macro for easily adding default-constructed sub-class -// builders. +/** @brief A utility macro for easily adding default-constructed sub-class + * builders.*/ #define LBANN_ADD_DEFAULT_CALLBACK_BUILDER(Class, FunctionName) \ inline std::unique_ptr FunctionName( \ const google::protobuf::Message&, std::shared_ptr const&) { \ @@ -63,12 +63,10 @@ class callback_base { /** @name Constructors and destructor */ ///@{ - /** @brief Initialize a callback with an optional batch interval and - * summarizer. + /** @brief Initialize a callback with an optional batch interval */ - callback_base(int batch_interval = 1, - const std::shared_ptr& summarizer = nullptr) : - m_batch_interval(std::max(batch_interval, 1)), m_summarizer(summarizer) {} + callback_base(int batch_interval = 1) : + m_batch_interval(std::max(batch_interval, 1)) {} callback_base(const callback_base&) = default; virtual ~callback_base() = default; @@ -78,14 +76,6 @@ class callback_base { virtual callback_base* copy() const = 0; - ///@} - /** @name Modifiers */ - ///@{ - - void set_summarizer(const std::shared_ptr& summarizer) { - m_summarizer = summarizer; - } - /** @brief Called once to set up the callback (after all layers are * set up). */ @@ -194,8 +184,6 @@ class callback_base { /** @brief Batch methods should once every this many steps. */ int m_batch_interval; - /** @brief Optional summarizer for the callbacks to use. */ - std::shared_ptr m_summarizer; }; } // namespace lbann diff --git a/include/lbann/callbacks/check_gradients.hpp b/include/lbann/callbacks/check_gradients.hpp index bea3364e2cb..0cbe4f8938e 100644 --- a/include/lbann/callbacks/check_gradients.hpp +++ b/include/lbann/callbacks/check_gradients.hpp @@ -79,7 +79,7 @@ class check_gradients : public callback_base { // Builder function std::unique_ptr build_check_gradients_callback_from_pbuf( - const google::protobuf::Message&, lbann_summary*); + const google::protobuf::Message&, std::shared_ptr const&); } // namespace callback } // namespace lbann diff --git a/include/lbann/callbacks/check_metric.hpp b/include/lbann/callbacks/check_metric.hpp index 23b1ebfd317..d965f6d6ad5 100644 --- a/include/lbann/callbacks/check_metric.hpp +++ b/include/lbann/callbacks/check_metric.hpp @@ -79,7 +79,7 @@ class check_metric : public callback_base { // Builder function std::unique_ptr build_check_metric_callback_from_pbuf( - const google::protobuf::Message&, lbann_summary*); + const google::protobuf::Message&, std::shared_ptr const&); } // namespace callback } // namespace lbann diff --git a/include/lbann/callbacks/checkpoint.hpp b/include/lbann/callbacks/checkpoint.hpp index b96d85d1d00..5107966dd79 100644 --- a/include/lbann/callbacks/checkpoint.hpp +++ b/include/lbann/callbacks/checkpoint.hpp @@ -207,7 +207,7 @@ static inline bool read_latest(std::string filename, int *epochLast, int *trainL // Builder function std::unique_ptr build_checkpoint_callback_from_pbuf( - const google::protobuf::Message&, lbann_summary*); + const google::protobuf::Message&, std::shared_ptr const&); } // namespace callback } // namespace lbann diff --git a/include/lbann/callbacks/confusion_matrix.hpp b/include/lbann/callbacks/confusion_matrix.hpp index a0a723084cc..3bed8808a6f 100644 --- a/include/lbann/callbacks/confusion_matrix.hpp +++ b/include/lbann/callbacks/confusion_matrix.hpp @@ -114,7 +114,7 @@ class confusion_matrix : public callback_base { // Builder function std::unique_ptr build_confusion_matrix_callback_from_pbuf( - const google::protobuf::Message&, lbann_summary*); + const google::protobuf::Message&, std::shared_ptr const&); } // namespace callback } // namespace lbann diff --git a/include/lbann/callbacks/debug.hpp b/include/lbann/callbacks/debug.hpp index c5a1382e543..d67857eeff7 100644 --- a/include/lbann/callbacks/debug.hpp +++ b/include/lbann/callbacks/debug.hpp @@ -50,9 +50,8 @@ class debug : public callback_base { * If modes is empty, status updates will be printed for all * execution modes. */ - debug(std::set modes, - const std::shared_ptr& summarizer = nullptr) : - callback_base(1, summarizer), m_modes(std::move(modes)) {} + debug(std::set modes) : + m_modes(std::move(modes)) {} debug(const debug&) = default; debug& operator=(const debug&) = default; debug* copy() const override { return new debug(*this); } @@ -107,7 +106,7 @@ class debug : public callback_base { // Builder function std::unique_ptr build_debug_callback_from_pbuf( - const google::protobuf::Message&, lbann_summary*); + const google::protobuf::Message&, std::shared_ptr const&); } // namespace callback } // namespace lbann diff --git a/include/lbann/callbacks/debug_io.hpp b/include/lbann/callbacks/debug_io.hpp index 5aaf5fe2e17..4ce036c1929 100644 --- a/include/lbann/callbacks/debug_io.hpp +++ b/include/lbann/callbacks/debug_io.hpp @@ -53,9 +53,10 @@ class debug_io : public callback_base { * Debug a particular phase; use invalid to debug every phase. */ debug_io(execution_mode phase = execution_mode::invalid, - int debug_lvl = 0, - const std::shared_ptr& summarizer = nullptr) : - callback_base(1, summarizer), m_debug_phase(phase), m_debug_lvl(debug_lvl) {} + int debug_lvl = 0) : + callback_base(1), + m_debug_phase(phase), + m_debug_lvl(debug_lvl) {} debug_io(const debug_io&) = default; debug_io& operator=( const debug_io&) = default; @@ -88,7 +89,7 @@ class debug_io : public callback_base { // Builder function std::unique_ptr build_debug_io_callback_from_pbuf( - const google::protobuf::Message&, lbann_summary*); + const google::protobuf::Message&, std::shared_ptr const&); } // namespace callback } // namespace lbann diff --git a/include/lbann/callbacks/dump_error_signals.hpp b/include/lbann/callbacks/dump_error_signals.hpp index 38644bdac52..9d704d9560a 100644 --- a/include/lbann/callbacks/dump_error_signals.hpp +++ b/include/lbann/callbacks/dump_error_signals.hpp @@ -62,7 +62,7 @@ class dump_error_signals : public callback_base { // Builder function std::unique_ptr build_dump_error_signals_callback_from_pbuf( - const google::protobuf::Message&, lbann_summary*); + const google::protobuf::Message&, std::shared_ptr const&); } // namespace callback } // namespace lbann diff --git a/include/lbann/callbacks/dump_gradients.hpp b/include/lbann/callbacks/dump_gradients.hpp index ddbad213c14..005a0195955 100644 --- a/include/lbann/callbacks/dump_gradients.hpp +++ b/include/lbann/callbacks/dump_gradients.hpp @@ -72,7 +72,7 @@ class dump_gradients : public callback_base { // Builder function std::unique_ptr build_dump_gradients_callback_from_pbuf( - const google::protobuf::Message&, lbann_summary*); + const google::protobuf::Message&, std::shared_ptr const&); } // namespace callback } // namespace lbann diff --git a/include/lbann/callbacks/dump_minibatch_sample_indices.hpp b/include/lbann/callbacks/dump_minibatch_sample_indices.hpp index f3f6ddd1e48..1aca8c40a0e 100644 --- a/include/lbann/callbacks/dump_minibatch_sample_indices.hpp +++ b/include/lbann/callbacks/dump_minibatch_sample_indices.hpp @@ -77,7 +77,7 @@ class dump_minibatch_sample_indices : public callback_base { // Builder function std::unique_ptr build_dump_mb_indices_callback_from_pbuf( - const google::protobuf::Message&, lbann_summary*); + const google::protobuf::Message&, std::shared_ptr const&); } // namespace callback } // namespace lbann diff --git a/include/lbann/callbacks/dump_outputs.hpp b/include/lbann/callbacks/dump_outputs.hpp index babc685ffad..939de823c10 100644 --- a/include/lbann/callbacks/dump_outputs.hpp +++ b/include/lbann/callbacks/dump_outputs.hpp @@ -118,7 +118,7 @@ class dump_outputs : public callback_base { // Builder function std::unique_ptr build_dump_outputs_callback_from_pbuf( - const google::protobuf::Message&, lbann_summary*); + const google::protobuf::Message&, std::shared_ptr const&); } // namespace callback } // namespace lbann diff --git a/include/lbann/callbacks/dump_weights.hpp b/include/lbann/callbacks/dump_weights.hpp index c28b7e4c461..85bf7d1b2af 100644 --- a/include/lbann/callbacks/dump_weights.hpp +++ b/include/lbann/callbacks/dump_weights.hpp @@ -69,7 +69,7 @@ class dump_weights : public callback_base { // Builder function std::unique_ptr build_dump_weights_callback_from_pbuf( - const google::protobuf::Message&, lbann_summary*); + const google::protobuf::Message&, std::shared_ptr const&); } // namespace callback } // namespace lbann diff --git a/include/lbann/callbacks/early_stopping.hpp b/include/lbann/callbacks/early_stopping.hpp index 8445d20c790..f74611900f5 100644 --- a/include/lbann/callbacks/early_stopping.hpp +++ b/include/lbann/callbacks/early_stopping.hpp @@ -66,7 +66,7 @@ class early_stopping : public callback_base { // Builder function std::unique_ptr build_early_stopping_callback_from_pbuf( - const google::protobuf::Message&, lbann_summary*); + const google::protobuf::Message&, std::shared_ptr const&); } // namespace callback } // namespace lbann diff --git a/include/lbann/callbacks/hang.hpp b/include/lbann/callbacks/hang.hpp index d408a110a61..246d72ca51b 100644 --- a/include/lbann/callbacks/hang.hpp +++ b/include/lbann/callbacks/hang.hpp @@ -71,7 +71,7 @@ class hang : public callback_base { // Builder function std::unique_ptr build_hang_callback_from_pbuf( - const google::protobuf::Message&, lbann_summary*); + const google::protobuf::Message&, std::shared_ptr const&); } // namespace callback } // namespace lbann diff --git a/include/lbann/callbacks/imcomm.hpp b/include/lbann/callbacks/imcomm.hpp index 77d27bab183..5620749e0d1 100644 --- a/include/lbann/callbacks/imcomm.hpp +++ b/include/lbann/callbacks/imcomm.hpp @@ -92,6 +92,9 @@ class imcomm : public callback_base { /** Summarize relevant statistics. */ void do_summary(model *m, weights *w, EvalType im_time); + + /**@brief lbann_summary */ + std::shared_ptr m_summarizer = nullptr; }; @@ -101,7 +104,7 @@ std::string get_comm_type_name(imcomm::comm_type m); // Builder function std::unique_ptr build_imcomm_callback_from_pbuf( - const google::protobuf::Message&, lbann_summary*); + const google::protobuf::Message&, std::shared_ptr const&); } // namespace callback } // namespace lbann diff --git a/include/lbann/callbacks/learning_rate.hpp b/include/lbann/callbacks/learning_rate.hpp index 77ff3d486ae..20b0de08c17 100644 --- a/include/lbann/callbacks/learning_rate.hpp +++ b/include/lbann/callbacks/learning_rate.hpp @@ -136,7 +136,7 @@ class step_learning_rate : public learning_rate { // Builder function std::unique_ptr build_step_learning_rate_callback_from_pbuf( - const google::protobuf::Message&, lbann_summary*); + const google::protobuf::Message&, std::shared_ptr const&); /** * Decrease the learning rate by a fixed proportion when validation error stops @@ -179,7 +179,7 @@ class adaptive_learning_rate : public learning_rate { // Builder function std::unique_ptr build_adaptive_learning_rate_callback_from_pbuf( - const google::protobuf::Message&, lbann_summary*); + const google::protobuf::Message&, std::shared_ptr const&); /** * Decrease learning rate by a fixed amount at fixed times. @@ -219,7 +219,7 @@ class drop_fixed_learning_rate : // Builder function std::unique_ptr build_drop_fixed_learning_rate_callback_from_pbuf( - const google::protobuf::Message&, lbann_summary*); + const google::protobuf::Message&, std::shared_ptr const&); /** * Linearly increase the learning rate to reach a target value over a @@ -267,7 +267,7 @@ class linear_growth_learning_rate : // Builder function std::unique_ptr build_linear_growth_learning_rate_callback_from_pbuf( - const google::protobuf::Message&,lbann_summary*); + const google::protobuf::Message&,std::shared_ptr const&); /** * Decrease the learning rate by polynomial policy @@ -310,7 +310,7 @@ class poly_learning_rate : public learning_rate { // Builder function std::unique_ptr build_poly_learning_rate_callback_from_pbuf( - const google::protobuf::Message&, lbann_summary*); + const google::protobuf::Message&, std::shared_ptr const&); /** * This implements an adaptive scheme for adjust each optimizer's @@ -340,7 +340,7 @@ class optimizerwise_adaptive_learning_rate : public learning_rate { // Builder function std::unique_ptr build_optimizerwise_adaptive_learning_rate_callback_from_pbuf( - const google::protobuf::Message&,lbann_summary*); + const google::protobuf::Message&,std::shared_ptr const&); } // namespace callback } // namespace lbann diff --git a/include/lbann/callbacks/ltfb.hpp b/include/lbann/callbacks/ltfb.hpp index 8115b30b094..cd65d4e7da9 100644 --- a/include/lbann/callbacks/ltfb.hpp +++ b/include/lbann/callbacks/ltfb.hpp @@ -121,8 +121,7 @@ class ltfb : public callback_base { std::set weights_names = std::set(), bool low_score_wins = false, communication_algorithm comm_algo = communication_algorithm::sendrecv_weights, - bool exchange_hyperparameters = false, - const std::shared_ptr& summarizer = nullptr); + bool exchange_hyperparameters = false); ltfb(const ltfb& other); ltfb& operator=(const ltfb& other); ltfb* copy() const override { return new ltfb(*this); } @@ -166,13 +165,12 @@ class ltfb : public callback_base { * Used to temporarily store local weights during a tournament. */ std::vector> m_workspace_weights; - }; // Builder function std::unique_ptr build_ltfb_callback_from_pbuf( - const google::protobuf::Message&, lbann_summary*); + const google::protobuf::Message&, std::shared_ptr const&); } // namespace callback } // namespace lbann diff --git a/include/lbann/callbacks/mixup.hpp b/include/lbann/callbacks/mixup.hpp index f31d129bb44..b4b5873f3a6 100644 --- a/include/lbann/callbacks/mixup.hpp +++ b/include/lbann/callbacks/mixup.hpp @@ -80,7 +80,7 @@ class mixup : public callback_base { // Builder function std::unique_ptr build_mixup_callback_from_pbuf( - const google::protobuf::Message&, lbann_summary*); + const google::protobuf::Message&, std::shared_ptr const&); } // namespace callback } // namespace lbann diff --git a/include/lbann/callbacks/monitor_io.hpp b/include/lbann/callbacks/monitor_io.hpp index fa5f2832b2d..8f665c928d4 100644 --- a/include/lbann/callbacks/monitor_io.hpp +++ b/include/lbann/callbacks/monitor_io.hpp @@ -66,7 +66,7 @@ class monitor_io : public callback_base { // Builder function std::unique_ptr build_monitor_io_callback_from_pbuf( - const google::protobuf::Message&, lbann_summary*); + const google::protobuf::Message&, std::shared_ptr const&); } // namespace callback } // namespace lbann diff --git a/include/lbann/callbacks/perturb_adam.hpp b/include/lbann/callbacks/perturb_adam.hpp index 0fe08c6f423..7538cd526ad 100644 --- a/include/lbann/callbacks/perturb_adam.hpp +++ b/include/lbann/callbacks/perturb_adam.hpp @@ -127,7 +127,7 @@ class perturb_adam : public callback_base { // Builder function std::unique_ptr build_perturb_adam_callback_from_pbuf( - const google::protobuf::Message&, lbann_summary*); + const google::protobuf::Message&, std::shared_ptr const&); } // namespace callback } // namespace lbann diff --git a/include/lbann/callbacks/perturb_dropout.hpp b/include/lbann/callbacks/perturb_dropout.hpp index 13754dd9dd1..1fec9084ed8 100644 --- a/include/lbann/callbacks/perturb_dropout.hpp +++ b/include/lbann/callbacks/perturb_dropout.hpp @@ -80,7 +80,7 @@ class perturb_dropout : public callback_base { // Builder function std::unique_ptr build_perturb_dropout_callback_from_pbuf( - const google::protobuf::Message&, lbann_summary*); + const google::protobuf::Message&, std::shared_ptr const&); } // namespace callback } // namespace lbann diff --git a/include/lbann/callbacks/print_statistics.hpp b/include/lbann/callbacks/print_statistics.hpp index 75943c661fa..70fbc42c2ea 100644 --- a/include/lbann/callbacks/print_statistics.hpp +++ b/include/lbann/callbacks/print_statistics.hpp @@ -63,7 +63,7 @@ class print_statistics : public callback_base { // Builder function std::unique_ptr build_print_statistics_callback_from_pbuf( - const google::protobuf::Message&, lbann_summary*); + const google::protobuf::Message&, std::shared_ptr const&); } // namespace callback } // namespace lbann diff --git a/include/lbann/callbacks/profiler.hpp b/include/lbann/callbacks/profiler.hpp index f118b8d54c9..2a1c77a21dd 100644 --- a/include/lbann/callbacks/profiler.hpp +++ b/include/lbann/callbacks/profiler.hpp @@ -83,7 +83,7 @@ class profiler : public callback_base { // Builder function std::unique_ptr build_profiler_callback_from_pbuf( - const google::protobuf::Message&, lbann_summary*); + const google::protobuf::Message&, std::shared_ptr const&); } // namespace callback } // namespace lbann diff --git a/include/lbann/callbacks/replace_weights.hpp b/include/lbann/callbacks/replace_weights.hpp index 2b324ef7b55..d42ed2573be 100644 --- a/include/lbann/callbacks/replace_weights.hpp +++ b/include/lbann/callbacks/replace_weights.hpp @@ -73,7 +73,7 @@ class replace_weights : public callback_base { // Builder function std::unique_ptr build_replace_weights_callback_from_pbuf( - const google::protobuf::Message&, lbann_summary*); + const google::protobuf::Message&, std::shared_ptr const&); } // namespace callback } // namespace lbann diff --git a/include/lbann/callbacks/save_images.hpp b/include/lbann/callbacks/save_images.hpp index b772516f630..3ce9efc71f9 100644 --- a/include/lbann/callbacks/save_images.hpp +++ b/include/lbann/callbacks/save_images.hpp @@ -75,7 +75,7 @@ class save_images : public callback_base { // Builder function std::unique_ptr build_save_images_callback_from_pbuf( - const google::protobuf::Message&, lbann_summary*); + const google::protobuf::Message&, std::shared_ptr const&); } // namespace callback } // namespace lbann diff --git a/include/lbann/callbacks/save_model.hpp b/include/lbann/callbacks/save_model.hpp index 560b3db19f6..67a8bb9603f 100644 --- a/include/lbann/callbacks/save_model.hpp +++ b/include/lbann/callbacks/save_model.hpp @@ -97,7 +97,7 @@ class save_model : public callback_base { // Builder function std::unique_ptr build_save_model_callback_from_pbuf( - const google::protobuf::Message&, lbann_summary*); + const google::protobuf::Message&, std::shared_ptr const&); } // namespace callback } // namespace lbann diff --git a/include/lbann/callbacks/save_topk_models.hpp b/include/lbann/callbacks/save_topk_models.hpp index 42ef8baa42a..4a5c3800602 100644 --- a/include/lbann/callbacks/save_topk_models.hpp +++ b/include/lbann/callbacks/save_topk_models.hpp @@ -63,7 +63,7 @@ class save_topk_models : public save_model { // Builder function std::unique_ptr build_save_topk_models_callback_from_pbuf( - const google::protobuf::Message&, lbann_summary*); + const google::protobuf::Message&, std::shared_ptr const&); } // namespace callback } // namespace lbann diff --git a/include/lbann/callbacks/summary.hpp b/include/lbann/callbacks/summary.hpp index f40a2ddcd47..53f29e605bb 100644 --- a/include/lbann/callbacks/summary.hpp +++ b/include/lbann/callbacks/summary.hpp @@ -59,17 +59,23 @@ class summary : public callback_base { void on_epoch_end(model *m) override; void on_test_end(model *m) override; std::string name() const override { return "summary"; } - protected: + +private: + /**@brief lbann_summary */ + std::shared_ptr m_summarizer = nullptr; + +protected: /** Write out histograms from the model's layers. */ void save_histograms(model *m); - /** Interval for doing matrix summarization. */ + +/** Interval for doing matrix summarization. */ int m_mat_interval; }; // Builder function std::unique_ptr build_summary_callback_from_pbuf( - const google::protobuf::Message&, lbann_summary*); + const google::protobuf::Message&, std::shared_ptr const&); } // namespace callback } // namespace lbann diff --git a/include/lbann/callbacks/sync_layers.hpp b/include/lbann/callbacks/sync_layers.hpp index ebe377833c9..6fa78b5ebb9 100644 --- a/include/lbann/callbacks/sync_layers.hpp +++ b/include/lbann/callbacks/sync_layers.hpp @@ -79,7 +79,7 @@ class sync_layers : public callback_base { // Builder function std::unique_ptr build_sync_layers_callback_from_pbuf( - const google::protobuf::Message&, lbann_summary*); + const google::protobuf::Message&, std::shared_ptr const&); } // namespace callback } // namespace lbann diff --git a/include/lbann/callbacks/timeline.hpp b/include/lbann/callbacks/timeline.hpp index ff7e99828f2..5b247070fab 100644 --- a/include/lbann/callbacks/timeline.hpp +++ b/include/lbann/callbacks/timeline.hpp @@ -91,7 +91,7 @@ class timeline : public callback_base { // Builder function std::unique_ptr build_timeline_callback_from_pbuf( - const google::protobuf::Message&, lbann_summary*); + const google::protobuf::Message&, std::shared_ptr const&); } // namespace callback } // namespace lbann diff --git a/include/lbann/callbacks/timer.hpp b/include/lbann/callbacks/timer.hpp index f38dbb8cb99..ba8ee1e6735 100644 --- a/include/lbann/callbacks/timer.hpp +++ b/include/lbann/callbacks/timer.hpp @@ -44,7 +44,7 @@ class timer : public callback_base { public: timer(const std::shared_ptr& summarizer = nullptr) - : callback_base(1, summarizer) {} + : callback_base(1) {} timer(const timer&) = default; timer& operator=(const timer&) = default; timer* copy() const override { @@ -97,12 +97,15 @@ class timer : public callback_base { */ void batch_timing_end(const model& m); + /**@brief lbann_summary */ + std::shared_ptr m_summarizer = nullptr; + }; // Builder function std::unique_ptr build_timer_callback_from_pbuf( - const google::protobuf::Message&, lbann_summary*); + const google::protobuf::Message&, std::shared_ptr const&); } // namespace callback } // namespace lbann diff --git a/include/lbann/callbacks/variable_minibatch.hpp b/include/lbann/callbacks/variable_minibatch.hpp index d8a6a09c8ff..ce187cfb039 100644 --- a/include/lbann/callbacks/variable_minibatch.hpp +++ b/include/lbann/callbacks/variable_minibatch.hpp @@ -114,7 +114,7 @@ class step_minibatch : public variable_minibatch { // Builder function std::unique_ptr build_step_minibatch_callback_from_pbuf( - const google::protobuf::Message&, lbann_summary*); + const google::protobuf::Message&, std::shared_ptr const&); class minibatch_schedule : public variable_minibatch { public: @@ -152,7 +152,7 @@ class minibatch_schedule : public variable_minibatch { // Builder function std::unique_ptr build_minibatch_schedule_callback_from_pbuf( - const google::protobuf::Message&, lbann_summary*); + const google::protobuf::Message&, std::shared_ptr const&); } // namespace callback } // namespace lbann diff --git a/include/lbann/proto/factories.hpp b/include/lbann/proto/factories.hpp index 4aa32a651d4..6d2d0c32e9b 100644 --- a/include/lbann/proto/factories.hpp +++ b/include/lbann/proto/factories.hpp @@ -73,14 +73,14 @@ weights* construct_weights(lbann_comm* comm, /** Construct a callback specified with prototext. */ std::unique_ptr construct_callback(const google::protobuf::Message& proto_cb, - lbann_summary* summarizer); + std::shared_ptr const& summarizer); /** Construct a summarizer specified with prototext. * The summarizer is only constructed if the summarizer callback is * enabled. */ -lbann_summary* construct_summarizer(lbann_comm* comm, - const lbann_data::Model& m); +std::unique_ptr construct_summarizer(lbann_comm* comm, + const lbann_data::Model& m); /** Construct an optimizer specified with prototext. */ optimizer* construct_optimizer(lbann_comm* comm, diff --git a/src/callbacks/confusion_matrix.cpp b/src/callbacks/confusion_matrix.cpp index 6c464af1522..3406860e4d4 100644 --- a/src/callbacks/confusion_matrix.cpp +++ b/src/callbacks/confusion_matrix.cpp @@ -44,7 +44,7 @@ namespace callback { confusion_matrix::confusion_matrix(std::string prediction_layer, std::string label_layer, std::string prefix) - : callback_base(1, nullptr), + : callback_base(1), m_prediction_layer(std::move(prediction_layer)), m_label_layer(std::move(label_layer)), m_prefix(std::move(prefix)) {} diff --git a/src/callbacks/debug.cpp b/src/callbacks/debug.cpp index d8e44f00d1a..117a0e97a90 100644 --- a/src/callbacks/debug.cpp +++ b/src/callbacks/debug.cpp @@ -160,12 +160,12 @@ void debug::on_optimize_end(model *m, weights *w) { std::unique_ptr build_debug_callback_from_pbuf(const google::protobuf::Message& proto_msg, - const std::shared_ptr& summarizer) { + const std::shared_ptr&) { const auto& params = dynamic_cast(proto_msg); const auto& modes = parse_set(params.phase()); - return make_unique(modes, summarizer); + return make_unique(modes); } } // namespace callback diff --git a/src/callbacks/imcomm.cpp b/src/callbacks/imcomm.cpp index a6e9d7a1f17..779fcc08842 100644 --- a/src/callbacks/imcomm.cpp +++ b/src/callbacks/imcomm.cpp @@ -41,7 +41,7 @@ namespace callback { imcomm::imcomm(imcomm::comm_type ct, const std::shared_ptr& summarizer) : - callback_base(1, summarizer), m_default_ct(ct) {} + m_default_ct(ct), m_summarizer(summarizer) {} imcomm::imcomm(imcomm::comm_type ct, std::unordered_set weights_list, diff --git a/src/callbacks/ltfb.cpp b/src/callbacks/ltfb.cpp index 85013e2606b..45a793700dd 100644 --- a/src/callbacks/ltfb.cpp +++ b/src/callbacks/ltfb.cpp @@ -332,9 +332,8 @@ ltfb::ltfb(El::Int batch_interval, std::set weights_names, bool low_score_wins, communication_algorithm comm_algo, - bool exchange_hyperparameters, - const std::shared_ptr& summarizer) - : callback_base(batch_interval, summarizer), + bool exchange_hyperparameters) + : callback_base(batch_interval), m_metric_name(std::move(metric_name)), m_weights_names(std::move(weights_names)), m_low_score_wins(low_score_wins), @@ -533,7 +532,7 @@ ltfb::string_to_comm_algo(const std::string& str) { std::unique_ptr build_ltfb_callback_from_pbuf( const google::protobuf::Message& proto_msg, - std::shared_ptr const& summarizer) { + const std::shared_ptr&) { const auto& params = dynamic_cast(proto_msg); return make_unique( @@ -542,8 +541,7 @@ build_ltfb_callback_from_pbuf( parse_set(params.weights()), params.low_score_wins(), ltfb::string_to_comm_algo(params.communication_algorithm()), - params.exchange_hyperparameters(), - summarizer); + params.exchange_hyperparameters()); } } // namespace callback diff --git a/src/callbacks/summary.cpp b/src/callbacks/summary.cpp index e44b3be0a60..226092e4c7c 100644 --- a/src/callbacks/summary.cpp +++ b/src/callbacks/summary.cpp @@ -40,16 +40,34 @@ namespace lbann { namespace callback { summary::summary(const std::shared_ptr& summarizer, - int batch_interval, - int mat_interval) : - callback_base(batch_interval, summarizer), + int batch_interval, + int mat_interval) : + callback_base(batch_interval), + m_summarizer(summarizer), m_mat_interval(mat_interval) {} +namespace +{ +template +std::string BuildErrorMessage(Ts... args) +{ + std::ostringstream oss; + int dummy[] = { (oss << args, 0)... }; + (void) dummy; + LBANN_ERROR(oss.str()); +} +} + void summary::on_train_begin(model *m) { save_histograms(m); } void summary::on_batch_end(model *m) { + + if(!m_summarizer){ + LBANN_ERROR(BuildErrorMessage("Summary callback failed: m_summarizer does not exist.")); + } + prof_region_begin("summary-batch", prof_colors[0], false); m->summarize_stats(*m_summarizer); if (m_mat_interval > 0 && m->get_step(execution_mode::training) % m_mat_interval == 0) { @@ -75,6 +93,10 @@ void summary::on_batch_end(model *m) { } void summary::on_epoch_end(model *m) { + if(!m_summarizer){ + LBANN_ERROR(BuildErrorMessage("Summary callback failed: m_summarizer does not exist.")); + } + prof_region_begin("summary-epoch", prof_colors[0], false); for (const auto& met : m->get_metrics()) { EvalType train_score = met->get_mean_value(m->get_execution_mode()); @@ -91,6 +113,10 @@ void summary::on_epoch_end(model *m) { } void summary::on_test_end(model *m) { + + if(!m_summarizer){ + LBANN_ERROR(BuildErrorMessage("Summary callback failed: m_summarizer does not exist.")); + } prof_region_begin("summary-test", prof_colors[0], false); lbann_comm *comm = m->get_comm(); for (auto&& met : m->get_metrics()) { @@ -111,6 +137,9 @@ void summary::on_test_end(model *m) { } void summary::save_histograms(model *m) { + if(!m_summarizer){ + LBANN_ERROR(BuildErrorMessage("Summary callback failed: m_summarizer does not exist.")); + } for (const auto& layer : m->get_layers()) { const std::string prefix = layer->get_name() + "/"; for (int i = 0; i < layer->get_num_children(); ++i) { @@ -143,8 +172,8 @@ build_summary_callback_from_pbuf( const auto& params = dynamic_cast(proto_msg); return make_unique(summarizer, - params.batch_interval(), - params.mat_interval()); + params.batch_interval(), + params.mat_interval()); } } // namespace callback diff --git a/src/callbacks/timer.cpp b/src/callbacks/timer.cpp index ddf42fdba39..e98b37aa6f0 100644 --- a/src/callbacks/timer.cpp +++ b/src/callbacks/timer.cpp @@ -40,7 +40,7 @@ void timer::batch_timing_end(const model& m) { const auto& mode = m.get_execution_mode(); const auto& batch_time = get_time() - m_batch_start_times[mode]; m_batch_times[mode].push_back(batch_time); - if (m_summarizer != nullptr) { + if (m_summarizer) { m_summarizer->reduce_scalar("minibatch_time", batch_time, m.get_step(execution_mode::training)-1); m_summarizer->reduce_scalar_all("minibatch_time", batch_time, m.get_step(execution_mode::training)-1); } diff --git a/src/proto/callbacks.proto b/src/proto/callbacks.proto index 647d852e09a..eda029bff41 100644 --- a/src/proto/callbacks.proto +++ b/src/proto/callbacks.proto @@ -121,7 +121,6 @@ message Callback { } message CallbackSummary { - string dir = 1; //directory for the lbann_summary int64 batch_interval = 2; //default in lbann_callback_summary.hpp is 1 int64 mat_interval = 3; //default in lbann_callback_summary.hpp is 25 } diff --git a/src/proto/factories/callback_factory.cpp b/src/proto/factories/callback_factory.cpp index 79a79781c25..618045a1570 100644 --- a/src/proto/factories/callback_factory.cpp +++ b/src/proto/factories/callback_factory.cpp @@ -87,7 +87,7 @@ using factory_type = lbann::generic_factory< std::string, generate_builder_type, + std::shared_ptr const&>, default_key_error_policy>; namespace @@ -221,32 +221,26 @@ construct_callback( return factory.create_object(msg.GetDescriptor()->name(), msg, summarizer); } -lbann_summary* construct_summarizer(lbann_comm* comm, - const lbann_data::Model& m) { - lbann_summary *summary = nullptr; - bool master = comm->am_world_master(); - int size = m.callback_size(); - for (int j=0; j construct_summarizer(lbann_comm* comm, + const lbann_data::Model& m) { + const bool master = comm->am_world_master(); + if (m.has_summarizer()) { + auto dir = m.summarizer().dir(); + + if (master) { + std::cout << "constructing summarizer with dir: " << dir << std::endl; + } + + //check to see if directory exists + struct stat sb; + if (! ( stat(dir.c_str(), &sb) == 0 && S_ISDIR(sb.st_mode) )) { + LBANN_ERROR(BuildErrorMessage("summary directory ", + dir, " does not exist.")); } + + return make_unique(dir, comm); } - return summary; + return nullptr; } } // namespace proto diff --git a/src/proto/factories/model_factory.cpp b/src/proto/factories/model_factory.cpp index 63abc320e5a..cf34252397c 100644 --- a/src/proto/factories/model_factory.cpp +++ b/src/proto/factories/model_factory.cpp @@ -279,7 +279,7 @@ model* construct_model(lbann_comm* comm, // Construct callbacks std::vector> callback_list; - auto&& summarizer = construct_summarizer(comm, proto_model); + auto summarizer = std::shared_ptr(construct_summarizer(comm, proto_model)); for (int i=0; i Date: Mon, 5 Aug 2019 17:02:20 -0700 Subject: [PATCH 200/634] auxiliary functions use unique_ptr directly --- src/proto/factories/model_factory.cpp | 35 ++++++++++++--------------- 1 file changed, 15 insertions(+), 20 deletions(-) diff --git a/src/proto/factories/model_factory.cpp b/src/proto/factories/model_factory.cpp index a2eea721962..8ad5ac1f865 100644 --- a/src/proto/factories/model_factory.cpp +++ b/src/proto/factories/model_factory.cpp @@ -79,7 +79,7 @@ instantiate_model(lbann_comm* comm, * Layer terms require pointers to layers. */ void assign_layers_to_objective_function( - std::vector& layer_list, + const std::vector>& layer_list, objective_function& obj, const lbann_data::ObjectiveFunction& proto_obj) { @@ -93,7 +93,7 @@ void assign_layers_to_objective_function( err << "layer name \"" << name << "\" is not unique"; LBANN_ERROR(err.str()); } - names_to_layers[name] = l; + names_to_layers[name] = l.get(); } // Assign layers to layer terms in objective function @@ -123,12 +123,12 @@ void assign_layers_to_objective_function( << "in the prototext"; LBANN_ERROR(err.str()); } - } -void assign_layers_to_metrics(std::vector& layer_list, - std::vector>& metric_list, - const lbann_data::Model& proto_model) { +void assign_layers_to_metrics( + const std::vector>& layer_list, + std::vector>& metric_list, + const lbann_data::Model& proto_model) { // Construct map from layer names to layers std::unordered_map names_to_layers; @@ -139,7 +139,7 @@ void assign_layers_to_metrics(std::vector& layer_list, err << "layer name \"" << name << "\" is not unique"; LBANN_ERROR(err.str()); } - names_to_layers[name] = l; + names_to_layers[name] = l.get(); } // Assign layers to layer metrics @@ -162,10 +162,10 @@ void assign_layers_to_metrics(std::vector& layer_list, } /** Setup pointers from layers to weights. */ -void assign_weights_to_layers(std::vector& layer_list, - std::vector>& weights_list, - const lbann_data::Model& proto_model) { - std::stringstream err; +void assign_weights_to_layers( + const std::vector>& layer_list, + std::vector>& weights_list, + const lbann_data::Model& proto_model) { // Construct map from weights names to weights std::unordered_map names_to_weights; @@ -206,7 +206,7 @@ void assign_weights_to_layers(std::vector& layer_list, * L2 weight regularization requires pointers to weights. */ void assign_weights_to_objective_function( - std::vector>& weights_list, + const std::vector>& weights_list, objective_function& obj, const lbann_data::ObjectiveFunction& proto_obj) { @@ -260,16 +260,11 @@ std::unique_ptr construct_model( auto&& layer_list = construct_layer_graph(comm, data_readers, proto_model); - std::vector layer_pointers; - layer_pointers.reserve(layer_list.size()); - for (auto&& ptr : layer_list) { - layer_pointers.push_back(ptr.get()); - } // Construct objective function const auto& proto_obj = proto_model.objective_function(); auto obj = construct_objective_function(proto_obj); - assign_layers_to_objective_function(layer_pointers, *obj, proto_obj); + assign_layers_to_objective_function(layer_list, *obj, proto_obj); // Construct weights std::vector> weights_list; @@ -279,7 +274,7 @@ std::unique_ptr construct_model( proto_opt, proto_model.weights(i))); } - assign_weights_to_layers(layer_pointers, weights_list, proto_model); + assign_weights_to_layers(layer_list, weights_list, proto_model); assign_weights_to_objective_function(weights_list, *obj, proto_obj); // Construct metrics @@ -290,7 +285,7 @@ std::unique_ptr construct_model( params.name(), params.unit())); } - assign_layers_to_metrics(layer_pointers, metric_list, proto_model); + assign_layers_to_metrics(layer_list, metric_list, proto_model); // Construct callbacks std::vector> callback_list; From 330afd47238154fa767cd0904296ee1d418f67f9 Mon Sep 17 00:00:00 2001 From: "Thomas R. Benson" Date: Mon, 5 Aug 2019 17:02:44 -0700 Subject: [PATCH 201/634] cleanup LBANN_ERROR code in model_factory --- include/lbann/utils/exception.hpp | 21 +++++++++ src/proto/factories/model_factory.cpp | 61 ++++++++++----------------- 2 files changed, 44 insertions(+), 38 deletions(-) diff --git a/include/lbann/utils/exception.hpp b/include/lbann/utils/exception.hpp index c487eb6af64..d4669f95241 100644 --- a/include/lbann/utils/exception.hpp +++ b/include/lbann/utils/exception.hpp @@ -46,6 +46,9 @@ throw lbann::exception(ss_LBANN_ERROR.str()); \ } while (0) +#define LBANN_ERROR_STR(...) \ + LBANN_ERROR(build_string(__VA_ARGS__)) + // Macro to print a warning to standard error stream. #define LBANN_WARNING(message) \ do { \ @@ -91,6 +94,24 @@ class exception : public std::exception { }; using lbann_exception = exception; +/** @brief Build a string from the arguments. + * + * The arguments must be stream-outputable (have operator<<(ostream&, + * T) defined). It will be a static error if this fails. + * + * @tparam Args (Inferred) The types of the arguments. + * + * @param[in] args The things to be stringified. + */ +template +std::string build_string(Args&&... args) +{ + std::ostringstream oss; + int dummy[] = { (oss << args, 0)... }; + (void) dummy; // silence compiler warnings + return oss.str(); +} + } // namespace lbann #endif // LBANN_UTILS_EXCEPTION_HPP_INCLUDED diff --git a/src/proto/factories/model_factory.cpp b/src/proto/factories/model_factory.cpp index 8ad5ac1f865..43e88eed6f5 100644 --- a/src/proto/factories/model_factory.cpp +++ b/src/proto/factories/model_factory.cpp @@ -68,9 +68,7 @@ instantiate_model(lbann_comm* comm, } // Throw error if model type is not supported - std::stringstream err; - err << "unknown model type (" << type << ")"; - LBANN_ERROR(err.str()); + LBANN_ERROR_STR("unknown model type (", type, ")"); return nullptr; } @@ -83,15 +81,12 @@ void assign_layers_to_objective_function( objective_function& obj, const lbann_data::ObjectiveFunction& proto_obj) { - std::stringstream err; - // Construct map from layer names to layers std::unordered_map names_to_layers; for (auto&& l : layer_list) { const auto& name = l->get_name(); if (names_to_layers.count(name) > 0) { - err << "layer name \"" << name << "\" is not unique"; - LBANN_ERROR(err.str()); + LBANN_ERROR_STR("layer name \"", name, "\" is not unique"); } names_to_layers[name] = l.get(); } @@ -106,10 +101,9 @@ void assign_layers_to_objective_function( const auto& params = proto_obj.layer_term(num_layer_terms-1); auto* l = names_to_layers[params.layer()]; if (l == nullptr) { - err << "attempted to set objective function layer term " - << "to correspond to layer \"" << params.layer() << "\", " - << "but no such layer exists"; - LBANN_ERROR(err.str()); + LBANN_ERROR_STR("attempted to set objective function layer term ", + "to correspond to layer \"", params.layer(), "\", ", + "but no such layer exists"); } term->set_layer(*l); } @@ -117,11 +111,9 @@ void assign_layers_to_objective_function( // Check that layer terms in objective function match prototext if (num_layer_terms != proto_obj.layer_term_size()) { - err << "recieved " << num_layer_terms << " " - << "objective function layer terms, " - << "but there are " << proto_obj.layer_term_size() << " " - << "in the prototext"; - LBANN_ERROR(err.str()); + LBANN_ERROR_STR("recieved ", num_layer_terms, + " objective function layer terms, but there are ", + proto_obj.layer_term_size(), " in the prototext"); } } @@ -135,9 +127,7 @@ void assign_layers_to_metrics( for (auto&& l : layer_list) { const auto& name = l->get_name(); if (names_to_layers.count(name) > 0) { - std::stringstream err; - err << "layer name \"" << name << "\" is not unique"; - LBANN_ERROR(err.str()); + LBANN_ERROR_STR("layer name \"", name, "\" is not unique"); } names_to_layers[name] = l.get(); } @@ -149,11 +139,10 @@ void assign_layers_to_metrics( const auto& params = proto_model.metric(i).layer_metric(); auto* l = names_to_layers[params.layer()]; if (l == nullptr) { - std::stringstream err; - err << "attempted to set layer metric \"" << m->name() << "\" " - << "to correspond to layer \"" << params.layer() << "\", " - << "but no such layer exists"; - LBANN_ERROR(err.str()); + LBANN_ERROR_STR("attempted to set layer metric " + "\"", m->name(), "\" " + "to correspond to layer \"", params.layer(), "\", " + "but no such layer exists"); } m->set_layer(*l); } @@ -172,8 +161,7 @@ void assign_weights_to_layers( for (auto&& w : weights_list) { const auto& name = w->get_name(); if (names_to_weights.count(name) > 0) { - err << "weights name \"" << name << "\" is not unique"; - LBANN_ERROR(err.str()); + LBANN_ERROR_STR("weights name \"", name, "\" is not unique"); } names_to_weights[name] = w.get(); } @@ -186,9 +174,10 @@ void assign_weights_to_layers( for (auto&& name : parse_list(proto_layer.weights())) { auto&& w = names_to_weights[name]; if (!w) { - err << "could not find weights named \"" << name << "\", " - << "which are expected by layer " << layer_list[i]->get_name(); - LBANN_ERROR(err.str()); + LBANN_ERROR_STR("could not find weights named " + "\"", name, "\", " + "which are expected by layer ", + layer_list[i]->get_name()); } if (is_frozen) { w->freeze(); @@ -210,15 +199,12 @@ void assign_weights_to_objective_function( objective_function& obj, const lbann_data::ObjectiveFunction& proto_obj) { - std::stringstream err; - // Construct map from weights names to weights std::unordered_map names_to_weights; for (auto&& w : weights_list) { const auto& name = w->get_name(); if (names_to_weights.count(name) > 0) { - err << "weights name \"" << name << "\" is not unique"; - LBANN_ERROR(err.str()); + LBANN_ERROR_STR("weights name \"", name, "\" is not unique"); } names_to_weights[name] = w.get(); } @@ -234,11 +220,10 @@ void assign_weights_to_objective_function( std::vector term_weights; for (auto&& weights_name : parse_list(params.weights())) { auto&& w = names_to_weights[weights_name]; - if (w == nullptr) { - err << "attempted to apply L2 weight regularization to " - << "weights \"" << weights_name << "\", " - << "but no such weights exists"; - LBANN_ERROR(err.str()); + if (!w) { + LBANN_ERROR_STR("attempted to apply L2 weight regularization to " + "weights \"", weights_name, "\", " + "but no such weights exists"); } term_weights.push_back(w); } From 84a79f925e01ca6180e13e8a8a815f784f671171 Mon Sep 17 00:00:00 2001 From: Katie Graham Date: Mon, 5 Aug 2019 17:46:20 -0700 Subject: [PATCH 202/634] Updated model_zoo prototext files to handle changes in summarizer constructor --- model_zoo/model.prototext | 232 ++++++++++++++++++ .../model_autoencoder_chem_sigmoid.prototext | 4 +- .../jag/wae_cycle_gan/cycle_gan.prototext | 11 +- .../wae_cycle_gan/cycle_gan_only.prototext | 10 +- .../lenet_mnist/model_lenet_mnist.prototext | 4 +- .../models/resnet50/model_resnet50.prototext | 5 +- .../model_mnist_simple_1.prototext | 5 +- .../model_mnist_simple_2.prototext | 4 +- .../tests/model_lenet_mnist_ckpt.prototext | 4 +- .../model_lenet_mnist_dist_ckpt.prototext | 5 +- .../model_lenet_mnist_lbann2ckpt.prototext | 5 +- 11 files changed, 273 insertions(+), 16 deletions(-) create mode 100644 model_zoo/model.prototext diff --git a/model_zoo/model.prototext b/model_zoo/model.prototext new file mode 100644 index 00000000000..6296f02c5f7 --- /dev/null +++ b/model_zoo/model.prototext @@ -0,0 +1,232 @@ +# cmd line for original experiment: +# $ /g/g13/graham63/workspace/pascal/lbann/install/bin/lbann --model=models/autoencoder_mnist/model_autoencoder_mnist.prototext --reader=data_readers/data_reader_mnist.prototext --optimizer=optimizers/opt_sgd.prototext +# +# Experiment conducted at: Mon Aug 5 10:08:01 2019 +# +# +# Experiment was run with lbann version: 0.99.0 +# +# +# To rerun the experiment: +# $ srun -n1 /g/g13/graham63/workspace/pascal/lbann/install/bin/lbann --prototext=model.prototext +# +# +# Selected SLURM Environment Variables: +# SLURM_NODELIST=pascal30 +# SLURM_NNODES=1 +# SLURM_NTASKS=1 +# SLURM_TASKS_PER_NODE=1 + +# +# +data_reader { + reader { + name: "mnist" + role: "train" + shuffle: true + data_filedir: "/p/lscratchh/brainusr/datasets/MNIST" + data_filename: "train-images-idx3-ubyte" + label_filename: "train-labels-idx1-ubyte" + validation_percent: 0.1 + percent_of_data_to_use: 1 + transforms { + scale { + scale: 0.00392156886 + } + } + } + reader { + name: "mnist" + role: "test" + shuffle: true + data_filedir: "/p/lscratchh/brainusr/datasets/MNIST" + data_filename: "t10k-images-idx3-ubyte" + label_filename: "t10k-labels-idx1-ubyte" + percent_of_data_to_use: 1 + transforms { + scale { + scale: 0.00392156886 + } + } + } +} +model { + objective_function { + layer_term { + layer: "mean_squared_error" + } + } + num_epochs: 10 + data_layout: "model_parallel" + layer { + input { + } + name: "data" + data_layout: "data_parallel" + children: "image dummy" + } + layer { + name: "image" + data_layout: "data_parallel" + parents: "data" + split { + } + } + layer { + name: "dummy" + data_layout: "data_parallel" + parents: "data" + dummy { + } + } + layer { + fully_connected { + num_neurons: 1000 + weight_initialization: "glorot_uniform" + has_bias: true + } + name: "encode1" + data_layout: "model_parallel" + parents: "image" + } + layer { + name: "relu1" + data_layout: "model_parallel" + parents: "encode1" + relu { + } + } + layer { + fully_connected { + num_neurons: 500 + weight_initialization: "glorot_uniform" + has_bias: true + } + name: "encode2" + data_layout: "model_parallel" + parents: "relu1" + } + layer { + name: "relu2" + data_layout: "model_parallel" + parents: "encode2" + relu { + } + } + layer { + fully_connected { + num_neurons: 250 + weight_initialization: "glorot_uniform" + has_bias: true + } + name: "encode3" + data_layout: "model_parallel" + parents: "relu2" + } + layer { + name: "relu3" + data_layout: "model_parallel" + parents: "encode3" + relu { + } + } + layer { + fully_connected { + num_neurons: 30 + weight_initialization: "glorot_uniform" + has_bias: true + } + name: "encode4" + data_layout: "model_parallel" + parents: "relu3" + } + layer { + fully_connected { + num_neurons: 250 + weight_initialization: "glorot_uniform" + has_bias: true + } + name: "decode4" + data_layout: "model_parallel" + parents: "encode4" + } + layer { + name: "relu4" + data_layout: "model_parallel" + parents: "decode4" + relu { + } + } + layer { + fully_connected { + num_neurons: 500 + weight_initialization: "glorot_uniform" + has_bias: true + } + name: "decode3" + data_layout: "model_parallel" + parents: "relu4" + } + layer { + name: "relu5" + data_layout: "model_parallel" + parents: "decode3" + relu { + } + } + layer { + fully_connected { + num_neurons: 1000 + weight_initialization: "glorot_uniform" + has_bias: true + } + name: "decode2" + data_layout: "model_parallel" + parents: "relu5" + } + layer { + name: "relu6" + data_layout: "model_parallel" + parents: "decode2" + relu { + } + } + layer { + fully_connected { + weight_initialization: "glorot_uniform" + has_bias: true + } + name: "decode1" + data_layout: "model_parallel" + hint_layer: "image" + parents: "relu6" + } + layer { + name: "reconstruction" + data_layout: "model_parallel" + parents: "decode1" + sigmoid { + } + } + layer { + name: "mean_squared_error" + data_layout: "data_parallel" + mean_squared_error { + } + parents: "reconstruction image" + } + mini_batch_size: 10 + callback { + print { + interval: 1 + } + } + block_size: 256 + num_parallel_readers: 1 +} +optimizer { + sgd { + learn_rate: 0.01 + momentum: 0.9 + } +} diff --git a/model_zoo/models/autoencoder_candle_pilot1/model_autoencoder_chem_sigmoid.prototext b/model_zoo/models/autoencoder_candle_pilot1/model_autoencoder_chem_sigmoid.prototext index 374ed07ec4c..67fbf7dda3e 100644 --- a/model_zoo/models/autoencoder_candle_pilot1/model_autoencoder_chem_sigmoid.prototext +++ b/model_zoo/models/autoencoder_candle_pilot1/model_autoencoder_chem_sigmoid.prototext @@ -20,6 +20,9 @@ model { ################################################### # Callbacks ################################################### + summarizer { + dir: "." + } callback { print { interval: 1 @@ -31,7 +34,6 @@ model { } callback { summary { - dir: "." batch_interval: 1 mat_interval: 25 } diff --git a/model_zoo/models/jag/wae_cycle_gan/cycle_gan.prototext b/model_zoo/models/jag/wae_cycle_gan/cycle_gan.prototext index 716840be3ae..92db21e4ca1 100644 --- a/model_zoo/models/jag/wae_cycle_gan/cycle_gan.prototext +++ b/model_zoo/models/jag/wae_cycle_gan/cycle_gan.prototext @@ -1,8 +1,8 @@ model { name: "cycgan_model" - shareable_training_data_reader:false + shareable_training_data_reader:false serialize_io: true - procs_per_trainer:0 + procs_per_trainer:0 objective_function { l2_weight_regularization { scale_factor: 0.0001 @@ -48,7 +48,7 @@ model { layer: "L_cyc_x" } } - num_epochs: 40 + num_epochs: 40 super_steps: 10 metric { layer_metric { @@ -1001,9 +1001,12 @@ model { } callback { gpu_memory_usage {} } #callback { debug {} } + + #summarizer { + # dir: "." + #} #callback { # summary { - # dir: "." # mat_interval: 25 # } #} diff --git a/model_zoo/models/jag/wae_cycle_gan/cycle_gan_only.prototext b/model_zoo/models/jag/wae_cycle_gan/cycle_gan_only.prototext index 79a4e1264e7..adaeeb9456c 100644 --- a/model_zoo/models/jag/wae_cycle_gan/cycle_gan_only.prototext +++ b/model_zoo/models/jag/wae_cycle_gan/cycle_gan_only.prototext @@ -1,8 +1,8 @@ model { name: "cycgan_model" - shareable_training_data_reader:false + shareable_training_data_reader:false serialize_io: true - procs_per_trainer:0 + procs_per_trainer:0 objective_function { l2_weight_regularization { scale_factor: 0.0001 @@ -48,7 +48,7 @@ model { layer: "L_cyc_x" } } - num_epochs: 40 + num_epochs: 40 super_steps: 10 metric { layer_metric { @@ -871,9 +871,11 @@ model { } callback { gpu_memory_usage {} } #callback { debug {} } + #summarizer { + # dir: "." + #} #callback { # summary { - # dir: "." # mat_interval: 25 # } #} diff --git a/model_zoo/models/lenet_mnist/model_lenet_mnist.prototext b/model_zoo/models/lenet_mnist/model_lenet_mnist.prototext index 99a763c52b6..d41b5c90339 100644 --- a/model_zoo/models/lenet_mnist/model_lenet_mnist.prototext +++ b/model_zoo/models/lenet_mnist/model_lenet_mnist.prototext @@ -32,12 +32,14 @@ model { ################################################### # Callbacks ################################################### + summarizer { + dir: "." + } callback { print {} } callback { timer {} } callback { summary { - dir: "." mat_interval: 25 } } diff --git a/model_zoo/models/resnet50/model_resnet50.prototext b/model_zoo/models/resnet50/model_resnet50.prototext index 520bb8aa11a..6ad463158cf 100644 --- a/model_zoo/models/resnet50/model_resnet50.prototext +++ b/model_zoo/models/resnet50/model_resnet50.prototext @@ -47,9 +47,12 @@ model { all_optimizers: true } } + + #summarizer { + # dir: "." + #} # callback { # summary { - # dir: "." # mat_interval: 25 # } # } diff --git a/model_zoo/models/simple_mnist/model_mnist_simple_1.prototext b/model_zoo/models/simple_mnist/model_mnist_simple_1.prototext index 77a1c7ed256..638368f5fb7 100644 --- a/model_zoo/models/simple_mnist/model_mnist_simple_1.prototext +++ b/model_zoo/models/simple_mnist/model_mnist_simple_1.prototext @@ -33,11 +33,14 @@ model { # Callbacks ################################################### + summarizer { + dir: "." + } + callback { print {} } callback { timer {} } callback { summary { - dir: "." mat_interval: 25 } } diff --git a/model_zoo/models/simple_mnist/model_mnist_simple_2.prototext b/model_zoo/models/simple_mnist/model_mnist_simple_2.prototext index 0be924650f9..ea600bd2646 100644 --- a/model_zoo/models/simple_mnist/model_mnist_simple_2.prototext +++ b/model_zoo/models/simple_mnist/model_mnist_simple_2.prototext @@ -32,12 +32,14 @@ model { ################################################### # Callbacks ################################################### + summarizer { + dir: "." + } callback { print {} } callback { timer {} } callback { summary { - dir: "." mat_interval: 25 } } diff --git a/model_zoo/tests/model_lenet_mnist_ckpt.prototext b/model_zoo/tests/model_lenet_mnist_ckpt.prototext index e717e129366..060b2c223d3 100644 --- a/model_zoo/tests/model_lenet_mnist_ckpt.prototext +++ b/model_zoo/tests/model_lenet_mnist_ckpt.prototext @@ -32,12 +32,14 @@ model { ################################################### # Callbacks ################################################### + summarizer { + dir: "." + } callback { print {} } callback { timer {} } callback { summary { - dir: "." mat_interval: 25 } } diff --git a/model_zoo/tests/model_lenet_mnist_dist_ckpt.prototext b/model_zoo/tests/model_lenet_mnist_dist_ckpt.prototext index 8afa85edd18..922e0f483a6 100644 --- a/model_zoo/tests/model_lenet_mnist_dist_ckpt.prototext +++ b/model_zoo/tests/model_lenet_mnist_dist_ckpt.prototext @@ -33,11 +33,14 @@ model { # Callbacks ################################################### + summarizer { + dir: "." + } + callback { print {} } callback { timer {} } callback { summary { - dir: "." mat_interval: 25 } } diff --git a/model_zoo/tests/model_lenet_mnist_lbann2ckpt.prototext b/model_zoo/tests/model_lenet_mnist_lbann2ckpt.prototext index d8e7066afd5..7d17ecd18c4 100644 --- a/model_zoo/tests/model_lenet_mnist_lbann2ckpt.prototext +++ b/model_zoo/tests/model_lenet_mnist_lbann2ckpt.prototext @@ -33,11 +33,14 @@ model { # Callbacks ################################################### + summarizer { + dir: "." + } + callback { print {} } callback { timer {} } callback { summary { - dir: "." mat_interval: 25 } } From 29f7503b1a7f8bf243bc02abdc780702e8a3775a Mon Sep 17 00:00:00 2001 From: Katie Graham Date: Mon, 5 Aug 2019 17:57:24 -0700 Subject: [PATCH 203/634] "Removed model.prototext- committed by mistake" --- model_zoo/model.prototext | 232 -------------------------------------- 1 file changed, 232 deletions(-) delete mode 100644 model_zoo/model.prototext diff --git a/model_zoo/model.prototext b/model_zoo/model.prototext deleted file mode 100644 index 6296f02c5f7..00000000000 --- a/model_zoo/model.prototext +++ /dev/null @@ -1,232 +0,0 @@ -# cmd line for original experiment: -# $ /g/g13/graham63/workspace/pascal/lbann/install/bin/lbann --model=models/autoencoder_mnist/model_autoencoder_mnist.prototext --reader=data_readers/data_reader_mnist.prototext --optimizer=optimizers/opt_sgd.prototext -# -# Experiment conducted at: Mon Aug 5 10:08:01 2019 -# -# -# Experiment was run with lbann version: 0.99.0 -# -# -# To rerun the experiment: -# $ srun -n1 /g/g13/graham63/workspace/pascal/lbann/install/bin/lbann --prototext=model.prototext -# -# -# Selected SLURM Environment Variables: -# SLURM_NODELIST=pascal30 -# SLURM_NNODES=1 -# SLURM_NTASKS=1 -# SLURM_TASKS_PER_NODE=1 - -# -# -data_reader { - reader { - name: "mnist" - role: "train" - shuffle: true - data_filedir: "/p/lscratchh/brainusr/datasets/MNIST" - data_filename: "train-images-idx3-ubyte" - label_filename: "train-labels-idx1-ubyte" - validation_percent: 0.1 - percent_of_data_to_use: 1 - transforms { - scale { - scale: 0.00392156886 - } - } - } - reader { - name: "mnist" - role: "test" - shuffle: true - data_filedir: "/p/lscratchh/brainusr/datasets/MNIST" - data_filename: "t10k-images-idx3-ubyte" - label_filename: "t10k-labels-idx1-ubyte" - percent_of_data_to_use: 1 - transforms { - scale { - scale: 0.00392156886 - } - } - } -} -model { - objective_function { - layer_term { - layer: "mean_squared_error" - } - } - num_epochs: 10 - data_layout: "model_parallel" - layer { - input { - } - name: "data" - data_layout: "data_parallel" - children: "image dummy" - } - layer { - name: "image" - data_layout: "data_parallel" - parents: "data" - split { - } - } - layer { - name: "dummy" - data_layout: "data_parallel" - parents: "data" - dummy { - } - } - layer { - fully_connected { - num_neurons: 1000 - weight_initialization: "glorot_uniform" - has_bias: true - } - name: "encode1" - data_layout: "model_parallel" - parents: "image" - } - layer { - name: "relu1" - data_layout: "model_parallel" - parents: "encode1" - relu { - } - } - layer { - fully_connected { - num_neurons: 500 - weight_initialization: "glorot_uniform" - has_bias: true - } - name: "encode2" - data_layout: "model_parallel" - parents: "relu1" - } - layer { - name: "relu2" - data_layout: "model_parallel" - parents: "encode2" - relu { - } - } - layer { - fully_connected { - num_neurons: 250 - weight_initialization: "glorot_uniform" - has_bias: true - } - name: "encode3" - data_layout: "model_parallel" - parents: "relu2" - } - layer { - name: "relu3" - data_layout: "model_parallel" - parents: "encode3" - relu { - } - } - layer { - fully_connected { - num_neurons: 30 - weight_initialization: "glorot_uniform" - has_bias: true - } - name: "encode4" - data_layout: "model_parallel" - parents: "relu3" - } - layer { - fully_connected { - num_neurons: 250 - weight_initialization: "glorot_uniform" - has_bias: true - } - name: "decode4" - data_layout: "model_parallel" - parents: "encode4" - } - layer { - name: "relu4" - data_layout: "model_parallel" - parents: "decode4" - relu { - } - } - layer { - fully_connected { - num_neurons: 500 - weight_initialization: "glorot_uniform" - has_bias: true - } - name: "decode3" - data_layout: "model_parallel" - parents: "relu4" - } - layer { - name: "relu5" - data_layout: "model_parallel" - parents: "decode3" - relu { - } - } - layer { - fully_connected { - num_neurons: 1000 - weight_initialization: "glorot_uniform" - has_bias: true - } - name: "decode2" - data_layout: "model_parallel" - parents: "relu5" - } - layer { - name: "relu6" - data_layout: "model_parallel" - parents: "decode2" - relu { - } - } - layer { - fully_connected { - weight_initialization: "glorot_uniform" - has_bias: true - } - name: "decode1" - data_layout: "model_parallel" - hint_layer: "image" - parents: "relu6" - } - layer { - name: "reconstruction" - data_layout: "model_parallel" - parents: "decode1" - sigmoid { - } - } - layer { - name: "mean_squared_error" - data_layout: "data_parallel" - mean_squared_error { - } - parents: "reconstruction image" - } - mini_batch_size: 10 - callback { - print { - interval: 1 - } - } - block_size: 256 - num_parallel_readers: 1 -} -optimizer { - sgd { - learn_rate: 0.01 - momentum: 0.9 - } -} From 6f2546d373b25b4643da18cc9910dd6f7cb210db Mon Sep 17 00:00:00 2001 From: Katie Graham Date: Tue, 6 Aug 2019 09:11:51 -0700 Subject: [PATCH 204/634] resolved namespace and documentation issues --- include/lbann/callbacks/callback.hpp | 4 ++++ src/proto/factories/callback_factory.cpp | 3 --- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/include/lbann/callbacks/callback.hpp b/include/lbann/callbacks/callback.hpp index 224b0d04399..f7e7c428188 100644 --- a/include/lbann/callbacks/callback.hpp +++ b/include/lbann/callbacks/callback.hpp @@ -76,6 +76,10 @@ class callback_base { virtual callback_base* copy() const = 0; + ///@} + /** @name Modifiers */ + ///@{ + /** @brief Called once to set up the callback (after all layers are * set up). */ diff --git a/src/proto/factories/callback_factory.cpp b/src/proto/factories/callback_factory.cpp index 618045a1570..e31a1bd9309 100644 --- a/src/proto/factories/callback_factory.cpp +++ b/src/proto/factories/callback_factory.cpp @@ -90,8 +90,6 @@ using factory_type = lbann::generic_factory< std::shared_ptr const&>, default_key_error_policy>; -namespace -{ template std::string BuildErrorMessage(Ts... args) { @@ -100,7 +98,6 @@ std::string BuildErrorMessage(Ts... args) (void) dummy; LBANN_ERROR(oss.str()); } -} void register_default_builders(factory_type& factory) { From 05095bc7dc560d5f25898deeec00eb1ed04b9a3f Mon Sep 17 00:00:00 2001 From: "Thomas R. Benson" Date: Tue, 6 Aug 2019 09:44:20 -0700 Subject: [PATCH 205/634] fix style --- src/proto/factories/optimizer_factory.cpp | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/src/proto/factories/optimizer_factory.cpp b/src/proto/factories/optimizer_factory.cpp index c0f460263b6..220f25fef46 100644 --- a/src/proto/factories/optimizer_factory.cpp +++ b/src/proto/factories/optimizer_factory.cpp @@ -51,8 +51,7 @@ using factory_type = lbann::generic_factory< lbann_comm*>, default_key_error_policy>; -void register_default_builders(factory_type& factory) -{ +void register_default_builders(factory_type& factory) { factory.register_builder("AdaGrad", build_adagrad_optimizer_from_pbuf); factory.register_builder("Adam", build_adam_optimizer_from_pbuf); factory.register_builder("HypergradientAdam", @@ -62,8 +61,7 @@ void register_default_builders(factory_type& factory) } // Manage a global factory -struct factory_manager -{ +struct factory_manager { factory_type factory_; factory_manager() { @@ -72,8 +70,7 @@ struct factory_manager }; factory_manager factory_mgr_; -factory_type const& get_optimizer_factory() noexcept -{ +factory_type const& get_optimizer_factory() noexcept { return factory_mgr_.factory_; } From 48d7709e7ae0ad97c13fda900da8ba7705e82ad7 Mon Sep 17 00:00:00 2001 From: "Thomas R. Benson" Date: Tue, 6 Aug 2019 09:46:53 -0700 Subject: [PATCH 206/634] fix style -- transform factory --- src/proto/factories/transform_factory.cpp | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/src/proto/factories/transform_factory.cpp b/src/proto/factories/transform_factory.cpp index 6a8fce802bd..4b855e0cf4d 100644 --- a/src/proto/factories/transform_factory.cpp +++ b/src/proto/factories/transform_factory.cpp @@ -66,8 +66,7 @@ using factory_type = lbann::generic_factory< google::protobuf::Message const&>, default_key_error_policy>; -void register_default_builders(factory_type& factory) -{ +void register_default_builders(factory_type& factory) { using namespace transform; factory.register_builder("AdjustBrightness", build_adjust_brightness_transform_from_pbuf); factory.register_builder("AdjustContrast", build_adjust_contrast_transform_from_pbuf); @@ -93,8 +92,7 @@ void register_default_builders(factory_type& factory) } // Manage a global factory -struct factory_manager -{ +struct factory_manager { factory_type factory_; factory_manager() { @@ -103,8 +101,7 @@ struct factory_manager }; factory_manager factory_mgr_; -factory_type const& get_transform_factory() noexcept -{ +factory_type const& get_transform_factory() noexcept { return factory_mgr_.factory_; } From fd111389a8322647f501e0a6aae471fc3a59eb19 Mon Sep 17 00:00:00 2001 From: "Thomas R. Benson" Date: Tue, 6 Aug 2019 09:50:56 -0700 Subject: [PATCH 207/634] more fix style -- transforms --- src/transforms/normalize.cpp | 3 +-- src/transforms/sample_normalize.cpp | 3 +-- src/transforms/scale.cpp | 3 +-- 3 files changed, 3 insertions(+), 6 deletions(-) diff --git a/src/transforms/normalize.cpp b/src/transforms/normalize.cpp index 28ff39bb18b..374b5e340d6 100644 --- a/src/transforms/normalize.cpp +++ b/src/transforms/normalize.cpp @@ -106,8 +106,7 @@ void normalize::apply(utils::type_erased_matrix& data, CPUMat& out, } std::unique_ptr -build_normalize_transform_from_pbuf(google::protobuf::Message const& msg) -{ +build_normalize_transform_from_pbuf(google::protobuf::Message const& msg) { auto& pb_trans = dynamic_cast(msg); return make_unique( parse_list(pb_trans.means()), diff --git a/src/transforms/sample_normalize.cpp b/src/transforms/sample_normalize.cpp index 2f8eab63ea3..392a9fdf901 100644 --- a/src/transforms/sample_normalize.cpp +++ b/src/transforms/sample_normalize.cpp @@ -47,8 +47,7 @@ void sample_normalize::apply(utils::type_erased_matrix& data, std::vector -build_sample_normalize_transform_from_pbuf(google::protobuf::Message const&) -{ +build_sample_normalize_transform_from_pbuf(google::protobuf::Message const&) { return make_unique(); } diff --git a/src/transforms/scale.cpp b/src/transforms/scale.cpp index f1b5950e4c1..bf16e71e9ce 100644 --- a/src/transforms/scale.cpp +++ b/src/transforms/scale.cpp @@ -47,8 +47,7 @@ void scale::apply(utils::type_erased_matrix& data, std::vector&) { } std::unique_ptr -build_scale_transform_from_pbuf(google::protobuf::Message const& msg) -{ +build_scale_transform_from_pbuf(google::protobuf::Message const& msg) { auto const& params = dynamic_cast(msg); return make_unique(params.scale()); } From 71f147c2b68efd39d68c4a05f671f128726f828e Mon Sep 17 00:00:00 2001 From: "Thomas R. Benson" Date: Tue, 6 Aug 2019 09:59:19 -0700 Subject: [PATCH 208/634] fix style -- exception --- include/lbann/utils/exception.hpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/include/lbann/utils/exception.hpp b/include/lbann/utils/exception.hpp index d4669f95241..ddfd4a39c33 100644 --- a/include/lbann/utils/exception.hpp +++ b/include/lbann/utils/exception.hpp @@ -104,8 +104,7 @@ using lbann_exception = exception; * @param[in] args The things to be stringified. */ template -std::string build_string(Args&&... args) -{ +std::string build_string(Args&&... args) { std::ostringstream oss; int dummy[] = { (oss << args, 0)... }; (void) dummy; // silence compiler warnings From a5131c43d45235c3abe183056d67a91b5db7fe6e Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Tue, 6 Aug 2019 12:06:53 -0700 Subject: [PATCH 209/634] Updated the OS X build documents to reflect new bugfixes --- docs/build_osx.rst | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/docs/build_osx.rst b/docs/build_osx.rst index 3e5ce179a09..12470de11fa 100644 --- a/docs/build_osx.rst +++ b/docs/build_osx.rst @@ -41,6 +41,7 @@ Setup Homebrew brew install open-mpi brew install scalapack brew install cmake + brew install hwloc Put the brew based clang in your path: @@ -98,17 +99,24 @@ Building & Installing LBANN as a developer cd ${LBANN_BUILD_DIR} cmake \ -G Ninja \ + -D CMAKE_EXPORT_COMPILE_COMMANDS=ON \ -D CMAKE_BUILD_TYPE:STRING=Release \ -D CMAKE_INSTALL_PREFIX:PATH=${LBANN_INSTALL_DIR} \ \ -D LBANN_SB_BUILD_ALUMINUM=ON \ -D ALUMINUM_ENABLE_MPI_CUDA=OFF \ -D ALUMINUM_ENABLE_NCCL=OFF \ + -D LBANN_SB_FWD_ALUMINUM_OpenMP_CXX_LIB_NAMES=omp \ + -D LBANN_SB_FWD_ALUMINUM_OpenMP_CXX_FLAGS=-fopenmp \ + -D LBANN_SB_FWD_ALUMINUM_OpenMP_omp_LIBRARY=/usr/local/opt/llvm/lib/libomp.dylib \ \ -D LBANN_SB_BUILD_HYDROGEN=ON \ -D Hydrogen_ENABLE_ALUMINUM=ON \ -D Hydrogen_ENABLE_CUB=OFF \ -D Hydrogen_ENABLE_CUDA=OFF \ + -D LBANN_SB_FWD_HYDROGEN_OpenMP_CXX_LIB_NAMES=omp \ + -D LBANN_SB_FWD_HYDROGEN_OpenMP_CXX_FLAGS="-fopenmp=libomp" \ + -D LBANN_SB_FWD_HYDROGEN_OpenMP_omp_LIBRARY=/usr/local/opt/llvm/lib/libomp.dylib \ \ -D LBANN_SB_BUILD_LBANN=ON \ -D LBANN_DATATYPE:STRING=float \ @@ -123,12 +131,13 @@ Building & Installing LBANN as a developer -D LBANN_WITH_TOPO_AWARE:BOOL=ON \ -D LBANN_WITH_TBINF=OFF \ -D LBANN_WITH_VTUNE:BOOL=OFF \ + -D LBANN_SB_FWD_LBANN_HWLOC_DIR=/usr/local/opt/hwloc \ + -D LBANN_SB_FWD_LBANN_OpenMP_CXX_LIB_NAMES=omp \ + -D LBANN_SB_FWD_LBANN_OpenMP_CXX_FLAGS="-fopenmp=libomp" \ + -D LBANN_SB_FWD_LBANN_OpenMP_omp_LIBRARY=/usr/local/opt/llvm/lib/libomp.dylib \ \ -D CMAKE_CXX_COMPILER=$(which clang) \ -D CMAKE_C_COMPILER=$(which clang) \ - -D LBANN_SB_FWD_ALUMINUM_OpenMP_CXX_LIB_NAMES=omp \ - -D LBANN_SB_FWD_ALUMINUM_OpenMP_CXX_FLAGS=-fopenmp \ - -D LBANN_SB_FWD_ALUMINUM_OpenMP_omp_LIBRARY=/usr/local/opt/llvm/lib/libomp.dylib \ ${LBANN_HOME}/superbuild ninja From d09bfd8db9eb804a3aaab40a5509fbd98ffd806d Mon Sep 17 00:00:00 2001 From: Katie Graham Date: Tue, 6 Aug 2019 14:04:30 -0700 Subject: [PATCH 210/634] Resolved issue: made member variable private --- include/lbann/callbacks/summary.hpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/include/lbann/callbacks/summary.hpp b/include/lbann/callbacks/summary.hpp index 53f29e605bb..af943dc2f36 100644 --- a/include/lbann/callbacks/summary.hpp +++ b/include/lbann/callbacks/summary.hpp @@ -60,15 +60,15 @@ class summary : public callback_base { void on_test_end(model *m) override; std::string name() const override { return "summary"; } -private: - /**@brief lbann_summary */ - std::shared_ptr m_summarizer = nullptr; - protected: /** Write out histograms from the model's layers. */ void save_histograms(model *m); -/** Interval for doing matrix summarization. */ +private: + /**@brief lbann_summary */ + std::shared_ptr m_summarizer = nullptr; + + /** Interval for doing matrix summarization. */ int m_mat_interval; }; From 40859f5307baaad07c5466bcdecf408125e2c6f7 Mon Sep 17 00:00:00 2001 From: "Thomas R. Benson" Date: Tue, 6 Aug 2019 16:06:43 -0700 Subject: [PATCH 211/634] use realpaths instead of abspaths in setup.py --- cmake/configure_files/setup.py.in | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cmake/configure_files/setup.py.in b/cmake/configure_files/setup.py.in index bd6dae0516b..3f4bc1a1597 100644 --- a/cmake/configure_files/setup.py.in +++ b/cmake/configure_files/setup.py.in @@ -9,9 +9,9 @@ config_file = '@_PYTHON_CONFIG_INI@' # Get relative paths # Note: setuptools does not accept absolute paths -current_dir = os.path.dirname(os.path.abspath(__file__)) -src_dir = os.path.relpath(os.path.abspath(src_dir), current_dir) -config_file = os.path.relpath(os.path.abspath(config_file), current_dir) +current_dir = os.path.dirname(os.path.realpath(__file__)) +src_dir = os.path.relpath(os.path.realpath(src_dir), current_dir) +config_file = os.path.relpath(os.path.realpath(config_file), current_dir) # Setup package setuptools.setup( From 0933d1e8e4317eec25e86d414ccfb4443765f44f Mon Sep 17 00:00:00 2001 From: "Thomas R. Benson" Date: Wed, 7 Aug 2019 10:05:46 -0700 Subject: [PATCH 212/634] fix a cmake ordering issue --- CMakeLists.txt | 8 ++++---- src/utils/threads/CMakeLists.txt | 3 ++- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 1a8577a6c3e..588e32a30a2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -470,16 +470,16 @@ add_subdirectory(docs) # Build LBANN ################################################################ +# Add LBANN source files +add_subdirectory(include) +add_subdirectory(src) + # Write the configure file configure_file( "${CMAKE_SOURCE_DIR}/cmake/configure_files/lbann_config.hpp.in" "${CMAKE_BINARY_DIR}/lbann_config.hpp" @ONLY) -# Add LBANN source files -add_subdirectory(include) -add_subdirectory(src) - # Create the LBANN library add_library(lbann ${LBANN_SOURCES} ${LBANN_HEADERS} ${LBANN_CUDA_SOURCES}) diff --git a/src/utils/threads/CMakeLists.txt b/src/utils/threads/CMakeLists.txt index dbd8ab558f0..c6d0b198928 100644 --- a/src/utils/threads/CMakeLists.txt +++ b/src/utils/threads/CMakeLists.txt @@ -24,7 +24,8 @@ check_cxx_source_compiles( set(CMAKE_REQUIRED_LIBRARIES) if (LBANN_HAS_PTHREAD_SETAFFINITY_NP AND LBANN_HAS_PTHREAD_GETAFFINITY_NP) - set(LBANN_HAS_PTHREAD_AFFINITY_SUPPORT TRUE) + set(LBANN_HAS_PTHREAD_AFFINITY_SUPPORT TRUE CACHE INTERNAL + "LBANN has pthread affinity support") endif () # Add the source files for this directory From 23a1ee1da24e03404308b24e804bfbdcbe44e891 Mon Sep 17 00:00:00 2001 From: Ryan Forsyth Date: Wed, 7 Aug 2019 11:58:35 -0700 Subject: [PATCH 213/634] Update LBANN flags --- bamboo/common_python/test_tools.py | 78 ++++++++++++++++++++-- bamboo/common_python/tools.py | 100 ++++++++++++++++++++++++++--- 2 files changed, 163 insertions(+), 15 deletions(-) diff --git a/bamboo/common_python/test_tools.py b/bamboo/common_python/test_tools.py index 8d2ec02ff5d..4ac070e9e2f 100644 --- a/bamboo/common_python/test_tools.py +++ b/bamboo/common_python/test_tools.py @@ -23,6 +23,7 @@ num_epochs=7, optimizer_name='adagrad', processes_per_model=10, + extra_lbann_flags={'block_size': 4, 'print_affinity': None}, output_file_name='output_file', error_file_name='error_file', check_executable_existence=False) @@ -30,32 +31,37 @@ def test_command_catalyst(): actual = tools.get_command(cluster='catalyst', **d) - expected = 'salloc --nodes=20 --partition=pdebug --time=30 srun --mpibind=off --time=30 --ntasks=40 exe --reader=dir/model_zoo/data_readers/data_reader_mnist.prototext --data_reader_percent=0.100000 --exit_after_setup --mini_batch_size=15 --model=dir/model_zoo/models/folder/model_lenet.prototext --num_epochs=7 --optimizer=dir/model_zoo/optimizers/opt_adagrad.prototext --procs_per_model=10 > output_file 2> error_file' + expected = 'salloc --nodes=20 --partition=pdebug --time=30 srun --mpibind=off --time=30 --ntasks=40 exe --reader=dir/model_zoo/data_readers/data_reader_mnist.prototext --data_reader_percent=0.100000 --exit_after_setup --mini_batch_size=15 --model=dir/model_zoo/models/folder/model_lenet.prototext --num_epochs=7 --optimizer=dir/model_zoo/optimizers/opt_adagrad.prototext --procs_per_model=10 --block_size=4 --print_affinity > output_file 2> error_file' + assert actual == expected + +def test_command_corona(): + actual = tools.get_command(cluster='corona', **d) + expected = 'salloc --nodes=20 --partition=pdebug --time=30 srun --mpibind=off --time=30 --ntasks=40 exe --reader=dir/model_zoo/data_readers/data_reader_mnist.prototext --data_reader_percent=0.100000 --exit_after_setup --mini_batch_size=15 --model=dir/model_zoo/models/folder/model_lenet.prototext --num_epochs=7 --optimizer=dir/model_zoo/optimizers/opt_adagrad.prototext --procs_per_model=10 --block_size=4 --print_affinity > output_file 2> error_file' assert actual == expected def test_command_lassen(): actual = tools.get_command(cluster='lassen', **d) - expected = 'bsub -G guests -Is -q pdebug -nnodes 20 -W 30 jsrun -b "packed:10" -c 40 -g 4 -d packed -n 16 -r 1 -a 4 exe --data_filedir=gpfs1/filedir --reader=dir/model_zoo/data_readers/data_reader_mnist.prototext --data_reader_percent=0.100000 --exit_after_setup --mini_batch_size=15 --model=dir/model_zoo/models/folder/model_lenet.prototext --num_epochs=7 --optimizer=dir/model_zoo/optimizers/opt_adagrad.prototext --procs_per_model=10 > output_file 2> error_file' + expected = 'bsub -G guests -Is -q pdebug -nnodes 20 -W 30 jsrun -b "packed:10" -c 40 -g 4 -d packed -n 16 -r 1 -a 4 exe --data_filedir=gpfs1/filedir --reader=dir/model_zoo/data_readers/data_reader_mnist.prototext --data_reader_percent=0.100000 --exit_after_setup --mini_batch_size=15 --model=dir/model_zoo/models/folder/model_lenet.prototext --num_epochs=7 --optimizer=dir/model_zoo/optimizers/opt_adagrad.prototext --procs_per_model=10 --block_size=4 --print_affinity > output_file 2> error_file' assert actual == expected def test_command_pascal(): actual = tools.get_command(cluster='pascal', **d) - expected = 'salloc --nodes=20 --partition=pbatch --time=30 srun --mpibind=off --time=30 --ntasks=40 exe --reader=dir/model_zoo/data_readers/data_reader_mnist.prototext --data_reader_percent=0.100000 --exit_after_setup --mini_batch_size=15 --model=dir/model_zoo/models/folder/model_lenet.prototext --num_epochs=7 --optimizer=dir/model_zoo/optimizers/opt_adagrad.prototext --procs_per_model=10 > output_file 2> error_file' + expected = 'salloc --nodes=20 --partition=pbatch --time=30 srun --mpibind=off --time=30 --ntasks=40 exe --reader=dir/model_zoo/data_readers/data_reader_mnist.prototext --data_reader_percent=0.100000 --exit_after_setup --mini_batch_size=15 --model=dir/model_zoo/models/folder/model_lenet.prototext --num_epochs=7 --optimizer=dir/model_zoo/optimizers/opt_adagrad.prototext --procs_per_model=10 --block_size=4 --print_affinity > output_file 2> error_file' assert actual == expected def test_command_ray(): actual = tools.get_command(cluster='ray', **d) - expected = 'bsub -x -G guests -Is -n 40 -q pdebug -R "span[ptile=2]" -W 30 mpirun --timeout=30 -np 40 -N 2 exe --data_filedir=gscratchr/filedir --reader=dir/model_zoo/data_readers/data_reader_mnist.prototext --data_reader_percent=0.100000 --exit_after_setup --mini_batch_size=15 --model=dir/model_zoo/models/folder/model_lenet.prototext --num_epochs=7 --optimizer=dir/model_zoo/optimizers/opt_adagrad.prototext --procs_per_model=10 > output_file 2> error_file' + expected = 'bsub -x -G guests -Is -n 40 -q pdebug -R "span[ptile=2]" -W 30 mpirun --timeout=30 -np 40 -N 2 exe --data_filedir=gscratchr/filedir --reader=dir/model_zoo/data_readers/data_reader_mnist.prototext --data_reader_percent=0.100000 --exit_after_setup --mini_batch_size=15 --model=dir/model_zoo/models/folder/model_lenet.prototext --num_epochs=7 --optimizer=dir/model_zoo/optimizers/opt_adagrad.prototext --procs_per_model=10 --block_size=4 --print_affinity > output_file 2> error_file' assert actual == expected # Test error cases ############################################################ -def test_blacklisted_substrings(): +def test_blacklisted_substrings_1(): try: tools.get_command('ray', 'exe', partition=';', optimizer_path='--model=new_model', @@ -67,6 +73,30 @@ def test_blacklisted_substrings(): assert actual == expected +def test_blacklisted_substrings_2(): + try: + tools.get_command('ray', 'exe', partition='pdebug', + extra_lbann_flags={'--bad_key': 5}, + check_executable_existence=False) + assert False + except Exception as e: + actual = str(e) + expected = 'Invalid character(s): --bad_key contains --' + assert actual == expected + + +def test_blacklisted_substrings_3(): + try: + tools.get_command('ray', 'exe', partition='pdebug', + extra_lbann_flags={'key': '--bad_value'}, + check_executable_existence=False) + assert False + except Exception as e: + actual = str(e) + expected = 'Invalid character(s): --bad_value contains --' + assert actual == expected + + def test_unsupported_cluster(): try: tools.get_command('q', 'exe', check_executable_existence=False) @@ -391,3 +421,41 @@ def test_bad_data_filedir_15(): actual = str(e) expected = 'Invalid Usage: At least one of [data_filedir_train_default, data_filename_train_default, data_filedir_test_default, data_filename_test_default] is set, but neither data_reader_name or data_reader_path are.' assert actual == expected + + +def test_bad_extra_lbann_flags_invalid_flag(): + try: + tools.get_command('ray', 'exe', partition='pdebug', + extra_lbann_flags={'invalid_flag': 'value'}, + check_executable_existence=False) + assert False + except Exception as e: + actual = str(e) + expected = ("Invalid Usage: extra_lbann_flags includes invalid" + " flag=invalid_flag. Flags must" + " be in ['block_size', 'procs_per_trainer', 'num_gpus'," + " 'num_parallel_readers', 'num_io_threads', 'serialize_io'," + " 'disable_background_io_activity', 'disable_cuda'," + " 'random_seed', 'objective_function', 'data_layout'," + " 'print_affinity', 'use_data_store', 'preload_data_store'," + " 'super_node', 'write_sample_list', 'ltfb_verbose'," + " 'index_list_train', 'index_list_test'," + " 'label_filename_train', 'label_filename_test'," + " 'share_testing_data_readers', 'image_dir', 'no_im_comm']." + ) + assert actual == expected + + +def test_bad_extra_lbann_flags_not_a_dict(): + try: + tools.get_command('ray', 'exe', partition='pdebug', + extra_lbann_flags='invalid_flag', + check_executable_existence=False) + assert False + except Exception as e: + actual = str(e) + expected = ( + 'Invalid Usage: extra_lbann_flags must be a dict e.g. `{flag :' + ' None, flag: 4}`. Use `None` if a flag has no value attached ' + 'to it.') + assert actual == expected diff --git a/bamboo/common_python/tools.py b/bamboo/common_python/tools.py index 7f48f57461a..23480d4d802 100644 --- a/bamboo/common_python/tools.py +++ b/bamboo/common_python/tools.py @@ -5,7 +5,7 @@ def check_list(substrings, strings): errors = [] for string in strings: for substring in substrings: - if (string is not None) and (substring in string): + if (string is not None) and (isinstance(string, str)) and (substring in string): errors.append('%s contains %s' % (string, substring)) return errors @@ -35,6 +35,7 @@ def get_command(cluster, optimizer_name=None, optimizer_path=None, processes_per_model=None, + extra_lbann_flags=None, ckpt_dir=None, output_file_name=None, error_file_name=None, @@ -44,12 +45,26 @@ def get_command(cluster, # Check parameters for black-listed characters like semi-colons that # would terminate the command and allow for an extra command blacklist = [';', '--'] - strings = [partition, dir_name, data_filedir_default, - data_filedir_train_default, - data_filename_train_default, data_filedir_test_default, - data_filename_test_default, data_reader_name, data_reader_path, - model_folder, model_name, model_path, optimizer_name, - optimizer_path, output_file_name, error_file_name] + strings = [ + cluster, executable, num_nodes, partition, time_limit, num_processes, + dir_name, data_filedir_default, data_filedir_train_default, + data_filename_train_default, data_filedir_test_default, + data_filename_test_default, data_reader_name, data_reader_path, + data_reader_percent, exit_after_setup, metadata, mini_batch_size, + model_folder, model_name, model_path, num_epochs, optimizer_name, + optimizer_path, processes_per_model, ckpt_dir, output_file_name, + error_file_name, return_tuple, check_executable_existence, skip_no_exe + ] + lbann_errors = [] + if extra_lbann_flags is not None: + if not isinstance(extra_lbann_flags, dict): + lbann_errors.append( + ('extra_lbann_flags must be a dict e.g. `{flag :' + ' None, flag: 4}`. Use `None` if a flag has no value attached ' + 'to it.')) + else: + strings += list(extra_lbann_flags.keys()) + strings += list(extra_lbann_flags.values()) invalid_character_errors = check_list(blacklist, strings) if invalid_character_errors != []: raise Exception('Invalid character(s): %s' % ' , '.join( @@ -245,7 +260,6 @@ def get_command(cluster, option_num_epochs = '' option_optimizer = '' option_processes_per_model = '' - lbann_errors = [] if model_path is not None: # If model_folder and/or model_name are set, an exception will be # raised later. @@ -387,17 +401,83 @@ def get_command(cluster, option_processes_per_model = ' --procs_per_model=%d' % processes_per_model if ckpt_dir is not None: option_ckpt_dir = ' --ckpt_dir=%s' % ckpt_dir + if extra_lbann_flags is not None: + # If extra_lbann_flags is not a dict, then we have already appended + # this error to lbann_errors. + if isinstance(extra_lbann_flags, dict): + extra_options = '' + # See `lbann --help` or src/proto/proto_common.cpp + allowed_flags = [ + # 'model', + # 'optimizer', + # 'reader', + # 'metadata', + + # General: + # 'mini_batch_size', + # 'num_epochs', + 'block_size', + 'procs_per_trainer', + 'num_gpus', + 'num_parallel_readers', + 'num_io_threads', + 'serialize_io', + 'disable_background_io_activity', + 'disable_cuda', + 'random_seed', + 'objective_function', + 'data_layout', + 'print_affinity', + 'use_data_store', + 'preload_data_store', + 'super_node', + 'write_sample_list', + 'ltfb_verbose', + + # DataReaders: + # 'data_filedir', + # 'data_filedir_train', + # 'data_filedir_test', + # 'data_filename_train', + # 'data_filename_test', + 'index_list_train', + 'index_list_test', + 'label_filename_train', + 'label_filename_test', + # 'data_reader_percent', + 'share_testing_data_readers', + + # Callbacks: + 'image_dir', + 'no_im_comm', + + # Not listed by `lbann --help`: + # 'ckpt_dir', + # 'exit_after_setup', + # 'procs_per_model' + ] + for flag, value in sorted(extra_lbann_flags.items()): + if flag in allowed_flags: + if value is not None: + extra_options += ' --{f}={v}'.format(f=flag, v=value) + else: + extra_options += ' --{f}'.format(f=flag) + else: + s = ('extra_lbann_flags includes invalid flag={f}.' + ' Flags must be in {flags}.').format( + f=flag, flags=allowed_flags) + lbann_errors.append(s) if lbann_errors != []: print('lbann_errors={lbann_errors}.'.format(lbann_errors=lbann_errors)) raise Exception('Invalid Usage: ' + ' , '.join(lbann_errors)) - command_lbann = '%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s' % ( + command_lbann = '%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s' % ( executable, option_ckpt_dir, option_data_filedir, option_data_filedir_train, option_data_filename_train, option_data_filedir_test, option_data_filename_test, option_data_reader, option_data_reader_percent, option_exit_after_setup, option_metadata, option_mini_batch_size, option_model, option_num_epochs, option_optimizer, - option_processes_per_model) + option_processes_per_model, extra_options) # Create redirect command command_output = '' From 9db8f8041d1d49daf2d913b1956a03a195ae0114 Mon Sep 17 00:00:00 2001 From: Ryan Forsyth Date: Wed, 7 Aug 2019 13:04:30 -0700 Subject: [PATCH 214/634] Update time limits --- bamboo/common_python/tools.py | 16 +++------------- bamboo/unit_tests/test_unit_layer_clamp.py | 1 + bamboo/unit_tests/test_unit_layer_covariance.py | 1 + bamboo/unit_tests/test_unit_layer_elu.py | 1 + bamboo/unit_tests/test_unit_layer_identity.py | 1 + bamboo/unit_tests/test_unit_layer_l1_norm.py | 1 + bamboo/unit_tests/test_unit_layer_l2_norm2.py | 1 + bamboo/unit_tests/test_unit_layer_leaky_relu.py | 1 + bamboo/unit_tests/test_unit_layer_log_sigmoid.py | 1 + bamboo/unit_tests/test_unit_layer_log_softmax.py | 4 +++- .../test_unit_layer_mean_absolute_error.py | 1 + bamboo/unit_tests/test_unit_layer_relu.py | 4 +++- bamboo/unit_tests/test_unit_layer_selu.py | 4 +++- bamboo/unit_tests/test_unit_layer_sigmoid.py | 4 +++- bamboo/unit_tests/test_unit_layer_softmax.py | 4 +++- bamboo/unit_tests/test_unit_layer_softplus.py | 1 + bamboo/unit_tests/test_unit_layer_softsign.py | 1 + .../test_unit_layer_squared_difference.py | 1 + bamboo/unit_tests/test_unit_layer_tessellate.py | 1 + bamboo/unit_tests/test_unit_layer_variance.py | 4 +++- 20 files changed, 34 insertions(+), 19 deletions(-) diff --git a/bamboo/common_python/tools.py b/bamboo/common_python/tools.py index 23480d4d802..8ea802b1ece 100644 --- a/bamboo/common_python/tools.py +++ b/bamboo/common_python/tools.py @@ -70,10 +70,9 @@ def get_command(cluster, raise Exception('Invalid character(s): %s' % ' , '.join( invalid_character_errors)) - # Never give lbannusr an allocation for over 12 hours though. - strict_time_limit = 60*6 # 6 hours. - if (time_limit is None) or (time_limit > strict_time_limit): - time_limit = strict_time_limit + MAX_TIME = 360 # 6 hours. + if (time_limit is None) or (time_limit > MAX_TIME): + time_limit = MAX_TIME # Check executable existence if check_executable_existence: @@ -87,7 +86,6 @@ def get_command(cluster, else: raise Exception('Unsupported Cluster: %s' % cluster) - MAX_TIME = 600 # Description of command line options are from the appropriate command's # man pages if scheduler == 'slurm': @@ -128,10 +126,6 @@ def get_command(cluster, # Create run command if command_allocate == '': space = '' - # If nodes have already been allocated, - # then an individual test should not take longer than MAX_TIME. - if time_limit > MAX_TIME: - time_limit = MAX_TIME else: space = ' ' command_run = '{s}srun --mpibind=off --time={t}'.format( @@ -197,10 +191,6 @@ def get_command(cluster, # Create run command if command_allocate == '': space = '' - # If nodes have already been allocated, - # then an individual test should not take longer than MAX_TIME. - if time_limit > MAX_TIME: - time_limit = MAX_TIME else: space = ' ' if cluster == 'lassen': diff --git a/bamboo/unit_tests/test_unit_layer_clamp.py b/bamboo/unit_tests/test_unit_layer_clamp.py index ea8182b8a56..3b9cb25593e 100644 --- a/bamboo/unit_tests/test_unit_layer_clamp.py +++ b/bamboo/unit_tests/test_unit_layer_clamp.py @@ -14,6 +14,7 @@ def skeleton_layer_clamp(cluster, executables, dir_name, compiler_name): error_file_name = '%s/bamboo/unit_tests/error/layer_clamp_%s_error.txt' % (dir_name, compiler_name) command = tools.get_command( cluster=cluster, executable=executables[compiler_name], num_nodes=1, + time_limit=10, num_processes=2, dir_name=dir_name, data_reader_name='synthetic', model_folder='tests/layer_tests', model_name='clamp', diff --git a/bamboo/unit_tests/test_unit_layer_covariance.py b/bamboo/unit_tests/test_unit_layer_covariance.py index 74d0b8da353..1bf45a4ac5c 100644 --- a/bamboo/unit_tests/test_unit_layer_covariance.py +++ b/bamboo/unit_tests/test_unit_layer_covariance.py @@ -14,6 +14,7 @@ def skeleton_layer_covariance(cluster, executables, dir_name, compiler_name): error_file_name = '%s/bamboo/unit_tests/error/layer_covariance_%s_error.txt' % (dir_name, compiler_name) command = tools.get_command( cluster=cluster, executable=executables[compiler_name], num_nodes=1, + time_limit=10, num_processes=2, dir_name=dir_name, data_reader_name='synthetic', model_folder='tests/layer_tests', model_name='covariance', diff --git a/bamboo/unit_tests/test_unit_layer_elu.py b/bamboo/unit_tests/test_unit_layer_elu.py index 76332cebcec..b7d7a969c9a 100644 --- a/bamboo/unit_tests/test_unit_layer_elu.py +++ b/bamboo/unit_tests/test_unit_layer_elu.py @@ -14,6 +14,7 @@ def skeleton_layer_elu(cluster, executables, dir_name, compiler_name): error_file_name = '%s/bamboo/unit_tests/error/layer_elu_%s_error.txt' % (dir_name, compiler_name) command = tools.get_command( cluster=cluster, executable=executables[compiler_name], num_nodes=1, + time_limit=10, num_processes=2, dir_name=dir_name, data_reader_name='synthetic', model_folder='tests/layer_tests', model_name='elu', diff --git a/bamboo/unit_tests/test_unit_layer_identity.py b/bamboo/unit_tests/test_unit_layer_identity.py index 1ea2d742d22..ec1b58c9b14 100644 --- a/bamboo/unit_tests/test_unit_layer_identity.py +++ b/bamboo/unit_tests/test_unit_layer_identity.py @@ -14,6 +14,7 @@ def skeleton_layer_identity(cluster, executables, dir_name, compiler_name): error_file_name = '%s/bamboo/unit_tests/error/layer_identity_%s_error.txt' % (dir_name, compiler_name) command = tools.get_command( cluster=cluster, executable=executables[compiler_name], num_nodes=1, + time_limit=10, num_processes=2, dir_name=dir_name, data_reader_name='synthetic', model_folder='tests/layer_tests', model_name='identity', diff --git a/bamboo/unit_tests/test_unit_layer_l1_norm.py b/bamboo/unit_tests/test_unit_layer_l1_norm.py index 6f39cb5a242..b334cc10011 100644 --- a/bamboo/unit_tests/test_unit_layer_l1_norm.py +++ b/bamboo/unit_tests/test_unit_layer_l1_norm.py @@ -14,6 +14,7 @@ def skeleton_layer_l1_norm(cluster, executables, dir_name, compiler_name): error_file_name = '%s/bamboo/unit_tests/error/layer_l1_norm_%s_error.txt' % (dir_name, compiler_name) command = tools.get_command( cluster=cluster, executable=executables[compiler_name], num_nodes=1, + time_limit=10, num_processes=2, dir_name=dir_name, data_reader_name='synthetic', model_folder='tests/layer_tests', model_name='l1_norm', diff --git a/bamboo/unit_tests/test_unit_layer_l2_norm2.py b/bamboo/unit_tests/test_unit_layer_l2_norm2.py index a90fa330eb0..76c6a5adcc7 100644 --- a/bamboo/unit_tests/test_unit_layer_l2_norm2.py +++ b/bamboo/unit_tests/test_unit_layer_l2_norm2.py @@ -14,6 +14,7 @@ def skeleton_layer_l2_norm2(cluster, executables, dir_name, compiler_name): error_file_name = '%s/bamboo/unit_tests/error/layer_l2_norm2_%s_error.txt' % (dir_name, compiler_name) command = tools.get_command( cluster=cluster, executable=executables[compiler_name], num_nodes=1, + time_limit=10, num_processes=2, dir_name=dir_name, data_reader_name='synthetic', model_folder='tests/layer_tests', model_name='l2_norm2', diff --git a/bamboo/unit_tests/test_unit_layer_leaky_relu.py b/bamboo/unit_tests/test_unit_layer_leaky_relu.py index 68b6d8d0fdd..66ad045cccc 100644 --- a/bamboo/unit_tests/test_unit_layer_leaky_relu.py +++ b/bamboo/unit_tests/test_unit_layer_leaky_relu.py @@ -14,6 +14,7 @@ def skeleton_layer_leaky_relu(cluster, executables, dir_name, compiler_name): error_file_name = '%s/bamboo/unit_tests/error/layer_leaky_relu_%s_error.txt' % (dir_name, compiler_name) command = tools.get_command( cluster=cluster, executable=executables[compiler_name], num_nodes=1, + time_limit=10, num_processes=2, dir_name=dir_name, data_reader_name='synthetic', model_folder='tests/layer_tests', model_name='leaky_relu', diff --git a/bamboo/unit_tests/test_unit_layer_log_sigmoid.py b/bamboo/unit_tests/test_unit_layer_log_sigmoid.py index 93faa462298..750c089833a 100644 --- a/bamboo/unit_tests/test_unit_layer_log_sigmoid.py +++ b/bamboo/unit_tests/test_unit_layer_log_sigmoid.py @@ -14,6 +14,7 @@ def skeleton_layer_log_sigmoid(cluster, executables, dir_name, compiler_name): error_file_name = '%s/bamboo/unit_tests/error/layer_log_sigmoid_%s_error.txt' % (dir_name, compiler_name) command = tools.get_command( cluster=cluster, executable=executables[compiler_name], num_nodes=1, + time_limit=10, num_processes=2, dir_name=dir_name, data_reader_name='synthetic', model_folder='tests/layer_tests', model_name='log_sigmoid', diff --git a/bamboo/unit_tests/test_unit_layer_log_softmax.py b/bamboo/unit_tests/test_unit_layer_log_softmax.py index 6fe031609c3..75a4a6b7bbd 100644 --- a/bamboo/unit_tests/test_unit_layer_log_softmax.py +++ b/bamboo/unit_tests/test_unit_layer_log_softmax.py @@ -14,7 +14,9 @@ def skeleton_layer_log_softmax(cluster, executables, dir_name, compiler_name): error_file_name = '%s/bamboo/unit_tests/error/layer_log_softmax_%s_error.txt' % (dir_name, compiler_name) command = tools.get_command( cluster=cluster, executable=executables[compiler_name], - num_nodes=1, num_processes=2, dir_name=dir_name, + num_nodes=1, + time_limit=10, + num_processes=2, dir_name=dir_name, data_reader_name='synthetic', model_folder='tests/layer_tests', model_name='log_softmax', optimizer_name='sgd', diff --git a/bamboo/unit_tests/test_unit_layer_mean_absolute_error.py b/bamboo/unit_tests/test_unit_layer_mean_absolute_error.py index 7db9912503a..562b906d597 100644 --- a/bamboo/unit_tests/test_unit_layer_mean_absolute_error.py +++ b/bamboo/unit_tests/test_unit_layer_mean_absolute_error.py @@ -14,6 +14,7 @@ def skeleton_layer_mean_absolute_error(cluster, executables, dir_name, compiler_ error_file_name = '%s/bamboo/unit_tests/error/layer_mean_absolute_error_%s_error.txt' % (dir_name, compiler_name) command = tools.get_command( cluster=cluster, executable=executables[compiler_name], num_nodes=1, + time_limit=10, num_processes=2, dir_name=dir_name, data_reader_name='synthetic', model_folder='tests/layer_tests', model_name='mean_absolute_error', diff --git a/bamboo/unit_tests/test_unit_layer_relu.py b/bamboo/unit_tests/test_unit_layer_relu.py index 8136ad8f712..1e6212fec18 100644 --- a/bamboo/unit_tests/test_unit_layer_relu.py +++ b/bamboo/unit_tests/test_unit_layer_relu.py @@ -14,7 +14,9 @@ def skeleton_layer_relu(cluster, executables, dir_name, compiler_name): error_file_name = '%s/bamboo/unit_tests/error/layer_relu_%s_error.txt' % (dir_name, compiler_name) command = tools.get_command( cluster=cluster, executable=executables[compiler_name], - num_nodes=1, num_processes=2, dir_name=dir_name, + num_nodes=1, + time_limit=10, + num_processes=2, dir_name=dir_name, data_reader_name='synthetic', model_folder='tests/layer_tests', model_name='relu', optimizer_name='sgd', diff --git a/bamboo/unit_tests/test_unit_layer_selu.py b/bamboo/unit_tests/test_unit_layer_selu.py index c920297b2c5..46c47b824a7 100644 --- a/bamboo/unit_tests/test_unit_layer_selu.py +++ b/bamboo/unit_tests/test_unit_layer_selu.py @@ -14,7 +14,9 @@ def skeleton_layer_selu(cluster, executables, dir_name, compiler_name): error_file_name = '%s/bamboo/unit_tests/error/layer_selu_%s_error.txt' % (dir_name, compiler_name) command = tools.get_command( cluster=cluster, executable=executables[compiler_name], - num_nodes=1, num_processes=2, dir_name=dir_name, + num_nodes=1, + time_limit=10, + num_processes=2, dir_name=dir_name, data_reader_name='synthetic', model_folder='tests/layer_tests', model_name='selu', optimizer_name='sgd', diff --git a/bamboo/unit_tests/test_unit_layer_sigmoid.py b/bamboo/unit_tests/test_unit_layer_sigmoid.py index ddb7306630d..255d0363c33 100644 --- a/bamboo/unit_tests/test_unit_layer_sigmoid.py +++ b/bamboo/unit_tests/test_unit_layer_sigmoid.py @@ -14,7 +14,9 @@ def skeleton_layer_sigmoid(cluster, executables, dir_name, compiler_name): error_file_name = '%s/bamboo/unit_tests/error/layer_sigmoid_%s_error.txt' % (dir_name, compiler_name) command = tools.get_command( cluster=cluster, executable=executables[compiler_name], - num_nodes=1, num_processes=2, dir_name=dir_name, + num_nodes=1, + time_limit=10, + num_processes=2, dir_name=dir_name, data_reader_name='synthetic', model_folder='tests/layer_tests', model_name='sigmoid', optimizer_name='sgd', diff --git a/bamboo/unit_tests/test_unit_layer_softmax.py b/bamboo/unit_tests/test_unit_layer_softmax.py index ef80a96ce84..f80b19f128b 100644 --- a/bamboo/unit_tests/test_unit_layer_softmax.py +++ b/bamboo/unit_tests/test_unit_layer_softmax.py @@ -14,7 +14,9 @@ def skeleton_layer_softmax(cluster, executables, dir_name, compiler_name): error_file_name = '%s/bamboo/unit_tests/error/layer_softmax_%s_error.txt' % (dir_name, compiler_name) command = tools.get_command( cluster=cluster, executable=executables[compiler_name], - num_nodes=1, num_processes=2, dir_name=dir_name, + num_nodes=1, + time_limit=10, + num_processes=2, dir_name=dir_name, data_reader_name='synthetic', model_folder='tests/layer_tests', model_name='softmax', optimizer_name='sgd', diff --git a/bamboo/unit_tests/test_unit_layer_softplus.py b/bamboo/unit_tests/test_unit_layer_softplus.py index a06a6291ec8..b2bd086ebbc 100644 --- a/bamboo/unit_tests/test_unit_layer_softplus.py +++ b/bamboo/unit_tests/test_unit_layer_softplus.py @@ -14,6 +14,7 @@ def skeleton_layer_softplus(cluster, executables, dir_name, compiler_name): error_file_name = '%s/bamboo/unit_tests/error/layer_softplus_%s_error.txt' % (dir_name, compiler_name) command = tools.get_command( cluster=cluster, executable=executables[compiler_name], num_nodes=1, + time_limit=10, num_processes=2, dir_name=dir_name, data_reader_name='synthetic', model_folder='tests/layer_tests', model_name='softplus', diff --git a/bamboo/unit_tests/test_unit_layer_softsign.py b/bamboo/unit_tests/test_unit_layer_softsign.py index 321e2f4b9d3..6830ca4f12a 100644 --- a/bamboo/unit_tests/test_unit_layer_softsign.py +++ b/bamboo/unit_tests/test_unit_layer_softsign.py @@ -14,6 +14,7 @@ def skeleton_layer_softsign(cluster, executables, dir_name, compiler_name): error_file_name = '%s/bamboo/unit_tests/error/layer_softsign_%s_error.txt' % (dir_name, compiler_name) command = tools.get_command( cluster=cluster, executable=executables[compiler_name], num_nodes=1, + time_limit=10, num_processes=2, dir_name=dir_name, data_reader_name='synthetic', model_folder='tests/layer_tests', model_name='softsign', diff --git a/bamboo/unit_tests/test_unit_layer_squared_difference.py b/bamboo/unit_tests/test_unit_layer_squared_difference.py index 4991552a6ed..591ec7ba205 100644 --- a/bamboo/unit_tests/test_unit_layer_squared_difference.py +++ b/bamboo/unit_tests/test_unit_layer_squared_difference.py @@ -14,6 +14,7 @@ def skeleton_layer_squared_difference(cluster, executables, dir_name, compiler_n error_file_name = '%s/bamboo/unit_tests/error/layer_squared_difference_%s_error.txt' % (dir_name, compiler_name) command = tools.get_command( cluster=cluster, executable=executables[compiler_name], num_nodes=1, + time_limit=10, num_processes=2, dir_name=dir_name, data_reader_name='synthetic', model_folder='tests/layer_tests', model_name='squared_difference', diff --git a/bamboo/unit_tests/test_unit_layer_tessellate.py b/bamboo/unit_tests/test_unit_layer_tessellate.py index 14857aab027..c4e9c4d3876 100644 --- a/bamboo/unit_tests/test_unit_layer_tessellate.py +++ b/bamboo/unit_tests/test_unit_layer_tessellate.py @@ -14,6 +14,7 @@ def skeleton_layer_tessellate(cluster, executables, dir_name, compiler_name): error_file_name = '%s/bamboo/unit_tests/error/layer_tessellate_%s_error.txt' % (dir_name, compiler_name) command = tools.get_command( cluster=cluster, executable=executables[compiler_name], num_nodes=1, + time_limit=10, num_processes=2, dir_name=dir_name, data_reader_name='synthetic', model_folder='tests/layer_tests', model_name='tessellate', diff --git a/bamboo/unit_tests/test_unit_layer_variance.py b/bamboo/unit_tests/test_unit_layer_variance.py index 8ca64bba063..40350b0c088 100644 --- a/bamboo/unit_tests/test_unit_layer_variance.py +++ b/bamboo/unit_tests/test_unit_layer_variance.py @@ -14,7 +14,9 @@ def skeleton_layer_variance(cluster, executables, dir_name, compiler_name): error_file_name = '%s/bamboo/unit_tests/error/layer_variance_%s_error.txt' % (dir_name, compiler_name) command = tools.get_command( cluster=cluster, executable=executables[compiler_name], - num_nodes=1, num_processes=2, dir_name=dir_name, + num_nodes=1, + time_limit=10, + num_processes=2, dir_name=dir_name, data_reader_name='synthetic', model_folder='tests/layer_tests', model_name='variance', optimizer_name='sgd', From dbc31255e2aaf7f7461cd8e6e8f6f12525e3264b Mon Sep 17 00:00:00 2001 From: "David A. Hysom" Date: Wed, 7 Aug 2019 15:13:06 -0700 Subject: [PATCH 215/634] bug fixes for data_store_conduit for the mode: --preload_data_store. There were two problems. 1) the entire data set was being loaded, even if the data reader specified that only an absolute number, or precentage should be used 2) the validation set's schema was corrupted (see: data_store_conduit::copy_members); I don't know why, as this used to work; changes to the conduit repo, perhaps? To get data store preloading to work I had to refactor data_store::select_subset_of_data(). Formerly this method performed two functions: 1) resizing the shuffled_indices list 2) selecting a portion of the list for the validation set. To fix preloading (first problem) we need to perform (1), then call preload for the data store, then call (2). Hence, select_subset_of_data() was refactored int o two methods: resize_shuffled_indices() and select_subset_of_data() This refactor required touching every data reader: calls to select_subset_of_data() needed to be replaced by two calls: resize_shuffled_indices() select_subset_of_data() --- include/lbann/data_readers/data_reader.hpp | 27 ++++- src/data_readers/data_reader.cpp | 107 ++++++++---------- src/data_readers/data_reader_ascii.cpp | 1 + src/data_readers/data_reader_cifar10.cpp | 1 + src/data_readers/data_reader_csv.cpp | 1 + src/data_readers/data_reader_image.cpp | 20 ++-- src/data_readers/data_reader_jag.cpp | 2 +- src/data_readers/data_reader_jag_conduit.cpp | 30 +++-- .../data_reader_merge_features.cpp | 1 + .../data_reader_merge_samples.cpp | 1 + src/data_readers/data_reader_mesh.cpp | 1 + src/data_readers/data_reader_mnist.cpp | 1 + src/data_readers/data_reader_moving_mnist.cpp | 1 + src/data_readers/data_reader_multi_images.cpp | 2 +- .../data_reader_multihead_siamese.cpp | 2 +- src/data_readers/data_reader_numpy.cpp | 1 + src/data_readers/data_reader_numpy_npz.cpp | 1 + .../data_reader_numpy_npz_conduit.cpp | 22 ++-- .../data_reader_pilot2_molecular.cpp | 1 + src/data_readers/data_reader_python.cpp | 1 + src/data_readers/data_reader_synthetic.cpp | 1 + src/data_readers/data_reader_triplet.cpp | 2 +- src/data_store/data_store_conduit.cpp | 32 +++--- 23 files changed, 141 insertions(+), 118 deletions(-) diff --git a/include/lbann/data_readers/data_reader.hpp b/include/lbann/data_readers/data_reader.hpp index a20b6e49964..c4470b47081 100644 --- a/include/lbann/data_readers/data_reader.hpp +++ b/include/lbann/data_readers/data_reader.hpp @@ -550,9 +550,17 @@ class generic_data_reader { } /** - * Select the appropriate subset of data based on settings. + * Optionally resizes the shuffled indices based on the data reader + * prototext settings: absolute_sample_count, percent_of_data_to_use. + * (dah - this was formerly part of select_subset_of_data) */ - virtual void select_subset_of_data(); + void resize_shuffled_indices(); + + /** + * Select the appropriate subset of data for the validation set based on + * the data reader prototext setting: validation_percent + */ + void select_subset_of_data(); /// called by select_subset_of_data() if data set is partitioned void select_subset_of_data_partitioned(); @@ -740,6 +748,12 @@ class generic_data_reader { */ double get_use_percent() const; + /** + * Returns the percent of the shuffled indices that are to be + * used. Code in this method was formerly in select_subset_of_data() + */ + double get_percent_to_use() const; + /** * Return the percent of the dataset to be used for validation. */ @@ -784,6 +798,11 @@ class generic_data_reader { return false; } + /// returns the percent of shuffled indices that are used; + /// the returned value depends on the values returned by + /// get_absolute_sample_count() and get_use_percent(). + double get_percent_to_use(); + /** * Called before fetch_datum/label/response to allow initialization. */ @@ -906,6 +925,10 @@ class generic_data_reader { /// but we're not retrieving a conduit::Node from the store. This typically occurs /// during the test phase bool m_issue_warning; + + /// throws exception if get_absolute_sample_count() and + /// get_use_percent() are incorrect + void error_check_counts() const; }; template diff --git a/src/data_readers/data_reader.cpp b/src/data_readers/data_reader.cpp index 87f824fe287..66294f21417 100644 --- a/src/data_readers/data_reader.cpp +++ b/src/data_readers/data_reader.cpp @@ -350,35 +350,30 @@ int generic_data_reader::get_next_position() const { } } -void generic_data_reader::select_subset_of_data_partitioned() { - - //sanity checks - if (get_absolute_sample_count()) { - throw lbann_exception( - std::string{} + __FILE__ + " " + std::to_string(__LINE__) + - " :: generic_data_reader - absolute_sample_count is not supported " - + "for partitioned data_set"); - } +void generic_data_reader::error_check_counts() const { + size_t count = get_absolute_sample_count(); double use_percent = get_use_percent(); - if (use_percent <= 0.0 || use_percent > 1.0) { - throw lbann_exception( - std::string{} + __FILE__ + " " + std::to_string(__LINE__) + - " :: generic_data_reader - percent_of_data_to_use must be > 0 " - + "and <= 1"); + if (count == 0 and use_percent == 0.0) { + LBANN_ERROR("get_use_percent() and get_absolute_sample_count() are both zero; exactly one must be zero"); } - if (! (m_partition_mode == 1 || m_partition_mode == 2)) { - throw lbann_exception( - std::string{} + __FILE__ + " " + std::to_string(__LINE__) + - " :: generic_data_reader - overlap mode must be 1 or 2\n" + if (!(count == 0 or use_percent == 0.0)) { + LBANN_ERROR("get_use_percent() and get_absolute_sample_count() are both non-zero; exactly one must be zero"); + } + if (m_is_partitioned && !(m_partition_mode == 1 || m_partition_mode == 2)) { + LBANN_ERROR("overlap mode must be 1 or 2\n" " 1 - share overlap data with one neighboring models;\n" " 2 - a set of overlap indices is common to (is shared by) all models"); } + if (count != 0) { + if(count > static_cast(get_num_data())) { + LBANN_ERROR("absolute_sample_count=" + + std::to_string(count) + " is > get_num_data=" + + std::to_string(get_num_data())); + } + } +} - shuffle_indices(); - - //optionally only use a portion of the data (useful during development - //and testing) - m_shuffled_indices.resize( get_use_percent() * m_shuffled_indices.size()); +void generic_data_reader::select_subset_of_data_partitioned() { std::vector common_pool; //case where there's an overlap set that is common to all models @@ -494,54 +489,43 @@ void generic_data_reader::select_subset_of_data_partitioned() { } } -void generic_data_reader::select_subset_of_data() { +double generic_data_reader::get_percent_to_use() { + error_check_counts(); + size_t count = get_absolute_sample_count(); + double use_percent = get_use_percent(); + double r = 0.; + + if (count != 0) { + r = count / get_num_data(); + } + + if (use_percent) { + r = (use_percent*get_num_data()) / get_num_data(); + } + + return r; +} + +void generic_data_reader::resize_shuffled_indices() { // ensure that all readers have the same number of indices if (m_jag_partitioned) { size_t n = m_comm->trainer_allreduce(m_shuffled_indices.size(), El::mpi::MIN); m_shuffled_indices.resize(n); } + double use_percent = get_percent_to_use(); + shuffle_indices(); + m_shuffled_indices.resize(use_percent * get_num_data()); +} + +void generic_data_reader::select_subset_of_data() { // optionally partition data set amongst the models if (m_is_partitioned) { select_subset_of_data_partitioned(); return ; } - shuffle_indices(); - - size_t count = get_absolute_sample_count(); - double use_percent = get_use_percent(); - if (count == 0 and use_percent == 0.0) { - throw lbann_exception( - std::string{} + __FILE__ + " " + std::to_string(__LINE__) + - " :: generic_data_reader::select_subset_of_data() get_use_percent() " - + "and get_absolute_sample_count() are both zero; exactly one " - + "must be zero"); - } - if (!(count == 0 or use_percent == 0.0)) { - throw lbann_exception( - std::string{} + __FILE__ + " " + std::to_string(__LINE__) + - " :: generic_data_reader::select_subset_of_data() get_use_percent() " - "and get_absolute_sample_count() are both non-zero; exactly one " - "must be zero"); - } - - if (count != 0) { - if(count > static_cast(get_num_data())) { - throw lbann_exception( - std::string{} + __FILE__ + " " + std::to_string(__LINE__) + - " :: generic_data_reader::select_subset_of_data() - absolute_sample_count=" + - std::to_string(count) + " is > get_num_data=" + - std::to_string(get_num_data())); - } - m_shuffled_indices.resize(get_absolute_sample_count()); - } - - if (use_percent) { - m_shuffled_indices.resize(get_use_percent()*get_num_data()); - } - - long unused = get_validation_percent()*get_num_data(); //get_num_data() = m_shuffled_indices.size() + long unused = get_validation_percent()*get_num_data(); long use_me = get_num_data() - unused; if (unused > 0) { m_unused_indices=std::vector(m_shuffled_indices.begin() + use_me, m_shuffled_indices.end()); @@ -742,11 +726,16 @@ void generic_data_reader::instantiate_data_store(const std::vector& local_l std::cout << "generic_data_reader::instantiate_data_store - Starting the preload" << std::endl; } if (local_list_sizes.size() != 0) { + if (is_master()) std::cout << "XX local_list_sizes.size() != 0\n"; m_data_store->build_preloaded_owner_map(local_list_sizes); } +else { + if (is_master()) std::cout << "XX local_list_sizes.size() == 0\n"; +} preload_data_store(); if(is_master()) { std::cout << "preload complete" << std::endl; + std::cout << "num loaded samples in P_0: " << m_data_store->get_data_size() << std::endl; } } diff --git a/src/data_readers/data_reader_ascii.cpp b/src/data_readers/data_reader_ascii.cpp index 854c6348861..8be6521b512 100644 --- a/src/data_readers/data_reader_ascii.cpp +++ b/src/data_readers/data_reader_ascii.cpp @@ -109,6 +109,7 @@ void ascii_reader::load() { std::cerr << "calling select_subset_of_data; m_shuffled_indices.size: " << m_shuffled_indices.size() << std::endl; } + resize_shuffled_indices(); select_subset_of_data(); } diff --git a/src/data_readers/data_reader_cifar10.cpp b/src/data_readers/data_reader_cifar10.cpp index fd8535a7c79..655d6fa6e49 100644 --- a/src/data_readers/data_reader_cifar10.cpp +++ b/src/data_readers/data_reader_cifar10.cpp @@ -120,6 +120,7 @@ void cifar10_reader::load() { m_shuffled_indices.clear(); m_shuffled_indices.resize(m_images.size()); std::iota(m_shuffled_indices.begin(), m_shuffled_indices.end(), 0); + resize_shuffled_indices(); select_subset_of_data(); } diff --git a/src/data_readers/data_reader_csv.cpp b/src/data_readers/data_reader_csv.cpp index 3b1b80b37fd..e7eccec0067 100644 --- a/src/data_readers/data_reader_csv.cpp +++ b/src/data_readers/data_reader_csv.cpp @@ -265,6 +265,7 @@ void csv_reader::load() { // Reset indices. m_shuffled_indices.resize(m_num_samples); std::iota(m_shuffled_indices.begin(), m_shuffled_indices.end(), 0); + resize_shuffled_indices(); select_subset_of_data(); } diff --git a/src/data_readers/data_reader_image.cpp b/src/data_readers/data_reader_image.cpp index bff16dbd652..6f3d69af019 100644 --- a/src/data_readers/data_reader_image.cpp +++ b/src/data_readers/data_reader_image.cpp @@ -159,14 +159,17 @@ void image_data_reader::load() { } fclose(fplist); - // TODO: this will probably need to change after sample_list class - // is modified - + // reset indices + m_shuffled_indices.clear(); + m_shuffled_indices.resize(m_image_list.size()); + std::iota(m_shuffled_indices.begin(), m_shuffled_indices.end(), 0); + resize_shuffled_indices(); + std::vector local_list_sizes; if (opts->get_bool("preload_data_store") || opts->get_bool("data_store_cache")) { int np = m_comm->get_procs_per_trainer(); - int base_files_per_rank = m_image_list.size() / np; - int extra = m_image_list.size() - (base_files_per_rank*np); + int base_files_per_rank = m_shuffled_indices.size() / np; + int extra = m_shuffled_indices.size() - (base_files_per_rank*np); if (extra > np) { LBANN_ERROR("extra > np"); } @@ -179,18 +182,12 @@ void image_data_reader::load() { } } - // reset indices - m_shuffled_indices.clear(); - m_shuffled_indices.resize(m_image_list.size()); - std::iota(m_shuffled_indices.begin(), m_shuffled_indices.end(), 0); - opts->set_option("node_sizes_vary", 1); instantiate_data_store(local_list_sizes); select_subset_of_data(); } -//void read_raw_data(const std::string &filename, std::vector &data) { void read_raw_data(const std::string &filename, std::vector &data) { data.clear(); std::ifstream in(filename.c_str()); @@ -210,6 +207,7 @@ void image_data_reader::preload_data_store() { m_data_store->set_preload(); conduit::Node node; + if (is_master()) std::cerr << "Starting image_data_reader::preload_data_store; num indices: " << m_shuffled_indices.size() << std::endl; int rank = m_comm->get_rank_in_trainer(); for (size_t data_id=0; data_idget_index_owner(data_id) != rank) { diff --git a/src/data_readers/data_reader_jag.cpp b/src/data_readers/data_reader_jag.cpp index d1d24e48578..19edae18f76 100644 --- a/src/data_readers/data_reader_jag.cpp +++ b/src/data_readers/data_reader_jag.cpp @@ -339,7 +339,7 @@ void data_reader_jag::load() { m_shuffled_indices.resize(num_samples); std::iota(m_shuffled_indices.begin(), m_shuffled_indices.end(), 0); - + resize_shuffled_indices(); select_subset_of_data(); } diff --git a/src/data_readers/data_reader_jag_conduit.cpp b/src/data_readers/data_reader_jag_conduit.cpp index 56c1cd2ee52..687d8f27cf8 100644 --- a/src/data_readers/data_reader_jag_conduit.cpp +++ b/src/data_readers/data_reader_jag_conduit.cpp @@ -817,17 +817,6 @@ void data_reader_jag_conduit::load() { std::cout << "Done with data checking" << std::endl; } - - // need to resize and init shuffled indices here, since it's needed in - // preload_data_store, which must be called before merging the sample lists - int sz = m_sample_list.size(); - std::vector local_list_sizes(m_comm->get_procs_per_trainer()); - m_comm->trainer_all_gather(sz, local_list_sizes); - - if(is_master()) { - std::cout << "We now have the proper size" << std::endl; - } - /// Merge all of the sample lists m_sample_list.all_gather_packed_lists(*m_comm); if (opts->has_string("write_sample_list") && m_comm->am_trainer_master()) { @@ -841,13 +830,32 @@ void data_reader_jag_conduit::load() { s << basename << "." << ext; m_sample_list.write(s.str()); } + m_shuffled_indices.resize(m_sample_list.size()); std::iota(m_shuffled_indices.begin(), m_shuffled_indices.end(), 0); + resize_shuffled_indices(); if(is_master()) { std::cout << "Lists have been gathered" << std::endl; } + std::vector local_list_sizes; + if (opts->get_bool("preload_data_store") || opts->get_bool("data_store_cache")) { + int np = m_comm->get_procs_per_trainer(); + int base_files_per_rank = m_shuffled_indices.size() / np; + int extra = m_shuffled_indices.size() - (base_files_per_rank*np); + if (extra > np) { + LBANN_ERROR("extra > np"); + } + local_list_sizes.resize(np, 0); + for (int j=0; jget_bool("preload_data_store") && opts->get_bool("use_data_store")) { LBANN_WARNING("when not preloading you must specify the number of labels in the prototext file if you are doing classification"); } @@ -119,16 +126,8 @@ void numpy_npz_conduit_reader::load() { } } - // Reset indices. - m_shuffled_indices.clear(); - m_shuffled_indices.resize(m_num_samples); - std::iota(m_shuffled_indices.begin(), m_shuffled_indices.end(), 0); - instantiate_data_store(local_list_sizes); - // TODO: this may need fixing up for efficiency. If using an absolute - // num samples, or percentage of samples, and we've preloaded, - // this is wasteful and not what we want select_subset_of_data(); } @@ -326,11 +325,6 @@ void numpy_npz_conduit_reader::fill_in_metadata() { } in.close(); - m_num_samples = m_filenames.size(); - if (is_master()) { - std::cout << "num samples: " << m_num_samples << "\n"; - } - int data_id = 0; //meaningless conduit::Node node; numpy_conduit_converter::load_conduit_node(m_filenames[my_file], data_id, node); diff --git a/src/data_readers/data_reader_pilot2_molecular.cpp b/src/data_readers/data_reader_pilot2_molecular.cpp index e48a56610e1..b85d0f29642 100644 --- a/src/data_readers/data_reader_pilot2_molecular.cpp +++ b/src/data_readers/data_reader_pilot2_molecular.cpp @@ -115,6 +115,7 @@ void pilot2_molecular_reader::load() { m_shuffled_indices.clear(); m_shuffled_indices.resize(m_num_samples); std::iota(m_shuffled_indices.begin(), m_shuffled_indices.end(), 0); + resize_shuffled_indices(); select_subset_of_data(); } diff --git a/src/data_readers/data_reader_python.cpp b/src/data_readers/data_reader_python.cpp index 09cad0e1c0b..a0e7e73c858 100644 --- a/src/data_readers/data_reader_python.cpp +++ b/src/data_readers/data_reader_python.cpp @@ -319,6 +319,7 @@ def @init_func@(): void python_reader::load() { m_shuffled_indices.resize(m_num_samples); std::iota(m_shuffled_indices.begin(), m_shuffled_indices.end(), 0); + resize_shuffled_indices(); select_subset_of_data(); } diff --git a/src/data_readers/data_reader_synthetic.cpp b/src/data_readers/data_reader_synthetic.cpp index d8f3ea207d4..57ab4af3a4e 100644 --- a/src/data_readers/data_reader_synthetic.cpp +++ b/src/data_readers/data_reader_synthetic.cpp @@ -91,6 +91,7 @@ void data_reader_synthetic::load() { m_shuffled_indices.clear(); m_shuffled_indices.resize(m_num_samples); std::iota(m_shuffled_indices.begin(), m_shuffled_indices.end(), 0); + resize_shuffled_indices(); select_subset_of_data(); } diff --git a/src/data_readers/data_reader_triplet.cpp b/src/data_readers/data_reader_triplet.cpp index a6037cff2eb..bbe4bb0b32a 100644 --- a/src/data_readers/data_reader_triplet.cpp +++ b/src/data_readers/data_reader_triplet.cpp @@ -146,7 +146,7 @@ void data_reader_triplet::load() { m_shuffled_indices.resize(num_samples); std::iota(m_shuffled_indices.begin(), m_shuffled_indices.end(), 0); - + resize_shuffled_indices(); select_subset_of_data(); } diff --git a/src/data_store/data_store_conduit.cpp b/src/data_store/data_store_conduit.cpp index 8a789eab464..b9b0e323e6b 100644 --- a/src/data_store/data_store_conduit.cpp +++ b/src/data_store/data_store_conduit.cpp @@ -183,15 +183,9 @@ void data_store_conduit::copy_members(const data_store_conduit& rhs, const std:: } if(ds_sample_move_list.size() == 0) { - if (m_trainer_master) { - std::cout << "data_store_conduit::copy_members; ds_sample_move_list.size = 0; copying all entries in m_data\n"; - } m_data = rhs.m_data; } else { /// Move indices on the list from the data and owner maps in the RHS data store to the new data store - if (m_trainer_master) { - std::cout << "data_store_conduit::copy_members; ds_sample_move_list.size != 0; copying ONLY SOME entries in m_data\n"; - } for(auto&& i : ds_sample_move_list) { if(rhs.m_data.find(i) != rhs.m_data.end()){ @@ -201,7 +195,18 @@ void data_store_conduit::copy_members(const data_store_conduit& rhs, const std:: if (!m_super_node) { /// Repack the nodes because they don't seem to copy correctly - compact_nodes(); + // + //dah - previously this code block only contained the line: + // build_node_for_sending(rhs.m_data[i]["data"], m_data[i]); + //However, this resulted in errors in the schema; not sure why, + //as it used to work; some change in the conduit library? + conduit::Node n2; + const std::vector &names = rhs.m_data[i]["data"].child_names(); + const std::vector &names2 = rhs.m_data[i]["data"][names[0]].child_names(); + for (auto t : names2) { + n2[names[0]][t] = rhs.m_data[i]["data"][names[0]][t]; + } + build_node_for_sending(n2, m_data[i]); } else { m_data[i] = rhs.m_data[i]; } @@ -263,7 +268,7 @@ void data_store_conduit::setup(int mini_batch_size) { preload_local_cache(); } - if (m_world_master && !m_preload) { + if (m_world_master) { std::cerr << "TIME for data_store_conduit setup: " << get_time() - tm1 << "\n"; } } @@ -825,18 +830,11 @@ void data_store_conduit::purge_unused_samples(const std::vector& indices) { void data_store_conduit::compact_nodes() { if (m_super_node) { - if (m_output) { - m_output << "RETURNING from data_store_conduit::compact_nodes; m_data.size(): " << m_data.size() << "\n"; - } return; - } else { - if (m_output) { - m_output << ">> NOT RETURNING from data_store_conduit::compact_nodes\n"; - } - } + } for(auto&& j : *m_shuffled_indices) { if(m_data.find(j) != m_data.end()){ - if(!m_data[j].is_contiguous()) { + if(! (m_data[j].is_contiguous() && m_data[j].is_compact()) ) { /// Repack the nodes because they don't seem to copy correctly conduit::Node node = m_data[j]["data"]; m_data.erase(j); From eaed364fe00c884c49a22d50fcb475b18339cc69 Mon Sep 17 00:00:00 2001 From: "Thomas R. Benson" Date: Wed, 7 Aug 2019 17:16:07 -0700 Subject: [PATCH 216/634] make LBANN_ERROR and LBANN_WARNING variadic macros --- include/lbann/utils/cublas.hpp | 31 ++++++++++--------- include/lbann/utils/exception.hpp | 44 ++++++++++++--------------- src/proto/factories/model_factory.cpp | 44 +++++++++++++-------------- 3 files changed, 58 insertions(+), 61 deletions(-) diff --git a/include/lbann/utils/cublas.hpp b/include/lbann/utils/cublas.hpp index 49225ff2336..0122b8f697f 100644 --- a/include/lbann/utils/cublas.hpp +++ b/include/lbann/utils/cublas.hpp @@ -29,6 +29,7 @@ #include "lbann/base.hpp" #include "lbann/utils/cuda.hpp" +#include "lbann/utils/exception.hpp" #ifdef LBANN_HAS_CUDA #include @@ -44,8 +45,9 @@ const cublasStatus_t status_FORCE_CHECK_CUBLAS = (cublas_call); \ if (status_FORCE_CHECK_CUBLAS != CUBLAS_STATUS_SUCCESS) { \ cudaDeviceReset(); \ - LBANN_ERROR(std::string("cuBLAS error: ") \ - + lbann::cublas::get_error_string(status_FORCE_CHECK_CUBLAS)); \ + LBANN_ERROR("cuBLAS error: ", \ + lbann::cublas::get_error_string( \ + status_FORCE_CHECK_CUBLAS)); \ } \ } \ { \ @@ -55,8 +57,8 @@ status_FORCE_CHECK_CUBLAS = cudaGetLastError(); \ if (status_FORCE_CHECK_CUBLAS != cudaSuccess) { \ cudaDeviceReset(); \ - LBANN_ERROR(std::string("CUDA error: ") \ - + cudaGetErrorString(status_FORCE_CHECK_CUBLAS)); \ + LBANN_ERROR("CUDA error: ", \ + cudaGetErrorString(status_FORCE_CHECK_CUBLAS)); \ } \ } \ } while (0) @@ -67,20 +69,19 @@ const cublasStatus_t status_FORCE_CHECK_CUBLAS = (cublas_call); \ if (status_FORCE_CHECK_CUBLAS != CUBLAS_STATUS_SUCCESS) { \ cudaDeviceReset(); \ - LBANN_ERROR(std::string("cuBLAS error: ") \ - + lbann::cublas::get_error_string(status_FORCE_CHECK_CUBLAS)); \ + LBANN_ERROR("cuBLAS error: ", \ + lbann::cublas::get_error_string( \ + status_FORCE_CHECK_CUBLAS)); \ } \ } \ } while (0) -#define FORCE_CHECK_CUBLAS_SYNC(cuda_call) \ - do { \ - const cudaError_t cuda_status = cuda_call; \ - if (cuda_status != cudaSuccess) { \ - std::cerr << "CUDA error: " << cudaGetErrorString(cuda_status) << "\n"; \ - std::cerr << "Error at " << __FILE__ << ":" << __LINE__ << "\n"; \ - cudaDeviceReset(); \ - throw lbann::lbann_exception("CUDA error"); \ - } \ +#define FORCE_CHECK_CUBLAS_SYNC(cuda_call) \ + do { \ + const cudaError_t cuda_status = cuda_call; \ + if (cuda_status != cudaSuccess) { \ + cudaDeviceReset(); \ + LBANN_ERROR("CUDA error: ", cudaGetErrorString(cuda_status)); \ + } \ } while (0) #ifdef LBANN_DEBUG #define CHECK_CUBLAS(cublas_call) \ diff --git a/include/lbann/utils/exception.hpp b/include/lbann/utils/exception.hpp index ddfd4a39c33..da6b91acdec 100644 --- a/include/lbann/utils/exception.hpp +++ b/include/lbann/utils/exception.hpp @@ -28,39 +28,35 @@ #define LBANN_UTILS_EXCEPTION_HPP_INCLUDED #include "lbann/comm.hpp" + +#include #include #include -#include // Macro to throw an LBANN exception -#define LBANN_ERROR(message) \ +#define LBANN_ERROR(...) \ do { \ - std::stringstream ss_LBANN_ERROR; \ - ss_LBANN_ERROR << "LBANN error "; \ const int rank_LBANN_ERROR = lbann::get_rank_in_world(); \ - if (rank_LBANN_ERROR >= 0) { \ - ss_LBANN_ERROR << "on rank " << rank_LBANN_ERROR << " "; \ - } \ - ss_LBANN_ERROR << "(" << __FILE__ << ":" << __LINE__ << ")" \ - << ": " << (message); \ - throw lbann::exception(ss_LBANN_ERROR.str()); \ + throw lbann::exception( \ + lbann::build_string( \ + "LBANN error", \ + (rank_LBANN_ERROR >= 0 \ + ? " on rank " + std::to_string(rank_LBANN_ERROR) \ + : std::string()), \ + " (", __FILE__, ":", __LINE__, "): ", __VA_ARGS__)); \ } while (0) -#define LBANN_ERROR_STR(...) \ - LBANN_ERROR(build_string(__VA_ARGS__)) - // Macro to print a warning to standard error stream. -#define LBANN_WARNING(message) \ - do { \ - std::stringstream ss_LBANN_WARNING; \ - ss_LBANN_WARNING << "LBANN warning "; \ - const int rank_LBANN_WARNING = lbann::get_rank_in_world(); \ - if (rank_LBANN_WARNING >= 0) { \ - ss_LBANN_WARNING << "on rank " << rank_LBANN_WARNING << " "; \ - } \ - ss_LBANN_WARNING << "(" << __FILE__ << ":" << __LINE__ << ")" \ - << ": " << (message) << std::endl; \ - std::cerr << ss_LBANN_WARNING.str(); \ +#define LBANN_WARNING(...) \ + do { \ + const int rank_LBANN_WARNING = lbann::get_rank_in_world(); \ + std::cerr << lbann::build_string( \ + "LBANN warning", \ + (rank_LBANN_WARNING >= 0 \ + ? " on rank " + std::to_string(rank_LBANN_WARNING) \ + : std::string()), \ + " (", __FILE__, ":", __LINE__, "): ", __VA_ARGS__) \ + << std::endl; \ } while (0) namespace lbann { diff --git a/src/proto/factories/model_factory.cpp b/src/proto/factories/model_factory.cpp index 43e88eed6f5..6aba400ad53 100644 --- a/src/proto/factories/model_factory.cpp +++ b/src/proto/factories/model_factory.cpp @@ -68,7 +68,7 @@ instantiate_model(lbann_comm* comm, } // Throw error if model type is not supported - LBANN_ERROR_STR("unknown model type (", type, ")"); + LBANN_ERROR("unknown model type (", type, ")"); return nullptr; } @@ -86,7 +86,7 @@ void assign_layers_to_objective_function( for (auto&& l : layer_list) { const auto& name = l->get_name(); if (names_to_layers.count(name) > 0) { - LBANN_ERROR_STR("layer name \"", name, "\" is not unique"); + LBANN_ERROR("layer name \"", name, "\" is not unique"); } names_to_layers[name] = l.get(); } @@ -101,9 +101,9 @@ void assign_layers_to_objective_function( const auto& params = proto_obj.layer_term(num_layer_terms-1); auto* l = names_to_layers[params.layer()]; if (l == nullptr) { - LBANN_ERROR_STR("attempted to set objective function layer term ", - "to correspond to layer \"", params.layer(), "\", ", - "but no such layer exists"); + LBANN_ERROR("attempted to set objective function layer term ", + "to correspond to layer \"", params.layer(), "\", ", + "but no such layer exists"); } term->set_layer(*l); } @@ -111,9 +111,9 @@ void assign_layers_to_objective_function( // Check that layer terms in objective function match prototext if (num_layer_terms != proto_obj.layer_term_size()) { - LBANN_ERROR_STR("recieved ", num_layer_terms, - " objective function layer terms, but there are ", - proto_obj.layer_term_size(), " in the prototext"); + LBANN_ERROR("recieved ", num_layer_terms, + " objective function layer terms, but there are ", + proto_obj.layer_term_size(), " in the prototext"); } } @@ -127,7 +127,7 @@ void assign_layers_to_metrics( for (auto&& l : layer_list) { const auto& name = l->get_name(); if (names_to_layers.count(name) > 0) { - LBANN_ERROR_STR("layer name \"", name, "\" is not unique"); + LBANN_ERROR("layer name \"", name, "\" is not unique"); } names_to_layers[name] = l.get(); } @@ -139,10 +139,10 @@ void assign_layers_to_metrics( const auto& params = proto_model.metric(i).layer_metric(); auto* l = names_to_layers[params.layer()]; if (l == nullptr) { - LBANN_ERROR_STR("attempted to set layer metric " - "\"", m->name(), "\" " - "to correspond to layer \"", params.layer(), "\", " - "but no such layer exists"); + LBANN_ERROR("attempted to set layer metric " + "\"", m->name(), "\" " + "to correspond to layer \"", params.layer(), "\", " + "but no such layer exists"); } m->set_layer(*l); } @@ -161,7 +161,7 @@ void assign_weights_to_layers( for (auto&& w : weights_list) { const auto& name = w->get_name(); if (names_to_weights.count(name) > 0) { - LBANN_ERROR_STR("weights name \"", name, "\" is not unique"); + LBANN_ERROR("weights name \"", name, "\" is not unique"); } names_to_weights[name] = w.get(); } @@ -174,10 +174,10 @@ void assign_weights_to_layers( for (auto&& name : parse_list(proto_layer.weights())) { auto&& w = names_to_weights[name]; if (!w) { - LBANN_ERROR_STR("could not find weights named " - "\"", name, "\", " - "which are expected by layer ", - layer_list[i]->get_name()); + LBANN_ERROR("could not find weights named " + "\"", name, "\", " + "which are expected by layer ", + layer_list[i]->get_name()); } if (is_frozen) { w->freeze(); @@ -204,7 +204,7 @@ void assign_weights_to_objective_function( for (auto&& w : weights_list) { const auto& name = w->get_name(); if (names_to_weights.count(name) > 0) { - LBANN_ERROR_STR("weights name \"", name, "\" is not unique"); + LBANN_ERROR("weights name \"", name, "\" is not unique"); } names_to_weights[name] = w.get(); } @@ -221,9 +221,9 @@ void assign_weights_to_objective_function( for (auto&& weights_name : parse_list(params.weights())) { auto&& w = names_to_weights[weights_name]; if (!w) { - LBANN_ERROR_STR("attempted to apply L2 weight regularization to " - "weights \"", weights_name, "\", " - "but no such weights exists"); + LBANN_ERROR("attempted to apply L2 weight regularization to " + "weights \"", weights_name, "\", " + "but no such weights exists"); } term_weights.push_back(w); } From 6a95c0e88e54e9273033cfaf0afe32b0fb3e8a0b Mon Sep 17 00:00:00 2001 From: "Thomas R. Benson" Date: Wed, 7 Aug 2019 17:31:36 -0700 Subject: [PATCH 217/634] Fix the tests prototexts --- .../layer_tests/model_channelwise_mean.prototext | 6 ++++-- model_zoo/tests/layer_tests/model_clamp.prototext | 6 ++++-- .../tests/layer_tests/model_covariance.prototext | 12 ++++++++---- model_zoo/tests/layer_tests/model_elu.prototext | 6 ++++-- model_zoo/tests/layer_tests/model_identity.prototext | 6 ++++-- model_zoo/tests/layer_tests/model_l1_norm.prototext | 6 ++++-- model_zoo/tests/layer_tests/model_l2_norm2.prototext | 6 ++++-- .../tests/layer_tests/model_leaky_relu.prototext | 6 ++++-- .../tests/layer_tests/model_log_sigmoid.prototext | 6 ++++-- .../tests/layer_tests/model_log_softmax.prototext | 6 ++++-- .../layer_tests/model_mean_absolute_error.prototext | 12 ++++++++---- model_zoo/tests/layer_tests/model_relu.prototext | 6 ++++-- model_zoo/tests/layer_tests/model_selu.prototext | 6 ++++-- model_zoo/tests/layer_tests/model_sigmoid.prototext | 6 ++++-- model_zoo/tests/layer_tests/model_softmax.prototext | 6 ++++-- model_zoo/tests/layer_tests/model_softplus.prototext | 6 ++++-- model_zoo/tests/layer_tests/model_softsign.prototext | 6 ++++-- .../layer_tests/model_squared_difference.prototext | 12 ++++++++---- .../tests/layer_tests/model_tessellate.prototext | 12 ++++++++---- model_zoo/tests/layer_tests/model_variance.prototext | 6 ++++-- model_zoo/tests/model_mnist_conv_graph.prototext | 8 ++++++-- 21 files changed, 102 insertions(+), 50 deletions(-) diff --git a/model_zoo/tests/layer_tests/model_channelwise_mean.prototext b/model_zoo/tests/layer_tests/model_channelwise_mean.prototext index d530e311f1b..60ca2691a49 100644 --- a/model_zoo/tests/layer_tests/model_channelwise_mean.prototext +++ b/model_zoo/tests/layer_tests/model_channelwise_mean.prototext @@ -63,8 +63,10 @@ model { } weights { name: "x_vals" - value_initializer { - values: "1.2 1 0.8 3.3 -0.2 -0.1 -0.9 -1.1 -2 -1.3 0.3 -1" + initializer { + value_initializer { + values: "1.2 1 0.8 3.3 -0.2 -0.1 -0.9 -1.1 -2 -1.3 0.3 -1" + } } } diff --git a/model_zoo/tests/layer_tests/model_clamp.prototext b/model_zoo/tests/layer_tests/model_clamp.prototext index b02fd5919ec..afac7aba8f6 100644 --- a/model_zoo/tests/layer_tests/model_clamp.prototext +++ b/model_zoo/tests/layer_tests/model_clamp.prototext @@ -63,8 +63,10 @@ model { } weights { name: "x_vals" - value_initializer { - values: "-2 -0.25 0.25 0.5 2" + initializer { + value_initializer { + values: "-2 -0.25 0.25 0.5 2" + } } } diff --git a/model_zoo/tests/layer_tests/model_covariance.prototext b/model_zoo/tests/layer_tests/model_covariance.prototext index 1324f945ec8..e92580370b0 100644 --- a/model_zoo/tests/layer_tests/model_covariance.prototext +++ b/model_zoo/tests/layer_tests/model_covariance.prototext @@ -63,8 +63,10 @@ model { } weights { name: "x0_vals" - value_initializer { - values: "1 -0.5 0.25 -0.125 0.0675" + initializer { + value_initializer { + values: "1 -0.5 0.25 -0.125 0.0675" + } } } layer { @@ -77,8 +79,10 @@ model { } weights { name: "x1_vals" - value_initializer { - values: "0.1 0.2 0.4 0.8 1.6" + initializer { + value_initializer { + values: "0.1 0.2 0.4 0.8 1.6" + } } } diff --git a/model_zoo/tests/layer_tests/model_elu.prototext b/model_zoo/tests/layer_tests/model_elu.prototext index ce20c7cb110..aa03e13d47a 100644 --- a/model_zoo/tests/layer_tests/model_elu.prototext +++ b/model_zoo/tests/layer_tests/model_elu.prototext @@ -63,8 +63,10 @@ model { } weights { name: "x_vals" - value_initializer { - values: "-2 -0.25 0.25 0.5 1" + initializer { + value_initializer { + values: "-2 -0.25 0.25 0.5 1" + } } } diff --git a/model_zoo/tests/layer_tests/model_identity.prototext b/model_zoo/tests/layer_tests/model_identity.prototext index 98eb617f70e..89d153d7feb 100644 --- a/model_zoo/tests/layer_tests/model_identity.prototext +++ b/model_zoo/tests/layer_tests/model_identity.prototext @@ -63,8 +63,10 @@ model { } weights { name: "x_vals" - value_initializer { - values: "-1.5 -0.25 0 0.5 1" + initializer { + value_initializer { + values: "-1.5 -0.25 0 0.5 1" + } } } diff --git a/model_zoo/tests/layer_tests/model_l1_norm.prototext b/model_zoo/tests/layer_tests/model_l1_norm.prototext index 9192a686411..b40f293b207 100644 --- a/model_zoo/tests/layer_tests/model_l1_norm.prototext +++ b/model_zoo/tests/layer_tests/model_l1_norm.prototext @@ -63,8 +63,10 @@ model { } weights { name: "x_vals" - value_initializer { - values: "1 -0.5 0.25 -0.125 0.125" + initializer { + value_initializer { + values: "1 -0.5 0.25 -0.125 0.125" + } } } diff --git a/model_zoo/tests/layer_tests/model_l2_norm2.prototext b/model_zoo/tests/layer_tests/model_l2_norm2.prototext index 07c72d2ef85..7887c860609 100644 --- a/model_zoo/tests/layer_tests/model_l2_norm2.prototext +++ b/model_zoo/tests/layer_tests/model_l2_norm2.prototext @@ -62,8 +62,10 @@ model { } weights { name: "x_vals" - value_initializer { - values: "0 1 -0.5 0.5 -1" + initializer { + value_initializer { + values: "0 1 -0.5 0.5 -1" + } } } diff --git a/model_zoo/tests/layer_tests/model_leaky_relu.prototext b/model_zoo/tests/layer_tests/model_leaky_relu.prototext index cc6473695cb..c07342ecff4 100644 --- a/model_zoo/tests/layer_tests/model_leaky_relu.prototext +++ b/model_zoo/tests/layer_tests/model_leaky_relu.prototext @@ -63,8 +63,10 @@ model { } weights { name: "x_vals" - value_initializer { - values: "-2 -1 -0.25 0.25 0.5" + initializer { + value_initializer { + values: "-2 -1 -0.25 0.25 0.5" + } } } diff --git a/model_zoo/tests/layer_tests/model_log_sigmoid.prototext b/model_zoo/tests/layer_tests/model_log_sigmoid.prototext index b3e58f7fd15..fa94c64873b 100644 --- a/model_zoo/tests/layer_tests/model_log_sigmoid.prototext +++ b/model_zoo/tests/layer_tests/model_log_sigmoid.prototext @@ -63,8 +63,10 @@ model { } weights { name: "x_vals" - value_initializer { - values: "-1 -0.25 0 0.5 2" + initializer { + value_initializer { + values: "-1 -0.25 0 0.5 2" + } } } diff --git a/model_zoo/tests/layer_tests/model_log_softmax.prototext b/model_zoo/tests/layer_tests/model_log_softmax.prototext index 12555305705..e19aab9c01e 100644 --- a/model_zoo/tests/layer_tests/model_log_softmax.prototext +++ b/model_zoo/tests/layer_tests/model_log_softmax.prototext @@ -63,8 +63,10 @@ model { } weights { name: "x_vals" - value_initializer { - values: "-4 -2 0 1 2" + initializer { + value_initializer { + values: "-4 -2 0 1 2" + } } } diff --git a/model_zoo/tests/layer_tests/model_mean_absolute_error.prototext b/model_zoo/tests/layer_tests/model_mean_absolute_error.prototext index beda327e807..27ac1f4855f 100644 --- a/model_zoo/tests/layer_tests/model_mean_absolute_error.prototext +++ b/model_zoo/tests/layer_tests/model_mean_absolute_error.prototext @@ -63,8 +63,10 @@ model { } weights { name: "x0_vals" - value_initializer { - values: "1 -0.5 0.25 -0.125 0.125" + initializer { + value_initializer { + values: "1 -0.5 0.25 -0.125 0.125" + } } } layer { @@ -77,8 +79,10 @@ model { } weights { name: "x1_vals" - value_initializer { - values: "1.5 0 -1 -0.125 -0.125" + initializer { + value_initializer { + values: "1.5 0 -1 -0.125 -0.125" + } } } diff --git a/model_zoo/tests/layer_tests/model_relu.prototext b/model_zoo/tests/layer_tests/model_relu.prototext index db91a7ba590..ba8ce807f98 100644 --- a/model_zoo/tests/layer_tests/model_relu.prototext +++ b/model_zoo/tests/layer_tests/model_relu.prototext @@ -63,8 +63,10 @@ model { } weights { name: "x_vals" - value_initializer { - values: "-1.5 -0.25 0.25 0.5 1" + initializer { + value_initializer { + values: "-1.5 -0.25 0.25 0.5 1" + } } } diff --git a/model_zoo/tests/layer_tests/model_selu.prototext b/model_zoo/tests/layer_tests/model_selu.prototext index 9e98a04ea17..c4b23b221b4 100644 --- a/model_zoo/tests/layer_tests/model_selu.prototext +++ b/model_zoo/tests/layer_tests/model_selu.prototext @@ -63,8 +63,10 @@ model { } weights { name: "x_vals" - value_initializer { - values: "-2 -0.25 0.25 0.5 1" + initializer { + value_initializer { + values: "-2 -0.25 0.25 0.5 1" + } } } diff --git a/model_zoo/tests/layer_tests/model_sigmoid.prototext b/model_zoo/tests/layer_tests/model_sigmoid.prototext index 989c1fb4c5c..055d094885c 100644 --- a/model_zoo/tests/layer_tests/model_sigmoid.prototext +++ b/model_zoo/tests/layer_tests/model_sigmoid.prototext @@ -63,8 +63,10 @@ model { } weights { name: "x_vals" - value_initializer { - values: "-200 -0.25 0 0.5 100" + initializer { + value_initializer { + values: "-200 -0.25 0 0.5 100" + } } } diff --git a/model_zoo/tests/layer_tests/model_softmax.prototext b/model_zoo/tests/layer_tests/model_softmax.prototext index b231ff7d179..c20d7cc2a2d 100644 --- a/model_zoo/tests/layer_tests/model_softmax.prototext +++ b/model_zoo/tests/layer_tests/model_softmax.prototext @@ -63,8 +63,10 @@ model { } weights { name: "x_vals" - value_initializer { - values: "-4 -2 0 1 2" + initializer { + value_initializer { + values: "-4 -2 0 1 2" + } } } diff --git a/model_zoo/tests/layer_tests/model_softplus.prototext b/model_zoo/tests/layer_tests/model_softplus.prototext index fc4d06823b3..19eb004df3e 100644 --- a/model_zoo/tests/layer_tests/model_softplus.prototext +++ b/model_zoo/tests/layer_tests/model_softplus.prototext @@ -63,8 +63,10 @@ model { } weights { name: "x_vals" - value_initializer { - values: "-2 -0.25 0 0.5 1" + initializer { + value_initializer { + values: "-2 -0.25 0 0.5 1" + } } } diff --git a/model_zoo/tests/layer_tests/model_softsign.prototext b/model_zoo/tests/layer_tests/model_softsign.prototext index 55e4e89cfc9..4d14d92e7b5 100644 --- a/model_zoo/tests/layer_tests/model_softsign.prototext +++ b/model_zoo/tests/layer_tests/model_softsign.prototext @@ -63,8 +63,10 @@ model { } weights { name: "x_vals" - value_initializer { - values: "-200 -0.25 0 0.5 100" + initializer { + value_initializer { + values: "-200 -0.25 0 0.5 100" + } } } diff --git a/model_zoo/tests/layer_tests/model_squared_difference.prototext b/model_zoo/tests/layer_tests/model_squared_difference.prototext index 87b8a14c7c7..87846bd21c1 100644 --- a/model_zoo/tests/layer_tests/model_squared_difference.prototext +++ b/model_zoo/tests/layer_tests/model_squared_difference.prototext @@ -60,8 +60,10 @@ model { } weights { name: "x0_vals" - value_initializer { - values: "1 -0.5 0.25 -0.125 0.125" + initializer { + value_initializer { + values: "1 -0.5 0.25 -0.125 0.125" + } } } layer { @@ -74,8 +76,10 @@ model { } weights { name: "x1_vals" - value_initializer { - values: "1.5 0 -1 -0.125 -0.125" + initializer { + value_initializer { + values: "1.5 0 -1 -0.125 -0.125" + } } } diff --git a/model_zoo/tests/layer_tests/model_tessellate.prototext b/model_zoo/tests/layer_tests/model_tessellate.prototext index 11440379413..2848f80949a 100644 --- a/model_zoo/tests/layer_tests/model_tessellate.prototext +++ b/model_zoo/tests/layer_tests/model_tessellate.prototext @@ -59,8 +59,10 @@ model { } weights { name: "x_vals" - value_initializer { - values: "0.4 0.6 -0.5" + initializer { + value_initializer { + values: "0.4 0.6 -0.5" + } } } @@ -96,8 +98,10 @@ model { } weights { name: "scales_vals" - value_initializer { - values: "1.2 1.3 1.4 1.5 1.6 1.7 1.8 1.9 2.0 2.1 2.2 2.3 2.4 2.5 2.6 2.7 2.8 2.9 3.0 3.1 3.2 3.3 3.4 3.5" + initializer { + value_initializer { + values: "1.2 1.3 1.4 1.5 1.6 1.7 1.8 1.9 2.0 2.1 2.2 2.3 2.4 2.5 2.6 2.7 2.8 2.9 3.0 3.1 3.2 3.3 3.4 3.5" + } } optimizer {} # No optimizer } diff --git a/model_zoo/tests/layer_tests/model_variance.prototext b/model_zoo/tests/layer_tests/model_variance.prototext index 33d0ac06373..096ef81a182 100644 --- a/model_zoo/tests/layer_tests/model_variance.prototext +++ b/model_zoo/tests/layer_tests/model_variance.prototext @@ -63,8 +63,10 @@ model { } weights { name: "x_vals" - value_initializer { - values: "1 -0.5 0.25 -0.125 0.0675" + initializer { + value_initializer { + values: "1 -0.5 0.25 -0.125 0.0675" + } } } diff --git a/model_zoo/tests/model_mnist_conv_graph.prototext b/model_zoo/tests/model_mnist_conv_graph.prototext index 21e5b210d53..23552b26a6e 100644 --- a/model_zoo/tests/model_mnist_conv_graph.prototext +++ b/model_zoo/tests/model_mnist_conv_graph.prototext @@ -143,11 +143,15 @@ model { } weights { name: "branch3_conv_kernel" - glorot_uniform_initializer {} + initializer { + glorot_uniform_initializer {} + } } weights { name: "branch3_conv_bias" - constant_initializer {} + initializer { + constant_initializer {} + } } layer { parents: "branch3_slice" From edeae3efe939d39e9467f47e1ea74f00c0c88ad1 Mon Sep 17 00:00:00 2001 From: Ryan Forsyth Date: Thu, 8 Aug 2019 09:58:40 -0700 Subject: [PATCH 218/634] Test clean up --- bamboo/common_python/tools.py | 2 +- .../test_integration_autoencoders.py | 2 +- .../test_integration_debug.py | 4 +- .../test_integration_performance.py | 6 +-- .../test_unit_check_proto_models.py | 2 +- bamboo/unit_tests/test_unit_checkpoint.py | 2 +- bamboo/unit_tests/test_unit_layer_clamp.py | 2 +- .../unit_tests/test_unit_layer_covariance.py | 2 +- bamboo/unit_tests/test_unit_layer_elu.py | 2 +- bamboo/unit_tests/test_unit_layer_identity.py | 2 +- bamboo/unit_tests/test_unit_layer_l1_norm.py | 2 +- bamboo/unit_tests/test_unit_layer_l2_norm2.py | 2 +- .../unit_tests/test_unit_layer_leaky_relu.py | 2 +- .../unit_tests/test_unit_layer_log_sigmoid.py | 2 +- .../unit_tests/test_unit_layer_log_softmax.py | 2 +- .../test_unit_layer_mean_absolute_error.py | 2 +- bamboo/unit_tests/test_unit_layer_relu.py | 2 +- bamboo/unit_tests/test_unit_layer_selu.py | 2 +- bamboo/unit_tests/test_unit_layer_sigmoid.py | 2 +- bamboo/unit_tests/test_unit_layer_softmax.py | 2 +- bamboo/unit_tests/test_unit_layer_softplus.py | 2 +- bamboo/unit_tests/test_unit_layer_softsign.py | 2 +- .../test_unit_layer_squared_difference.py | 2 +- .../unit_tests/test_unit_layer_tessellate.py | 2 +- bamboo/unit_tests/test_unit_layer_variance.py | 2 +- bamboo/unit_tests/test_unit_lbann2_reload.py | 2 +- .../unit_tests/test_unit_lbann_invocation.py | 49 ++++++++++++++++--- .../unit_tests/test_unit_mnist_conv_graph.py | 2 +- .../test_unit_mnist_ridge_regression.py | 2 +- .../test_unit_mnist_softmax_classifier.py | 2 +- .../test_unit_reconstruction_loss.py | 2 +- 31 files changed, 74 insertions(+), 41 deletions(-) diff --git a/bamboo/common_python/tools.py b/bamboo/common_python/tools.py index 8ea802b1ece..f2bd74712d4 100644 --- a/bamboo/common_python/tools.py +++ b/bamboo/common_python/tools.py @@ -391,11 +391,11 @@ def get_command(cluster, option_processes_per_model = ' --procs_per_model=%d' % processes_per_model if ckpt_dir is not None: option_ckpt_dir = ' --ckpt_dir=%s' % ckpt_dir + extra_options = '' if extra_lbann_flags is not None: # If extra_lbann_flags is not a dict, then we have already appended # this error to lbann_errors. if isinstance(extra_lbann_flags, dict): - extra_options = '' # See `lbann --help` or src/proto/proto_common.cpp allowed_flags = [ # 'model', diff --git a/bamboo/integration_tests/test_integration_autoencoders.py b/bamboo/integration_tests/test_integration_autoencoders.py index 25b02387f73..8d1b0c2216b 100644 --- a/bamboo/integration_tests/test_integration_autoencoders.py +++ b/bamboo/integration_tests/test_integration_autoencoders.py @@ -88,7 +88,7 @@ def test_integration_autoencoder_imagenet_intel19(cluster, dirname, exes, skeleton_autoencoder_imagenet(cluster, dirname, exes, 'intel19', weekly) -# Run with python -m pytest -s test_integration_autoencoder.py -k 'test_integration_autoencoder_imagenet_exe' --exe= +# Run with python3 -m pytest -s test_integration_autoencoder.py -k 'test_integration_autoencoder_imagenet_exe' --exe= def test_integration_autoencoder_imagenet_exe(cluster, dirname, exe): if exe is None: e = 'test_integration_autoencoder_imagenet_exe: Non-local testing' diff --git a/bamboo/integration_tests/test_integration_debug.py b/bamboo/integration_tests/test_integration_debug.py index cca69a66ff0..0d4d57b9701 100644 --- a/bamboo/integration_tests/test_integration_debug.py +++ b/bamboo/integration_tests/test_integration_debug.py @@ -84,7 +84,7 @@ def test_integration_cifar_intel19_debug(cluster, dirname, exes, weekly, debug_b skeleton_cifar_debug(cluster, dirname, exes, 'intel19_debug', weekly, debug_build) -# Run with python -m pytest -s test_integration_debug.py -k 'test_integration_mnist_exe' --exe= +# Run with python3 -m pytest -s test_integration_debug.py -k 'test_integration_mnist_exe' --exe= def test_integration_mnist_exe(cluster, dirname, exe): if exe is None: e = 'test_integration_mnist_exe: Non-local testing' @@ -94,7 +94,7 @@ def test_integration_mnist_exe(cluster, dirname, exe): skeleton_mnist_debug(cluster, dirname, exes, 'exe', True, True) -# Run with python -m pytest -s test_integration_debug.py -k 'test_integration_cifar_exe' --exe= +# Run with python3 -m pytest -s test_integration_debug.py -k 'test_integration_cifar_exe' --exe= def test_integration_cifar_exe(cluster, dirname, exe): if exe == None: e = 'test_integration_cifar_exe: Non-local testing' diff --git a/bamboo/integration_tests/test_integration_performance.py b/bamboo/integration_tests/test_integration_performance.py index 1a77589732b..31044b4dafd 100644 --- a/bamboo/integration_tests/test_integration_performance.py +++ b/bamboo/integration_tests/test_integration_performance.py @@ -216,7 +216,7 @@ def test_integration_performance_full_alexnet_intel19(cluster, dirname, exes, run) -# Run with python -m pytest -s test_integration_performance.py -k 'test_integration_performance_lenet_mnist_exe' --exe= +# Run with python3 -m pytest -s test_integration_performance.py -k 'test_integration_performance_lenet_mnist_exe' --exe= def test_integration_performance_lenet_mnist_exe(cluster, dirname, exe): if exe is None: e = 'test_integration_performance_lenet_mnist_exe: Non-local testing' @@ -226,7 +226,7 @@ def test_integration_performance_lenet_mnist_exe(cluster, dirname, exe): skeleton_performance_lenet_mnist(cluster, dirname, exes, 'exe') -# Run with python -m pytest -s test_integration_performance.py -k 'test_integration_performance_alexnet_exe' --exe= +# Run with python3 -m pytest -s test_integration_performance.py -k 'test_integration_performance_alexnet_exe' --exe= def test_integration_performance_alexnet_exe(cluster, dirname, exe): if exe is None: e = 'stest_integration_performance_alexnet_exe: Non-local testing' @@ -236,7 +236,7 @@ def test_integration_performance_alexnet_exe(cluster, dirname, exe): skeleton_performance_alexnet(cluster, dirname, exes, 'exe', True) -# Run with python -m pytest -s test_integration_performance.py -k 'test_integration_performance_full_alexnet_exe' --exe= +# Run with python3 -m pytest -s test_integration_performance.py -k 'test_integration_performance_full_alexnet_exe' --exe= def test_integration_performance_full_alexnet_exe(cluster, dirname, exe): if exe is None: e = 'test_integration_performance_full_alexnet_exe: Non-local testing' diff --git a/bamboo/unit_tests/test_unit_check_proto_models.py b/bamboo/unit_tests/test_unit_check_proto_models.py index 431449dc960..4c66c2ae8de 100644 --- a/bamboo/unit_tests/test_unit_check_proto_models.py +++ b/bamboo/unit_tests/test_unit_check_proto_models.py @@ -130,7 +130,7 @@ def test_unit_models_intel19(cluster, dirname, exes): skeleton_models(cluster, dirname, exes, 'intel19') -# Run with python -m pytest -s test_unit_check_proto_models.py -k 'test_unit_models_exe' --exe= +# Run with python3 -m pytest -s test_unit_check_proto_models.py -k 'test_unit_models_exe' --exe= def test_unit_models_exe(cluster, dirname, exe): if exe is None: e = 'test_unit_models_exe: Non-local testing' diff --git a/bamboo/unit_tests/test_unit_checkpoint.py b/bamboo/unit_tests/test_unit_checkpoint.py index 4d44d02348a..a0e06de14cf 100644 --- a/bamboo/unit_tests/test_unit_checkpoint.py +++ b/bamboo/unit_tests/test_unit_checkpoint.py @@ -140,7 +140,7 @@ def test_unit_checkpoint_lenet_intel19(cluster, exes, dirname): skeleton_checkpoint_lenet_distributed(cluster, exes, dirname, 'intel19') -# Run with python -m pytest -s test_unit_checkpoint.py -k 'test_unit_checkpoint_lenet_exe' --exe= +# Run with python3 -m pytest -s test_unit_checkpoint.py -k 'test_unit_checkpoint_lenet_exe' --exe= def test_unit_checkpoint_lenet_exe(cluster, dirname, exe): if exe is None: e = 'test_unit_checkpoint_lenet_exe: Non-local testing' diff --git a/bamboo/unit_tests/test_unit_layer_clamp.py b/bamboo/unit_tests/test_unit_layer_clamp.py index 3b9cb25593e..73a4a48a87d 100644 --- a/bamboo/unit_tests/test_unit_layer_clamp.py +++ b/bamboo/unit_tests/test_unit_layer_clamp.py @@ -36,7 +36,7 @@ def test_unit_layer_clamp_intel19(cluster, exes, dirname): skeleton_layer_clamp(cluster, exes, dirname, 'intel19') -# Run with python -m pytest -s test_unit_layer_clamp.py -k 'test_unit_layer_clamp_exe' --exe= +# Run with python3 -m pytest -s test_unit_layer_clamp.py -k 'test_unit_layer_clamp_exe' --exe= def test_unit_layer_clamp_exe(cluster, dirname, exe): if exe is None: e = 'test_unit_layer_clamp_exe: Non-local testing' diff --git a/bamboo/unit_tests/test_unit_layer_covariance.py b/bamboo/unit_tests/test_unit_layer_covariance.py index 1bf45a4ac5c..8e6450495cc 100644 --- a/bamboo/unit_tests/test_unit_layer_covariance.py +++ b/bamboo/unit_tests/test_unit_layer_covariance.py @@ -36,7 +36,7 @@ def test_unit_layer_covariance_intel19(cluster, exes, dirname): skeleton_layer_covariance(cluster, exes, dirname, 'intel19') -# Run with python -m pytest -s test_unit_ridge_regression.py -k 'test_unit_layer_covariance_exe' --exe= +# Run with python3 -m pytest -s test_unit_ridge_regression.py -k 'test_unit_layer_covariance_exe' --exe= def test_unit_layer_covariance_exe(cluster, dirname, exe): if exe is None: e = 'test_unit_layer_covariance_exe: Non-local testing' diff --git a/bamboo/unit_tests/test_unit_layer_elu.py b/bamboo/unit_tests/test_unit_layer_elu.py index b7d7a969c9a..06e50790d0a 100644 --- a/bamboo/unit_tests/test_unit_layer_elu.py +++ b/bamboo/unit_tests/test_unit_layer_elu.py @@ -36,7 +36,7 @@ def test_unit_layer_elu_intel19(cluster, exes, dirname): skeleton_layer_elu(cluster, exes, dirname, 'intel19') -# Run with python -m pytest -s test_unit_layer_elu.py -k 'test_unit_layer_elu_exe' --exe= +# Run with python3 -m pytest -s test_unit_layer_elu.py -k 'test_unit_layer_elu_exe' --exe= def test_unit_layer_elu_exe(cluster, dirname, exe): if exe is None: e = 'test_unit_layer_elu_exe: Non-local testing' diff --git a/bamboo/unit_tests/test_unit_layer_identity.py b/bamboo/unit_tests/test_unit_layer_identity.py index ec1b58c9b14..f1695ff6dda 100644 --- a/bamboo/unit_tests/test_unit_layer_identity.py +++ b/bamboo/unit_tests/test_unit_layer_identity.py @@ -36,7 +36,7 @@ def test_unit_layer_identity_intel19(cluster, exes, dirname): skeleton_layer_identity(cluster, exes, dirname, 'intel19') -# Run with python -m pytest -s test_unit_layer_identity.py -k 'test_unit_layer_identity_exe' --exe= +# Run with python3 -m pytest -s test_unit_layer_identity.py -k 'test_unit_layer_identity_exe' --exe= def test_unit_layer_identity_exe(cluster, dirname, exe): if exe is None: e = 'test_unit_layer_identity_exe: Non-local testing' diff --git a/bamboo/unit_tests/test_unit_layer_l1_norm.py b/bamboo/unit_tests/test_unit_layer_l1_norm.py index b334cc10011..1635895bfe1 100644 --- a/bamboo/unit_tests/test_unit_layer_l1_norm.py +++ b/bamboo/unit_tests/test_unit_layer_l1_norm.py @@ -36,7 +36,7 @@ def test_unit_layer_l1_norm_intel19(cluster, exes, dirname): skeleton_layer_l1_norm(cluster, exes, dirname, 'intel19') -# Run with python -m pytest -s test_unit_ridge_regression.py -k 'test_unit_layer_l1_norm_exe' --exe= +# Run with python3 -m pytest -s test_unit_ridge_regression.py -k 'test_unit_layer_l1_norm_exe' --exe= def test_unit_layer_l1_norm_exe(cluster, dirname, exe): if exe is None: e = 'test_unit_layer_l1_norm_exe: Non-local testing' diff --git a/bamboo/unit_tests/test_unit_layer_l2_norm2.py b/bamboo/unit_tests/test_unit_layer_l2_norm2.py index 76c6a5adcc7..b4d5eda45a5 100644 --- a/bamboo/unit_tests/test_unit_layer_l2_norm2.py +++ b/bamboo/unit_tests/test_unit_layer_l2_norm2.py @@ -35,7 +35,7 @@ def test_unit_layer_l2_norm2_intel19(cluster, exes, dirname): skeleton_layer_l2_norm2(cluster, exes, dirname, 'intel19') -# Run with python -m pytest -s test_unit_ridge_regression.py -k 'test_unit_layer_l2_norm2_exe' --exe= +# Run with python3 -m pytest -s test_unit_ridge_regression.py -k 'test_unit_layer_l2_norm2_exe' --exe= def test_unit_layer_l2_norm2_exe(cluster, dirname, exe): if exe is None: e = 'test_unit_layer_l2_norm2_exe: Non-local testing' diff --git a/bamboo/unit_tests/test_unit_layer_leaky_relu.py b/bamboo/unit_tests/test_unit_layer_leaky_relu.py index 66ad045cccc..76551e0168d 100644 --- a/bamboo/unit_tests/test_unit_layer_leaky_relu.py +++ b/bamboo/unit_tests/test_unit_layer_leaky_relu.py @@ -36,7 +36,7 @@ def test_unit_layer_leaky_relu_intel19(cluster, exes, dirname): skeleton_layer_leaky_relu(cluster, exes, dirname, 'intel19') -# Run with python -m pytest -s test_unit_ridge_regression.py -k 'test_unit_layer_leaky_relu_exe' --exe= +# Run with python3 -m pytest -s test_unit_ridge_regression.py -k 'test_unit_layer_leaky_relu_exe' --exe= def test_unit_layer_leaky_relu_exe(cluster, dirname, exe): if exe is None: e = 'test_unit_layer_leaky_relu_exe: Non-local testing' diff --git a/bamboo/unit_tests/test_unit_layer_log_sigmoid.py b/bamboo/unit_tests/test_unit_layer_log_sigmoid.py index 750c089833a..933a080a84e 100644 --- a/bamboo/unit_tests/test_unit_layer_log_sigmoid.py +++ b/bamboo/unit_tests/test_unit_layer_log_sigmoid.py @@ -36,7 +36,7 @@ def test_unit_layer_log_sigmoid_intel19(cluster, exes, dirname): skeleton_layer_log_sigmoid(cluster, exes, dirname, 'intel19') -# Run with python -m pytest -s test_unit_layer_log_sigmoid.py -k 'test_unit_layer_log_sigmoid_exe' --exe= +# Run with python3 -m pytest -s test_unit_layer_log_sigmoid.py -k 'test_unit_layer_log_sigmoid_exe' --exe= def test_unit_layer_log_sigmoid_exe(cluster, dirname, exe): if exe is None: e = 'test_unit_layer_log_sigmoid_exe: Non-local testing' diff --git a/bamboo/unit_tests/test_unit_layer_log_softmax.py b/bamboo/unit_tests/test_unit_layer_log_softmax.py index 75a4a6b7bbd..cc89c82cb17 100644 --- a/bamboo/unit_tests/test_unit_layer_log_softmax.py +++ b/bamboo/unit_tests/test_unit_layer_log_softmax.py @@ -37,7 +37,7 @@ def test_unit_layer_log_softmax_intel19(cluster, exes, dirname): skeleton_layer_log_softmax(cluster, exes, dirname, 'intel19') -# Run with python -m pytest -s test_unit_ridge_regression.py -k 'test_unit_layer_log_softmax_exe' --exe= +# Run with python3 -m pytest -s test_unit_ridge_regression.py -k 'test_unit_layer_log_softmax_exe' --exe= def test_unit_layer_log_softmax_exe(cluster, dirname, exe): if exe is None: e = 'test_unit_layer_log_softmax_exe: Non-local testing' diff --git a/bamboo/unit_tests/test_unit_layer_mean_absolute_error.py b/bamboo/unit_tests/test_unit_layer_mean_absolute_error.py index 562b906d597..a04207d1a86 100644 --- a/bamboo/unit_tests/test_unit_layer_mean_absolute_error.py +++ b/bamboo/unit_tests/test_unit_layer_mean_absolute_error.py @@ -36,7 +36,7 @@ def test_unit_layer_mean_absolute_error_intel19(cluster, exes, dirname): skeleton_layer_mean_absolute_error(cluster, exes, dirname, 'intel19') -# Run with python -m pytest -s test_unit_ridge_regression.py -k 'test_unit_layer_mean_absolute_error_exe' --exe= +# Run with python3 -m pytest -s test_unit_ridge_regression.py -k 'test_unit_layer_mean_absolute_error_exe' --exe= def test_unit_layer_mean_absolute_error_exe(cluster, dirname, exe): if exe is None: e = 'test_unit_layer_mean_absolute_error_exe: Non-local testing' diff --git a/bamboo/unit_tests/test_unit_layer_relu.py b/bamboo/unit_tests/test_unit_layer_relu.py index 1e6212fec18..4bd05c82f52 100644 --- a/bamboo/unit_tests/test_unit_layer_relu.py +++ b/bamboo/unit_tests/test_unit_layer_relu.py @@ -37,7 +37,7 @@ def test_unit_layer_relu_intel19(cluster, exes, dirname): skeleton_layer_relu(cluster, exes, dirname, 'intel19') -# Run with python -m pytest -s test_unit_layer_relu.py -k 'test_unit_layer_relu_exe' --exe= +# Run with python3 -m pytest -s test_unit_layer_relu.py -k 'test_unit_layer_relu_exe' --exe= def test_unit_layer_relu_exe(cluster, dirname, exe): if exe is None: e = 'test_unit_layer_relu_exe: Non-local testing' diff --git a/bamboo/unit_tests/test_unit_layer_selu.py b/bamboo/unit_tests/test_unit_layer_selu.py index 46c47b824a7..77be41d2283 100644 --- a/bamboo/unit_tests/test_unit_layer_selu.py +++ b/bamboo/unit_tests/test_unit_layer_selu.py @@ -37,7 +37,7 @@ def test_unit_layer_selu_intel19(cluster, exes, dirname): skeleton_layer_selu(cluster, exes, dirname, 'intel19') -# Run with python -m pytest -s test_unit_layer_selu.py -k 'test_unit_layer_selu_exe' --exe= +# Run with python3 -m pytest -s test_unit_layer_selu.py -k 'test_unit_layer_selu_exe' --exe= def test_unit_layer_selu_exe(cluster, dirname, exe): if exe is None: e = 'test_unit_layer_selu_exe: Non-local testing' diff --git a/bamboo/unit_tests/test_unit_layer_sigmoid.py b/bamboo/unit_tests/test_unit_layer_sigmoid.py index 255d0363c33..d8143a558d4 100644 --- a/bamboo/unit_tests/test_unit_layer_sigmoid.py +++ b/bamboo/unit_tests/test_unit_layer_sigmoid.py @@ -37,7 +37,7 @@ def test_unit_layer_sigmoid_intel19(cluster, exes, dirname): skeleton_layer_sigmoid(cluster, exes, dirname, 'intel19') -# Run with python -m pytest -s test_unit_layer_sigmoid.py -k 'test_unit_layer_sigmoid_exe' --exe= +# Run with python3 -m pytest -s test_unit_layer_sigmoid.py -k 'test_unit_layer_sigmoid_exe' --exe= def test_unit_layer_sigmoid_exe(cluster, dirname, exe): if exe is None: e = 'test_unit_layer_sigmoid_exe: Non-local testing' diff --git a/bamboo/unit_tests/test_unit_layer_softmax.py b/bamboo/unit_tests/test_unit_layer_softmax.py index f80b19f128b..4ba2b8561bc 100644 --- a/bamboo/unit_tests/test_unit_layer_softmax.py +++ b/bamboo/unit_tests/test_unit_layer_softmax.py @@ -37,7 +37,7 @@ def test_unit_layer_softmax_intel19(cluster, exes, dirname): skeleton_layer_softmax(cluster, exes, dirname, 'intel19') -# Run with python -m pytest -s test_unit_ridge_regression.py -k 'test_unit_layer_softmax_exe' --exe= +# Run with python3 -m pytest -s test_unit_ridge_regression.py -k 'test_unit_layer_softmax_exe' --exe= def test_unit_layer_softmax_exe(cluster, dirname, exe): if exe is None: e = 'test_unit_layer_softmax_exe: Non-local testing' diff --git a/bamboo/unit_tests/test_unit_layer_softplus.py b/bamboo/unit_tests/test_unit_layer_softplus.py index b2bd086ebbc..362b261f27f 100644 --- a/bamboo/unit_tests/test_unit_layer_softplus.py +++ b/bamboo/unit_tests/test_unit_layer_softplus.py @@ -36,7 +36,7 @@ def test_unit_layer_softplus_intel19(cluster, exes, dirname): skeleton_layer_softplus(cluster, exes, dirname, 'intel19') -# Run with python -m pytest -s test_unit_layer_softplus.py -k 'test_unit_layer_softplus_exe' --exe= +# Run with python3 -m pytest -s test_unit_layer_softplus.py -k 'test_unit_layer_softplus_exe' --exe= def test_unit_layer_softplus_exe(cluster, dirname, exe): if exe is None: e = 'test_unit_layer_softplus_exe: Non-local testing' diff --git a/bamboo/unit_tests/test_unit_layer_softsign.py b/bamboo/unit_tests/test_unit_layer_softsign.py index 6830ca4f12a..1e262807486 100644 --- a/bamboo/unit_tests/test_unit_layer_softsign.py +++ b/bamboo/unit_tests/test_unit_layer_softsign.py @@ -40,7 +40,7 @@ def test_unit_layer_softsign_intel19(cluster, exes, dirname): skeleton_layer_softsign(cluster, exes, dirname, 'intel19') -# Run with python -m pytest -s test_unit_layer_softsign.py -k 'test_unit_layer_softsign_exe' --exe= +# Run with python3 -m pytest -s test_unit_layer_softsign.py -k 'test_unit_layer_softsign_exe' --exe= def test_unit_layer_softsign_exe(cluster, dirname, exe): if exe is None: e = 'test_unit_layer_softsign_exe: Non-local testing' diff --git a/bamboo/unit_tests/test_unit_layer_squared_difference.py b/bamboo/unit_tests/test_unit_layer_squared_difference.py index 591ec7ba205..f6deacdea6f 100644 --- a/bamboo/unit_tests/test_unit_layer_squared_difference.py +++ b/bamboo/unit_tests/test_unit_layer_squared_difference.py @@ -36,7 +36,7 @@ def test_unit_layer_squared_difference_intel19(cluster, exes, dirname): skeleton_layer_squared_difference(cluster, exes, dirname, 'intel19') -# Run with python -m pytest -s test_unit_layer_squared_difference.py -k 'test_unit_layer_squared_difference_exe' --exe= +# Run with python3 -m pytest -s test_unit_layer_squared_difference.py -k 'test_unit_layer_squared_difference_exe' --exe= def test_unit_layer_squared_difference_exe(cluster, dirname, exe): if exe is None: e = 'test_unit_layer_squared_difference_exe: Non-local testing' diff --git a/bamboo/unit_tests/test_unit_layer_tessellate.py b/bamboo/unit_tests/test_unit_layer_tessellate.py index c4e9c4d3876..024ebab761a 100644 --- a/bamboo/unit_tests/test_unit_layer_tessellate.py +++ b/bamboo/unit_tests/test_unit_layer_tessellate.py @@ -36,7 +36,7 @@ def test_unit_layer_tessellate_intel19(cluster, exes, dirname): skeleton_layer_tessellate(cluster, exes, dirname, 'intel19') -# Run with python -m pytest -s test_unit_layer_tessellate.py -k 'test_unit_layer_tessellate_exe' --exe= +# Run with python3 -m pytest -s test_unit_layer_tessellate.py -k 'test_unit_layer_tessellate_exe' --exe= def test_unit_layer_tessellate_exe(cluster, dirname, exe): if exe is None: e = 'test_unit_layer_tessellate_exe: Non-local testing' diff --git a/bamboo/unit_tests/test_unit_layer_variance.py b/bamboo/unit_tests/test_unit_layer_variance.py index 40350b0c088..20af21d60e1 100644 --- a/bamboo/unit_tests/test_unit_layer_variance.py +++ b/bamboo/unit_tests/test_unit_layer_variance.py @@ -37,7 +37,7 @@ def test_unit_layer_variance_intel19(cluster, exes, dirname): skeleton_layer_variance(cluster, exes, dirname, 'intel19') -# Run with python -m pytest -s test_unit_ridge_regression.py -k 'test_unit_layer_variance_exe' --exe= +# Run with python3 -m pytest -s test_unit_ridge_regression.py -k 'test_unit_layer_variance_exe' --exe= def test_unit_layer_variance_exe(cluster, dirname, exe): if exe is None: e = 'test_unit_layer_variance_exe: Non-local testing' diff --git a/bamboo/unit_tests/test_unit_lbann2_reload.py b/bamboo/unit_tests/test_unit_lbann2_reload.py index 7a2307fa92b..7fef676bccc 100644 --- a/bamboo/unit_tests/test_unit_lbann2_reload.py +++ b/bamboo/unit_tests/test_unit_lbann2_reload.py @@ -134,7 +134,7 @@ def test_unit_lbann2_reload_intel19(cluster, exes, dirname): skeleton_lbann2_reload(cluster, exes, dirname, 'intel19') -# Run with python -m pytest -s test_unit_lbann2_reload.py -k 'test_unit_lbann2_reload_exe' --exe= +# Run with python3 -m pytest -s test_unit_lbann2_reload.py -k 'test_unit_lbann2_reload_exe' --exe= def test_unit_lbann2_reload_exe(cluster, dirname, exe): if exe is None: e = 'test_unit_lbann2_reload_exe: Non-local testing' diff --git a/bamboo/unit_tests/test_unit_lbann_invocation.py b/bamboo/unit_tests/test_unit_lbann_invocation.py index 8ff69b3cd84..5bde732c262 100644 --- a/bamboo/unit_tests/test_unit_lbann_invocation.py +++ b/bamboo/unit_tests/test_unit_lbann_invocation.py @@ -3,8 +3,13 @@ import tools import os, sys + +# Run with python3 -m pytest -s test_unit_lbann_invocation.py -k 'test_unit_no_params_bad' --exes= def test_unit_no_params_bad(cluster, exes): - exe = exes['gcc7'] + if isinstance(exes, dict): + exe = exes['gcc7'] + else: + exe = exes sys.stderr.write('TESTING: run lbann with no params; lbann should throw exception\n') command = tools.get_command( cluster=cluster, executable=exe, exit_after_setup=True) @@ -12,8 +17,12 @@ def test_unit_no_params_bad(cluster, exes): assert return_code != 0 +# Run with python3 -m pytest -s test_unit_lbann_invocation.py -k 'test_unit_one_model_bad' --exes= def test_unit_one_model_bad(cluster, exes): - exe = exes['gcc7'] + if isinstance(exes, dict): + exe = exes['gcc7'] + else: + exe = exes sys.stderr.write('TESTING: run lbann with no optimizer or reader; lbann should throw exception\n') model_path = 'prototext/model_mnist_simple_1.prototext' command = tools.get_command( @@ -23,8 +32,12 @@ def test_unit_one_model_bad(cluster, exes): assert return_code != 0 +# Run with python3 -m pytest -s test_unit_lbann_invocation.py -k 'test_unit_two_models_bad' --exes= def test_unit_two_models_bad(cluster, exes): - exe = exes['gcc7'] + if isinstance(exes, dict): + exe = exes['gcc7'] + else: + exe = exes sys.stderr.write('TESTING: run lbann with two models but no optimizer or reader; lbann should throw exception\n') model_path = '{prototext/model_mnist_simple_1.prototext,prototext/model_mnist_simple_1.prototext}' command = tools.get_command( @@ -34,8 +47,12 @@ def test_unit_two_models_bad(cluster, exes): assert return_code != 0 +# Run with python3 -m pytest -s test_unit_lbann_invocation.py -k 'test_unit_two_models_bad2' --exes= def test_unit_two_models_bad2(cluster, exes): - exe = exes['gcc7'] + if isinstance(exes, dict): + exe = exes['gcc7'] + else: + exe = exes sys.stderr.write('TESTING: run lbann with two models with missing {; lbann should throw exception\n') model_path='prototext/model_mnist_simple_1.prototext,prototext/model_mnist_simple_1.prototext}' command = tools.get_command( @@ -45,8 +62,12 @@ def test_unit_two_models_bad2(cluster, exes): assert return_code != 0 +# Run with python3 -m pytest -s test_unit_lbann_invocation.py -k 'test_unit_missing_optimizer' --exes= def test_unit_missing_optimizer(cluster, exes): - exe = exes['gcc7'] + if isinstance(exes, dict): + exe = exes['gcc7'] + else: + exe = exes sys.stderr.write('TESTING: run lbann with two models, reader, but no optimizer; lbann should throw exception\n') model_path='{prototext/model_mnist_simple_1.prototext,prototext/model_mnist_simple_1.prototext}' data_reader_path='prototext/data_reader_mnist.prototext' @@ -58,8 +79,12 @@ def test_unit_missing_optimizer(cluster, exes): assert return_code != 0 +# Run with python3 -m pytest -s test_unit_lbann_invocation.py -k 'test_unit_missing_reader' --exes= def test_unit_missing_reader(cluster, exes): - exe = exes['gcc7'] + if isinstance(exes, dict): + exe = exes['gcc7'] + else: + exe = exes sys.stderr.write('TESTING: run lbann with two models, reader, but no reader; lbann should throw exception\n') model_path = '{prototext/model_mnist_simple_1.prototext,prototext/model_mnist_simple_1.prototext}' optimizer_path = 'prototext/opt_sgd.prototext' @@ -70,8 +95,12 @@ def test_unit_missing_reader(cluster, exes): assert return_code != 0 +# Run with python3 -m pytest -s test_unit_lbann_invocation.py -k 'test_unit_bad_params' --exes= def test_unit_bad_params(cluster, exes): - exe = exes['gcc7'] + if isinstance(exes, dict): + exe = exes['gcc7'] + else: + exe = exes sys.stderr.write('TESTING: run lbann with ill-formed param (missing -) lbann should throw exception\n') (command_allocate, command_run, _, _) = tools.get_command(cluster=cluster, executable=exe, return_tuple=True) command_string = '%s%s %s -exit_after_setup --reader=prototext/data_reader_mnist.prototext --model={prototext/model_mnist_simple_1.prototext,prototext/model_mnist_simple_1.prototext} --optimizer=prototext/opt_sgd.prototext' % (command_allocate, command_run, exe) @@ -79,8 +108,12 @@ def test_unit_bad_params(cluster, exes): assert return_code != 0 +# Run with python3 -m pytest -s test_unit_lbann_invocation.py -k 'test_unit_should_work' --exes= def test_unit_should_work(cluster, exes): - exe = exes['gcc7'] + if isinstance(exes, dict): + exe = exes['gcc7'] + else: + exe = exes sys.stderr.write('TESTING: run lbann with two models, reader, and optimizer; lbann should NOT throw exception\n') model_path = '{prototext/model_mnist_simple_1.prototext,prototext/model_mnist_simple_1.prototext}' data_reader_path = 'prototext/data_reader_mnist.prototext' diff --git a/bamboo/unit_tests/test_unit_mnist_conv_graph.py b/bamboo/unit_tests/test_unit_mnist_conv_graph.py index 530501a7035..6c6f45d6ca0 100644 --- a/bamboo/unit_tests/test_unit_mnist_conv_graph.py +++ b/bamboo/unit_tests/test_unit_mnist_conv_graph.py @@ -42,7 +42,7 @@ def test_unit_mnist_conv_graph_intel19(cluster, exes, dirname): skeleton_mnist_conv_graph(cluster, exes, dirname, 'intel19') -# Run with python -m pytest -s test_unit_conv_graph.py -k 'test_unit_mnist_conv_graph_exe' --exe= +# Run with python3 -m pytest -s test_unit_conv_graph.py -k 'test_unit_mnist_conv_graph_exe' --exe= def test_unit_mnist_conv_graph_exe(cluster, dirname, exe): if exe is None: e = 'test_unit_mnist_conv_graph_exe: Non-local testing' diff --git a/bamboo/unit_tests/test_unit_mnist_ridge_regression.py b/bamboo/unit_tests/test_unit_mnist_ridge_regression.py index 521a59a310f..0c2b3e8df30 100644 --- a/bamboo/unit_tests/test_unit_mnist_ridge_regression.py +++ b/bamboo/unit_tests/test_unit_mnist_ridge_regression.py @@ -36,7 +36,7 @@ def test_unit_mnist_ridge_regression_intel19(cluster, exes, dirname): skeleton_mnist_ridge_regression(cluster, exes, dirname, 'intel19') -# Run with python -m pytest -s test_unit_ridge_regression.py -k 'test_unit_mnist_ridge_regression_exe' --exe= +# Run with python3 -m pytest -s test_unit_ridge_regression.py -k 'test_unit_mnist_ridge_regression_exe' --exe= def test_unit_mnist_ridge_regression_exe(cluster, dirname, exe): if exe is None: e = 'test_unit_mnist_ridge_regression_exe: Non-local testing' diff --git a/bamboo/unit_tests/test_unit_mnist_softmax_classifier.py b/bamboo/unit_tests/test_unit_mnist_softmax_classifier.py index 36cb5f7ce86..8a018403867 100644 --- a/bamboo/unit_tests/test_unit_mnist_softmax_classifier.py +++ b/bamboo/unit_tests/test_unit_mnist_softmax_classifier.py @@ -36,7 +36,7 @@ def test_unit_mnist_softmax_classifier_intel19(cluster, exes, dirname): skeleton_mnist_softmax_classifier(cluster, exes, dirname, 'intel19') -# Run with python -m pytest -s test_unit_softmax_classifier.py -k 'test_unit_mnist_softmax_classifier_exe' --exe= +# Run with python3 -m pytest -s test_unit_softmax_classifier.py -k 'test_unit_mnist_softmax_classifier_exe' --exe= def test_unit_mnist_softmax_classifier_exe(cluster, dirname, exe): if exe is None: e = 'test_unit_mnist_softmax_classifier_exe: Non-local testing' diff --git a/bamboo/unit_tests/test_unit_reconstruction_loss.py b/bamboo/unit_tests/test_unit_reconstruction_loss.py index 85825a02d88..951f9b46656 100644 --- a/bamboo/unit_tests/test_unit_reconstruction_loss.py +++ b/bamboo/unit_tests/test_unit_reconstruction_loss.py @@ -42,7 +42,7 @@ def test_unit_jag_reconstruction_loss_intel19(cluster, exes, dirname): skeleton_jag_reconstruction_loss(cluster, exes, dirname, 'intel19') -# Run with python -m pytest -s test_unit_ridge_regression.py -k 'test_unit_jag_reconstruction_loss_exe' --exe= +# Run with python3 -m pytest -s test_unit_ridge_regression.py -k 'test_unit_jag_reconstruction_loss_exe' --exe= def test_unit_jag_reconstruction_loss_exe(cluster, dirname, exe): if exe is None: e = 'test_unit_jag_reconstruction_loss_exe: Non-local testing' From 6bc9f3e19ecf9deecbd4e310863e2a4d8259de55 Mon Sep 17 00:00:00 2001 From: "David A. Hysom" Date: Thu, 8 Aug 2019 10:10:32 -0700 Subject: [PATCH 219/634] removed a print statement --- include/lbann/layers/io/input/generic_input_layer.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/include/lbann/layers/io/input/generic_input_layer.hpp b/include/lbann/layers/io/input/generic_input_layer.hpp index 7c001602f27..3bf4677c7b3 100644 --- a/include/lbann/layers/io/input/generic_input_layer.hpp +++ b/include/lbann/layers/io/input/generic_input_layer.hpp @@ -601,7 +601,6 @@ class generic_input_layer : public io_layer { it = m_data_readers.find(execution_mode::training); if ((it != m_data_readers.end()) && it->second) { linearized_data_size = (it->second)->get_linearized_data_size(); - std::cerr << "XX >>>>>> linearized_data_size: " << linearized_data_size << "\n"; } it = m_data_readers.find(execution_mode::validation); From 4a4fbcdd7303854c2e6e94cdc24eba334c12415d Mon Sep 17 00:00:00 2001 From: graham63 <50850420+graham63@users.noreply.github.com> Date: Thu, 8 Aug 2019 12:58:45 -0700 Subject: [PATCH 220/634] Update src/callbacks/summary.cpp Co-Authored-By: Tim Moon --- src/callbacks/summary.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/callbacks/summary.cpp b/src/callbacks/summary.cpp index 226092e4c7c..c054ca5b91d 100644 --- a/src/callbacks/summary.cpp +++ b/src/callbacks/summary.cpp @@ -65,7 +65,7 @@ void summary::on_train_begin(model *m) { void summary::on_batch_end(model *m) { if(!m_summarizer){ - LBANN_ERROR(BuildErrorMessage("Summary callback failed: m_summarizer does not exist.")); + LBANN_ERROR("Summary callback failed: m_summarizer does not exist."); } prof_region_begin("summary-batch", prof_colors[0], false); From e43fb76cc40af901e0b9f414d8f665b389ecdc1a Mon Sep 17 00:00:00 2001 From: Katie Graham Date: Thu, 8 Aug 2019 13:12:51 -0700 Subject: [PATCH 221/634] addressed all PR comments --- include/lbann/callbacks/imcomm.hpp | 2 +- include/lbann/callbacks/summary.hpp | 2 +- include/lbann/callbacks/timer.hpp | 2 +- src/callbacks/summary.cpp | 18 +++--------------- src/proto/factories/callback_factory.cpp | 16 +++------------- 5 files changed, 9 insertions(+), 31 deletions(-) diff --git a/include/lbann/callbacks/imcomm.hpp b/include/lbann/callbacks/imcomm.hpp index 5620749e0d1..1108b061882 100644 --- a/include/lbann/callbacks/imcomm.hpp +++ b/include/lbann/callbacks/imcomm.hpp @@ -93,7 +93,7 @@ class imcomm : public callback_base { /** Summarize relevant statistics. */ void do_summary(model *m, weights *w, EvalType im_time); - /**@brief lbann_summary */ + /** @brief lbann_summary */ std::shared_ptr m_summarizer = nullptr; }; diff --git a/include/lbann/callbacks/summary.hpp b/include/lbann/callbacks/summary.hpp index af943dc2f36..eb199110378 100644 --- a/include/lbann/callbacks/summary.hpp +++ b/include/lbann/callbacks/summary.hpp @@ -65,7 +65,7 @@ class summary : public callback_base { void save_histograms(model *m); private: - /**@brief lbann_summary */ + /** @brief lbann_summary */ std::shared_ptr m_summarizer = nullptr; /** Interval for doing matrix summarization. */ diff --git a/include/lbann/callbacks/timer.hpp b/include/lbann/callbacks/timer.hpp index ba8ee1e6735..2afcf03d23f 100644 --- a/include/lbann/callbacks/timer.hpp +++ b/include/lbann/callbacks/timer.hpp @@ -97,7 +97,7 @@ class timer : public callback_base { */ void batch_timing_end(const model& m); - /**@brief lbann_summary */ + /** @brief lbann_summary */ std::shared_ptr m_summarizer = nullptr; }; diff --git a/src/callbacks/summary.cpp b/src/callbacks/summary.cpp index c054ca5b91d..f284a7f2f07 100644 --- a/src/callbacks/summary.cpp +++ b/src/callbacks/summary.cpp @@ -46,18 +46,6 @@ summary::summary(const std::shared_ptr& summarizer, m_summarizer(summarizer), m_mat_interval(mat_interval) {} -namespace -{ -template -std::string BuildErrorMessage(Ts... args) -{ - std::ostringstream oss; - int dummy[] = { (oss << args, 0)... }; - (void) dummy; - LBANN_ERROR(oss.str()); -} -} - void summary::on_train_begin(model *m) { save_histograms(m); } @@ -94,7 +82,7 @@ void summary::on_batch_end(model *m) { void summary::on_epoch_end(model *m) { if(!m_summarizer){ - LBANN_ERROR(BuildErrorMessage("Summary callback failed: m_summarizer does not exist.")); + LBANN_ERROR("Summary callback failed: m_summarizer does not exist."); } prof_region_begin("summary-epoch", prof_colors[0], false); @@ -115,7 +103,7 @@ void summary::on_epoch_end(model *m) { void summary::on_test_end(model *m) { if(!m_summarizer){ - LBANN_ERROR(BuildErrorMessage("Summary callback failed: m_summarizer does not exist.")); + LBANN_ERROR("Summary callback failed: m_summarizer does not exist."); } prof_region_begin("summary-test", prof_colors[0], false); lbann_comm *comm = m->get_comm(); @@ -138,7 +126,7 @@ void summary::on_test_end(model *m) { void summary::save_histograms(model *m) { if(!m_summarizer){ - LBANN_ERROR(BuildErrorMessage("Summary callback failed: m_summarizer does not exist.")); + LBANN_ERROR("Summary callback failed: m_summarizer does not exist."); } for (const auto& layer : m->get_layers()) { const std::string prefix = layer->get_name() + "/"; diff --git a/src/proto/factories/callback_factory.cpp b/src/proto/factories/callback_factory.cpp index 5cd33eb3ce5..2c463d004f2 100644 --- a/src/proto/factories/callback_factory.cpp +++ b/src/proto/factories/callback_factory.cpp @@ -66,6 +66,7 @@ #include "lbann/proto/factories.hpp" #include "lbann/proto/helpers.hpp" #include "lbann/utils/factory.hpp" +#include "lbann/utils/file_utils.hpp" #include "lbann/utils/memory.hpp" #include @@ -89,15 +90,6 @@ using factory_type = lbann::generic_factory< std::shared_ptr const&>, default_key_error_policy>; -template -std::string BuildErrorMessage(Ts... args) -{ - std::ostringstream oss; - int dummy[] = { (oss << args, 0)... }; - (void) dummy; - LBANN_ERROR(oss.str()); -} - void register_default_builders(factory_type& factory) { using namespace callback; @@ -228,10 +220,8 @@ std::unique_ptr construct_summarizer(lbann_comm* comm, } //check to see if directory exists - struct stat sb; - if (! ( stat(dir.c_str(), &sb) == 0 && S_ISDIR(sb.st_mode) )) { - LBANN_ERROR(BuildErrorMessage("summary directory ", - dir, " does not exist.")); + if (!file::directory_exists(dir)) { + LBANN_ERROR("summary directory ", dir, " does not exist."); } return make_unique(dir, comm); From e2556a6c1e2d3a8a0d8e7cf86152ed2ec94a0683 Mon Sep 17 00:00:00 2001 From: Ryan Forsyth Date: Mon, 1 Jul 2019 14:51:19 -0700 Subject: [PATCH 222/634] Update checkpointing --- bamboo/clean.sh | 3 +- bamboo/unit_tests/test_unit_checkpoint.py | 46 ++++++++++++++------ bamboo/unit_tests/test_unit_lbann2_reload.py | 19 +++----- 3 files changed, 40 insertions(+), 28 deletions(-) diff --git a/bamboo/clean.sh b/bamboo/clean.sh index 2864324c3ca..3d9d20c951f 100755 --- a/bamboo/clean.sh +++ b/bamboo/clean.sh @@ -21,7 +21,8 @@ rm -f ${LBANN_DIR}/bamboo/integration_tests/error/*.txt rm -f ${LBANN_DIR}/bamboo/integration_tests/output/*.txt # Unit Tests -rm -rf ${LBANN_DIR}/bamboo/unit_tests/ckpt_* +rm -rf ${LBANN_DIR}/bamboo/unit_tests/ckpt* +rm -rf ${LBANN_DIR}/bamboo/unit_tests/lbann2_* rm -f ${LBANN_DIR}/bamboo/unit_tests/*.prototext* rm -f ${LBANN_DIR}/bamboo/unit_tests/*.pyc rm -rf ${LBANN_DIR}/bamboo/unit_tests/__pycache__ diff --git a/bamboo/unit_tests/test_unit_checkpoint.py b/bamboo/unit_tests/test_unit_checkpoint.py index a0e06de14cf..b3756453fc3 100644 --- a/bamboo/unit_tests/test_unit_checkpoint.py +++ b/bamboo/unit_tests/test_unit_checkpoint.py @@ -27,8 +27,9 @@ def skeleton_checkpoint_lenet_shared(cluster, executables, dir_name, if return_code_nockpt != 0: sys.stderr.write('LeNet (no checkpoint) execution failed, exiting with error') sys.exit(1) - ckpt_pre = 'ckpt_pre_lenet_shared_{c}'.format(c=compiler_name) - os.system('mv ckpt {c}'.format(c=ckpt_pre)) + os.system('mkdir ckpt_lenet_shared') + no_ckpt = 'ckpt_lenet_shared/no_ckpt_{c}'.format(c=compiler_name) + os.system('mv ckpt {c}'.format(c=no_ckpt)) # Run to checkpoint, printing weights to files. output_file_name = '%s/bamboo/unit_tests/output/checkpoint_lenet_shared_checkpoint_%s_output.txt' % (dir_name, compiler_name) @@ -60,8 +61,8 @@ def skeleton_checkpoint_lenet_shared(cluster, executables, dir_name, sys.stderr.write('LeNet execution (restart from checkpoint) failed, exiting with error') sys.exit(1) - diff_test = os.system('diff -rq ckpt {c}'.format(c=ckpt_pre)) - os.system('mv ckpt ckpt_post_lenet_shared_{c}'.format(c=compiler_name)) + diff_test = os.system('diff -rq ckpt {c}'.format(c=no_ckpt)) + os.system('mv ckpt ckpt_lenet_shared/ckpt_{c}'.format(c=compiler_name)) assert diff_test == 0 @@ -87,8 +88,9 @@ def skeleton_checkpoint_lenet_distributed(cluster, executables, dir_name, if return_code_nockpt != 0: sys.stderr.write('LeNet (no checkpoint) execution failed, exiting with error') sys.exit(1) - ckpt_pre = 'ckpt_pre_lenet_distributed_{c}'.format(c=compiler_name) - os.system('mv ckpt {c}'.format(c=ckpt_pre)) + os.system('mkdir ckpt_lenet_distributed') + no_ckpt = 'ckpt_lenet_distributed/no_ckpt_{c}'.format(c=compiler_name) + os.system('mv ckpt {c}'.format(c=no_ckpt)) # Run to checkpoint, printing weights to files. output_file_name = '%s/bamboo/unit_tests/output/checkpoint_lenet_distributed_checkpoint_%s_output.txt' % (dir_name, compiler_name) @@ -120,32 +122,50 @@ def skeleton_checkpoint_lenet_distributed(cluster, executables, dir_name, sys.stderr.write('LeNet execution (restart from checkpoint) failed, exiting with error') sys.exit(1) - diff_test = os.system('diff -rq ckpt {c}'.format(c=ckpt_pre)) - os.system('mv ckpt ckpt_post_lenet_distributed_{c}'.format(c=compiler_name)) + diff_test = os.system('diff -rq ckpt {c}'.format(c=no_ckpt)) + os.system('mv ckpt ckpt_lenet_distributed/ckpt_{c}'.format(c=compiler_name)) assert diff_test == 0 -def test_unit_checkpoint_lenet_clang6(cluster, exes, dirname): +def test_unit_checkpoint_lenet_shared_clang6(cluster, exes, dirname): skeleton_checkpoint_lenet_shared(cluster, exes, dirname, 'clang6') + + +def test_unit_checkpoint_lenet_distributed_clang6(cluster, exes, dirname): skeleton_checkpoint_lenet_distributed(cluster, exes, dirname, 'clang6') -def test_unit_checkpoint_lenet_gcc7(cluster, exes, dirname): +def test_unit_checkpoint_lenet_shared_gcc7(cluster, exes, dirname): skeleton_checkpoint_lenet_shared(cluster, exes, dirname, 'gcc7') + + +def test_unit_checkpoint_lenet_distributed_gcc7(cluster, exes, dirname): skeleton_checkpoint_lenet_distributed(cluster, exes, dirname, 'gcc7') -def test_unit_checkpoint_lenet_intel19(cluster, exes, dirname): +def test_unit_checkpoint_lenet_shared_intel19(cluster, exes, dirname): skeleton_checkpoint_lenet_shared(cluster, exes, dirname, 'intel19') + + +def test_unit_checkpoint_lenet_distributed_intel19(cluster, exes, dirname): skeleton_checkpoint_lenet_distributed(cluster, exes, dirname, 'intel19') -# Run with python3 -m pytest -s test_unit_checkpoint.py -k 'test_unit_checkpoint_lenet_exe' --exe= -def test_unit_checkpoint_lenet_exe(cluster, dirname, exe): +# Run with python3 -m pytest -s test_unit_checkpoint.py -k 'test_unit_checkpoint_lenet_shared_exe' --exe= +def test_unit_checkpoint_lenet_shared_exe(cluster, dirname, exe): if exe is None: e = 'test_unit_checkpoint_lenet_exe: Non-local testing' print('Skip - ' + e) pytest.skip(e) exes = {'exe': exe} skeleton_checkpoint_lenet_shared(cluster, exes, dirname, 'exe') + + +# Run with python3 -m pytest -s test_unit_checkpoint.py -k 'test_unit_checkpoint_lenet_distributed_exe' --exe= +def test_unit_checkpoint_lenet_distributed_exe(cluster, dirname, exe): + if exe is None: + e = 'test_unit_checkpoint_lenet_exe: Non-local testing' + print('Skip - ' + e) + pytest.skip(e) + exes = {'exe': exe} skeleton_checkpoint_lenet_distributed(cluster, exes, dirname, 'exe') diff --git a/bamboo/unit_tests/test_unit_lbann2_reload.py b/bamboo/unit_tests/test_unit_lbann2_reload.py index 7fef676bccc..215f8cac960 100644 --- a/bamboo/unit_tests/test_unit_lbann2_reload.py +++ b/bamboo/unit_tests/test_unit_lbann2_reload.py @@ -12,12 +12,6 @@ def skeleton_lbann2_reload(cluster, executables, dir_name, compiler_name): pytest.skip(e) lbann2 = executables[compiler_name] + '2' - # Delete directories / files if they happen to be around from the - # previous build. - os.system('rm -rf ckpt') - os.system('rm -rf lbann2_*') - - # No checkpointing, printing weights to files. model_path = '{../../model_zoo/models/lenet_mnist/model_lenet_mnist.prototext,../../model_zoo/tests/model_lenet_mnist_lbann2ckpt.prototext}' output_file_name = '%s/bamboo/unit_tests/output/lbann2_no_checkpoint_%s_output.txt' % (dir_name, compiler_name) @@ -39,7 +33,9 @@ def skeleton_lbann2_reload(cluster, executables, dir_name, compiler_name): sys.stderr.write('LBANN2 LeNet execution failed, exiting with error') sys.exit(1) - os.system('mv lbann2_ckpt lbann2_nockpt') + os.system('mkdir ckpt_lbann2_reload') + no_ckpt = 'ckpt_lbann2_reload/lbann2_no_ckpt_{c}'.format(c=compiler_name) + os.system('mv lbann2_ckpt {c}'.format(c=no_ckpt)) # Run to checkpoint, printing weights to files. output_file_name = '%s/bamboo/unit_tests/output/lbann2_checkpoint_%s_output.txt' % (dir_name, compiler_name) @@ -79,7 +75,7 @@ def skeleton_lbann2_reload(cluster, executables, dir_name, compiler_name): os.system('rm lbann2_ckpt/model0-epoch*') os.system('rm lbann2_nockpt/model0-epoch*') - diff_result = os.system('diff -rq lbann2_ckpt/ lbann2_nockpt/') + diff_result = os.system('diff -rq lbann2_ckpt/ {c}'.format(c=no_ckpt)) allow_epsilon_diff = False if allow_epsilon_diff and (diff_result != 0): equal_within_epsilon = True @@ -113,20 +109,15 @@ def skeleton_lbann2_reload(cluster, executables, dir_name, compiler_name): print(error_string) if equal_within_epsilon: diff_result = 0 - os.system('rm -rf ckpt') - os.system('rm -rf lbann2_*') + os.system('mv lbann2_ckpt ckpt_lbann2_reload/lbann2_ckpt_{c}'.format(c=compiler_name)) assert diff_result == 0 def test_unit_lbann2_reload_clang6(cluster, exes, dirname): - if cluster == 'catalyst': # STILL ERRORS - pytest.skip('FIXME') skeleton_lbann2_reload(cluster, exes, dirname, 'clang6') def test_unit_lbann2_reload_gcc7(cluster, exes, dirname): - if cluster in ['catalyst', 'corona', 'lassen', 'pascal']: # STILL ERRORS - pytest.skip('FIXME') skeleton_lbann2_reload(cluster, exes, dirname, 'gcc7') From 3e74fbbc30b043af9bb7f5cb1ba9d56dd1e6a1b6 Mon Sep 17 00:00:00 2001 From: Sam Ade Jacobs Date: Thu, 8 Aug 2019 15:51:39 -0700 Subject: [PATCH 223/634] Add GRU to list of python modules (#1148) * Add GRU to list of python modules * Add data layout to hidden layers and other clean up recommended by Tim --- python/lbann/modules.py | 140 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 140 insertions(+) diff --git a/python/lbann/modules.py b/python/lbann/modules.py index f9ea6fca7ad..bde566693e0 100644 --- a/python/lbann/modules.py +++ b/python/lbann/modules.py @@ -363,3 +363,143 @@ def forward(self, x, prev_state): # Return output and state return output, (output, cell) + +class GRU(lbann.modules.Module): + """Gated-recurrent unit. + Implementation mostly taken from: + https://pytorch.org/docs/stable/nn.html#gru""" + + global_count = 0 # Static counter, used for default names + + def __init__(self, size, bias = True, + weights=[], name=None, data_layout='data_parallel'): + """Initialize GRU cell. + + Args: + size (int): Size of output tensor. + bias (bool): Whether to apply biases after linearity. + weights (`Weights` or iterator of `Weights`): Weights in + fully-connected layer. There are at most four - two + matrices ((3*size) x (input_size) and (3*size) x (size) dimensions) each and two + biases (3*size entries) each. If weights are not provided, + the matrix and bias will be initialized in a similar + manner as PyTorch (uniform random values from + [-1/sqrt(size), 1/sqrt(size)]). + name (str): Default name is in the form 'gru'. + data_layout (str): Data layout. + + """ + super().__init__() + GRU.global_count += 1 + self.step = 0 + self.size = size + self.name = (name + if name + else 'gru{0}'.format(GRU.global_count)) + self.data_layout = data_layout + + # Weights + self.weights = list(make_iterable(weights)) + if len(self.weights) > 4: + raise ValueError('`GRU` has at most 4 weights, ' + 'but got {0}'.format(len(self.weights))) + ##@todo: use loop + if len(self.weights) == 0: + self.weights.append( + lbann.Weights(initializer=lbann.UniformInitializer(min=-1/sqrt(self.size), + max=1/sqrt(self.size)), + name=self.name+'_ih_matrix')) + if len(self.weights) == 1: + self.weights.append( + lbann.Weights(initializer=lbann.UniformInitializer(min=-1/sqrt(self.size), + max=1/sqrt(self.size)), + name=self.name+'_ih_bias')) + if len(self.weights) == 2: + self.weights.append( + lbann.Weights(initializer=lbann.UniformInitializer(min=-1/sqrt(self.size), + max=1/sqrt(self.size)), + name=self.name+'_hh_matrix')) + if len(self.weights) == 3: + self.weights.append( + lbann.Weights(initializer=lbann.UniformInitializer(min=-1/sqrt(self.size), + max=1/sqrt(self.size)), + name=self.name+'_hh_bias')) + + # Linearity + ####Learnable input-hidden weights + self.ih_fc = lbann.modules.FullyConnectedModule(3*size, bias=bias, + weights=self.weights[:2], + name=self.name + '_ih_fc', + data_layout=self.data_layout) + ###Learnable hidden-hidden weights + self.hh_fc = lbann.modules.FullyConnectedModule(3*size, bias=bias, + weights=self.weights[2:], + name=self.name + '_hh_fc', + data_layout=self.data_layout) + + def forward(self, x, prev_state): + """Apply GRU step. + + Args: + x (Layer): Input. + prev_state: State from previous GRU step. + + Returns: + (Layer, Layer): The output (out) and state (hn). + The state can be passed directly into + the next GRU step. + + """ + self.step += 1 + name = '{0}_step{1}'.format(self.name, self.step) + + + fc1 = self.ih_fc(x) #input_fc + fc2 = self.hh_fc(prev_state) #hidden_fc + + + # Get gates and cell update + fc1_slice = lbann.Slice(fc1, + slice_points=_str_list([0, self.size, 2*self.size, 3*self.size]), + name=name + '_fc1_slice', + data_layout=self.data_layout) + Wir_x = lbann.Identity(fc1_slice, name=name + '_Wrx', + data_layout=self.data_layout) + Wiz_x = lbann.Identity(fc1_slice, name=name + '_Wzx', + data_layout=self.data_layout) + Win_x = lbann.Identity(fc1_slice, name=name + '_Wnx', + data_layout=self.data_layout) + + fc2_slice = lbann.Slice(fc2, + slice_points=_str_list([0, self.size, 2*self.size, 3*self.size]), + name=name + '_fc2_slice', + data_layout=self.data_layout) + Whr_prev = lbann.Identity(fc2_slice, name=name + '_Wrh', + data_layout=self.data_layout) + Whz_prev = lbann.Identity(fc2_slice, name=name + '_Wzh', + data_layout=self.data_layout) + Whn_prev = lbann.Identity(fc2_slice, name=name + '_Wnh', + data_layout=self.data_layout) + + rt = lbann.Sigmoid(lbann.Add([Wir_x,Whr_prev], data_layout=self.data_layout), name=name + '_reset_gate', + data_layout=self.data_layout) + + zt = lbann.Sigmoid(lbann.Add([Wiz_x,Whz_prev], data_layout=self.data_layout), name=name + '_update_gate', + data_layout=self.data_layout) + + nt = lbann.Tanh(lbann.Add([Win_x, + lbann.Multiply([rt,Whn_prev], data_layout=self.data_layout)], data_layout=self.data_layout), + name=name + '_new_gate', data_layout=self.data_layout) + + ht = lbann.Add([ + lbann.Multiply([ + lbann.WeightedSum([ + lbann.Constant(value=1.0, hint_layer=zt, data_layout=self.data_layout), + zt], + scaling_factors='1 -1', data_layout=self.data_layout), + nt], data_layout=self.data_layout), + lbann.Multiply([zt,prev_state], data_layout=self.data_layout)], name=name+ '_output', + data_layout=self.data_layout) + + # Return output + return ht, ht From f8bea3c68721045df3eaa78610e0e08549d15fc7 Mon Sep 17 00:00:00 2001 From: "Thomas R. Benson" Date: Fri, 9 Aug 2019 13:45:57 -0700 Subject: [PATCH 224/634] fix an issue in which an empty "optimizer" message is both valid and different from not having such a message at all --- include/lbann/proto/helpers.hpp | 4 ++ src/proto/factories/weights_factory.cpp | 15 ++++--- src/proto/helpers.cpp | 52 ++++++++++++++++--------- 3 files changed, 46 insertions(+), 25 deletions(-) diff --git a/include/lbann/proto/helpers.hpp b/include/lbann/proto/helpers.hpp index c28861f7bfd..0a8cf656409 100644 --- a/include/lbann/proto/helpers.hpp +++ b/include/lbann/proto/helpers.hpp @@ -51,6 +51,10 @@ using generate_builder_type = namespace helpers { +/** @brief Test whether the message has the oneof field. */ +bool has_oneof( + google::protobuf::Message const& msg, std::string const& oneof_name); + /** @brief Get a "derived type" message from the given message. */ google::protobuf::Message const& get_oneof_message( diff --git a/src/proto/factories/weights_factory.cpp b/src/proto/factories/weights_factory.cpp index 326bd92e0b1..3abff494111 100644 --- a/src/proto/factories/weights_factory.cpp +++ b/src/proto/factories/weights_factory.cpp @@ -111,12 +111,15 @@ std::unique_ptr construct_weights( // Set weights initializer and optimizer auto init = construct_initializer(proto_weights); - std::unique_ptr opt; - if (proto_weights.has_optimizer()) { - opt = construct_optimizer(comm, proto_weights.optimizer()); - } else { - opt = construct_optimizer(comm, proto_opt); - } + const lbann_data::Optimizer& opt_msg = + (proto_weights.has_optimizer() + ? proto_weights.optimizer() + : proto_opt); + + std::unique_ptr opt = + (helpers::has_oneof(opt_msg, "optimizer_type") + ? construct_optimizer(comm, opt_msg) + : nullptr); w->set_initializer(std::move(init)); w->set_optimizer(std::move(opt)); diff --git a/src/proto/helpers.cpp b/src/proto/helpers.cpp index a9494a1f412..1ec2ed82274 100644 --- a/src/proto/helpers.cpp +++ b/src/proto/helpers.cpp @@ -35,33 +35,47 @@ namespace lbann { namespace proto { namespace helpers { - -google::protobuf::Message const& -get_oneof_message( - google::protobuf::Message const& msg_in, std::string const& oneof_name) -{ - auto&& desc = msg_in.GetDescriptor(); - auto&& reflex = msg_in.GetReflection(); - auto&& oneof_handle = desc->FindOneofByName(oneof_name); +namespace { +google::protobuf::FieldDescriptor const* get_oneof_field_descriptor( + google::protobuf::Message const& msg_in, std::string const& oneof_name) { + auto desc = msg_in.GetDescriptor(); + auto reflex = msg_in.GetReflection(); + auto oneof_handle = desc->FindOneofByName(oneof_name); if (!oneof_handle) { - std::string msg_string; - google::protobuf::TextFormat::PrintToString(msg_in, &msg_string); - LBANN_ERROR(std::string("Message has no oneof field named \"") - + oneof_name + "\"\n\nMessage(" - + desc->DebugString() +"):\n\n" - + msg_string); + std::string msg_string; + google::protobuf::TextFormat::PrintToString(msg_in, &msg_string); + LBANN_ERROR("Message has no oneof field named \"", + oneof_name, "\"\n\nMessage(", + desc->DebugString(), "):\n\n", + msg_string); } - auto&& oneof_field = reflex->GetOneofFieldDescriptor(msg_in, oneof_handle); + return reflex->GetOneofFieldDescriptor(msg_in, oneof_handle); +} +}// namespace + +bool has_oneof( + google::protobuf::Message const& msg, std::string const& oneof_name) +{ + return (bool) get_oneof_field_descriptor(msg, oneof_name); +} - if (!oneof_field) - LBANN_ERROR("Oneof field in message has not been set."); +google::protobuf::Message const& +get_oneof_message( + google::protobuf::Message const& msg_in, std::string const& oneof_name) { + auto oneof_field = get_oneof_field_descriptor(msg_in, oneof_name); + if (!oneof_field) { + LBANN_ERROR("Oneof field \"", oneof_name, + "\" in message has not been set. Message:\n{", + msg_in.DebugString(),"\n}\n"); + } - if (oneof_field->type() != google::protobuf::FieldDescriptor::TYPE_MESSAGE) + if (oneof_field->type() != google::protobuf::FieldDescriptor::TYPE_MESSAGE) { LBANN_ERROR("Oneof field is not of message type."); + } - return reflex->GetMessage(msg_in, oneof_field); + return msg_in.GetReflection()->GetMessage(msg_in, oneof_field); } }// namespace helpers From 6114ad7f5ad4161354c0bbfc6310de20d6f3ec50 Mon Sep 17 00:00:00 2001 From: Nikoli Dryden Date: Sat, 10 Aug 2019 15:24:19 -0700 Subject: [PATCH 225/634] Aluminum version compatibility. --- superbuild/aluminum/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/superbuild/aluminum/CMakeLists.txt b/superbuild/aluminum/CMakeLists.txt index 3f2825f2233..d88928dcc59 100644 --- a/superbuild/aluminum/CMakeLists.txt +++ b/superbuild/aluminum/CMakeLists.txt @@ -11,7 +11,7 @@ else () CACHE STRING "The URL from which to clone Aluminum") endif () -set(ALUMINUM_TAG "master" +set(ALUMINUM_TAG "v0.2.1-1" CACHE STRING "The git tag to checkout for Aluminum") set(ALUMINUM_CMAKE_BUILD_TYPE "${CMAKE_BUILD_TYPE}" From e9bc5119c7ff10fa2f9316207634d3172a2bff53 Mon Sep 17 00:00:00 2001 From: Tim Moon Date: Mon, 12 Aug 2019 10:53:33 -0700 Subject: [PATCH 226/634] Fixing bug in Python GRU module. --- python/lbann/modules.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/lbann/modules.py b/python/lbann/modules.py index bde566693e0..b04cfeb05da 100644 --- a/python/lbann/modules.py +++ b/python/lbann/modules.py @@ -364,7 +364,7 @@ def forward(self, x, prev_state): # Return output and state return output, (output, cell) -class GRU(lbann.modules.Module): +class GRU(Module): """Gated-recurrent unit. Implementation mostly taken from: https://pytorch.org/docs/stable/nn.html#gru""" From e9ef5980f8daf8ba89537026122a6441e7ffc6d6 Mon Sep 17 00:00:00 2001 From: Ryan Forsyth Date: Mon, 12 Aug 2019 11:53:08 -0700 Subject: [PATCH 227/634] Fix Develop errors --- bamboo/allocate_and_run.sh | 6 +++--- bamboo/integration_tests/common_code.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/bamboo/allocate_and_run.sh b/bamboo/allocate_and_run.sh index bc364071f67..a5944d5eb8e 100755 --- a/bamboo/allocate_and_run.sh +++ b/bamboo/allocate_and_run.sh @@ -45,9 +45,9 @@ elif [ "${CLUSTER}" = 'catalyst' ] || [ "${CLUSTER}" = 'corona' ] || [ "${CLUSTE timeout -k 5 24h salloc -N16 --partition=pbatch -t $ALLOCATION_TIME_LIMIT ./run.sh --weekly if [ "${CLUSTER}" = 'catalyst' ]; then cd integration_tests - python -m pytest -s test_integration_performance_full_alexnet_clang6 --weekly --run --junitxml=../full_alexnet_clang6/results.xml - python -m pytest -s test_integration_performance_full_alexnet_gcc7 --weekly --run --junitxml=../full_alexnet_gcc7/results.xml - # python -m pytest -s test_integration_performance_full_alexnet_intel19 --weekly --run --junitxml=../full_alexnet_intel19/results.xml + python -m pytest -s test_integration_performance.py -k test_integration_performance_full_alexnet_clang6 --weekly --run --junitxml=../full_alexnet_clang6/results.xml + python -m pytest -s test_integration_performance.py -k test_integration_performance_full_alexnet_gcc7 --weekly --run --junitxml=../full_alexnet_gcc7/results.xml + # python -m pytest -s test_integration_performance.py -k test_integration_performance_full_alexnet_intel19 --weekly --run --junitxml=../full_alexnet_intel19/results.xml cd .. fi else diff --git a/bamboo/integration_tests/common_code.py b/bamboo/integration_tests/common_code.py index be5b5c6449c..d0d2aed6d7e 100644 --- a/bamboo/integration_tests/common_code.py +++ b/bamboo/integration_tests/common_code.py @@ -47,7 +47,7 @@ def get_command(cluster, dir_name, model_folder, model_name, executable, time_limit = 600 else: partition = 'pdebug' - time_limit = 30 + time_limit = 60 if (cluster == 'ray') and (model_name == 'conv_autoencoder_mnist'): num_processes = 20 else: From 31e7a755b5d69f6d2e183f839224512aa686648b Mon Sep 17 00:00:00 2001 From: "Thomas R. Benson" Date: Mon, 12 Aug 2019 14:47:07 -0700 Subject: [PATCH 228/634] update to HDF5's sudden deprecation of boolean API version options --- superbuild/hdf5/CMakeLists.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/superbuild/hdf5/CMakeLists.txt b/superbuild/hdf5/CMakeLists.txt index 6266d7627db..3bdd08d1d4a 100644 --- a/superbuild/hdf5/CMakeLists.txt +++ b/superbuild/hdf5/CMakeLists.txt @@ -46,6 +46,7 @@ option(HDF5_USE_18_API_DEFAULT "Use 1.8 API by default" ON) option(HDF5_USE_110_API_DEFAULT "Use 1.10 API by default" OFF) option(HDF5_USE_112_API_DEFAULT "Use 1.12 API by default" OFF) option(HDF5_BUILD_FORTRAN "Build HDF5 with fortran support" OFF) +set(HDF5_DEFAULT_API_VERSION "v18") # At present, this is required for LBANN/JAG use. option(HDF5_ENABLE_Z_LIB_SUPPORT "Build HDF5 with ZLIB support" ON) @@ -108,6 +109,7 @@ ExternalProject_Add(HDF5 -DBUILD_SHARED_LIBS=${HDF5_BUILD_SHARED_LIBS} -DBUILD_TESTING=${BUILD_HDF5_TESTING} -DHDF5_GENERATE_HEADERS=ON + -DDEFAULT_API_VERSION=${HDF5_DEFAULT_API_VERSION} ${HDF5_CMAKE_ARGS} ) From e002cfff52bbc6137b0f3230a71360f594e4e269 Mon Sep 17 00:00:00 2001 From: Tim Moon Date: Mon, 12 Aug 2019 16:58:48 -0700 Subject: [PATCH 229/634] Minor tweaks in entry-wise scale/bias layer. Explicitly casting AbsMat to CPUMat or GPUMat. Fixing incorrect buffer sizing. --- .../layers/learning/entrywise_scale_bias.hpp | 17 +++++++++------- src/layers/learning/entrywise_scale_bias.cpp | 20 +++++++++---------- src/layers/learning/entrywise_scale_bias.cu | 20 +++++++++---------- 3 files changed, 30 insertions(+), 27 deletions(-) diff --git a/include/lbann/layers/learning/entrywise_scale_bias.hpp b/include/lbann/layers/learning/entrywise_scale_bias.hpp index 545198955f1..fb10d83e44b 100644 --- a/include/lbann/layers/learning/entrywise_scale_bias.hpp +++ b/include/lbann/layers/learning/entrywise_scale_bias.hpp @@ -85,14 +85,17 @@ class entrywise_scale_bias_layer : public Layer { void setup_data() override { Layer::setup_data(); - const auto dims = get_output_dims(); - const El::Int size = get_output_size(); + + // Initialize output dimensions + set_output_dims(get_input_dims()); + const auto output_dims = get_output_dims(); + const El::Int output_size = get_output_size(); // Construct default weights if needed if (this->m_weights.size() < 1) { this->m_weights.push_back(new weights(get_comm())); - std::vector vals(2*size, DataType{0}); - std::fill(vals.begin(), vals.begin()+size, DataType{1}); + std::vector vals(2*output_size, DataType{0}); + std::fill(vals.begin(), vals.begin()+output_size, DataType{1}); auto init = make_unique(vals); std::unique_ptr opt(m_model->create_optimizer()); this->m_weights[0]->set_name(get_name() + "_weights"); @@ -113,13 +116,13 @@ class entrywise_scale_bias_layer : public Layer { // Setup weights auto dist = get_prev_activations().DistData(); dist.rowDist = El::STAR; - m_weights[0]->set_dims(dims, + m_weights[0]->set_dims(output_dims, {static_cast(2)}); m_weights[0]->set_matrix_distribution(dist); // Setup gradient w.r.t. weights m_weights_gradient->AlignWith(dist); - m_weights_gradient->Resize(size, 2); + m_weights_gradient->Resize(output_size, 2); } @@ -157,7 +160,7 @@ class entrywise_scale_bias_layer : public Layer { Layer::bp_setup_gradient_wrt_inputs(mini_batch_size); m_weights_gradient->Empty(false); m_weights_gradient->AlignWith(get_prev_activations()); - m_weights_gradient->Resize(get_input_size(), mini_batch_size); + m_weights_gradient->Resize(get_input_size(), 2); } protected: diff --git a/src/layers/learning/entrywise_scale_bias.cpp b/src/layers/learning/entrywise_scale_bias.cpp index 48323475945..9478bbca732 100644 --- a/src/layers/learning/entrywise_scale_bias.cpp +++ b/src/layers/learning/entrywise_scale_bias.cpp @@ -125,23 +125,23 @@ void bp_impl(const CPUMat& local_input, template <> void entrywise_scale_bias_layer ::fp_compute() { - fp_impl(get_local_prev_activations(), - get_local_activations(), + fp_impl(dynamic_cast(get_local_prev_activations()), + dynamic_cast(get_local_activations()), *m_weights[0]); } template <> void entrywise_scale_bias_layer ::fp_compute() { - fp_impl(get_local_prev_activations(), - get_local_activations(), + fp_impl(dynamic_cast(get_local_prev_activations()), + dynamic_cast(get_local_activations()), *m_weights[0]); } template <> void entrywise_scale_bias_layer ::bp_compute() { - bp_impl(get_local_prev_activations(), - get_local_prev_error_signals(), - get_local_error_signals(), + bp_impl(dynamic_cast(get_local_prev_activations()), + dynamic_cast(get_local_prev_error_signals()), + dynamic_cast(get_local_error_signals()), *this->m_weights[0], *m_weights_gradient, this->m_model->get_effective_mini_batch_size()); @@ -149,9 +149,9 @@ void entrywise_scale_bias_layer template <> void entrywise_scale_bias_layer ::bp_compute() { - bp_impl(get_local_prev_activations(), - get_local_prev_error_signals(), - get_local_error_signals(), + bp_impl(dynamic_cast(get_local_prev_activations()), + dynamic_cast(get_local_prev_error_signals()), + dynamic_cast(get_local_error_signals()), *this->m_weights[0], *m_weights_gradient, this->m_model->get_effective_mini_batch_size()); diff --git a/src/layers/learning/entrywise_scale_bias.cu b/src/layers/learning/entrywise_scale_bias.cu index 71b739f0429..aac88d96243 100644 --- a/src/layers/learning/entrywise_scale_bias.cu +++ b/src/layers/learning/entrywise_scale_bias.cu @@ -181,23 +181,23 @@ void bp_impl(const GPUMat& local_input, template <> void entrywise_scale_bias_layer ::fp_compute() { - fp_impl(get_local_prev_activations(), - get_local_activations(), + fp_impl(dynamic_cast(get_local_prev_activations()), + dynamic_cast(get_local_activations()), *m_weights[0]); } template <> void entrywise_scale_bias_layer ::fp_compute() { - fp_impl(get_local_prev_activations(), - get_local_activations(), + fp_impl(dynamic_cast(get_local_prev_activations()), + dynamic_cast(get_local_activations()), *m_weights[0]); } template <> void entrywise_scale_bias_layer ::bp_compute() { - bp_impl(get_local_prev_activations(), - get_local_prev_error_signals(), - get_local_error_signals(), + bp_impl(dynamic_cast(get_local_prev_activations()), + dynamic_cast(get_local_prev_error_signals()), + dynamic_cast(get_local_error_signals()), *this->m_weights[0], *m_weights_gradient, this->m_model->get_effective_mini_batch_size()); @@ -205,9 +205,9 @@ void entrywise_scale_bias_layer template <> void entrywise_scale_bias_layer ::bp_compute() { - bp_impl(get_local_prev_activations(), - get_local_prev_error_signals(), - get_local_error_signals(), + bp_impl(dynamic_cast(get_local_prev_activations()), + dynamic_cast(get_local_prev_error_signals()), + dynamic_cast(get_local_error_signals()), *this->m_weights[0], *m_weights_gradient, this->m_model->get_effective_mini_batch_size()); From f93a7b5c797be53068b62e3d3d4f86012da1344e Mon Sep 17 00:00:00 2001 From: Tim Moon Date: Mon, 12 Aug 2019 19:58:13 -0700 Subject: [PATCH 230/634] Protobuf message for no optimizer (#1161) * Adding NoOptimizer protobuf message. * Updating models with weights with no optimizer. * Review suggestion from @benson31. --- .../gan/mnist/adversarial_model.prototext | 10 ++-- .../gan/mnist/discriminator_model.prototext | 8 ++-- ...batchnorm_transferred_and_frozen.prototext | 40 ++++++++-------- ..._alexnet_batchnorm_dag_frozen_bn.prototext | 48 +++++++++---------- .../layer_tests/model_tessellate.prototext | 2 +- python/lbann/__init__.py | 2 +- python/lbann/proto.py | 4 +- src/proto/factories/optimizer_factory.cpp | 7 +++ src/proto/optimizers.proto | 13 +++-- 9 files changed, 72 insertions(+), 62 deletions(-) diff --git a/model_zoo/models/gan/mnist/adversarial_model.prototext b/model_zoo/models/gan/mnist/adversarial_model.prototext index 19f2dd83493..8d35b438717 100644 --- a/model_zoo/models/gan/mnist/adversarial_model.prototext +++ b/model_zoo/models/gan/mnist/adversarial_model.prototext @@ -145,7 +145,7 @@ model { weights { name: "gen_fc_weights" - optimizer { } + optimizer { no_optimizer {} } initializer { glorot_normal_initializer {} } @@ -303,7 +303,7 @@ model { weights { name: "dis_flatten_weights" - optimizer { } + optimizer { no_optimizer {} } initializer { he_normal_initializer {} } @@ -322,7 +322,7 @@ model { weights { name: "dis_fc1_weights" - optimizer { } + optimizer { no_optimizer {} } initializer { glorot_normal_initializer {} } @@ -348,7 +348,7 @@ model { weights { name: "dis_fc2_weights" - optimizer { } + optimizer { no_optimizer {} } initializer { glorot_normal_initializer {} } @@ -375,7 +375,7 @@ model { # FULLY_CONNECTED fc1 weights { name: "dis_fc3_weights" - optimizer { } + optimizer { no_optimizer {} } initializer { glorot_normal_initializer {} } diff --git a/model_zoo/models/gan/mnist/discriminator_model.prototext b/model_zoo/models/gan/mnist/discriminator_model.prototext index a10cc04afbb..75cc0425b9e 100644 --- a/model_zoo/models/gan/mnist/discriminator_model.prototext +++ b/model_zoo/models/gan/mnist/discriminator_model.prototext @@ -137,7 +137,7 @@ model { } weights { name: "gen_fc1_weights" - optimizer { } + optimizer { no_optimizer {} } initializer { glorot_normal_initializer {} } @@ -180,7 +180,7 @@ model { weights { name: "gen_fc2_weights" - optimizer { } + optimizer { no_optimizer {} } initializer { glorot_normal_initializer {} } @@ -220,7 +220,7 @@ model { weights { name: "gen_fc3_weights" - optimizer { } + optimizer { no_optimizer {} } initializer { glorot_normal_initializer {} } @@ -260,7 +260,7 @@ model { weights { name: "gen_fc4_weights" - optimizer { } + optimizer { no_optimizer {} } initializer { glorot_normal_initializer {} } diff --git a/model_zoo/models/siamese/finetune-cub/model_cub_batchnorm_transferred_and_frozen.prototext b/model_zoo/models/siamese/finetune-cub/model_cub_batchnorm_transferred_and_frozen.prototext index 8b135bbcac1..ec2f478b8b1 100644 --- a/model_zoo/models/siamese/finetune-cub/model_cub_batchnorm_transferred_and_frozen.prototext +++ b/model_zoo/models/siamese/finetune-cub/model_cub_batchnorm_transferred_and_frozen.prototext @@ -388,7 +388,7 @@ model { value: 1.0 } } - optimizer {} + optimizer { no_optimizer {} } } @@ -399,7 +399,7 @@ model { value: 0.0 } } - optimizer {} + optimizer { no_optimizer {} } } @@ -410,7 +410,7 @@ model { value: 0.0 } } - optimizer {} + optimizer { no_optimizer {} } } @@ -421,7 +421,7 @@ model { value: 1.0 } } - optimizer {} + optimizer { no_optimizer {} } } @@ -432,7 +432,7 @@ model { value: 1.0 } } - optimizer {} + optimizer { no_optimizer {} } } @@ -443,7 +443,7 @@ model { value: 0.0 } } - optimizer {} + optimizer { no_optimizer {} } } @@ -454,7 +454,7 @@ model { value: 0.0 } } - optimizer {} + optimizer { no_optimizer {} } } @@ -465,7 +465,7 @@ model { value: 1.0 } } - optimizer {} + optimizer { no_optimizer {} } } @@ -476,7 +476,7 @@ model { value: 1.0 } } - optimizer {} + optimizer { no_optimizer {} } } @@ -487,7 +487,7 @@ model { value: 0.0 } } - optimizer {} + optimizer { no_optimizer {} } } @@ -498,7 +498,7 @@ model { value: 0.0 } } - optimizer {} + optimizer { no_optimizer {} } } @@ -509,7 +509,7 @@ model { value: 1.0 } } - optimizer {} + optimizer { no_optimizer {} } } @@ -520,7 +520,7 @@ model { value: 1.0 } } - optimizer {} + optimizer { no_optimizer {} } } @@ -531,7 +531,7 @@ model { value: 0.0 } } - optimizer {} + optimizer { no_optimizer {} } } @@ -542,7 +542,7 @@ model { value: 0.0 } } - optimizer {} + optimizer { no_optimizer {} } } @@ -553,7 +553,7 @@ model { value: 1.0 } } - optimizer {} + optimizer { no_optimizer {} } } @@ -564,7 +564,7 @@ model { value: 1.0 } } - optimizer {} + optimizer { no_optimizer {} } } @@ -575,7 +575,7 @@ model { value: 0.0 } } - optimizer {} + optimizer { no_optimizer {} } } @@ -586,7 +586,7 @@ model { value: 0.0 } } - optimizer {} + optimizer { no_optimizer {} } } @@ -597,7 +597,7 @@ model { value: 1.0 } } - optimizer {} + optimizer { no_optimizer {} } } diff --git a/model_zoo/models/siamese/triplet/model_triplet_alexnet_batchnorm_dag_frozen_bn.prototext b/model_zoo/models/siamese/triplet/model_triplet_alexnet_batchnorm_dag_frozen_bn.prototext index 83ace5138af..e6a87ad92f5 100644 --- a/model_zoo/models/siamese/triplet/model_triplet_alexnet_batchnorm_dag_frozen_bn.prototext +++ b/model_zoo/models/siamese/triplet/model_triplet_alexnet_batchnorm_dag_frozen_bn.prototext @@ -374,7 +374,7 @@ model { value: 1.0 } } - optimizer {} + optimizer { no_optimizer {} } } @@ -385,7 +385,7 @@ model { value: 0.0 } } - optimizer {} + optimizer { no_optimizer {} } } @@ -396,7 +396,7 @@ model { value: 0.0 } } - optimizer {} + optimizer { no_optimizer {} } } @@ -407,7 +407,7 @@ model { value: 1.0 } } - optimizer {} + optimizer { no_optimizer {} } } @@ -418,7 +418,7 @@ model { value: 1.0 } } - optimizer {} + optimizer { no_optimizer {} } } @@ -429,7 +429,7 @@ model { value: 0.0 } } - optimizer {} + optimizer { no_optimizer {} } } @@ -440,7 +440,7 @@ model { value: 0.0 } } - optimizer {} + optimizer { no_optimizer {} } } @@ -451,7 +451,7 @@ model { value: 1.0 } } - optimizer {} + optimizer { no_optimizer {} } } @@ -462,7 +462,7 @@ model { value: 1.0 } } - optimizer {} + optimizer { no_optimizer {} } } @@ -473,7 +473,7 @@ model { value: 0.0 } } - optimizer {} + optimizer { no_optimizer {} } } @@ -484,7 +484,7 @@ model { value: 0.0 } } - optimizer {} + optimizer { no_optimizer {} } } @@ -495,7 +495,7 @@ model { value: 1.0 } } - optimizer {} + optimizer { no_optimizer {} } } @@ -506,7 +506,7 @@ model { value: 1.0 } } - optimizer {} + optimizer { no_optimizer {} } } @@ -517,7 +517,7 @@ model { value: 0.0 } } - optimizer {} + optimizer { no_optimizer {} } } @@ -528,7 +528,7 @@ model { value: 0.0 } } - optimizer {} + optimizer { no_optimizer {} } } @@ -539,7 +539,7 @@ model { value: 1.0 } } - optimizer {} + optimizer { no_optimizer {} } } @@ -550,7 +550,7 @@ model { value: 1.0 } } - optimizer {} + optimizer { no_optimizer {} } } @@ -561,7 +561,7 @@ model { value: 0.0 } } - optimizer {} + optimizer { no_optimizer {} } } @@ -572,7 +572,7 @@ model { value: 0.0 } } - optimizer {} + optimizer { no_optimizer {} } } @@ -583,7 +583,7 @@ model { value: 1.0 } } - optimizer {} + optimizer { no_optimizer {} } } @@ -594,7 +594,7 @@ model { value: 1.0 } } - optimizer {} + optimizer { no_optimizer {} } } @@ -605,7 +605,7 @@ model { value: 0.0 } } - optimizer {} + optimizer { no_optimizer {} } } @@ -616,7 +616,7 @@ model { value: 0.0 } } - optimizer {} + optimizer { no_optimizer {} } } @@ -627,7 +627,7 @@ model { value: 1.0 } } - optimizer {} + optimizer { no_optimizer {} } } ################################################### diff --git a/model_zoo/tests/layer_tests/model_tessellate.prototext b/model_zoo/tests/layer_tests/model_tessellate.prototext index 2848f80949a..a7fb5c1dc2f 100644 --- a/model_zoo/tests/layer_tests/model_tessellate.prototext +++ b/model_zoo/tests/layer_tests/model_tessellate.prototext @@ -103,7 +103,7 @@ model { values: "1.2 1.3 1.4 1.5 1.6 1.7 1.8 1.9 2.0 2.1 2.2 2.3 2.4 2.5 2.6 2.7 2.8 2.9 3.0 3.1 3.2 3.3 3.4 3.5" } } - optimizer {} # No optimizer + optimizer { no_optimizer {} } } layer { parents: "sum scales" diff --git a/python/lbann/__init__.py b/python/lbann/__init__.py index 89a48cedc11..1eeb8651e34 100644 --- a/python/lbann/__init__.py +++ b/python/lbann/__init__.py @@ -19,7 +19,7 @@ _lbann_exe = _config['Paths']['lbann_exe'] except: pass -import lbann_pb2, callbacks_pb2, layers_pb2, metrics_pb2, model_pb2, objective_functions_pb2, optimizers_pb2, weights_pb2 +import lbann_pb2, callbacks_pb2, layers_pb2, metrics_pb2, model_pb2, objective_functions_pb2, optimizers_pb2, reader_pb2, weights_pb2 def lbann_exe(): """LBANN executable.""" return _lbann_exe if _lbann_exe else 'lbann' diff --git a/python/lbann/proto.py b/python/lbann/proto.py index f1166851a85..59d60fd3055 100644 --- a/python/lbann/proto.py +++ b/python/lbann/proto.py @@ -2,7 +2,7 @@ import google.protobuf.text_format import google.protobuf.message -from lbann import lbann_pb2 +from lbann import lbann_pb2, NoOptimizer def save_prototext(filename, **kwargs): """Save a prototext file. @@ -36,7 +36,7 @@ def save_prototext(filename, **kwargs): # provided. if not message.HasField('optimizer'): from lbann import Optimizer - message.optimizer.CopyFrom(Optimizer().export_proto()) + message.optimizer.CopyFrom(NoOptimizer().export_proto()) message.optimizer.SetInParent() # Write to file diff --git a/src/proto/factories/optimizer_factory.cpp b/src/proto/factories/optimizer_factory.cpp index 220f25fef46..b7d548f4c50 100644 --- a/src/proto/factories/optimizer_factory.cpp +++ b/src/proto/factories/optimizer_factory.cpp @@ -43,6 +43,12 @@ namespace lbann { namespace proto { namespace { +std::unique_ptr +build_no_optimizer_from_pbuf( + google::protobuf::Message const& msg, lbann_comm* comm) { + return nullptr; +} + using factory_type = lbann::generic_factory< lbann::optimizer, std::string, @@ -52,6 +58,7 @@ using factory_type = lbann::generic_factory< default_key_error_policy>; void register_default_builders(factory_type& factory) { + factory.register_builder("NoOptimizer", build_no_optimizer_from_pbuf); factory.register_builder("AdaGrad", build_adagrad_optimizer_from_pbuf); factory.register_builder("Adam", build_adam_optimizer_from_pbuf); factory.register_builder("HypergradientAdam", diff --git a/src/proto/optimizers.proto b/src/proto/optimizers.proto index 48df859c926..259624c8be7 100644 --- a/src/proto/optimizers.proto +++ b/src/proto/optimizers.proto @@ -30,13 +30,16 @@ package lbann_data; message Optimizer { oneof optimizer_type { - AdaGrad adagrad = 1; - Adam adam = 2; - HypergradientAdam hypergradient_adam = 3; - RMSprop rmsprop = 4; - SGD sgd = 5; + NoOptimizer no_optimizer = 1; + AdaGrad adagrad = 2; + Adam adam = 3; + HypergradientAdam hypergradient_adam = 4; + RMSprop rmsprop = 5; + SGD sgd = 6; } + message NoOptimizer {} + message AdaGrad { double learn_rate = 1; double eps = 2; // Suggested: 1e-8 From dce3790c440f40f811d411b81c02a4109d5ad639 Mon Sep 17 00:00:00 2001 From: "Thomas R. Benson" Date: Tue, 13 Aug 2019 09:36:51 -0700 Subject: [PATCH 231/634] fix a bug where an initializer might not be specified for a weights object --- src/proto/factories/weights_factory.cpp | 42 ++++++++++++++++++------- src/proto/weights.proto | 3 ++ 2 files changed, 33 insertions(+), 12 deletions(-) diff --git a/src/proto/factories/weights_factory.cpp b/src/proto/factories/weights_factory.cpp index 3abff494111..80e86952177 100644 --- a/src/proto/factories/weights_factory.cpp +++ b/src/proto/factories/weights_factory.cpp @@ -39,26 +39,40 @@ namespace lbann { namespace proto { namespace { +using MessageT = google::protobuf::Message; + // Define the factory type. using factory_type = lbann::generic_factory< lbann::weights_initializer, std::string, generate_builder_type, + MessageT const&>, default_key_error_policy>; void register_default_builders(factory_type& factory) { - factory.register_builder("ConstantInitializer", build_constant_initializer_from_pbuf); - factory.register_builder("ValueInitializer", build_value_initializer_from_pbuf); - factory.register_builder("UniformInitializer", build_uniform_initializer_from_pbuf); - factory.register_builder("NormalInitializer", build_normal_initializer_from_pbuf); - factory.register_builder("GlorotNormalInitializer", build_glorot_initializer_from_pbuf); - factory.register_builder("GlorotUniformInitializer", build_glorot_initializer_from_pbuf); - factory.register_builder("HeNormalInitializer", build_he_initializer_from_pbuf); - factory.register_builder("HeUniformInitializer", build_he_initializer_from_pbuf); - factory.register_builder("LeCunNormalInitializer", build_lecun_initializer_from_pbuf); - factory.register_builder("LeCunUniformInitializer", build_lecun_initializer_from_pbuf); + factory.register_builder("LayerDefault", + [](MessageT const&) { return nullptr; }); + factory.register_builder("ConstantInitializer", + build_constant_initializer_from_pbuf); + factory.register_builder("ValueInitializer", + build_value_initializer_from_pbuf); + factory.register_builder("UniformInitializer", + build_uniform_initializer_from_pbuf); + factory.register_builder("NormalInitializer", + build_normal_initializer_from_pbuf); + factory.register_builder("GlorotNormalInitializer", + build_glorot_initializer_from_pbuf); + factory.register_builder("GlorotUniformInitializer", + build_glorot_initializer_from_pbuf); + factory.register_builder("HeNormalInitializer", + build_he_initializer_from_pbuf); + factory.register_builder("HeUniformInitializer", + build_he_initializer_from_pbuf); + factory.register_builder("LeCunNormalInitializer", + build_lecun_initializer_from_pbuf); + factory.register_builder("LeCunUniformInitializer", + build_lecun_initializer_from_pbuf); } // Manage a global factory @@ -110,7 +124,11 @@ std::unique_ptr construct_weights( } // Set weights initializer and optimizer - auto init = construct_initializer(proto_weights); + std::unique_ptr init = + (proto_weights.has_initializer() + ? construct_initializer(proto_weights) + : nullptr); + const lbann_data::Optimizer& opt_msg = (proto_weights.has_optimizer() ? proto_weights.optimizer() diff --git a/src/proto/weights.proto b/src/proto/weights.proto index bd09244e825..a2d3b804322 100644 --- a/src/proto/weights.proto +++ b/src/proto/weights.proto @@ -38,6 +38,7 @@ message Weights { message Initializer { oneof initializer_type { + LayerDefault layer_default = 1; ConstantInitializer constant_initializer = 20; ValueInitializer value_initializer = 21; UniformInitializer uniform_initializer = 22; @@ -51,6 +52,8 @@ message Initializer { } // Weight initializers + message LayerDefault {} + message ConstantInitializer { double value = 1; } From 82e12f11c210cc4428f3e7254dac0ed0e067cb62 Mon Sep 17 00:00:00 2001 From: "Thomas R. Benson" Date: Tue, 13 Aug 2019 10:40:27 -0700 Subject: [PATCH 232/634] pin conduit version to v0.4.0 --- superbuild/conduit/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/superbuild/conduit/CMakeLists.txt b/superbuild/conduit/CMakeLists.txt index 61128d47e3e..0c01bf48009 100644 --- a/superbuild/conduit/CMakeLists.txt +++ b/superbuild/conduit/CMakeLists.txt @@ -16,7 +16,7 @@ else () CACHE STRING "The URL from which to clone CONDUIT") endif () -set(CONDUIT_TAG "master" +set(CONDUIT_TAG "v0.4.0" CACHE STRING "The git tag to checkout for CONDUIT") set(CONDUIT_CMAKE_BUILD_TYPE "${CMAKE_BUILD_TYPE}" From f39e1b4f8d6596137ca816b448dc74d12ae66611 Mon Sep 17 00:00:00 2001 From: Tim Moon Date: Tue, 13 Aug 2019 11:20:11 -0700 Subject: [PATCH 233/634] Check gradient callback can be applied at any execution mode (#1162) * Check gradient callback can be applied at any execution mode. * Updating gradient checking unit tests to avoid changing behavior. * Review suggestions from @ndryden. --- include/lbann/callbacks/check_gradients.hpp | 25 ++++-- .../model_channelwise_mean.prototext | 1 + .../tests/layer_tests/model_clamp.prototext | 1 + .../layer_tests/model_covariance.prototext | 1 + .../tests/layer_tests/model_elu.prototext | 1 + .../layer_tests/model_identity.prototext | 1 + .../tests/layer_tests/model_l1_norm.prototext | 1 + .../layer_tests/model_l2_norm2.prototext | 1 + .../layer_tests/model_leaky_relu.prototext | 1 + .../layer_tests/model_log_sigmoid.prototext | 1 + .../layer_tests/model_log_softmax.prototext | 1 + .../model_mean_absolute_error.prototext | 1 + .../tests/layer_tests/model_relu.prototext | 1 + .../tests/layer_tests/model_selu.prototext | 1 + .../tests/layer_tests/model_sigmoid.prototext | 1 + .../tests/layer_tests/model_softmax.prototext | 1 + .../layer_tests/model_softplus.prototext | 1 + .../layer_tests/model_softsign.prototext | 1 + .../model_squared_difference.prototext | 1 + .../layer_tests/model_tessellate.prototext | 1 + .../layer_tests/model_variance.prototext | 1 + .../tests/model_mnist_conv_graph.prototext | 1 + .../model_mnist_ridge_regression.prototext | 1 + .../model_mnist_softmax_classifier.prototext | 1 + src/callbacks/check_gradients.cpp | 82 ++++++++++--------- src/proto/callbacks.proto | 1 + 26 files changed, 87 insertions(+), 44 deletions(-) diff --git a/include/lbann/callbacks/check_gradients.hpp b/include/lbann/callbacks/check_gradients.hpp index 0cbe4f8938e..d56cfd39436 100644 --- a/include/lbann/callbacks/check_gradients.hpp +++ b/include/lbann/callbacks/check_gradients.hpp @@ -36,9 +36,9 @@ namespace callback { /** @brief Gradient checking callback. * - * Gradient checking is performed at the end of the test phase. Using - * a fourth-order finite difference scheme, a numerical partial - * derivative is computed for every weight parameter. If the + * Gradient checking is performed at the end of each execution mode + * phase. Using a fourth-order finite difference scheme, a numerical + * partial derivative is computed for every weight parameter. If the * numerical derivative differs signifcantly from the analytical * derivative computed during backprop, the gradient check has * failed. @@ -47,6 +47,9 @@ class check_gradients : public callback_base { public: /** + * @param modes Execution modes with gradient checks. If + * none are provided, gradient checking is + * performed for every execution mode. * @param step_size Step size for numerical * differentiation (with a step size of * zero, the step size is estimated to @@ -56,17 +59,22 @@ class check_gradients : public callback_base { * @param error_on_failure Whether to throw an exception for * large gradient errors. */ - check_gradients(DataType step_size = DataType(0), - bool verbose = false, - bool error_on_failure = false); + check_gradients(std::set modes = {}, + DataType step_size = DataType(0), + bool verbose = false, + bool error_on_failure = false); check_gradients* copy() const override { return new check_gradients(*this); } - void on_test_end(model *m) override; std::string name() const override { return "check gradients"; } + void on_train_end(model *m) override { do_check_gradients(*m); } + void on_validation_end(model *m) override { do_check_gradients(*m); } + void on_test_end(model *m) override { do_check_gradients(*m); } private: + /** Execution modes with gradient checks. */ + std::set m_modes; /** Step size for numerical differentiation. */ DataType m_step_size; /** Whether to print results for each parameter. */ @@ -74,6 +82,9 @@ class check_gradients : public callback_base { /** Whether to throw an exception for large gradient errors. */ bool m_error_on_failure; + /** Does nothing if current execution mode is not in m_modes. */ + void do_check_gradients(model& m) const; + }; // Builder function diff --git a/model_zoo/tests/layer_tests/model_channelwise_mean.prototext b/model_zoo/tests/layer_tests/model_channelwise_mean.prototext index 60ca2691a49..5d73378bfbb 100644 --- a/model_zoo/tests/layer_tests/model_channelwise_mean.prototext +++ b/model_zoo/tests/layer_tests/model_channelwise_mean.prototext @@ -37,6 +37,7 @@ model { } callback { check_gradients { + execution_modes: "test" verbose: false error_on_failure: true } diff --git a/model_zoo/tests/layer_tests/model_clamp.prototext b/model_zoo/tests/layer_tests/model_clamp.prototext index afac7aba8f6..96b68c24a19 100644 --- a/model_zoo/tests/layer_tests/model_clamp.prototext +++ b/model_zoo/tests/layer_tests/model_clamp.prototext @@ -37,6 +37,7 @@ model { } callback { check_gradients { + execution_modes: "test" verbose: false error_on_failure: true } diff --git a/model_zoo/tests/layer_tests/model_covariance.prototext b/model_zoo/tests/layer_tests/model_covariance.prototext index e92580370b0..c081c8f9261 100644 --- a/model_zoo/tests/layer_tests/model_covariance.prototext +++ b/model_zoo/tests/layer_tests/model_covariance.prototext @@ -37,6 +37,7 @@ model { } callback { check_gradients { + execution_modes: "test" verbose: false error_on_failure: true } diff --git a/model_zoo/tests/layer_tests/model_elu.prototext b/model_zoo/tests/layer_tests/model_elu.prototext index aa03e13d47a..ca38a049fea 100644 --- a/model_zoo/tests/layer_tests/model_elu.prototext +++ b/model_zoo/tests/layer_tests/model_elu.prototext @@ -37,6 +37,7 @@ model { } callback { check_gradients { + execution_modes: "test" verbose: false error_on_failure: true } diff --git a/model_zoo/tests/layer_tests/model_identity.prototext b/model_zoo/tests/layer_tests/model_identity.prototext index 89d153d7feb..1ee188c4e92 100644 --- a/model_zoo/tests/layer_tests/model_identity.prototext +++ b/model_zoo/tests/layer_tests/model_identity.prototext @@ -37,6 +37,7 @@ model { } callback { check_gradients { + execution_modes: "test" verbose: false error_on_failure: true } diff --git a/model_zoo/tests/layer_tests/model_l1_norm.prototext b/model_zoo/tests/layer_tests/model_l1_norm.prototext index b40f293b207..510e3510b99 100644 --- a/model_zoo/tests/layer_tests/model_l1_norm.prototext +++ b/model_zoo/tests/layer_tests/model_l1_norm.prototext @@ -37,6 +37,7 @@ model { } callback { check_gradients { + execution_modes: "test" verbose: false error_on_failure: true } diff --git a/model_zoo/tests/layer_tests/model_l2_norm2.prototext b/model_zoo/tests/layer_tests/model_l2_norm2.prototext index 7887c860609..623f374f02f 100644 --- a/model_zoo/tests/layer_tests/model_l2_norm2.prototext +++ b/model_zoo/tests/layer_tests/model_l2_norm2.prototext @@ -37,6 +37,7 @@ model { } callback { check_gradients { + execution_modes: "test" verbose: false error_on_failure: true } diff --git a/model_zoo/tests/layer_tests/model_leaky_relu.prototext b/model_zoo/tests/layer_tests/model_leaky_relu.prototext index c07342ecff4..e1641781ca9 100644 --- a/model_zoo/tests/layer_tests/model_leaky_relu.prototext +++ b/model_zoo/tests/layer_tests/model_leaky_relu.prototext @@ -37,6 +37,7 @@ model { } callback { check_gradients { + execution_modes: "test" verbose: false error_on_failure: true } diff --git a/model_zoo/tests/layer_tests/model_log_sigmoid.prototext b/model_zoo/tests/layer_tests/model_log_sigmoid.prototext index fa94c64873b..b53d30b0029 100644 --- a/model_zoo/tests/layer_tests/model_log_sigmoid.prototext +++ b/model_zoo/tests/layer_tests/model_log_sigmoid.prototext @@ -37,6 +37,7 @@ model { } callback { check_gradients { + execution_modes: "test" verbose: false error_on_failure: true } diff --git a/model_zoo/tests/layer_tests/model_log_softmax.prototext b/model_zoo/tests/layer_tests/model_log_softmax.prototext index e19aab9c01e..d9ae9fbc863 100644 --- a/model_zoo/tests/layer_tests/model_log_softmax.prototext +++ b/model_zoo/tests/layer_tests/model_log_softmax.prototext @@ -37,6 +37,7 @@ model { } callback { check_gradients { + execution_modes: "test" verbose: false error_on_failure: true } diff --git a/model_zoo/tests/layer_tests/model_mean_absolute_error.prototext b/model_zoo/tests/layer_tests/model_mean_absolute_error.prototext index 27ac1f4855f..78f637afb34 100644 --- a/model_zoo/tests/layer_tests/model_mean_absolute_error.prototext +++ b/model_zoo/tests/layer_tests/model_mean_absolute_error.prototext @@ -37,6 +37,7 @@ model { } callback { check_gradients { + execution_modes: "test" verbose: false error_on_failure: true } diff --git a/model_zoo/tests/layer_tests/model_relu.prototext b/model_zoo/tests/layer_tests/model_relu.prototext index ba8ce807f98..8e048caa515 100644 --- a/model_zoo/tests/layer_tests/model_relu.prototext +++ b/model_zoo/tests/layer_tests/model_relu.prototext @@ -37,6 +37,7 @@ model { } callback { check_gradients { + execution_modes: "test" verbose: false error_on_failure: true } diff --git a/model_zoo/tests/layer_tests/model_selu.prototext b/model_zoo/tests/layer_tests/model_selu.prototext index c4b23b221b4..488aa6cb0d5 100644 --- a/model_zoo/tests/layer_tests/model_selu.prototext +++ b/model_zoo/tests/layer_tests/model_selu.prototext @@ -37,6 +37,7 @@ model { } callback { check_gradients { + execution_modes: "test" verbose: false error_on_failure: true } diff --git a/model_zoo/tests/layer_tests/model_sigmoid.prototext b/model_zoo/tests/layer_tests/model_sigmoid.prototext index 055d094885c..5eda4e5e5e1 100644 --- a/model_zoo/tests/layer_tests/model_sigmoid.prototext +++ b/model_zoo/tests/layer_tests/model_sigmoid.prototext @@ -37,6 +37,7 @@ model { } callback { check_gradients { + execution_modes: "test" verbose: false error_on_failure: true } diff --git a/model_zoo/tests/layer_tests/model_softmax.prototext b/model_zoo/tests/layer_tests/model_softmax.prototext index c20d7cc2a2d..4171f1c93bd 100644 --- a/model_zoo/tests/layer_tests/model_softmax.prototext +++ b/model_zoo/tests/layer_tests/model_softmax.prototext @@ -37,6 +37,7 @@ model { } callback { check_gradients { + execution_modes: "test" verbose: false error_on_failure: true } diff --git a/model_zoo/tests/layer_tests/model_softplus.prototext b/model_zoo/tests/layer_tests/model_softplus.prototext index 19eb004df3e..09622663b13 100644 --- a/model_zoo/tests/layer_tests/model_softplus.prototext +++ b/model_zoo/tests/layer_tests/model_softplus.prototext @@ -37,6 +37,7 @@ model { } callback { check_gradients { + execution_modes: "test" verbose: false error_on_failure: true } diff --git a/model_zoo/tests/layer_tests/model_softsign.prototext b/model_zoo/tests/layer_tests/model_softsign.prototext index 4d14d92e7b5..cdc3adc9ade 100644 --- a/model_zoo/tests/layer_tests/model_softsign.prototext +++ b/model_zoo/tests/layer_tests/model_softsign.prototext @@ -37,6 +37,7 @@ model { } callback { check_gradients { + execution_modes: "test" verbose: false error_on_failure: true } diff --git a/model_zoo/tests/layer_tests/model_squared_difference.prototext b/model_zoo/tests/layer_tests/model_squared_difference.prototext index 87846bd21c1..ea81d630ee2 100644 --- a/model_zoo/tests/layer_tests/model_squared_difference.prototext +++ b/model_zoo/tests/layer_tests/model_squared_difference.prototext @@ -34,6 +34,7 @@ model { } callback { check_gradients { + execution_modes: "test" verbose: false error_on_failure: true } diff --git a/model_zoo/tests/layer_tests/model_tessellate.prototext b/model_zoo/tests/layer_tests/model_tessellate.prototext index a7fb5c1dc2f..0ca5493bb6b 100644 --- a/model_zoo/tests/layer_tests/model_tessellate.prototext +++ b/model_zoo/tests/layer_tests/model_tessellate.prototext @@ -34,6 +34,7 @@ model { } callback { check_gradients { + execution_modes: "test" verbose: false error_on_failure: true } diff --git a/model_zoo/tests/layer_tests/model_variance.prototext b/model_zoo/tests/layer_tests/model_variance.prototext index 096ef81a182..d1d6c8b8329 100644 --- a/model_zoo/tests/layer_tests/model_variance.prototext +++ b/model_zoo/tests/layer_tests/model_variance.prototext @@ -37,6 +37,7 @@ model { } callback { check_gradients { + execution_modes: "test" verbose: false error_on_failure: true } diff --git a/model_zoo/tests/model_mnist_conv_graph.prototext b/model_zoo/tests/model_mnist_conv_graph.prototext index 23552b26a6e..28fa7809551 100644 --- a/model_zoo/tests/model_mnist_conv_graph.prototext +++ b/model_zoo/tests/model_mnist_conv_graph.prototext @@ -20,6 +20,7 @@ model { callback { timer {} } callback { check_gradients { + execution_modes: "test" verbose: false error_on_failure: true } diff --git a/model_zoo/tests/model_mnist_ridge_regression.prototext b/model_zoo/tests/model_mnist_ridge_regression.prototext index 173ea38fc71..1b358e90faf 100644 --- a/model_zoo/tests/model_mnist_ridge_regression.prototext +++ b/model_zoo/tests/model_mnist_ridge_regression.prototext @@ -30,6 +30,7 @@ model { callback { timer {} } callback { check_gradients { + execution_modes: "test" verbose: false error_on_failure: true } diff --git a/model_zoo/tests/model_mnist_softmax_classifier.prototext b/model_zoo/tests/model_mnist_softmax_classifier.prototext index 8bbd7fa5bc5..ae75c319f7c 100644 --- a/model_zoo/tests/model_mnist_softmax_classifier.prototext +++ b/model_zoo/tests/model_mnist_softmax_classifier.prototext @@ -28,6 +28,7 @@ model { callback { timer {} } callback { check_gradients { + execution_modes: "test" verbose: false error_on_failure: true } diff --git a/src/callbacks/check_gradients.cpp b/src/callbacks/check_gradients.cpp index 385356ad953..c3de1bc48e1 100644 --- a/src/callbacks/check_gradients.cpp +++ b/src/callbacks/check_gradients.cpp @@ -27,6 +27,7 @@ #include "lbann/callbacks/check_gradients.hpp" #include "lbann/data_readers/data_reader.hpp" #include "lbann/layers/io/input/generic_input_layer.hpp" +#include "lbann/proto/proto_common.hpp" #include "lbann/utils/memory.hpp" #include @@ -64,40 +65,44 @@ DataType compute_objective_function(model& m) { } // namespace -check_gradients - ::check_gradients(DataType step_size, - bool verbose, - bool error_on_failure) - : m_step_size(step_size), +check_gradients::check_gradients(std::set modes, + DataType step_size, + bool verbose, + bool error_on_failure) + : m_modes(std::move(modes)), + m_step_size(step_size), m_verbose(verbose), m_error_on_failure(error_on_failure) {} -void check_gradients::on_test_end(model *m) { +void check_gradients::do_check_gradients(model& m) const { // Get objects from model - lbann_comm *comm = m->get_comm(); - auto mode = m->get_execution_mode(); - const auto& layers = m->get_layers(); + auto& comm = *m.get_comm(); + const auto mode = m.get_execution_mode(); + const auto& layers = m.get_layers(); + + // Return immediately if gradient check isn't currently needed + if (!m_modes.empty() && m_modes.count(mode) == 0) { return; } // Reset statistics and gradients - m->get_objective_function()->reset_statistics(mode); - for (auto&& met : m->get_metrics()) { + m.get_objective_function()->reset_statistics(mode); + for (auto&& met : m.get_metrics()) { met->reset_statistics(mode); } - for (auto&& w : m->get_weights()) { + for (auto&& w : m.get_weights()) { auto&& opt = w->get_optimizer(); if (opt != nullptr) { opt->clear_gradient(); } } // Load data in input layers - for (auto&& l : m->get_layers()) { + for (auto&& l : m.get_layers()) { if (dynamic_cast(l) != nullptr) { l->forward_prop(); } } // Compute objective function - const DataType objective = compute_objective_function(*m); + const DataType objective = compute_objective_function(m); // Choose finite difference step // Note: Consider a central difference scheme: @@ -110,23 +115,22 @@ void check_gradients::on_test_end(model *m) { // If step size is not specified, then we choose h so that // E_fl <= sqrt(epsilon) const DataType epsilon = std::pow(std::numeric_limits::epsilon(), 0.9); - DataType step_size = m_step_size; - if (m_step_size <= DataType(0)) { - step_size = std::fabs(objective) * std::sqrt(epsilon); - } + const DataType step_size = (m_step_size > DataType{0} ? + m_step_size : + std::fabs(objective) * std::sqrt(epsilon)); DataType expected_error = (epsilon * objective / step_size + std::pow(step_size, 4) / 18); expected_error = std::pow(expected_error, 0.9); // Compute gradients - m->get_objective_function()->differentiate(); - m->get_objective_function()->compute_weight_regularization(); - for (int l = layers.size() - 1; l > 0; --l) { - layers[l]->back_prop(); + m.get_objective_function()->differentiate(); + m.get_objective_function()->compute_weight_regularization(); + for (El::Int i = layers.size()-1; i >= 0; --i) { + layers[i]->back_prop(); } // Print objective function value - if (comm->am_world_master()) { + if (comm.am_world_master()) { std::cout << "----------------------------------------------------------------\n" << "Gradient checking...\n" << " Objective function value = " << objective << "\n" @@ -134,11 +138,11 @@ void check_gradients::on_test_end(model *m) { << " Expected gradient error = " << expected_error << "\n"; } - for (weights *w : m->get_weights()) { + for (weights *w : m.get_weights()) { if (w->get_optimizer() == nullptr) { continue; } - if (comm->am_world_master()) { + if (comm.am_world_master()) { std::cout << "Checking " << w->get_name() << std::endl; } @@ -165,13 +169,13 @@ void check_gradients::on_test_end(model *m) { // Note: matrix entry is reset after computing objective // function values w->set_value(initial_weight + 2 * step_size, row, col); - const DataType f_2h = compute_objective_function(*m); + const DataType f_2h = compute_objective_function(m); w->set_value(initial_weight + step_size, row, col); - const DataType f_h = compute_objective_function(*m); + const DataType f_h = compute_objective_function(m); w->set_value(initial_weight - step_size, row, col); - const DataType f_nh = compute_objective_function(*m); + const DataType f_nh = compute_objective_function(m); w->set_value(initial_weight - 2 * step_size, row, col); - const DataType f_n2h = compute_objective_function(*m); + const DataType f_n2h = compute_objective_function(m); w->set_value(initial_weight, row, col); // Compute relative error in gradient. @@ -198,7 +202,8 @@ void check_gradients::on_test_end(model *m) { << " Error = " << error << std::endl << " Relative error = " << relative_error << std::endl; if (m_error_on_failure) { - throw lbann_exception("callback_check_gradients: found large error in gradient"); + LBANN_ERROR("gradient checking found large difference between " + "analytical and numerical gradients"); } } else if (m_verbose) { std::cout << " " << w->get_name() << ", " @@ -215,21 +220,21 @@ void check_gradients::on_test_end(model *m) { } } - if (comm->am_world_master()) { + if (comm.am_world_master()) { std::cout << "----------------------------------------------------------------\n"; } // Clean up /// @todo tym: I'm not sure if data readers are properly reset - for (auto&& l : m->get_layers()) { + for (auto&& l : m.get_layers()) { auto&& input = dynamic_cast(l); if (input != nullptr) { auto&& reader = input->get_data_reader(mode); reader->set_initial_position(); } } - m->get_objective_function()->reset_statistics(mode); - for (auto&& met : m->get_metrics()) { + m.get_objective_function()->reset_statistics(mode); + for (auto&& met : m.get_metrics()) { met->reset_statistics(mode); } @@ -241,9 +246,12 @@ build_check_gradients_callback_from_pbuf( const google::protobuf::Message& proto_msg, const std::shared_ptr&) { const auto& params = dynamic_cast(proto_msg); - return make_unique(params.step_size(), - params.verbose(), - params.error_on_failure()); + const auto& modes = + parse_set(params.execution_modes()); + return make_unique(modes, + params.step_size(), + params.verbose(), + params.error_on_failure()); } } // namespace callback diff --git a/src/proto/callbacks.proto b/src/proto/callbacks.proto index eda029bff41..f9909233908 100644 --- a/src/proto/callbacks.proto +++ b/src/proto/callbacks.proto @@ -230,6 +230,7 @@ message Callback { double step_size = 1; bool verbose = 2; bool error_on_failure = 3; // Throw error if gradient check fails + string execution_modes = 4; // Default: all modes } message CallbackCheckMetric { From b887653d78809acdd5526a5f32b958bec75d5820 Mon Sep 17 00:00:00 2001 From: "Thomas R. Benson" Date: Wed, 14 Aug 2019 10:58:50 -0700 Subject: [PATCH 234/634] remove LayerDefault message as its semantically not quite right --- src/proto/factories/weights_factory.cpp | 2 -- src/proto/weights.proto | 3 --- 2 files changed, 5 deletions(-) diff --git a/src/proto/factories/weights_factory.cpp b/src/proto/factories/weights_factory.cpp index 80e86952177..b6b6dd1fcf7 100644 --- a/src/proto/factories/weights_factory.cpp +++ b/src/proto/factories/weights_factory.cpp @@ -51,8 +51,6 @@ using factory_type = lbann::generic_factory< void register_default_builders(factory_type& factory) { - factory.register_builder("LayerDefault", - [](MessageT const&) { return nullptr; }); factory.register_builder("ConstantInitializer", build_constant_initializer_from_pbuf); factory.register_builder("ValueInitializer", diff --git a/src/proto/weights.proto b/src/proto/weights.proto index a2d3b804322..bd09244e825 100644 --- a/src/proto/weights.proto +++ b/src/proto/weights.proto @@ -38,7 +38,6 @@ message Weights { message Initializer { oneof initializer_type { - LayerDefault layer_default = 1; ConstantInitializer constant_initializer = 20; ValueInitializer value_initializer = 21; UniformInitializer uniform_initializer = 22; @@ -52,8 +51,6 @@ message Initializer { } // Weight initializers - message LayerDefault {} - message ConstantInitializer { double value = 1; } From a512bf3cb20eec5f5a598a1d5f8571c0cc0b63aa Mon Sep 17 00:00:00 2001 From: Tim Moon Date: Wed, 14 Aug 2019 14:43:30 -0700 Subject: [PATCH 235/634] Entry-wise batchnorm (#1167) * CPU implementation of entry-wise batchnorm. Compiles, but have not tested whether it runs. * Fixing error in CPU entry-wise batchnorm backprop. Gradient checking now passes. * GPU implementation of entry-wise batchnorm. Not sure if this compiles. * Debugging GPU entry-wise batchnorm. Compiles, runs, and passes gradient checking. * Documentation suggestions from @ndryden. --- .../lbann/layers/regularizers/CMakeLists.txt | 1 + .../entrywise_batch_normalization.hpp | 222 +++++++ include/lbann/lbann.hpp | 1 + src/layers/regularizers/CMakeLists.txt | 2 + .../entrywise_batch_normalization.cpp | 425 ++++++++++++ .../entrywise_batch_normalization.cu | 613 ++++++++++++++++++ src/proto/factories/layer_factory.cpp | 5 + src/proto/layers.proto | 6 + 8 files changed, 1275 insertions(+) create mode 100644 include/lbann/layers/regularizers/entrywise_batch_normalization.hpp create mode 100644 src/layers/regularizers/entrywise_batch_normalization.cpp create mode 100644 src/layers/regularizers/entrywise_batch_normalization.cu diff --git a/include/lbann/layers/regularizers/CMakeLists.txt b/include/lbann/layers/regularizers/CMakeLists.txt index cd27df13645..726676d7426 100644 --- a/include/lbann/layers/regularizers/CMakeLists.txt +++ b/include/lbann/layers/regularizers/CMakeLists.txt @@ -2,6 +2,7 @@ set_full_path(THIS_DIR_HEADERS batch_normalization.hpp dropout.hpp + entrywise_batch_normalization.hpp local_response_normalization.hpp regularizer.hpp selu_dropout.hpp diff --git a/include/lbann/layers/regularizers/entrywise_batch_normalization.hpp b/include/lbann/layers/regularizers/entrywise_batch_normalization.hpp new file mode 100644 index 00000000000..81536a1f01d --- /dev/null +++ b/include/lbann/layers/regularizers/entrywise_batch_normalization.hpp @@ -0,0 +1,222 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_LAYERS_REGULARIZERS_ENTRYWISE_BATCH_NORMALIZATION_HPP_INCLUDED +#define LBANN_LAYERS_REGULARIZERS_ENTRYWISE_BATCH_NORMALIZATION_HPP_INCLUDED + +#include "lbann/layers/layer.hpp" +#include "lbann/models/model.hpp" +#include "lbann/utils/memory.hpp" + +namespace lbann { + +/** @brief + * + * Each input entry is normalized across the mini-batch to have zero + * mean and unit standard deviation. This uses the standard approach + * of maintaining the running mean and standard deviation (with + * exponential decay) for use at test time. See: + * + * Sergey Ioffe and Christian Szegedy. "Batch Normalization: + * Accelerating Deep Network Training by Reducing Internal Covariate + * Shift." In International Conference on Machine Learning, + * pp. 448-456. 2015. + */ +template +class entrywise_batch_normalization_layer : public Layer { +public: + + entrywise_batch_normalization_layer(lbann_comm* comm, + DataType decay=0.9, + DataType epsilon=1e-5) + : Layer(comm), m_decay(decay), m_epsilon(epsilon) {} + + entrywise_batch_normalization_layer(const entrywise_batch_normalization_layer& other) + : Layer(other), + m_decay(other.m_decay), + m_epsilon(other.m_epsilon), + m_batch_statistics(other.m_batch_statistics ? + other.m_batch_statistics->Copy() : + nullptr), + m_batch_statistics_gradient(other.m_batch_statistics_gradient ? + other.m_batch_statistics_gradient->Copy() : + nullptr) {} + + entrywise_batch_normalization_layer& operator=(const entrywise_batch_normalization_layer& other) { + Layer::operator=(other); + m_decay = other.m_decay; + m_epsilon = other.m_epsilon; + m_batch_statistics.reset(other.m_batch_statistics ? + other.m_batch_statistics->Copy() : + nullptr); + m_batch_statistics_gradient.reset(other.m_batch_statistics_gradient ? + other.m_batch_statistics_gradient->Copy() : + nullptr); + return *this; + } + + entrywise_batch_normalization_layer* copy() const override { return new entrywise_batch_normalization_layer(*this); } + std::string get_type() const override { return "entry-wise batch normalization"; } + data_layout get_data_layout() const override { return Layout; } + El::Device get_device_allocation() const override { return Device; } + + description get_description() const override { + auto desc = Layer::get_description(); + desc.add("Decay", m_decay); + desc.add("Epsilon", m_epsilon); + return desc; + } + +protected: + + void setup_matrices(const El::Grid& grid) override { + Layer::setup_matrices(grid); + auto dist = get_prev_activations().DistData(); + dist.rowDist = El::STAR; + m_batch_statistics.reset(AbsDistMat::Instantiate(dist)); + m_batch_statistics_gradient.reset(AbsDistMat::Instantiate(dist)); + } + + void setup_data() override { + Layer::setup_data(); + + // Initialize output dimensions + set_output_dims(get_input_dims()); + const auto output_dims = get_output_dims(); + const auto output_size = get_output_size(); + + // Initialize default weights if none are provided + if (this->m_weights.size() > 2) { + std::stringstream err; + err << "attempted to setup layer \"" << m_name << "\" " + << "with an invalid number of weights " + << "(found " << this->m_weights.size() << ", expected 2)"; + LBANN_ERROR(err.str()); + } + this->m_weights.resize(2, nullptr); + if (this->m_weights[0] == nullptr) { + this->m_weights[0] = new weights(get_comm()); + this->m_weights[0]->set_name(get_name() + "_running_mean"); + auto init = make_unique(DataType{0}); + this->m_weights[0]->set_initializer(std::move(init)); + this->m_model->add_weights(this->m_weights[0]); + } + if (this->m_weights[1] == nullptr) { + this->m_weights[1] = new weights(get_comm()); + this->m_weights[1]->set_name(get_name() + "_running_variance"); + auto init = make_unique(DataType{1}); + this->m_weights[1]->set_initializer(std::move(init)); + this->m_model->add_weights(this->m_weights[1]); + } + + // Setup weights + auto dist = get_prev_activations().DistData(); + dist.rowDist = El::STAR; + for (auto* w : this->m_weights) { + w->set_dims(output_dims); + w->set_matrix_distribution(dist); + } + + // Initialize matrices + m_batch_statistics->AlignWith(dist); + m_batch_statistics->Resize(output_size, 2); + m_batch_statistics_gradient->AlignWith(dist); + m_batch_statistics_gradient->Resize(output_size, 2); + + } + + void fp_setup_outputs(El::Int mini_batch_size) override { + Layer::fp_setup_outputs(mini_batch_size); + const auto& input = get_prev_activations(); + const auto input_size = get_input_size(); + + // Make sure batch statistics tensor is aligned with input tensor + m_batch_statistics->Empty(false); + m_batch_statistics->AlignWith(input); + m_batch_statistics->Resize(input_size, 2); + +#if 0 /// @todo See https://github.com/LLNL/lbann/issues/1123 + + // Check that weights tensors is aligned with input tensor + /// @todo Realign tensors if misaligned + bool aligned = true; + try { + const auto& running_mean = m_weights[0]->get_values(); + const auto& running_var = m_weights[1]->get_values(); + aligned = (input.ColAlign() == running_mean.ColAlign() + && input.RowAlign() == running_mean.RowAlign() + && input.ColAlign() == running_var.ColAlign() + && input.RowAlign() == running_var.RowAlign()); + } + catch (const exception& e) { + // An exception is thrown if you try accessing weights values + // before they are initialized. We don't care if this case is + // aligned, so it's safe to ignore. + } + if (!aligned) { + std::ostringstream err; + err << this->get_type() << " layer \"" << this->get_name() << "\" " + << "has misaligned input and weights matrices"; + LBANN_ERROR(err.str()); + } + +#endif // 0 + + } + + void bp_setup_gradient_wrt_inputs(El::Int mini_batch_size) override { + Layer::bp_setup_gradient_wrt_inputs(mini_batch_size); + m_batch_statistics_gradient->Empty(false); + m_batch_statistics_gradient->AlignWith(get_prev_activations()); + m_batch_statistics_gradient->Resize(get_input_size(), 2); + } + + void fp_compute() override; + void bp_compute() override; + +private: + + /** Decay rate for the running statistics. */ + DataType m_decay; + /** Small number to avoid division by zero. */ + DataType m_epsilon; + + /** @brief Current mini-batch statistics. + * + * These are fused for performance when doing non-local batchnorm. + */ + std::unique_ptr m_batch_statistics; + /** @brief Gradients w.r.t. current mini-batch statistics. + * + * These are fused for performance when doing non-local batchnorm. + */ + std::unique_ptr m_batch_statistics_gradient; + +}; + +} // namespace lbann + +#endif // LBANN_LAYERS_REGULARIZERS_ENTRYWISE_BATCH_NORMALIZATION_HPP_INCLUDED diff --git a/include/lbann/lbann.hpp b/include/lbann/lbann.hpp index d41fe933c90..d13a3f5ea47 100644 --- a/include/lbann/lbann.hpp +++ b/include/lbann/lbann.hpp @@ -95,6 +95,7 @@ #include "lbann/layers/regularizers/dropout.hpp" #include "lbann/layers/regularizers/selu_dropout.hpp" #include "lbann/layers/regularizers/batch_normalization.hpp" +#include "lbann/layers/regularizers/entrywise_batch_normalization.hpp" /// Input layer #include "lbann/layers/io/input/input_layer.hpp" diff --git a/src/layers/regularizers/CMakeLists.txt b/src/layers/regularizers/CMakeLists.txt index a85f2e3d740..f82d39fd12a 100644 --- a/src/layers/regularizers/CMakeLists.txt +++ b/src/layers/regularizers/CMakeLists.txt @@ -1,12 +1,14 @@ # Add the source files for this directory set_full_path(THIS_DIR_SOURCES batch_normalization.cpp + entrywise_batch_normalization.cpp ) if (LBANN_HAS_CUDA) # Add the CUDA source files for this directory set_full_path(THIS_DIR_CU_SOURCES batch_normalization.cu + entrywise_batch_normalization.cu ) endif () diff --git a/src/layers/regularizers/entrywise_batch_normalization.cpp b/src/layers/regularizers/entrywise_batch_normalization.cpp new file mode 100644 index 00000000000..655b940cdc1 --- /dev/null +++ b/src/layers/regularizers/entrywise_batch_normalization.cpp @@ -0,0 +1,425 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#include "lbann/layers/regularizers/entrywise_batch_normalization.hpp" + +namespace lbann { + +namespace { + +// Block size for loops +// Note: x86 cache lines are 64B +constexpr El::Int _bsize = 64 / sizeof(DataType); +constexpr El::Int bsize = _bsize > 1 ? _bsize : 1; + +/** + * mean = sum(x_i) / n + * + * var = ( sum(x_i^2)/n - mean^2 ) * n/(n-1) + */ +void compute_batch_statistics(lbann_comm& comm, + DataType decay, + const AbsDistMat& input, + AbsDistMat& batch_statistics, + AbsDistMat& running_mean, + AbsDistMat& running_var) { + + // Local matrices + const auto& local_input = dynamic_cast(input.LockedMatrix()); + auto& local_batch_statistics = dynamic_cast(batch_statistics.Matrix()); + auto local_batch_mean = El::View(local_batch_statistics, El::ALL, El::IR(0)); + auto local_batch_var = El::View(local_batch_statistics, El::ALL, El::IR(1)); + auto& local_running_mean = dynamic_cast(running_mean.Matrix()); + auto& local_running_var = dynamic_cast(running_var.Matrix()); + + // Dimensions + const El::Int local_height = local_input.Height(); + const El::Int local_width = local_input.Width(); + + // Compute local sums + El::Zero(batch_statistics); + LBANN_OMP_PARALLEL_FOR + for (El::Int row_start = 0; row_start < local_height; row_start += bsize) { + const El::Int row_end = std::min(row_start + bsize, local_height); + const El::Int col_start = 0; + const El::Int col_end = local_width; + for (El::Int col = col_start; col < col_end; ++col) { + for (El::Int row = row_start; row < row_end; ++row) { + const auto& x = local_input(row, col); + local_batch_mean(row, 0) += x; + local_batch_var(row, 0) += x * x; + } + } + } + + // Accumulate sums between processes + /// @todo Local statistics + /// @todo Arbitrary group sizes + comm.allreduce(batch_statistics, + batch_statistics.RedundantComm(), + El::mpi::SUM); + const size_t statistics_count = input.Width(); + + // Compute mini-batch statistics from sums + if (statistics_count <= 1) { + // local_mean already has correct values + El::Fill(local_batch_var, DataType{1}); + } else { + LBANN_OMP_PARALLEL_FOR + for (El::Int row = 0; row < local_height; ++row) { + auto& mean = local_batch_mean(row, 0); + auto& var = local_batch_var(row, 0); + auto& _running_mean = local_running_mean(row, 0); + auto& _running_var = local_running_var(row, 0); + const auto sum = local_batch_mean(row, 0); + const auto sqsum = local_batch_var(row, 0); + mean = sum / statistics_count; + const auto sqmean = sqsum / statistics_count; + var = (sqmean - mean * mean) * statistics_count / (statistics_count - 1); + _running_mean = decay * _running_mean + (DataType{1} - decay) * mean; + _running_var = decay * _running_var + (DataType{1} - decay) * var; + } + } + +} + +/** + * y_i = (x_i - mean) / sqrt(var + epsilon) + */ +void apply_batchnorm(DataType epsilon, + const CPUMat& local_input, + CPUMat& local_output, + const CPUMat& local_mean, + const CPUMat& local_var) { + const El::Int local_height = local_input.Height(); + const El::Int local_width = local_input.Width(); + LBANN_OMP_PARALLEL_FOR + for (El::Int row_start = 0; row_start < local_height; row_start += bsize) { + const El::Int row_end = std::min(row_start + bsize, local_height); + const El::Int col_start = 0; + const El::Int col_end = local_width; + DataType _inv_stdev[bsize]; + for (El::Int row = row_start; row < row_end; ++row) { + const auto& var = local_var(row, 0); + _inv_stdev[row-row_start] = 1 / std::sqrt(var + epsilon); + } + for (El::Int col = col_start; col < col_end; ++col) { + for (El::Int row = row_start; row < row_end; ++row) { + const auto& mean = local_mean(row, 0); + const auto& inv_stdev = _inv_stdev[row - row_start]; + const auto& x = local_input(row, col); + auto& y = local_output(row, col); + y = (x - mean) * inv_stdev; + } + } + } +} + +void fp_impl(lbann_comm& comm, + DataType decay, + DataType epsilon, + bool is_training, + const AbsDistMat& input, + AbsDistMat& output, + AbsDistMat& batch_statistics, + AbsDistMat& running_mean, + AbsDistMat& running_var) { + + // Local matrices + const auto& local_input = dynamic_cast(input.LockedMatrix()); + auto& local_output = dynamic_cast(output.Matrix()); + + // Batchnorm has different behavior for training and inference + if (is_training) { + + // For training, normalize with batch statistics + compute_batch_statistics(comm, + decay, + input, + batch_statistics, + running_mean, + running_var); + const auto& local_batch_statistics + = dynamic_cast(batch_statistics.LockedMatrix()); + const auto local_batch_mean = El::LockedView(local_batch_statistics, + El::ALL, El::IR(0)); + const auto local_batch_var = El::LockedView(local_batch_statistics, + El::ALL, El::IR(1)); + apply_batchnorm(epsilon, + local_input, + local_output, + local_batch_mean, + local_batch_var); + + } + else { + + // For inference, normalize with running statistics + const auto& local_running_mean = dynamic_cast(running_mean.LockedMatrix()); + const auto& local_running_var = dynamic_cast(running_var.LockedMatrix()); + apply_batchnorm(epsilon, + local_input, + local_output, + local_running_mean, + local_running_var); + + } + +} + +/** @brief Backprop for training. + * + * Assumes forward prop uses mini-batch statistics. In other words, + * statistics are dependent on input. + */ +void bp_training_impl(lbann_comm& comm, + DataType epsilon, + const AbsDistMat& input, + const AbsDistMat& gradient_wrt_output, + AbsDistMat& gradient_wrt_input, + const AbsDistMat& statistics, + AbsDistMat& gradient_wrt_statistics) { + + // Local matrices + const auto& local_input = dynamic_cast(input.LockedMatrix()); + const auto& local_gradient_wrt_output = dynamic_cast(gradient_wrt_output.LockedMatrix()); + auto& local_gradient_wrt_input = dynamic_cast(gradient_wrt_input.Matrix()); + const auto& local_statistics = dynamic_cast(statistics.LockedMatrix()); + const auto local_mean = El::LockedView(local_statistics, El::ALL, El::IR(0)); + const auto local_var = El::LockedView(local_statistics, El::ALL, El::IR(1)); + auto& local_gradient_wrt_statistics = dynamic_cast(gradient_wrt_statistics.Matrix()); + auto local_gradient_wrt_mean = El::View(local_gradient_wrt_statistics, El::ALL, El::IR(0)); + auto local_gradient_wrt_var = El::View(local_gradient_wrt_statistics, El::ALL, El::IR(1)); + + // Dimensions + const El::Int local_height = local_gradient_wrt_input.Height(); + const El::Int local_width = local_gradient_wrt_input.Width(); + + // Count for statistics + // Note: Output is constant if statistics count is <=1, so error + // signal is zero. + /// @todo Local statistics + /// @todo Arbitrary group sizes + const size_t statistics_count = input.Width(); + if (statistics_count <= 1) { + El::Zero(local_gradient_wrt_input); + return; + } + + // Compute local gradient w.r.t. batch statistics + // dL/dmean = - sum(dL/dy_i) / sqrt(var+epsilon) + // dL/dvar = - sum(dL/dy_i * (x_i-mean)) * (var+epsilon)^(-3/2) / 2 + El::Zero(gradient_wrt_statistics); + LBANN_OMP_PARALLEL_FOR + for (El::Int row_start = 0; row_start < local_height; row_start += bsize) { + const El::Int row_end = std::min(row_start + bsize, local_height); + const El::Int col_start = 0; + const El::Int col_end = local_width; + DataType _inv_stdev[bsize]; + for (El::Int row = row_start; row < row_end; ++row) { + const auto& var = local_var(row, 0); + _inv_stdev[row-row_start] = 1 / std::sqrt(var + epsilon); + } + for (El::Int col = col_start; col < col_end; ++col) { + for (El::Int row = row_start; row < row_end; ++row) { + const auto& mean = local_mean(row, 0); + const auto& inv_stdev = _inv_stdev[row - row_start]; + const auto& x = local_input(row, col); + const auto& dy = local_gradient_wrt_output(row, col); + auto& dmean = local_gradient_wrt_mean(row, 0); + auto& dvar = local_gradient_wrt_var(row, 0); + dmean += - dy * inv_stdev; + dvar += - dy * (x - mean) * inv_stdev*inv_stdev*inv_stdev / 2; + } + } + } + + // Accumulate gradient w.r.t. statistics across processes + /// @todo Local statistics + /// @todo Arbitrary group sizes + comm.allreduce(gradient_wrt_statistics, + gradient_wrt_statistics.RedundantComm(), + El::mpi::SUM); + + // Compute gradient w.r.t. input + // dL/dx_i = ( dL/dy_i / sqrt(var+epsilon) + // + dL/dmean / n + // + dL/dvar * (x_i - mean) * 2/(n-1) ) + const DataType inv_stats_count = DataType{1} / statistics_count; + const DataType inv_stats_countm1 = DataType{1} / (statistics_count - 1); + LBANN_OMP_PARALLEL_FOR + for (El::Int row_start = 0; row_start < local_height; row_start += bsize) { + const El::Int row_end = std::min(row_start + bsize, local_height); + const El::Int col_start = 0; + const El::Int col_end = local_width; + DataType _inv_stdev[bsize]; + for (El::Int row = row_start; row < row_end; ++row) { + const auto& var = local_var(row, 0); + _inv_stdev[row-row_start] = 1 / std::sqrt(var + epsilon); + } + for (El::Int col = col_start; col < col_end; ++col) { + for (El::Int row = row_start; row < row_end; ++row) { + const auto& mean = local_mean(row, 0); + const auto& inv_stdev = _inv_stdev[row - row_start]; + const auto& x = local_input(row, col); + const auto& dy = local_gradient_wrt_output(row, col); + auto& dx = local_gradient_wrt_input(row, col); + auto& dmean = local_gradient_wrt_mean(row, 0); + auto& dvar = local_gradient_wrt_var(row, 0); + dx = (dy * inv_stdev + + dmean * inv_stats_count + + dvar * (x - mean)) * 2 * inv_stats_countm1; + } + } + } + +} + +/** @brief Backprop for inference. + * + * Computes gradient w.r.t. input when the model is performing + * inference, e.g. in validation or testing mode. In this case, + * forward prop uses running statistics, which are independent of + * input. + */ +void bp_inference_impl(DataType epsilon, + const AbsDistMat& gradient_wrt_output, + AbsDistMat& gradient_wrt_input, + const AbsDistMat& running_var) { + + // Local matrices + const auto& local_gradient_wrt_output = dynamic_cast(gradient_wrt_output.LockedMatrix()); + auto& local_gradient_wrt_input = dynamic_cast(gradient_wrt_input.Matrix()); + const auto& local_running_var = dynamic_cast(running_var.LockedMatrix()); + + // Compute gradient w.r.t. input + // dL/dx_i = dL/dy_i / sqrt(var+epsilon) + const El::Int local_height = local_gradient_wrt_input.Height(); + const El::Int local_width = local_gradient_wrt_input.Width(); + LBANN_OMP_PARALLEL_FOR + for (El::Int row_start = 0; row_start < local_height; row_start += bsize) { + const El::Int row_end = std::min(row_start + bsize, local_height); + const El::Int col_start = 0; + const El::Int col_end = local_width; + DataType _inv_stdev[bsize]; + for (El::Int row = row_start; row < row_end; ++row) { + const auto& var = local_running_var(row, 0); + _inv_stdev[row-row_start] = 1 / std::sqrt(var + epsilon); + } + for (El::Int col = col_start; col < col_end; ++col) { + for (El::Int row = row_start; row < row_end; ++row) { + const auto& inv_stdev = _inv_stdev[row - row_start]; + const auto& dy = local_gradient_wrt_output(row, col); + auto& dx = local_gradient_wrt_input(row, col); + dx = dy * inv_stdev; + } + } + } + +} + +void bp_impl(lbann_comm& comm, + DataType epsilon, + bool is_training, + const AbsDistMat& input, + const AbsDistMat& gradient_wrt_output, + AbsDistMat& gradient_wrt_input, + const AbsDistMat& batch_statistics, + AbsDistMat& gradient_wrt_batch_statistics, + const AbsDistMat& running_var) { + + // Batchnorm has different behavior for training and inference + if (is_training) { + bp_training_impl(comm, + epsilon, + input, + gradient_wrt_output, + gradient_wrt_input, + batch_statistics, + gradient_wrt_batch_statistics); + } + else { + bp_inference_impl(epsilon, + gradient_wrt_output, + gradient_wrt_input, + running_var); + } + +} + +} // namespace + +// Template instantiation +template <> +void entrywise_batch_normalization_layer::fp_compute() { + fp_impl(*get_comm(), + m_decay, + m_epsilon, + m_model->get_execution_mode() == execution_mode::training, + get_prev_activations(), + get_activations(), + *m_batch_statistics, + m_weights[0]->get_values(), + m_weights[1]->get_values()); +} +template <> +void entrywise_batch_normalization_layer::fp_compute() { + fp_impl(*get_comm(), + m_decay, + m_epsilon, + m_model->get_execution_mode() == execution_mode::training, + get_prev_activations(), + get_activations(), + *m_batch_statistics, + m_weights[0]->get_values(), + m_weights[1]->get_values()); +} +template <> +void entrywise_batch_normalization_layer::bp_compute() { + bp_impl(*get_comm(), + m_epsilon, + m_model->get_execution_mode() == execution_mode::training, + get_prev_activations(), + get_prev_error_signals(), + get_error_signals(), + *m_batch_statistics, + *m_batch_statistics_gradient, + m_weights[1]->get_values()); +} +template <> +void entrywise_batch_normalization_layer::bp_compute() { + bp_impl(*get_comm(), + m_epsilon, + m_model->get_execution_mode() == execution_mode::training, + get_prev_activations(), + get_prev_error_signals(), + get_error_signals(), + *m_batch_statistics, + *m_batch_statistics_gradient, + m_weights[1]->get_values()); +} + +} // namespace lbann diff --git a/src/layers/regularizers/entrywise_batch_normalization.cu b/src/layers/regularizers/entrywise_batch_normalization.cu new file mode 100644 index 00000000000..d4109a6f87f --- /dev/null +++ b/src/layers/regularizers/entrywise_batch_normalization.cu @@ -0,0 +1,613 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#include "lbann/layers/regularizers/entrywise_batch_normalization.hpp" +#include "lbann/utils/cuda.hpp" + +namespace lbann { + +namespace { + +/** + * On input, sums and sqsums are assumed to be filled with zeros. + * + * Block dimensions: bsize x 1 x 1 + * + * Grid dimensions: (height / bsize) x 1 x 1 + */ +__global__ void row_sums_kernel(size_t height, + size_t width, + const DataType* __restrict__ vals, + size_t vals_ldim, + DataType* __restrict__ sums, + DataType* __restrict__ sqsums) { + const size_t gid = threadIdx.x + blockIdx.x * blockDim.x; + const size_t nthreads = blockDim.x * gridDim.x; + for (size_t row = gid; row < height; row += nthreads) { + auto& sum = sums[row]; + auto& sqsum = sqsums[row]; + for (size_t col = 0; col < width; ++col) { + const auto& x = vals[row + col * vals_ldim]; + sum += x; + sqsum += x * x; + } + } +} + +/** + * On input, batch_mean and batch_var are assumed to contain sums and + * squares of sums, respectively. + * + * Block dimensions: bsize x 1 x 1 + * + * Grid dimensions: (size / bsize) x 1 x 1 + */ +__global__ void compute_statistics_kernel(size_t size, + size_t statistics_count, + DataType decay, + DataType* __restrict__ batch_mean, + DataType* __restrict__ batch_var, + DataType* __restrict__ running_mean, + DataType* __restrict__ running_var) { + const size_t gid = threadIdx.x + blockIdx.x * blockDim.x; + const size_t nthreads = blockDim.x * gridDim.x; + for (size_t i = gid; i < size; i += nthreads) { + auto& mean = batch_mean[i]; + auto& var = batch_var[i]; + auto& _running_mean = running_mean[i]; + auto& _running_var = running_var[i]; + const auto sum = batch_mean[i]; + const auto sqsum = batch_var[i]; + mean = sum / statistics_count; + const auto sqmean = sqsum / statistics_count; + var = (sqmean - mean * mean) * statistics_count / (statistics_count - 1); + _running_mean = decay * _running_mean + (DataType{1} - decay) * mean; + _running_var = decay * _running_var + (DataType{1} - decay) * var; + } +} + +/** + * mean = sum(x_i) / n + * + * var = ( sum(x_i^2)/n - mean^2 ) * n/(n-1) + */ +void compute_batch_statistics(lbann_comm& comm, + DataType decay, + const AbsDistMat& input, + AbsDistMat& batch_statistics, + AbsDistMat& running_mean, + AbsDistMat& running_var) { + + // Local matrices + const auto& local_input = dynamic_cast(input.LockedMatrix()); + auto& local_batch_statistics = dynamic_cast(batch_statistics.Matrix()); + auto local_batch_mean = El::View(local_batch_statistics, El::ALL, El::IR(0)); + auto local_batch_var = El::View(local_batch_statistics, El::ALL, El::IR(1)); + auto& local_running_mean = dynamic_cast(running_mean.Matrix()); + auto& local_running_var = dynamic_cast(running_var.Matrix()); + + // Dimensions + const size_t local_height = local_input.Height(); + const size_t local_width = local_input.Width(); + + // Compute local sums + El::Zero(batch_statistics); + if (local_height > 0) { + constexpr size_t block_size = 256; + dim3 block_dims, grid_dims; + block_dims.x = block_size; + grid_dims.x = (local_height + block_size - 1) / block_size; + row_sums_kernel + <<>>( + local_height, + local_width, + local_input.LockedBuffer(), + local_input.LDim(), + local_batch_mean.Buffer(), + local_batch_var.Buffer()); + } + + // Accumulate sums between processes + /// @todo Local statistics + /// @todo Arbitrary group sizes + comm.allreduce(batch_statistics, + batch_statistics.RedundantComm(), + El::mpi::SUM); + const size_t statistics_count = input.Width(); + + // Compute mini-batch statistics from sums + if (statistics_count <= 1) { + // local_mean already has correct values + El::Fill(local_batch_var, DataType{1}); + } else { + if (local_height > 0) { + constexpr size_t block_size = 256; + dim3 block_dims, grid_dims; + block_dims.x = block_size; + grid_dims.x = (local_height + block_size - 1) / block_size; + compute_statistics_kernel + <<>>( + local_height, + statistics_count, + decay, + local_batch_mean.Buffer(), + local_batch_var.Buffer(), + local_running_mean.Buffer(), + local_running_var.Buffer()); + } + } + +} + +/** + * Block dimensions: bsizex x bsizey x 1 + * + * Grid dimensions: (height / bsizex) x (width / bsizey) x 1 + */ +__global__ void batchnorm_kernel(size_t height, + size_t width, + DataType epsilon, + const DataType* __restrict__ input, + size_t input_ldim, + DataType* __restrict__ output, + size_t output_ldim, + const DataType* __restrict__ mean, + const DataType* __restrict__ var) { + const size_t gidx = threadIdx.x + blockIdx.x * blockDim.x; + const size_t gidy = threadIdx.y + blockIdx.y * blockDim.y; + const size_t nthreadsx = blockDim.x * gridDim.x; + const size_t nthreadsy = blockDim.y * gridDim.y; + for (size_t row = gidx; row < height; row += nthreadsx) { + const auto& _mean = mean[row]; + const auto& _var = var[row]; + const auto inv_stdev = cuda::rsqrt(_var + epsilon); + for (size_t col = gidy; col < width; col += nthreadsy) { + const auto& x = input[row + col*input_ldim]; + auto& y = output[row + col*output_ldim]; + y = (x - _mean) * inv_stdev; + } + } +} + +/** + * y_i = (x_i - mean) / sqrt(var + epsilon) + */ +void apply_batchnorm(DataType epsilon, + const GPUMat& local_input, + GPUMat& local_output, + const GPUMat& local_mean, + const GPUMat& local_var) { + if (!local_input.IsEmpty()) { + const size_t local_height = local_input.Height(); + const size_t local_width = local_input.Width(); + constexpr size_t block_size_x = 256; + constexpr size_t block_size_y = 1; + dim3 block_dims, grid_dims; + block_dims.x = block_size_x; + block_dims.y = block_size_y; + grid_dims.x = (local_height + block_size_x - 1) / block_size_x; + grid_dims.y = (local_width + block_size_y - 1) / block_size_y; + batchnorm_kernel + <<>>( + local_height, + local_width, + epsilon, + local_input.LockedBuffer(), + local_input.LDim(), + local_output.Buffer(), + local_output.LDim(), + local_mean.LockedBuffer(), + local_var.LockedBuffer()); + } +} + +void fp_impl(lbann_comm& comm, + DataType decay, + DataType epsilon, + bool is_training, + const AbsDistMat& input, + AbsDistMat& output, + AbsDistMat& batch_statistics, + AbsDistMat& running_mean, + AbsDistMat& running_var) { + + // Local matrices + const auto& local_input = dynamic_cast(input.LockedMatrix()); + auto& local_output = dynamic_cast(output.Matrix()); + + // Batchnorm has different behavior for training and inference + if (is_training) { + + // For training, normalize with batch statistics + compute_batch_statistics(comm, + decay, + input, + batch_statistics, + running_mean, + running_var); + const auto& local_batch_statistics + = dynamic_cast(batch_statistics.LockedMatrix()); + const auto local_batch_mean = El::LockedView(local_batch_statistics, + El::ALL, El::IR(0)); + const auto local_batch_var = El::LockedView(local_batch_statistics, + El::ALL, El::IR(1)); + apply_batchnorm(epsilon, + local_input, + local_output, + local_batch_mean, + local_batch_var); + + } + else { + + // For inference, normalize with running statistics + const auto& local_running_mean = dynamic_cast(running_mean.LockedMatrix()); + const auto& local_running_var = dynamic_cast(running_var.LockedMatrix()); + apply_batchnorm(epsilon, + local_input, + local_output, + local_running_mean, + local_running_var); + + } + +} + +/** + * On input, gradient_wrt_mean and gradient_wrt_var are assumed to be + * filled with zeros. + * + * dL/dmean = - sum(dL/dy_i) / sqrt(var+epsilon) + * + * dL/dvar = - sum(dL/dy_i * (x_i-mean)) * (var+epsilon)^(-3/2) / 2 + * + * Block dimensions: bsize x 1 x 1 + * + * Grid dimensions: (height / bsize) x 1 x 1 + */ +__global__ void bp_training_stats_gradient_kernel(size_t height, + size_t width, + DataType epsilon, + const DataType* __restrict__ input, + size_t input_ldim, + const DataType* __restrict__ gradient_wrt_output, + size_t gradient_wrt_output_ldim, + const DataType* __restrict__ mean, + const DataType* __restrict__ var, + DataType* __restrict__ gradient_wrt_mean, + DataType* __restrict__ gradient_wrt_var) { + const size_t gid = threadIdx.x + blockIdx.x * blockDim.x; + const size_t nthreads = blockDim.x * gridDim.x; + for (size_t row = gid; row < height; row += nthreads) { + const auto& _mean = mean[row]; + const auto& _var = var[row]; + const auto inv_stdev = cuda::rsqrt(_var + epsilon); + auto& dmean = gradient_wrt_mean[row]; + auto& dvar = gradient_wrt_var[row]; + for (size_t col = 0; col < width; ++col) { + const auto& x = input[row + col * input_ldim]; + const auto& dy = gradient_wrt_output[row + col * gradient_wrt_output_ldim]; + dmean += - dy * inv_stdev; + dvar += - dy * (x - _mean) * inv_stdev*inv_stdev*inv_stdev / 2; + } + } +} + +/** + * dL/dx_i = ( dL/dy_i / sqrt(var+epsilon) + * + dL/dmean / n + * + dL/dvar * (x_i - mean) * 2/(n-1) ) + * + * Block dimensions: bsizex x bsizey x 1 + * + * Grid dimensions: (height / bsizex) x (width / bsizey) x 1 + */ +__global__ void bp_training_error_signal_kernel(size_t height, + size_t width, + DataType epsilon, + size_t statistics_count, + const DataType* __restrict__ input, + size_t input_ldim, + const DataType* __restrict__ gradient_wrt_output, + size_t gradient_wrt_output_ldim, + DataType* __restrict__ gradient_wrt_input, + size_t gradient_wrt_input_ldim, + const DataType* __restrict__ mean, + const DataType* __restrict__ var, + const DataType* __restrict__ gradient_wrt_mean, + const DataType* __restrict__ gradient_wrt_var) { + const size_t gidx = threadIdx.x + blockIdx.x * blockDim.x; + const size_t gidy = threadIdx.y + blockIdx.y * blockDim.y; + const size_t nthreadsx = blockDim.x * gridDim.x; + const size_t nthreadsy = blockDim.y * gridDim.y; + for (size_t row = gidx; row < height; row += nthreadsx) { + const auto& _mean = mean[row]; + const auto& _var = var[row]; + const auto& dmean = gradient_wrt_mean[row]; + const auto& dvar = gradient_wrt_var[row]; + const auto inv_stdev = cuda::rsqrt(_var + epsilon); + for (size_t col = gidy; col < width; col += nthreadsy) { + const auto& x = input[row + col * input_ldim]; + const auto& dy = gradient_wrt_output[row + col * gradient_wrt_output_ldim]; + auto& dx = gradient_wrt_input[row + col * gradient_wrt_input_ldim]; + dx = (dy * inv_stdev + + dmean / statistics_count + + dvar * (x - _mean)) * 2 / (statistics_count - 1); + } + } +} + +/** @brief Backprop for training. + * + * Assumes forward prop uses mini-batch statistics. In other words, + * statistics are dependent on input. + */ +void bp_training_impl(lbann_comm& comm, + DataType epsilon, + const AbsDistMat& input, + const AbsDistMat& gradient_wrt_output, + AbsDistMat& gradient_wrt_input, + const AbsDistMat& statistics, + AbsDistMat& gradient_wrt_statistics) { + + // Local matrices + const auto& local_input = dynamic_cast(input.LockedMatrix()); + const auto& local_gradient_wrt_output = dynamic_cast(gradient_wrt_output.LockedMatrix()); + auto& local_gradient_wrt_input = dynamic_cast(gradient_wrt_input.Matrix()); + const auto& local_statistics = dynamic_cast(statistics.LockedMatrix()); + const auto local_mean = El::LockedView(local_statistics, El::ALL, El::IR(0)); + const auto local_var = El::LockedView(local_statistics, El::ALL, El::IR(1)); + auto& local_gradient_wrt_statistics = dynamic_cast(gradient_wrt_statistics.Matrix()); + auto local_gradient_wrt_mean = El::View(local_gradient_wrt_statistics, El::ALL, El::IR(0)); + auto local_gradient_wrt_var = El::View(local_gradient_wrt_statistics, El::ALL, El::IR(1)); + + // Dimensions + const size_t local_height = local_gradient_wrt_input.Height(); + const size_t local_width = local_gradient_wrt_input.Width(); + + // Count for statistics + // Note: Output is constant if statistics count is <=1, so error + // signal is zero. + /// @todo Local statistics + /// @todo Arbitrary group sizes + const size_t statistics_count = input.Width(); + if (statistics_count <= 1) { + El::Zero(local_gradient_wrt_input); + return; + } + + // Compute local gradient w.r.t. batch statistics + El::Zero(gradient_wrt_statistics); + if (local_height > 0) { + constexpr size_t block_size = 256; + dim3 block_dims, grid_dims; + block_dims.x = block_size; + grid_dims.x = (local_height + block_size - 1) / block_size; + bp_training_stats_gradient_kernel + <<>>( + local_height, + local_width, + epsilon, + local_input.LockedBuffer(), + local_input.LDim(), + local_gradient_wrt_output.LockedBuffer(), + local_gradient_wrt_output.LDim(), + local_mean.LockedBuffer(), + local_var.LockedBuffer(), + local_gradient_wrt_mean.Buffer(), + local_gradient_wrt_var.Buffer()); + } + + // Accumulate gradient w.r.t. statistics across processes + /// @todo Local statistics + /// @todo Arbitrary group sizes + comm.allreduce(gradient_wrt_statistics, + gradient_wrt_statistics.RedundantComm(), + El::mpi::SUM); + + // Compute gradient w.r.t. input + if (!local_input.IsEmpty()) { + const size_t local_height = local_input.Height(); + const size_t local_width = local_input.Width(); + constexpr size_t block_size_x = 256; + constexpr size_t block_size_y = 1; + dim3 block_dims, grid_dims; + block_dims.x = block_size_x; + block_dims.y = block_size_y; + grid_dims.x = (local_height + block_size_x - 1) / block_size_x; + grid_dims.y = (local_width + block_size_y - 1) / block_size_y; + bp_training_error_signal_kernel + <<>>( + local_height, + local_width, + epsilon, + statistics_count, + local_input.LockedBuffer(), + local_input.LDim(), + local_gradient_wrt_output.LockedBuffer(), + local_gradient_wrt_output.LDim(), + local_gradient_wrt_input.Buffer(), + local_gradient_wrt_input.LDim(), + local_mean.LockedBuffer(), + local_var.LockedBuffer(), + local_gradient_wrt_mean.LockedBuffer(), + local_gradient_wrt_var.LockedBuffer()); + } + +} + +/** + * dL/dx_i = dL/dy_i / sqrt(var+epsilon) + * + * Block dimensions: bsizex x bsizey x 1 + * + * Grid dimensions: (height / bsizex) x (width / bsizey) x 1 + */ +__global__ void bp_inference_kernel(size_t height, + size_t width, + DataType epsilon, + const DataType* __restrict__ gradient_wrt_output, + size_t gradient_wrt_output_ldim, + DataType* __restrict__ gradient_wrt_input, + size_t gradient_wrt_input_ldim, + const DataType* __restrict__ running_var) { + const size_t gidx = threadIdx.x + blockIdx.x * blockDim.x; + const size_t gidy = threadIdx.y + blockIdx.y * blockDim.y; + const size_t nthreadsx = blockDim.x * gridDim.x; + const size_t nthreadsy = blockDim.y * gridDim.y; + for (size_t row = gidx; row < height; row += nthreadsx) { + const auto& var = running_var[row]; + const auto inv_stdev = cuda::rsqrt(var + epsilon); + for (size_t col = gidy; col < width; col += nthreadsy) { + const auto& dy = gradient_wrt_output[row + col * gradient_wrt_output_ldim]; + auto& dx = gradient_wrt_input[row + col * gradient_wrt_input_ldim]; + dx = dy * inv_stdev; + } + } +} + +/** @brief Backprop for inference. + * + * Assumes forward prop uses running statistics. In other words, + * statistics are independent of input. + */ +void bp_inference_impl(DataType epsilon, + const AbsDistMat& gradient_wrt_output, + AbsDistMat& gradient_wrt_input, + const AbsDistMat& running_var) { + + // Local matrices + const auto& local_gradient_wrt_output = dynamic_cast(gradient_wrt_output.LockedMatrix()); + auto& local_gradient_wrt_input = dynamic_cast(gradient_wrt_input.Matrix()); + const auto& local_running_var = dynamic_cast(running_var.LockedMatrix()); + + // Compute gradient w.r.t. input + if (!local_gradient_wrt_output.IsEmpty()) { + const size_t local_height = local_gradient_wrt_output.Height(); + const size_t local_width = local_gradient_wrt_output.Width(); + constexpr size_t block_size_x = 256; + constexpr size_t block_size_y = 1; + dim3 block_dims, grid_dims; + block_dims.x = block_size_x; + block_dims.y = block_size_y; + grid_dims.x = (local_height + block_size_x - 1) / block_size_x; + grid_dims.y = (local_width + block_size_y - 1) / block_size_y; + bp_inference_kernel + <<>>( + local_height, + local_width, + epsilon, + local_gradient_wrt_output.LockedBuffer(), + local_gradient_wrt_output.LDim(), + local_gradient_wrt_input.Buffer(), + local_gradient_wrt_input.LDim(), + local_running_var.LockedBuffer()); + } + +} + +void bp_impl(lbann_comm& comm, + DataType epsilon, + bool is_training, + const AbsDistMat& input, + const AbsDistMat& gradient_wrt_output, + AbsDistMat& gradient_wrt_input, + const AbsDistMat& batch_statistics, + AbsDistMat& gradient_wrt_batch_statistics, + const AbsDistMat& running_var) { + + // Batchnorm has different behavior for training and inference + if (is_training) { + bp_training_impl(comm, + epsilon, + input, + gradient_wrt_output, + gradient_wrt_input, + batch_statistics, + gradient_wrt_batch_statistics); + } + else { + bp_inference_impl(epsilon, + gradient_wrt_output, + gradient_wrt_input, + running_var); + } + +} + +} // namespace + +// Template instantiation +template <> +void entrywise_batch_normalization_layer::fp_compute() { + fp_impl(*get_comm(), + m_decay, + m_epsilon, + m_model->get_execution_mode() == execution_mode::training, + get_prev_activations(), + get_activations(), + *m_batch_statistics, + m_weights[0]->get_values(), + m_weights[1]->get_values()); +} +template <> +void entrywise_batch_normalization_layer::fp_compute() { + fp_impl(*get_comm(), + m_decay, + m_epsilon, + m_model->get_execution_mode() == execution_mode::training, + get_prev_activations(), + get_activations(), + *m_batch_statistics, + m_weights[0]->get_values(), + m_weights[1]->get_values()); +} +template <> +void entrywise_batch_normalization_layer::bp_compute() { + bp_impl(*get_comm(), + m_epsilon, + m_model->get_execution_mode() == execution_mode::training, + get_prev_activations(), + get_prev_error_signals(), + get_error_signals(), + *m_batch_statistics, + *m_batch_statistics_gradient, + m_weights[1]->get_values()); +} +template <> +void entrywise_batch_normalization_layer::bp_compute() { + bp_impl(*get_comm(), + m_epsilon, + m_model->get_execution_mode() == execution_mode::training, + get_prev_activations(), + get_prev_error_signals(), + get_error_signals(), + *m_batch_statistics, + *m_batch_statistics_gradient, + m_weights[1]->get_values()); +} + +} // namespace lbann diff --git a/src/proto/factories/layer_factory.cpp b/src/proto/factories/layer_factory.cpp index 02c7fe47164..74dc45a8488 100644 --- a/src/proto/factories/layer_factory.cpp +++ b/src/proto/factories/layer_factory.cpp @@ -66,6 +66,7 @@ #include "lbann/layers/regularizers/local_response_normalization.hpp" #include "lbann/layers/regularizers/regularizer.hpp" #include "lbann/layers/regularizers/selu_dropout.hpp" +#include "lbann/layers/regularizers/entrywise_batch_normalization.hpp" #include "lbann/layers/transform/bernoulli.hpp" #include "lbann/layers/transform/categorical_random.hpp" #include "lbann/layers/transform/concatenation.hpp" @@ -571,6 +572,10 @@ std::unique_ptr construct_layer( return lbann::make_unique>(comm, keep_prob); } } + if (proto_layer.has_entrywise_batch_normalization()) { + const auto& params = proto_layer.entrywise_batch_normalization(); + return lbann::make_unique>(comm, params.decay(), params.epsilon()); + } // Math layers CONSTRUCT_LAYER(logical_not); diff --git a/src/proto/layers.proto b/src/proto/layers.proto index 417dd30ec91..6d4a28b16df 100644 --- a/src/proto/layers.proto +++ b/src/proto/layers.proto @@ -151,6 +151,7 @@ message Layer { LocalResponseNormalization local_response_normalization = 20; Dropout dropout = 21; SeluDropout selu_dropout = 229; + EntrywiseBatchNormalization entrywise_batch_normalization = 230; // Activation layers Elu elu = 200; @@ -281,6 +282,11 @@ message Layer { int64 statistics_group_size = 6; } + message EntrywiseBatchNormalization { + double decay = 1; + double epsilon = 2; + } + message SeluDropout { double keep_prob = 2; //default: 0.95 double alpha = 3; //default: 1.6732632423543772848170429916717 From b79f442ecf0c81bcfc1b6a34a7584940307731ad Mon Sep 17 00:00:00 2001 From: Tim Moon Date: Wed, 14 Aug 2019 17:18:12 -0700 Subject: [PATCH 236/634] Embedded Python session is destroyed during LBANN finalization. Removed Python session object. --- include/lbann/utils/python.hpp | 71 ++++++------- src/base.cpp | 6 ++ src/data_readers/data_reader_python.cpp | 17 ++-- src/utils/python.cpp | 128 +++++++++++++----------- 4 files changed, 110 insertions(+), 112 deletions(-) diff --git a/include/lbann/utils/python.hpp b/include/lbann/utils/python.hpp index 1bab09a2346..0bcbaf34ff0 100644 --- a/include/lbann/utils/python.hpp +++ b/include/lbann/utils/python.hpp @@ -29,59 +29,44 @@ #include "lbann/base.hpp" #ifdef LBANN_HAS_PYTHON -#include + #include +#include +#include + namespace lbann { namespace python { -/** @brief Singleton class to manage embedded Python session. +/** @brief Start embedded Python session. * - * This mostly manages the initialization and finalization of the - * Python session. It is rarely necessary to interact with the - * singleton instance directly. + * Does nothing if Python has already been started. This function is + * thread-safe. * - * All static member functions are thread-safe. + * Be warned that restarting Python after it has been ended is a bad + * idea since any Python objects left over from the first session + * will be invalid in the second. Expect segfaults. */ -class session { -public: - - /** @brief Start embedded Python session if not already running. - * @details Does nothing if Python has already been started. - */ - static void start_once(); - - /** @brief Check if embedded Python session is running. */ - static bool is_active() noexcept; - - /** @brief Check if a Python error has occurred. - * - * Throws an exception if a Python error is detected. - * - * @param force_error Whether to force an exception to be thrown. - */ - static void check_error(bool force_error = false); - - /** @brief Get singleton instance. - * - * Initializes an embedded Python session the first time it is - * called. - */ - static session& get(); - - ~session(); - -private: +void initialize(); - /** @brief State on main Python thread. */ - PyThreadState* m_thread_state = nullptr; +/** @brief End embedded Python session. + * + * Does nothing if Python is not running. This function is + * thread-safe. + */ +void finalize(); - // Lifetime functions - session(); - session(const session&) = delete; - session& operator=(const session&) = delete; +/** @brief Check if embedded Python session is running. */ +bool is_active(); -}; +/** @brief Check if a Python error has occurred. + * + * Throws an exception if a Python error is detected. The GIL is + * acquired internally. + * + * @param force_error Whether to force an exception to be thrown. + */ +void check_error(bool force_error = false); /** @brief RAII wrapper for Python GIL. * @@ -90,6 +75,8 @@ class session { * time. Make sure to acquire the GIL before calling Python C API * functions. The GIL can be acquired recursively, i.e. you can * acquire the GIL even if you already control it. + * + * If an Python session is not running, one is started. */ class global_interpreter_lock { public: diff --git a/src/base.cpp b/src/base.cpp index 643a42fe1be..89fe8abae0c 100644 --- a/src/base.cpp +++ b/src/base.cpp @@ -45,6 +45,9 @@ #ifdef LBANN_HAS_CUDNN #include "lbann/utils/cudnn.hpp" #endif +#ifdef LBANN_HAS_PYTHON +#include "lbann/utils/python.hpp" +#endif #include #include @@ -94,6 +97,9 @@ world_comm_ptr initialize(int& argc, char**& argv, int seed) { void finalize(lbann_comm* comm) { #ifdef LBANN_HAS_CUDNN cudnn::destroy(); +#endif +#ifdef LBANN_HAS_PYTHON + python::finalize(); #endif if (comm != nullptr) { delete comm; diff --git a/src/data_readers/data_reader_python.cpp b/src/data_readers/data_reader_python.cpp index a0e7e73c858..2086cbc5757 100644 --- a/src/data_readers/data_reader_python.cpp +++ b/src/data_readers/data_reader_python.cpp @@ -42,14 +42,13 @@ python_reader::python_reader(std::string module, : generic_data_reader(true) { // Make sure Python is running and acquire GIL - python::session::start_once(); python::global_interpreter_lock gil; // Import Python module for data if (!module_dir.empty()) { auto path = PySys_GetObject("path"); // Borrowed reference PyList_Append(path, python::object(module_dir)); - python::session::check_error(); + python::check_error(); } python::object data_module = PyImport_ImportModule(module.c_str()); @@ -58,7 +57,7 @@ python_reader::python_reader(std::string module, = PyObject_GetAttrString(data_module, num_samples_function.c_str()); python::object num = PyObject_CallObject(num_func, nullptr); m_num_samples = PyLong_AsLong(num); - python::session::check_error(); + python::check_error(); // Get sample dimensions python::object dims_func @@ -69,7 +68,7 @@ python_reader::python_reader(std::string module, m_sample_dims.push_back(PyLong_AsLong(d)); Py_DECREF(d); } - python::session::check_error(); + python::check_error(); // Get sample access function m_sample_function = PyObject_GetAttrString(data_module, @@ -78,7 +77,7 @@ python_reader::python_reader(std::string module, } python_reader::~python_reader() { - if (python::session::is_active() && m_process_pool != nullptr) { + if (python::is_active() && m_process_pool != nullptr) { python::global_interpreter_lock gil; PyObject_CallMethod(m_process_pool, "terminate", nullptr); PyObject_CallMethod(m_process_pool, "join", nullptr); @@ -216,14 +215,14 @@ void python_reader::setup(int num_io_threads, PyObject_SetAttrString(main_module, sample_func_name.c_str(), m_sample_function); - python::session::check_error(); + python::check_error(); const std::string shared_array_name = ("_DATA_READER_PYTHON_CPP_shared_memory_array" + std::to_string(instance_id)); PyObject_SetAttrString(main_module, shared_array_name.c_str(), m_shared_memory_array); - python::session::check_error(); + python::check_error(); // Create wrapper around sample function // Note: We attempt accessing the sample with the buffer protocol @@ -273,7 +272,7 @@ def @wrapper_func@(sample_index, array_offset): std::regex("\\@datatype_typecode\\@"), datatype_typecode); PyRun_SimpleString(wrapper_func_def.c_str()); - python::session::check_error(); + python::check_error(); m_sample_function_wrapper = PyObject_GetAttrString(main_module, wrapper_func_name.c_str()); @@ -302,7 +301,7 @@ def @init_func@(): std::regex("\\@init_func\\@"), init_func_name); PyRun_SimpleString(init_func_def.c_str()); - python::session::check_error(); + python::check_error(); python::object init_func = PyObject_GetAttrString(main_module, init_func_name.c_str()); diff --git a/src/utils/python.cpp b/src/utils/python.cpp index b191a1ef9f0..ad73ed992e1 100644 --- a/src/utils/python.cpp +++ b/src/utils/python.cpp @@ -32,19 +32,68 @@ namespace lbann { namespace python { -// --------------------------------------------- -// session class -// --------------------------------------------- +namespace { + +/** @brief State on main Python thread after initialization. */ +PyThreadState* init_thread_state = nullptr; -void session::start_once() { get(); } +} // namespace -bool session::is_active() noexcept { return Py_IsInitialized(); } +void initialize() { -void session::check_error(bool force_error) { - start_once(); + // Thread-safe initialization with double-checked locking pattern if (!is_active()) { - LBANN_ERROR("embedded Python session has terminated unexpectedly"); + static std::mutex m; + std::lock_guard lock(m); + if (!is_active()) { + + // Hack to display output from Python + // Note: Python outputs didn't appear because MPI intercepts + // stdout and stderr. See + // https://stackoverflow.com/questions/29352485/python-print-not-working-when-embedded-into-mpi-program + Py_UnbufferedStdioFlag = 1; + + // Initialize Python session and release GIL + Py_Initialize(); + PyEval_InitThreads(); + init_thread_state = PyEval_SaveThread(); + if (!is_active()) { + LBANN_ERROR("error initializing embedded Python session"); + } + + } + } + +} + +void finalize() { + + // Thread-safe finalization with double-checked locking pattern + if (is_active()) { + static std::mutex m; + std::lock_guard lock(m); + if (is_active()) { + + // Take GIL and finalize Python session + if (init_thread_state != nullptr) { + PyEval_RestoreThread(init_thread_state); + init_thread_state = nullptr; + } + Py_Finalize(); + + // Check that Python session has been finalized + if (is_active()) { + LBANN_WARNING("error finalizing embedded Python session"); + } + + } } + +} + +bool is_active() { return Py_IsInitialized(); } + +void check_error(bool force_error) { global_interpreter_lock gil; if (force_error || PyErr_Occurred()) { @@ -92,60 +141,17 @@ void session::check_error(bool force_error) { } } -session& session::get() { - // Initializing static local variables is thread-safe as of C++11 - static session instance; - return instance; -} - -session::session() { - if (!is_active()) { - - // Hack to display output from Python - // Note: Python outputs didn't appear because MPI intercepts - // stdout and stderr. See - // https://stackoverflow.com/questions/29352485/python-print-not-working-when-embedded-into-mpi-program - Py_UnbufferedStdioFlag = 1; - - // Initialize embedded Python session - Py_Initialize(); - PyEval_InitThreads(); - - // Release GIL - m_thread_state = PyEval_SaveThread(); - - } - if (!is_active()) { - LBANN_ERROR("error initializing embedded Python session"); - } -} - -session::~session() { - if (is_active()) { - if (m_thread_state != nullptr) { - PyEval_RestoreThread(m_thread_state); - } - Py_Finalize(); - } - if (is_active()) { - LBANN_WARNING("error finalizing embedded Python session"); - } -} - // --------------------------------------------- // global_interpreter_lock class // --------------------------------------------- global_interpreter_lock::global_interpreter_lock() { - session::start_once(); - if (!session::is_active()) { - LBANN_ERROR("embedded Python session has terminated unexpectedly"); - } + initialize(); // Make sure Python is running m_gil_state = PyGILState_Ensure(); } global_interpreter_lock::~global_interpreter_lock() { - if (session::is_active()) { + if (is_active()) { PyGILState_Release(m_gil_state); } } @@ -155,29 +161,29 @@ global_interpreter_lock::~global_interpreter_lock() { // --------------------------------------------- object::object(PyObject* ptr) : m_ptr(ptr) { - session::check_error(); + check_error(); } object::object(const std::string& val) { global_interpreter_lock gil; m_ptr = PyUnicode_FromStringAndSize(val.c_str(), val.size()); - session::check_error(); + check_error(); } object::object(long val) { global_interpreter_lock gil; m_ptr = PyLong_FromLong(val); - session::check_error(); + check_error(); } object::object(double val) { global_interpreter_lock gil; m_ptr = PyFloat_FromDouble(val); - session::check_error(); + check_error(); } object::object(const object& other) : m_ptr(other.m_ptr) { global_interpreter_lock gil; m_ptr = other.m_ptr; Py_XINCREF(m_ptr); - session::check_error(); + check_error(); } object& object::operator=(const object& other) { @@ -185,7 +191,7 @@ object& object::operator=(const object& other) { Py_XDECREF(m_ptr); m_ptr = other.m_ptr; Py_XINCREF(m_ptr); - session::check_error(); + check_error(); return *this; } @@ -198,12 +204,12 @@ object& object::operator=(object&& other) { Py_XDECREF(m_ptr); m_ptr = other.m_ptr; other.m_ptr = nullptr; - session::check_error(); + check_error(); return *this; } object::~object() { - if (session::is_active()) { + if (is_active()) { global_interpreter_lock gil; Py_XDECREF(m_ptr); } From 02bc717cc061e722a51764494cccffba2767c3cd Mon Sep 17 00:00:00 2001 From: "Thomas R. Benson" Date: Thu, 15 Aug 2019 14:42:17 -0700 Subject: [PATCH 237/634] add the summarizer to the model class in the python front-end --- python/lbann/model.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/python/lbann/model.py b/python/lbann/model.py index 546c9049cdf..4a86dee82fc 100644 --- a/python/lbann/model.py +++ b/python/lbann/model.py @@ -10,7 +10,8 @@ class Model: def __init__(self, mini_batch_size, epochs, layers=[], weights=[], objective_function=None, - metrics=[], callbacks=[], random_seed=None): + metrics=[], callbacks=[], random_seed=None, + summary_dir=None): # Scalar fields self.mini_batch_size = mini_batch_size @@ -19,7 +20,7 @@ def __init__(self, mini_batch_size, epochs, self.num_parallel_readers = 0 # TODO: Make configurable self.procs_per_trainer = 0 # TODO: Make configurable self.random_seed = random_seed - + self.summary_dir = summary_dir # Get connected layers self.layers = list(lbann.layer.traverse_layer_graph(layers)) @@ -52,6 +53,8 @@ def export_proto(self): model.procs_per_trainer = self.procs_per_trainer if self.random_seed is not None: model.random_seed = self.random_seed + if self.summary_dir is not None: + model.summarizer.dir = self.summary_dir # Add model components model.layer.extend([l.export_proto() for l in self.layers]) From 809a3319702569bb4e4a938fd4ceab132fa6e7d6 Mon Sep 17 00:00:00 2001 From: Tim Moon Date: Thu, 15 Aug 2019 15:50:23 -0700 Subject: [PATCH 238/634] Fixing incorrect parentheses in entry-wise batchnorm. --- src/layers/regularizers/entrywise_batch_normalization.cpp | 2 +- src/layers/regularizers/entrywise_batch_normalization.cu | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/layers/regularizers/entrywise_batch_normalization.cpp b/src/layers/regularizers/entrywise_batch_normalization.cpp index 655b940cdc1..f288d4b848b 100644 --- a/src/layers/regularizers/entrywise_batch_normalization.cpp +++ b/src/layers/regularizers/entrywise_batch_normalization.cpp @@ -291,7 +291,7 @@ void bp_training_impl(lbann_comm& comm, auto& dvar = local_gradient_wrt_var(row, 0); dx = (dy * inv_stdev + dmean * inv_stats_count - + dvar * (x - mean)) * 2 * inv_stats_countm1; + + dvar * (x - mean) * 2 * inv_stats_countm1); } } } diff --git a/src/layers/regularizers/entrywise_batch_normalization.cu b/src/layers/regularizers/entrywise_batch_normalization.cu index d4109a6f87f..79108fd3321 100644 --- a/src/layers/regularizers/entrywise_batch_normalization.cu +++ b/src/layers/regularizers/entrywise_batch_normalization.cu @@ -355,7 +355,7 @@ __global__ void bp_training_error_signal_kernel(size_t height, auto& dx = gradient_wrt_input[row + col * gradient_wrt_input_ldim]; dx = (dy * inv_stdev + dmean / statistics_count - + dvar * (x - _mean)) * 2 / (statistics_count - 1); + + dvar * (x - _mean) * 2 / (statistics_count - 1)); } } } From 5646c39ecdcb2c6c49a2e3becbd035177257c173 Mon Sep 17 00:00:00 2001 From: Tom Benson <30674819+benson31@users.noreply.github.com> Date: Thu, 15 Aug 2019 19:04:55 -0700 Subject: [PATCH 239/634] expose options for offline builds of all superbuild packages (#1173) Enable offline builds --- superbuild/CMakeLists.txt | 2 ++ superbuild/aluminum/CMakeLists.txt | 19 ++++++++++++++++--- superbuild/cereal/CMakeLists.txt | 19 ++++++++++++++++--- superbuild/cnpy/CMakeLists.txt | 20 ++++++++++++++++---- superbuild/conduit/CMakeLists.txt | 19 ++++++++++++++++--- superbuild/cub/CMakeLists.txt | 21 +++++++++++++++++---- superbuild/hdf5/CMakeLists.txt | 19 ++++++++++++++++--- superbuild/hydrogen/CMakeLists.txt | 19 ++++++++++++++++--- superbuild/jpeg-turbo/CMakeLists.txt | 19 ++++++++++++++++--- superbuild/openblas/CMakeLists.txt | 18 ++++++++++++++++-- superbuild/opencv/CMakeLists.txt | 19 ++++++++++++++++--- superbuild/protobuf/CMakeLists.txt | 19 ++++++++++++++++--- 12 files changed, 179 insertions(+), 34 deletions(-) diff --git a/superbuild/CMakeLists.txt b/superbuild/CMakeLists.txt index 7fb7133edd4..2481f6175e1 100644 --- a/superbuild/CMakeLists.txt +++ b/superbuild/CMakeLists.txt @@ -58,6 +58,8 @@ option(LBANN_SB_BUILD_LBANN "Pull and build LBANN from Github" OFF) # # Add the TPL subdirectories # +set(_GIT_REPOSITORY_TAG "GIT_REPOSITORY") +set(_GIT_TAG_TAG "GIT_TAG") include(LBANNSuperBuildCreateCMakeArguments) if (LBANN_SB_BUILD_ALUMINUM) diff --git a/superbuild/aluminum/CMakeLists.txt b/superbuild/aluminum/CMakeLists.txt index d88928dcc59..a7b37c02672 100644 --- a/superbuild/aluminum/CMakeLists.txt +++ b/superbuild/aluminum/CMakeLists.txt @@ -44,14 +44,27 @@ create_cmake_arguments( EXTRA_REMOVE_PREFIXES "LBANN_SB_FWD_ALUMINUM" "LBANN_SB_FWD_Aluminum" VARIABLES ${ALUMINUM_VARIABLES}) +if (ALUMINUM_CUSTOM_SOURCE_DIR) + set(ALUMINUM_SOURCE_DIR "${ALUMINUM_CUSTOM_SOURCE_DIR}") + set(ALUMINUM_URL "") + set(ALUMINUM_TAG "") + set(_GIT_REPOSITORY_TAG) + set(_GIT_TAG_TAG) + message(STATUS "Using ALUMINUM source in: ${ALUMINUM_SOURCE_DIR}") +else () + set(ALUMINUM_SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/src") + set(_GIT_REPOSITORY_TAG "GIT_REPOSITORY") + set(_GIT_TAG_TAG "GIT_TAG") +endif () + include(ExternalProject) ExternalProject_Add(ALUMINUM PREFIX ${CMAKE_CURRENT_BINARY_DIR} TMP_DIR ${CMAKE_CURRENT_BINARY_DIR}/tmp STAMP_DIR ${CMAKE_CURRENT_BINARY_DIR}/stamp - GIT_REPOSITORY ${ALUMINUM_URL} - GIT_TAG ${ALUMINUM_TAG} - SOURCE_DIR ${CMAKE_CURRENT_BINARY_DIR}/src + ${_GIT_REPOSITORY_TAG} ${ALUMINUM_URL} + ${_GIT_TAG_TAG} ${ALUMINUM_TAG} + SOURCE_DIR ${ALUMINUM_SOURCE_DIR} BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/build INSTALL_DIR ${ALUMINUM_CMAKE_INSTALL_PREFIX} USES_TERMINAL_BUILD 1 diff --git a/superbuild/cereal/CMakeLists.txt b/superbuild/cereal/CMakeLists.txt index 5f1b924c600..40ce193d16d 100644 --- a/superbuild/cereal/CMakeLists.txt +++ b/superbuild/cereal/CMakeLists.txt @@ -19,6 +19,19 @@ set(CEREAL_TAG "master" CACHE STRING "The git tag or hash to checkout for CEREAL set(CEREAL_CMAKE_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}" CACHE PATH "The installation location of CEREAL.") +if (CEREAL_CUSTOM_SOURCE_DIR) + set(CEREAL_SOURCE_DIR "${CEREAL_CUSTOM_SOURCE_DIR}") + set(CEREAL_URL "") + set(CEREAL_TAG "") + set(_GIT_REPOSITORY_TAG) + set(_GIT_TAG_TAG) + message(STATUS "Using CEREAL source in: ${CEREAL_SOURCE_DIR}") +else () + set(CEREAL_SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/src") + set(_GIT_REPOSITORY_TAG "GIT_REPOSITORY") + set(_GIT_TAG_TAG "GIT_TAG") +endif () + # Handle the install of CEREAL include(ExternalProject) @@ -26,9 +39,9 @@ ExternalProject_Add(CEREAL PREFIX ${CMAKE_CURRENT_BINARY_DIR} TMP_DIR ${CMAKE_CURRENT_BINARY_DIR}/tmp STAMP_DIR ${CMAKE_CURRENT_BINARY_DIR}/stamp - GIT_REPOSITORY ${CEREAL_URL} - GIT_TAG ${CEREAL_TAG} - SOURCE_DIR ${CMAKE_CURRENT_BINARY_DIR}/src + ${_GIT_REPOSITORY_TAG} ${CEREAL_URL} + ${_GIT_TAG_TAG} ${CEREAL_TAG} + SOURCE_DIR ${CEREAL_SOURCE_DIR} BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/build INSTALL_DIR ${CEREAL_CMAKE_INSTALL_PREFIX} USES_TERMINAL_BUILD 1 diff --git a/superbuild/cnpy/CMakeLists.txt b/superbuild/cnpy/CMakeLists.txt index 400ef006cea..7a53e78f537 100644 --- a/superbuild/cnpy/CMakeLists.txt +++ b/superbuild/cnpy/CMakeLists.txt @@ -31,16 +31,28 @@ set(CNPY_CMAKE_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}" set(CNPY_CMAKE_BUILD_TYPE "${CMAKE_BUILD_TYPE}" CACHE STRING "The build type for CNPY.") +if (CNPY_CUSTOM_SOURCE_DIR) + set(CNPY_SOURCE_DIR "${CNPY_CUSTOM_SOURCE_DIR}") + set(CNPY_URL "") + set(CNPY_TAG "") + set(_GIT_REPOSITORY_TAG) + set(_GIT_TAG_TAG) + message(STATUS "Using CNPY source in: ${CNPY_SOURCE_DIR}") +else () + set(CNPY_SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/src") + set(_GIT_REPOSITORY_TAG "GIT_REPOSITORY") + set(_GIT_TAG_TAG "GIT_TAG") +endif () + # Now add the external project include(ExternalProject) - ExternalProject_Add(CNPY PREFIX ${CMAKE_CURRENT_BINARY_DIR} TMP_DIR ${CMAKE_CURRENT_BINARY_DIR}/tmp STAMP_DIR ${CMAKE_CURRENT_BINARY_DIR}/stamp - GIT_REPOSITORY ${CNPY_URL} - GIT_TAG ${CNPY_TAG} - SOURCE_DIR ${CMAKE_CURRENT_BINARY_DIR}/src + ${_GIT_REPOSITORY_TAG} ${CNPY_URL} + ${_GIT_TAG_TAG} ${CNPY_TAG} + SOURCE_DIR ${CNPY_SOURCE_DIR} BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/build INSTALL_DIR ${CNPY_CMAKE_INSTALL_PREFIX} USES_TERMINAL_BUILD 1 diff --git a/superbuild/conduit/CMakeLists.txt b/superbuild/conduit/CMakeLists.txt index 0c01bf48009..307a5c31bd1 100644 --- a/superbuild/conduit/CMakeLists.txt +++ b/superbuild/conduit/CMakeLists.txt @@ -68,14 +68,27 @@ if (CONDUIT_ENABLE_FORTRAN) "-DCMAKE_Fortran_FLAGS=${CMAKE_Fortran_FLAGS}") endif () +if (CONDUIT_CUSTOM_SOURCE_DIR) + set(CONDUIT_SOURCE_DIR "${CONDUIT_CUSTOM_SOURCE_DIR}") + set(CONDUIT_URL "") + set(CONDUIT_TAG "") + set(_GIT_REPOSITORY_TAG) + set(_GIT_TAG_TAG) + message(STATUS "Using CONDUIT source in: ${CONDUIT_SOURCE_DIR}") +else () + set(CONDUIT_SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/src") + set(_GIT_REPOSITORY_TAG "GIT_REPOSITORY") + set(_GIT_TAG_TAG "GIT_TAG") +endif () + include(ExternalProject) ExternalProject_Add(CONDUIT PREFIX ${CMAKE_CURRENT_BINARY_DIR} TMP_DIR ${CMAKE_CURRENT_BINARY_DIR}/tmp STAMP_DIR ${CMAKE_CURRENT_BINARY_DIR}/stamp - GIT_REPOSITORY ${CONDUIT_URL} - GIT_TAG ${CONDUIT_TAG} - SOURCE_DIR ${CMAKE_CURRENT_BINARY_DIR}/src + ${_GIT_REPOSITORY_TAG} ${CONDUIT_URL} + ${_GIT_TAG_TAG} ${CONDUIT_TAG} + SOURCE_DIR ${CONDUIT_SOURCE_DIR} SOURCE_SUBDIR src BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/build INSTALL_DIR ${CONDUIT_CMAKE_INSTALL_PREFIX} diff --git a/superbuild/cub/CMakeLists.txt b/superbuild/cub/CMakeLists.txt index 84af8a4ec71..e3f4ef9ee06 100644 --- a/superbuild/cub/CMakeLists.txt +++ b/superbuild/cub/CMakeLists.txt @@ -18,18 +18,31 @@ set(CUB_CMAKE_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}" # Handle the install of CUB include(ExternalProject) +if (CUB_CUSTOM_SOURCE_DIR) + set(CUB_SOURCE_DIR "${CUB_CUSTOM_SOURCE_DIR}") + set(CUB_URL "") + set(CUB_TAG "") + set(_GIT_REPOSITORY_TAG) + set(_GIT_TAG_TAG) + message(STATUS "Using CUB source in: ${CUB_SOURCE_DIR}") +else () + set(CUB_SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/src") + set(_GIT_REPOSITORY_TAG "GIT_REPOSITORY") + set(_GIT_TAG_TAG "GIT_TAG") +endif () + ExternalProject_Add(CUB PREFIX ${CMAKE_CURRENT_BINARY_DIR} TMP_DIR ${CMAKE_CURRENT_BINARY_DIR}/tmp STAMP_DIR ${CMAKE_CURRENT_BINARY_DIR}/stamp - GIT_REPOSITORY ${CUB_URL} - GIT_TAG ${CUB_TAG} - SOURCE_DIR ${CMAKE_CURRENT_BINARY_DIR}/src + ${_GIT_REPOSITORY_TAG} ${CUB_URL} + ${_GIT_TAG_TAG} ${CUB_TAG} + SOURCE_DIR ${CUB_SOURCE_DIR} BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/build PATCH_COMMAND ${CMAKE_COMMAND} -E copy_if_different ${CMAKE_CURRENT_SOURCE_DIR}/CUBCMakeLists.txt - ${CMAKE_CURRENT_BINARY_DIR}/src/CMakeLists.txt && + ${CUB_SOURCE_DIR}/CMakeLists.txt && patch -p1 < ${LBANN_SRC_DIR}/external/cub/cub_enable_alloc_free_logging.patch INSTALL_DIR ${CUB_CMAKE_INSTALL_PREFIX} USES_TERMINAL_BUILD 1 diff --git a/superbuild/hdf5/CMakeLists.txt b/superbuild/hdf5/CMakeLists.txt index 3bdd08d1d4a..641cacdf4ea 100644 --- a/superbuild/hdf5/CMakeLists.txt +++ b/superbuild/hdf5/CMakeLists.txt @@ -82,14 +82,27 @@ if (HDF5_BUILD_FORTRAN) "-DCMAKE_Fortran_FLAGS=${CMAKE_Fortran_FLAGS}") endif () +if (HDF5_CUSTOM_SOURCE_DIR) + set(HDF5_SOURCE_DIR "${HDF5_CUSTOM_SOURCE_DIR}") + set(HDF5_URL "") + set(HDF5_TAG "") + set(_GIT_REPOSITORY_TAG) + set(_GIT_TAG_TAG) + message(STATUS "Using HDF5 source in: ${HDF5_SOURCE_DIR}") +else () + set(HDF5_SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/src") + set(_GIT_REPOSITORY_TAG "GIT_REPOSITORY") + set(_GIT_TAG_TAG "GIT_TAG") +endif () + include(ExternalProject) ExternalProject_Add(HDF5 PREFIX ${CMAKE_CURRENT_BINARY_DIR} TMP_DIR ${CMAKE_CURRENT_BINARY_DIR}/tmp STAMP_DIR ${CMAKE_CURRENT_BINARY_DIR}/stamp - GIT_REPOSITORY ${HDF5_URL} - GIT_TAG ${HDF5_TAG} - SOURCE_DIR ${CMAKE_CURRENT_BINARY_DIR}/src + ${_GIT_REPOSITORY_TAG} ${HDF5_URL} + ${_GIT_TAG_TAG} ${HDF5_TAG} + SOURCE_DIR ${HDF5_SOURCE_DIR} BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/build INSTALL_DIR ${HDF5_CMAKE_INSTALL_PREFIX} USES_TERMINAL_BUILD 1 diff --git a/superbuild/hydrogen/CMakeLists.txt b/superbuild/hydrogen/CMakeLists.txt index 3f6d816c7a7..d68e68965a7 100644 --- a/superbuild/hydrogen/CMakeLists.txt +++ b/superbuild/hydrogen/CMakeLists.txt @@ -89,15 +89,28 @@ endif () set(HYDROGEN_TAG "hydrogen" CACHE STRING "The git tag or hash to checkout for Hydrogen") +if (HYDROGEN_CUSTOM_SOURCE_DIR) + set(HYDROGEN_SOURCE_DIR "${HYDROGEN_CUSTOM_SOURCE_DIR}") + set(HYDROGEN_URL "") + set(HYDROGEN_TAG "") + set(_GIT_REPOSITORY_TAG) + set(_GIT_TAG_TAG) + message(STATUS "Using HYDROGEN source in: ${HYDROGEN_SOURCE_DIR}") +else () + set(HYDROGEN_SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/src") + set(_GIT_REPOSITORY_TAG "GIT_REPOSITORY") + set(_GIT_TAG_TAG "GIT_TAG") +endif () + include(ExternalProject) ExternalProject_Add(HYDROGEN PREFIX ${CMAKE_CURRENT_BINARY_DIR} TMP_DIR ${CMAKE_CURRENT_BINARY_DIR}/tmp STAMP_DIR ${CMAKE_CURRENT_BINARY_DIR}/stamp - GIT_REPOSITORY ${HYDROGEN_URL} - GIT_TAG ${HYDROGEN_TAG} + ${_GIT_REPOSITORY_TAG} ${HYDROGEN_URL} + ${_GIT_TAG_TAG} ${HYDROGEN_TAG} + SOURCE_DIR ${HYDROGEN_SOURCE_DIR} ${_hydrogen_depends_tag} ${_HYDROGEN_DEPENDS} - SOURCE_DIR ${CMAKE_CURRENT_BINARY_DIR}/src BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/build INSTALL_DIR ${HYDROGEN_CMAKE_INSTALL_PREFIX} USES_TERMINAL_BUILD 1 diff --git a/superbuild/jpeg-turbo/CMakeLists.txt b/superbuild/jpeg-turbo/CMakeLists.txt index e069a21a400..9971400e95e 100644 --- a/superbuild/jpeg-turbo/CMakeLists.txt +++ b/superbuild/jpeg-turbo/CMakeLists.txt @@ -23,15 +23,28 @@ set(JPEG-TURBO_TAG "master" set (JPEG-TURBO_CMAKE_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}" CACHE PATH "The installation location of LIBJPEG-TURBO.") +if (JPEG_TURBO_CUSTOM_SOURCE_DIR) + set(JPEG_TURBO_SOURCE_DIR "${JPEG_TURBO_CUSTOM_SOURCE_DIR}") + set(JPEG-TURBO_URL "") + set(JPEG-TURBO_TAG "") + set(_GIT_REPOSITORY_TAG) + set(_GIT_TAG_TAG) + message(STATUS "Using JPEG-TURBO source in: ${JPEG_TURBO_SOURCE_DIR}") +else () + set(JPEG_TURBO_SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/src") + set(_GIT_REPOSITORY_TAG "GIT_REPOSITORY") + set(_GIT_TAG_TAG "GIT_TAG") +endif () + # Handle the install of LIBJPEG-TURBO include(ExternalProject) ExternalProject_Add(JPEG-TURBO PREFIX ${CMAKE_CURRENT_BINARY_DIR} TMP_DIR ${CMAKE_CURRENT_BINARY_DIR}/tmp STAMP_DIR ${CMAKE_CURRENT_BINARY_DIR}/stamp - GIT_REPOSITORY ${JPEG-TURBO_URL} - GIT_TAG ${JPEG-TURBO_TAG} - SOURCE_DIR ${CMAKE_CURRENT_BINARY_DIR}/src + ${_GIT_REPOSITORY_TAG} ${JPEG-TURBO_URL} + ${_GIT_TAG_TAG} ${JPEG-TURBO_TAG} + SOURCE_DIR ${JPEG_TURBO_SOURCE_DIR} BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/build INSTALL_DIR ${JPEG-TURBO_CMAKE_INSTALL_PREFIX} USES_TERMINAL_BUILD 1 diff --git a/superbuild/openblas/CMakeLists.txt b/superbuild/openblas/CMakeLists.txt index 6b305efb86b..9e0ba294218 100644 --- a/superbuild/openblas/CMakeLists.txt +++ b/superbuild/openblas/CMakeLists.txt @@ -64,11 +64,25 @@ set(OPENBLAS_ARCH_COMMAND "${_TMP_OPENBLAS_ARCH_COMMAND}" # instead. find_program(GNU_MAKE_PROGRAM make) +if (OPENBLAS_CUSTOM_SOURCE_DIR) + set(OPENBLAS_SOURCE_DIR "${OPENBLAS_CUSTOM_SOURCE_DIR}") + set(OPENBLAS_URL "") + set(OPENBLAS_TAG "") + set(_GIT_REPOSITORY_TAG) + set(_GIT_TAG_TAG) + message(STATUS "Using OPENBLAS source in: ${OPENBLAS_SOURCE_DIR}") +else () + set(OPENBLAS_SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/src") + set(_GIT_REPOSITORY_TAG "GIT_REPOSITORY") + set(_GIT_TAG_TAG "GIT_TAG") +endif () + include (ExternalProject) ExternalProject_Add(OPENBLAS PREFIX ${CMAKE_CURRENT_BINARY_DIR} - GIT_REPOSITORY ${OPENBLAS_URL} - GIT_TAG ${OPENBLAS_TAG} + ${_GIT_REPOSITORY_TAG} ${OPENBLAS_URL} + ${_GIT_TAG_TAG} ${OPENBLAS_TAG} + SOURCE_DIR ${OPENBLAS_SOURCE_DIR} BUILD_IN_SOURCE 1 INSTALL_DIR ${OPENBLAS_CMAKE_INSTALL_PREFIX} CONFIGURE_COMMAND "" diff --git a/superbuild/opencv/CMakeLists.txt b/superbuild/opencv/CMakeLists.txt index 493318c3333..7d1b4ede210 100644 --- a/superbuild/opencv/CMakeLists.txt +++ b/superbuild/opencv/CMakeLists.txt @@ -44,15 +44,28 @@ endif () set(OPENCV_TAG "4.1.0" CACHE STRING "The git tag or hash to checkout for OpenCV") +if (OPENCV_CUSTOM_SOURCE_DIR) + set(OPENCV_SOURCE_DIR "${OPENCV_CUSTOM_SOURCE_DIR}") + set(OPENCV_URL "") + set(OPENCV_TAG "") + set(_GIT_REPOSITORY_TAG) + set(_GIT_TAG_TAG) + message(STATUS "Using OPENCV source in: ${OPENCV_SOURCE_DIR}") +else () + set(OPENCV_SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/src") + set(_GIT_REPOSITORY_TAG "GIT_REPOSITORY") + set(_GIT_TAG_TAG "GIT_TAG") +endif () + include(ExternalProject) ExternalProject_Add(OPENCV PREFIX ${CMAKE_CURRENT_BINARY_DIR} TMP_DIR ${CMAKE_CURRENT_BINARY_DIR}/tmp STAMP_DIR ${CMAKE_CURRENT_BINARY_DIR}/stamp - GIT_REPOSITORY ${OPENCV_URL} - GIT_TAG ${OPENCV_TAG} + ${_GIT_REPOSITORY_TAG} ${OPENCV_URL} + ${_GIT_TAG_TAG} ${OPENCV_TAG} + SOURCE_DIR ${OPENCV_SOURCE_DIR} ${_opencv_depends_tag} ${_OPENCV_DEPENDS} - SOURCE_DIR ${CMAKE_CURRENT_BINARY_DIR}/src BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/build INSTALL_DIR ${OPENCV_CMAKE_INSTALL_PREFIX} USES_TERMINAL_BUILD 1 diff --git a/superbuild/protobuf/CMakeLists.txt b/superbuild/protobuf/CMakeLists.txt index 9b9656b87b9..3bfba417c4a 100644 --- a/superbuild/protobuf/CMakeLists.txt +++ b/superbuild/protobuf/CMakeLists.txt @@ -41,14 +41,27 @@ set(PROTOBUF_CMAKE_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}" set(PROTOBUF_CMAKE_BUILD_TYPE "${CMAKE_BUILD_TYPE}" CACHE STRING "The build type for PROTOBUF.") +if (PROTOBUF_CUSTOM_SOURCE_DIR) + set(PROTOBUF_SOURCE_DIR "${PROTOBUF_CUSTOM_SOURCE_DIR}") + set(PROTOBUF_URL "") + set(PROTOBUF_TAG "") + set(_GIT_REPOSITORY_TAG) + set(_GIT_TAG_TAG) + message(STATUS "Using PROTOBUF source in: ${PROTOBUF_SOURCE_DIR}") +else () + set(PROTOBUF_SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/src") + set(_GIT_REPOSITORY_TAG "GIT_REPOSITORY") + set(_GIT_TAG_TAG "GIT_TAG") +endif () + include(ExternalProject) ExternalProject_Add(PROTOBUF PREFIX "${CMAKE_CURRENT_BINARY_DIR}" TMP_DIR "${CMAKE_CURRENT_BINARY_DIR}/tmp" STAMP_DIR "${CMAKE_CURRENT_BINARY_DIR}/stamp" - GIT_REPOSITORY ${PROTOBUF_URL} - GIT_TAG ${PROTOBUF_TAG} - SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/src" + ${_GIT_REPOSITORY_TAG} ${PROTOBUF_URL} + ${_GIT_TAG_TAG} ${PROTOBUF_TAG} + SOURCE_DIR ${PROTOBUF_SOURCE_DIR} SOURCE_SUBDIR cmake BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/build" INSTALL_DIR "${PROTOBUF_CMAKE_INSTALL_PREFIX}" From e51bc7725c27f5481240ceb429a6f2a2470dd4aa Mon Sep 17 00:00:00 2001 From: "Thomas R. Benson" Date: Fri, 16 Aug 2019 09:51:00 -0700 Subject: [PATCH 240/634] add tmp and stamp dirs for openblas --- superbuild/openblas/CMakeLists.txt | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/superbuild/openblas/CMakeLists.txt b/superbuild/openblas/CMakeLists.txt index 9e0ba294218..c0494e454c8 100644 --- a/superbuild/openblas/CMakeLists.txt +++ b/superbuild/openblas/CMakeLists.txt @@ -83,8 +83,10 @@ ExternalProject_Add(OPENBLAS ${_GIT_REPOSITORY_TAG} ${OPENBLAS_URL} ${_GIT_TAG_TAG} ${OPENBLAS_TAG} SOURCE_DIR ${OPENBLAS_SOURCE_DIR} - BUILD_IN_SOURCE 1 + TMP_DIR ${CMAKE_CURRENT_BINARY_DIR}/tmp + STAMP_DIR ${CMAKE_CURRENT_BINARY_DIR}/stamp INSTALL_DIR ${OPENBLAS_CMAKE_INSTALL_PREFIX} + BUILD_IN_SOURCE 1 CONFIGURE_COMMAND "" UPDATE_COMMAND "" USES_TERMINAL_BUILD 1 From b47a13ca898239efae2364dd0da9789cb09684e1 Mon Sep 17 00:00:00 2001 From: "David A. Hysom" Date: Fri, 16 Aug 2019 11:46:06 -0700 Subject: [PATCH 241/634] reworking to make it faster --- model_zoo/jag_utils/select_samples.cpp | 135 +++++++++++++++---------- 1 file changed, 83 insertions(+), 52 deletions(-) diff --git a/model_zoo/jag_utils/select_samples.cpp b/model_zoo/jag_utils/select_samples.cpp index 527ab8be1b2..e4994973f1f 100644 --- a/model_zoo/jag_utils/select_samples.cpp +++ b/model_zoo/jag_utils/select_samples.cpp @@ -16,85 +16,57 @@ using namespace std; using namespace lbann; -#undef SANITY -#define SANITY +//================================================================================================= +// sanity check the cmd line +void check_cmd_line(); +std::string help_msg(); + +void read_mapping_file(std::string &mapping_fn, unordered_map> &sample_mapping, unordered_map> &sample_mapping_v, unordered_map& string_to_index); + +//================================================================================================= int main(int argc, char **argv) { int random_seed = lbann_default_random_seed; world_comm_ptr comm = initialize(argc, argv, random_seed); - bool master = comm->am_world_master(); - int rank, np; - MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &np); if (np!= 1) { - if (master) { - LBANN_ERROR("please run with a single processor"); - } + LBANN_ERROR("please run with a single processor"); } options *opts = options::get(); opts->init(argc, argv); - std::stringstream err; - - // sanity check the cmd line - if (! (opts->has_string("index_fn") && opts->has_string("sample_mapping_fn") - && opts->has_int("num_samples") && opts->has_int("random_seed") - && opts->has_string("output_fn"))) { - if (master) { - LBANN_ERROR("usage: select_samples --index_fn= --sample_mapping_fn= --num_samples= --output_fn= --random_seed=\n\n"); - } - exit(9); + // check for proper invocation, print help message + if (opts->has_bool("h") || opts->has_bool("help") || argc == 1) { + std::cout << help_msg(); + MPI_Finalize(); + exit(0); } + check_cmd_line(); + // get all required options const std::string index_fn = opts->get_string("index_fn"); - const std::string mapping_fn = opts->get_string("sample_mapping_fn"); - const std::string output_fn = opts->get_string("output_fn"); - size_t num_samples = opts->get_int("num_samples"); + const std::string mapping_fn = opts->get_string("mapping_fn"); + const std::string output_dir = opts->get_string("output_dir"); + const std::string output_base = opts->get_string("output_base_fn"); + size_t num_samples = opts->get_int("num_samples_per_list"); + size_t num_lists = opts->get_int("num_lists"); int seed = opts->get_int("random_seed"); - //========================================================================== // read previously computed mapping: sample_id (string) -> local_index - //========================================================================== - cerr << "reading sample mapping\n"; // maps filename to { sample_ids } unordered_map> sample_mapping; unordered_map> sample_mapping_v; // maps a sampleID to a local idex unordered_map string_to_index; - ifstream in(mapping_fn.c_str()); - string filename; - string sample_id; - string line; - size_t n = 0; - while (getline(in, line)) { - if (!line.size()) { - break; - } - stringstream s(line); - s >> filename; - ++n; - int hh = 0; - while (s >> sample_id) { - sample_mapping[filename].insert(sample_id); - sample_mapping_v[filename].push_back(sample_id); - if (string_to_index.find(sample_id) != string_to_index.end()) { - err << "duplicate sample_ID: " << sample_id << " in file: " << filename; - LBANN_ERROR(err.str()); - } - string_to_index[sample_id] = hh++; - } - } - in.close(); - cerr << "num lines processed: " << n << "\n"; + read_mapping_file(mapping_fn, sample_mapping, sample_mapping_v, string_to_index); //========================================================================== - // master builds two maps: > maps a filename to the + // build two maps: > maps a filename to the // set of indices (not sample_ids; that comes later!) that are to be // included and excluded - if (master) { // your job, should you decide to accept it, is to fill in these maps std::unordered_map> index_map_keep; @@ -283,7 +255,66 @@ int main(int argc, char **argv) { out << total_good << " " << total_bad << " " << num_include_files << "\n" << base_dir << "\n" << sout.str(); - } + MPI_Finalize(); return EXIT_SUCCESS; } + +// sanity check the cmd line +void check_cmd_line() { + options *opts = options::get(); + std::stringstream err; + if (! (opts->has_string("index_fn") && opts->has_string("mapping_fn") + && opts->has_int("num_samples_per_list") && && opts->has_int("num_lists") + && opts->has_int("random_seed") + && opts->has_string("output_dir") && opts->has_string("output_base_fn"))) { + std::cout << help_message(); + MPI_Finalize(); + exit(0); + } +} + +std::string help_msg() { + std::stringstream err; + err << "usage: select_samples --index_fn= --sample_mapping_fn= --num_samples_per_list= --num_lists --output_dir= --output_base_name= --random_seed=\n\n"; + err << "example invocation:\n"; + err << "select_samples \n"; + err << " --index_fn=/p/gpfs1/brainusr/datasets/10MJAG/1M_B/index.txt\n"; + err << " --mapping_fn=/p/gpfs1/brainusr/datasets/10MJAG/1M_B/id_mapping.txt\n"; + err << " --num_samples_per_list=1000\n"; + err << " --num_lists=4\n"; + err << " --output_dir=/p/gpfs1/brainusr/datasets/10MJAG/1M_B\n"; + err << " --output_base_fn=my_samples.txt\n"; + err << " --random_seed=42\n"; + err << "\n\n"; + return err.str(); +} + +void read_mapping_file(std::string &mapping_fn, unordered_map> &sample_mapping, unordered_map> &sample_mapping_v, unordered_map& string_to_index) { + cerr << "reading sample mapping\n"; + ifstream in(mapping_fn.c_str()); + string filename; + string sample_id; + string line; + size_t n = 0; + while (getline(in, line)) { + if (!line.size()) { + break; + } + stringstream s(line); + s >> filename; + ++n; + int hh = 0; + while (s >> sample_id) { + sample_mapping[filename].insert(sample_id); + sample_mapping_v[filename].push_back(sample_id); + if (string_to_index.find(sample_id) != string_to_index.end()) { + err << "duplicate sample_ID: " << sample_id << " in file: " << filename; + LBANN_ERROR(err.str()); + } + string_to_index[sample_id] = hh++; + } + } + in.close(); + cerr << "FINISHED reading sample mapping: num lines processed: " << n << "\n"; +} From 260aaec6fc132ffa0e8edbd88164653863987b3f Mon Sep 17 00:00:00 2001 From: "David A. Hysom" Date: Sat, 17 Aug 2019 07:50:53 -0700 Subject: [PATCH 242/634] compiles (after I added a hack to sample_list_impl), but not yet tested. I believe all code is in place. --- model_zoo/jag_utils/select_samples.cpp | 544 ++++++++++++++----------- 1 file changed, 314 insertions(+), 230 deletions(-) diff --git a/model_zoo/jag_utils/select_samples.cpp b/model_zoo/jag_utils/select_samples.cpp index e4994973f1f..1d4bd841a87 100644 --- a/model_zoo/jag_utils/select_samples.cpp +++ b/model_zoo/jag_utils/select_samples.cpp @@ -16,266 +16,130 @@ using namespace std; using namespace lbann; -//================================================================================================= +//============================================================================ // sanity check the cmd line void check_cmd_line(); -std::string help_msg(); +// returns the help message +string help_msg(); -void read_mapping_file(std::string &mapping_fn, unordered_map> &sample_mapping, unordered_map> &sample_mapping_v, unordered_map& string_to_index); +// tests that there are sufficient samples to build the lists +void sanity_test_request(); -//================================================================================================= -int main(int argc, char **argv) { - int random_seed = lbann_default_random_seed; - world_comm_ptr comm = initialize(argc, argv, random_seed); - MPI_Comm_size(MPI_COMM_WORLD, &np); +void read_mapping_file( + unordered_map> &sample_mapping, + unordered_map> &sample_mapping_v, + unordered_map& string_to_index); - if (np!= 1) { - LBANN_ERROR("please run with a single processor"); - } +void build_index_maps( + unordered_map> &sample_mapping, + unordered_map> &index_map_keep, + unordered_map> &index_map_exclude, + unordered_map &string_to_index, + unordered_map &filename_data); - options *opts = options::get(); - opts->init(argc, argv); +void divide_selected_samples( + const unordered_map> &index_map_keep, + vector>> &sets); - // check for proper invocation, print help message - if (opts->has_bool("h") || opts->has_bool("help") || argc == 1) { - std::cout << help_msg(); - MPI_Finalize(); - exit(0); - } - check_cmd_line(); - - // get all required options - const std::string index_fn = opts->get_string("index_fn"); - const std::string mapping_fn = opts->get_string("mapping_fn"); - const std::string output_dir = opts->get_string("output_dir"); - const std::string output_base = opts->get_string("output_base_fn"); - size_t num_samples = opts->get_int("num_samples_per_list"); - size_t num_lists = opts->get_int("num_lists"); - int seed = opts->get_int("random_seed"); - - // read previously computed mapping: sample_id (string) -> local_index - // maps filename to { sample_ids } - unordered_map> sample_mapping; - unordered_map> sample_mapping_v; - // maps a sampleID to a local idex - unordered_map string_to_index; - - read_mapping_file(mapping_fn, sample_mapping, sample_mapping_v, string_to_index); - - //========================================================================== - // build two maps: > maps a filename to the - // set of indices (not sample_ids; that comes later!) that are to be - // included and excluded - - // your job, should you decide to accept it, is to fill in these maps - std::unordered_map> index_map_keep; - std::unordered_map> index_map_exclude; - - //open input file - in.open(index_fn); - if (!in) { - err << "failed to open " << index_fn << " for reading\n"; - LBANN_ERROR(err.str()); - MPI_Abort(MPI_COMM_WORLD, -1); - } - getline(in, line); - if (line != "CONDUIT_HDF5_EXCLUSION") { - LBANN_ERROR("error: 1st line in index file must contain: CONDUIT_HDF5_EXCLUSION\n"); - } +//todo: some of these should be const +void write_sample_list( + int n, + vector>> &subsets, + unordered_map> &sample_mapping_v, + std::unordered_map &filename_data); - int num_valid, num_invalid, num_files; - in >> num_valid >> num_invalid >> num_files; - getline(in, line); //discard newline - string base_dir; - getline(in, base_dir); - cerr << "input index file contains " << num_valid << " valid samples\n"; - - // generate random indices; note that these are global indices - cerr << "generating random indicess ...\n"; - unordered_set random_indices; - srandom(seed); - while (true) { - int v = random() % num_valid; - random_indices.insert(v); - if (random_indices.size() == num_samples) { - break; - } - } +//============================================================================ +int main(int argc, char **argv) { + int random_seed = lbann_default_random_seed; + world_comm_ptr comm = initialize(argc, argv, random_seed); + int np = comm->get_procs_in_world(); - // loop over each entry from in input index file; determine which, if any, - // local indices will be added to the INCLUSION index - int first = 0; - size_t good, bad; - num_files = 0; - string fn; - std::unordered_map data; - while (! in.eof()) { - line = ""; - getline(in, line); - if (!line.size()) { - break; - } - ++num_files; - if (num_files % 1000 == 0) cerr << num_files/1000 << "K input lines processed\n"; - stringstream s(line); - s >> fn >> good >> bad; - data[fn] = line; - const int total = good+bad; - index_map_exclude[fn]; - index_map_keep[fn]; - while (s >> sample_id) { - if (sample_mapping[fn].find(sample_id) == sample_mapping[fn].end()) { - LBANN_ERROR("failed to find " + sample_id + " in sample_mapping"); - } - index_map_exclude[fn].insert(string_to_index[sample_id]); - } - if (index_map_exclude[fn].size() != bad) { - err << "exclude.size(): " << index_map_exclude[fn].size() << " should be: " << bad << " but isn't\n"; - LBANN_ERROR(err.str()); - } + try { - int local_valid_index = 0; - for (int local_index=0; local_indexinit(argc, argv); - //===================================================================== - // write EXCLUSION file - //===================================================================== - //open output file and write 1st header line - const std::string name1 = output_fn + "_bar"; - std::cerr << "\nWRITING output file: " << name1 << "\n"; - std::ofstream out(name1.c_str()); - if (!out) { - err << "failed to open " << name1 << " for writing\n"; - LBANN_ERROR(err.str()); + // check for proper invocation, print help message + if (opts->get_bool("h") || opts->get_bool("help") || argc == 1) { + cout << help_msg(); + return EXIT_FAILURE; } - out<< "CONDUIT_HDF5_EXCLUSION\n"; - - std::stringstream sout; - size_t total_good = 0; - size_t total_bad = 0; - size_t num_include_files = 0; - - for (auto t : index_map_exclude) { - filename = t.first; - if (data.find(filename) == data.end()) { - err << "data.find(" << filename << ") failed\n"; - for (auto tt : data) { - err << tt.first << "\n"; - } - LBANN_ERROR(err.str()); - } - // get total samples for the current file - std::stringstream s5(data[filename]); - s5 >> fn >> good >> bad; - size_t total = good+bad; - - const std::unordered_set &exclude_me = t.second; - int excluded = exclude_me.size(); - int included = total - excluded; - if (included) { - ++num_include_files; - total_good += included; - total_bad += excluded; - sout << filename << " " << included << " " << excluded; - for (auto t3 : exclude_me) { - sout << " " << sample_mapping_v[fn][t3]; - } - sout << "\n"; - } + // sanity checks + check_cmd_line(); + + // ensure we have enough samples to fullfill the requirements + sanity_test_request(); + + // maps filename to { sample_ids } + unordered_map> sample_mapping; + // maps filename to [ sample_ids ] + unordered_map> sample_mapping_v; + // maps a sampleID to a local idex + unordered_map string_to_index; + // note: the above mappings contain sample IDs for all samples, + // whether successful or failed + + read_mapping_file(sample_mapping, sample_mapping_v, string_to_index); + + unordered_map> index_map_keep; + unordered_map> index_map_exclude; + std::unordered_map filename_data; + build_index_maps(sample_mapping, index_map_keep, index_map_exclude, string_to_index, filename_data); + + // divide the selected samples into num_list sets + int num_lists = opts->get_int("num_lists"); + vector>> subsets(num_lists); + divide_selected_samples(index_map_keep, subsets); + + const string output_dir = opts->get_string("output_dir"); + const string output_base = opts->get_string("output_base_fn"); + for (int n=0; n> fn >> good >> bad; - size_t total = good+bad; - const std::unordered_set &include_me = t.second; - int included = include_me.size(); - int excluded = total - included; - - if (included) { - ++num_include_files; - total_good += included; - total_bad += excluded; - sout << filename << " " << included << " " << excluded; - for (auto t3 : include_me) { - sout << " " << sample_mapping_v[fn][t3]; - } - sout << "\n"; + } catch (lbann::exception& e) { + if (options::get()->get_bool("stack_trace_to_file")) { + ostringstream ss("stack_trace"); + const auto& rank = get_rank_in_world(); + if (rank >= 0) { + ss << "_rank" << rank; } + ss << ".txt"; + ofstream fs(ss.str()); + e.print_report(fs); } + El::ReportException(e); + return EXIT_FAILURE; + } catch (std::exception& e) { + El::ReportException(e); + return EXIT_FAILURE; + } - out << total_good << " " << total_bad << " " << num_include_files - << "\n" << base_dir << "\n" << sout.str(); - MPI_Finalize(); return EXIT_SUCCESS; } // sanity check the cmd line void check_cmd_line() { options *opts = options::get(); - std::stringstream err; + stringstream err; if (! (opts->has_string("index_fn") && opts->has_string("mapping_fn") - && opts->has_int("num_samples_per_list") && && opts->has_int("num_lists") + && opts->has_int("num_samples_per_list") && opts->has_int("num_lists") && opts->has_int("random_seed") && opts->has_string("output_dir") && opts->has_string("output_base_fn"))) { - std::cout << help_message(); - MPI_Finalize(); + cout << help_msg(); exit(0); } } -std::string help_msg() { - std::stringstream err; +string help_msg() { + stringstream err; err << "usage: select_samples --index_fn= --sample_mapping_fn= --num_samples_per_list= --num_lists --output_dir= --output_base_name= --random_seed=\n\n"; err << "example invocation:\n"; err << "select_samples \n"; @@ -290,8 +154,9 @@ std::string help_msg() { return err.str(); } -void read_mapping_file(std::string &mapping_fn, unordered_map> &sample_mapping, unordered_map> &sample_mapping_v, unordered_map& string_to_index) { - cerr << "reading sample mapping\n"; +void read_mapping_file(unordered_map> &sample_mapping, unordered_map> &sample_mapping_v, unordered_map& string_to_index) { + cerr << "starting read_mapping_file\n"; + const string mapping_fn = options::get()->get_string("mapping_fn"); ifstream in(mapping_fn.c_str()); string filename; string sample_id; @@ -309,12 +174,231 @@ void read_mapping_file(std::string &mapping_fn, unordered_map> maps a filename to the +// set of indices (not sample_ids; that comes later!) that are to be +// included and excluded +void build_index_maps( + unordered_map> &sample_mapping, + unordered_map> &index_map_keep, + unordered_map> &index_map_exclude, + unordered_map& string_to_index, + unordered_map &data) { + + cout << "starting build_index_maps\n"; + + int samples_per_list = options::get()->get_int("num_samples_per_list"); + int num_lists = options::get()->get_int("num_lists"); + size_t num_samples = samples_per_list * num_lists; + + //open input file + const string index_fn = options::get()->get_string("index_fn").c_str(); + ifstream in(index_fn.c_str()); + if (!in) { + LBANN_ERROR("failed to open " + index_fn + " for reading"); + } + + string line; + getline(in, line); + if (line != "CONDUIT_HDF5_EXCLUSION") { + LBANN_ERROR("error: 1st line in index file must contain: CONDUIT_HDF5_EXCLUSION\n"); + } + + int num_valid, num_invalid, num_files; + in >> num_valid >> num_invalid >> num_files; + getline(in, line); //discard newline + string base_dir; + getline(in, base_dir); + cerr << "input index file contains " << num_valid << " valid samples\n"; + + cerr << "generating random indicess ...\n"; + unordered_set random_indices; + srandom(options::get()->get_int("seed")); + while (true) { + int v = random() % num_valid; + random_indices.insert(v); + if (random_indices.size() == num_samples) { + break; + } + } + + // loop over each entry from in input index file; determine which, if any, + // local indices will be added to the INCLUSION index + int first = 0; + size_t good, bad; + num_files = 0; + string fn; + while (! in.eof()) { + line = ""; + getline(in, line); + if (!line.size()) { + break; + } + ++num_files; + if (num_files % 1000 == 0) cerr << num_files/1000 << "K input lines processed\n"; + stringstream s(line); + s >> fn >> good >> bad; + data[fn] = line; + const int total = good+bad; + index_map_exclude[fn]; + index_map_keep[fn]; + string sample_id; + while (s >> sample_id) { + if (sample_mapping[fn].find(sample_id) == sample_mapping[fn].end()) { + LBANN_ERROR("failed to find " + to_string(sample_id) + " in sample_mapping"); + } + index_map_exclude[fn].insert(string_to_index[sample_id]); + } + if (index_map_exclude[fn].size() != bad) { + LBANN_ERROR("exclude.size(): " + to_string(index_map_exclude[fn].size()) + " should be: " + to_string(bad) + " but isn't\n"); + } + + int local_valid_index = 0; + for (int local_index=0; local_indexget_string("index_fn").c_str(); + ifstream in(index_fn.c_str()); + if (!in) { + LBANN_ERROR("failed to open " + index_fn + " for reading"); + } + + string line; + getline(in, line); + if (line != "CONDUIT_HDF5_EXCLUSION") { + LBANN_ERROR("error: 1st line in index file must contain: CONDUIT_HDF5_EXCLUSION\n"); + } + + int num_valid, num_invalid, num_files; + in >> num_valid >> num_invalid >> num_files; + int samples_per_list = options::get()->get_int("num_samples_per_list"); + int num_lists = options::get()->get_int("num_lists"); + int num_samples = samples_per_list * num_lists; + if (num_samples > num_valid) { + LBANN_ERROR("you requested a total of " + to_string(num_samples) + " samples, but only " + to_string("num_valid") + " are available"); + } +} + +void divide_selected_samples( + const unordered_map> &index_map_keep, + vector>> &sets) { + size_t samples_per_list = options::get()->get_int("num_samples_per_list"); + size_t which = 0; + size_t count = 0; + size_t total = 0; + for (auto &it : index_map_keep) { + const string &filename = it.first; + const unordered_set &sample_ids = it.second; + for (auto &it2 : sample_ids) { + sets[which][filename].insert(it2); + ++total; + ++count; + if (count == samples_per_list) { + count = 0; + ++which; + } + } + } + + if (which != sets.size()) { + LBANN_ERROR("which != sets.size()"); + } + if (total != samples_per_list * sets.size()) { + LBANN_ERROR("samples_per_list * sets.size()"); + } +} + +void write_sample_list( + int n, + vector>> &subsets, + unordered_map> &sample_mapping_v, + std::unordered_map &filename_data) { + const string dir = options::get()->get_string("output_dir"); + const string fn = options::get()->get_string("output_base_fn"); + stringstream s; + s << dir << '/' << "t_" << n << '_' << fn; + ofstream out(s.str().c_str()); + if (!out) { + LBANN_ERROR("failed to open " + s.str() + " for writing"); + } + cout << "WRITING output file: " << s.str() << endl; + + out << "CONDUIT_HDF5_INCLUSION\n"; + stringstream s2; + size_t total_good = 0; + size_t total_bad = 0; + size_t num_include_files = 0; + stringstream sout; + for (auto &t : subsets[n]) { + const string &filename = t.first; + if (filename_data.find(filename) == filename_data.end()) { + #if 0 + err << "filename_data.find(" << filename << ") failed\n"; + for (auto tt : filename_data) { + err << tt.first << "\n"; + } + LBANN_ERROR(err.str()); + #endif + LBANN_ERROR("filename_data.find(" + filename + ") failed"); + } + + // get total samples for the current file + stringstream s5(filename_data[filename]); + int good, bad; + string fn_discard; + s5 >> fn_discard >> good >> bad; + size_t total = good+bad; + const unordered_set &include_me = t.second; + int included = include_me.size(); + int excluded = total - included; + + if (included) { + ++num_include_files; + total_good += included; + total_bad += excluded; + sout << filename << " " << included << " " << excluded; + for (auto t3 : include_me) { + sout << " " << sample_mapping_v[fn][t3]; + } + sout << "\n"; + } + } + + const string index_fn = options::get()->get_string("index_fn").c_str(); + ifstream in(index_fn.c_str()); + string line; + getline(in, line); + int num_valid, num_invalid, num_files; + in >> num_valid >> num_invalid >> num_files; + getline(in, line); //discard newline + string base_dir; + getline(in, base_dir); + + out << total_good << " " << total_bad << " " << num_include_files + << "\n" << base_dir << "\n" << sout.str(); } From f916248db5819e29fa1b848e686be5ec8299542f Mon Sep 17 00:00:00 2001 From: "David A. Hysom" Date: Sat, 17 Aug 2019 10:50:16 -0700 Subject: [PATCH 243/634] working version. --- model_zoo/jag_utils/select_samples.cpp | 64 ++++++++++++++++---------- 1 file changed, 39 insertions(+), 25 deletions(-) diff --git a/model_zoo/jag_utils/select_samples.cpp b/model_zoo/jag_utils/select_samples.cpp index 1d4bd841a87..d007c0f4eeb 100644 --- a/model_zoo/jag_utils/select_samples.cpp +++ b/model_zoo/jag_utils/select_samples.cpp @@ -73,6 +73,15 @@ int main(int argc, char **argv) { // sanity checks check_cmd_line(); + // check that output directory exists and is writable + const string d = opts->get_string("output_dir") + "/ok_to_erase_me"; + ofstream testing(d.c_str()); + if (!testing) { + LBANN_ERROR("the output directory \"" + opts->get_string("output_dir") + "\" either doesn't exist or is not writable"); + } + testing.close(); + remove(d.c_str()); + // ensure we have enough samples to fullfill the requirements sanity_test_request(); @@ -142,20 +151,23 @@ string help_msg() { stringstream err; err << "usage: select_samples --index_fn= --sample_mapping_fn= --num_samples_per_list= --num_lists --output_dir= --output_base_name= --random_seed=\n\n"; err << "example invocation:\n"; - err << "select_samples \n"; - err << " --index_fn=/p/gpfs1/brainusr/datasets/10MJAG/1M_B/index.txt\n"; - err << " --mapping_fn=/p/gpfs1/brainusr/datasets/10MJAG/1M_B/id_mapping.txt\n"; - err << " --num_samples_per_list=1000\n"; - err << " --num_lists=4\n"; - err << " --output_dir=/p/gpfs1/brainusr/datasets/10MJAG/1M_B\n"; - err << " --output_base_fn=my_samples.txt\n"; + err << "select_samples \\\n"; + err << " --index_fn=/p/gpfs1/brainusr/datasets/100M/index.txt \\\n"; + err << " --mapping_fn=/p/gpfs1/brainusr/datasets/100M/id_mapping.txt \\\n"; + err << " --num_samples_per_list=100000 \\\n"; + err << " --num_lists=640 \\\n"; + err << " --output_dir=/p/gpfs1/brainusr/datasets/100M/1M_B \\\n"; + err << " --output_base_fn=my_samples.txt \\\n"; err << " --random_seed=42\n"; err << "\n\n"; + err << "NOTE: output directory must exist prior to running this code\n"; + return err.str(); } void read_mapping_file(unordered_map> &sample_mapping, unordered_map> &sample_mapping_v, unordered_map& string_to_index) { cerr << "starting read_mapping_file\n"; + double tm1 = get_time(); const string mapping_fn = options::get()->get_string("mapping_fn"); ifstream in(mapping_fn.c_str()); string filename; @@ -180,7 +192,8 @@ void read_mapping_file(unordered_map> &sample_mapp } } in.close(); - cerr << " FINISHED reading sample mapping: num lines processed: " << n << "\n"; + double tm2 = get_time() - tm1; + cerr << " FINISHED reading sample mapping: num lines processed: " << n << "; time: " << tm2 << "\n"; } // build two maps: > maps a filename to the @@ -194,6 +207,7 @@ void build_index_maps( unordered_map &data) { cout << "starting build_index_maps\n"; + double tm1 = get_time(); int samples_per_list = options::get()->get_int("num_samples_per_list"); int num_lists = options::get()->get_int("num_lists"); @@ -217,11 +231,13 @@ void build_index_maps( getline(in, line); //discard newline string base_dir; getline(in, base_dir); + options::get()->set_option("base_dir", base_dir); cerr << "input index file contains " << num_valid << " valid samples\n"; - cerr << "generating random indicess ...\n"; + cerr << "generating random indices ...\n"; + double tm2 = get_time(); unordered_set random_indices; - srandom(options::get()->get_int("seed")); + srandom(options::get()->get_int("random_seed")); while (true) { int v = random() % num_valid; random_indices.insert(v); @@ -229,6 +245,7 @@ void build_index_maps( break; } } + cerr << " FINISHED generating random indices; time: " << get_time() - tm2 << endl; // loop over each entry from in input index file; determine which, if any, // local indices will be added to the INCLUSION index @@ -278,7 +295,7 @@ void build_index_maps( if (index_map_exclude.size() != index_map_keep.size()) { LBANN_ERROR("index_map_exclude.size() != index_map_keep.size()"); } - cout << " FINISHEDbuild_index_maps\n"; + cout << " FINISHED build_index_maps; time: " << get_time() - tm1 << endl; } void sanity_test_request() { @@ -341,7 +358,7 @@ void write_sample_list( const string dir = options::get()->get_string("output_dir"); const string fn = options::get()->get_string("output_base_fn"); stringstream s; - s << dir << '/' << "t_" << n << '_' << fn; + s << dir << '/' << "t" << n << '_' << fn; ofstream out(s.str().c_str()); if (!out) { LBANN_ERROR("failed to open " + s.str() + " for writing"); @@ -370,9 +387,10 @@ void write_sample_list( // get total samples for the current file stringstream s5(filename_data[filename]); int good, bad; - string fn_discard; - s5 >> fn_discard >> good >> bad; + string fn2; + s5 >> fn2 >> good >> bad; size_t total = good+bad; + const unordered_set &include_me = t.second; int included = include_me.size(); int excluded = total - included; @@ -382,23 +400,19 @@ void write_sample_list( total_good += included; total_bad += excluded; sout << filename << " " << included << " " << excluded; - for (auto t3 : include_me) { - sout << " " << sample_mapping_v[fn][t3]; + for (auto &t3 : include_me) { + if (sample_mapping_v.find(fn2) == sample_mapping_v.end()) { + LBANN_ERROR("failed to find the key: " + fn2 + " in sample_mapping_v map"); + } + sout << " " << sample_mapping_v[fn2][t3]; } sout << "\n"; } } - const string index_fn = options::get()->get_string("index_fn").c_str(); - ifstream in(index_fn.c_str()); - string line; - getline(in, line); - int num_valid, num_invalid, num_files; - in >> num_valid >> num_invalid >> num_files; - getline(in, line); //discard newline - string base_dir; - getline(in, base_dir); + const string base_dir = options::get()->get_string("base_dir"); out << total_good << " " << total_bad << " " << num_include_files << "\n" << base_dir << "\n" << sout.str(); + out.close(); } From 5732e4dedeb6d60b83bbf1d8fb0da78e97eaf4c2 Mon Sep 17 00:00:00 2001 From: "David A. Hysom" Date: Sat, 17 Aug 2019 13:18:12 -0700 Subject: [PATCH 244/634] occurences of to_string(..) -> std::to_string(..). Also fixed a couple of to_string("a string"). This resolves namespace collision between lbann::to_string and std::to_string --- model_zoo/jag_utils/select_samples.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/model_zoo/jag_utils/select_samples.cpp b/model_zoo/jag_utils/select_samples.cpp index d007c0f4eeb..cf7b40af967 100644 --- a/model_zoo/jag_utils/select_samples.cpp +++ b/model_zoo/jag_utils/select_samples.cpp @@ -186,7 +186,7 @@ void read_mapping_file(unordered_map> &sample_mapp sample_mapping[filename].insert(sample_id); sample_mapping_v[filename].push_back(sample_id); if (string_to_index.find(sample_id) != string_to_index.end()) { - LBANN_ERROR("duplicate sample_ID: " + to_string(sample_id) + " in file: " + filename); + LBANN_ERROR("duplicate sample_ID: " + sample_id + " in file: " + filename); } string_to_index[sample_id] = hh++; } @@ -270,12 +270,12 @@ void build_index_maps( string sample_id; while (s >> sample_id) { if (sample_mapping[fn].find(sample_id) == sample_mapping[fn].end()) { - LBANN_ERROR("failed to find " + to_string(sample_id) + " in sample_mapping"); + LBANN_ERROR("failed to find " + sample_id + " in sample_mapping"); } index_map_exclude[fn].insert(string_to_index[sample_id]); } if (index_map_exclude[fn].size() != bad) { - LBANN_ERROR("exclude.size(): " + to_string(index_map_exclude[fn].size()) + " should be: " + to_string(bad) + " but isn't\n"); + LBANN_ERROR("exclude.size(): " + std::to_string(index_map_exclude[fn].size()) + " should be: " + std::to_string(bad) + " but isn't\n"); } int local_valid_index = 0; @@ -317,7 +317,7 @@ void sanity_test_request() { int num_lists = options::get()->get_int("num_lists"); int num_samples = samples_per_list * num_lists; if (num_samples > num_valid) { - LBANN_ERROR("you requested a total of " + to_string(num_samples) + " samples, but only " + to_string("num_valid") + " are available"); + LBANN_ERROR("you requested a total of " + std::to_string(num_samples) + " samples, but only " + std::to_string(num_valid) + " are available"); } } From 65fe6d3422da49f8a4274b1bc7529613a12968ac Mon Sep 17 00:00:00 2001 From: "David A. Hysom" Date: Sat, 17 Aug 2019 13:24:50 -0700 Subject: [PATCH 245/634] removed unneeded includes: conduit headers, and also lbann.hpp (which is just stupid, since it includes just about everything) --- model_zoo/jag_utils/select_samples.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/model_zoo/jag_utils/select_samples.cpp b/model_zoo/jag_utils/select_samples.cpp index cf7b40af967..ef8d7f40e7d 100644 --- a/model_zoo/jag_utils/select_samples.cpp +++ b/model_zoo/jag_utils/select_samples.cpp @@ -7,11 +7,11 @@ #include #include #include -#include "lbann/lbann.hpp" -#include "conduit/conduit.hpp" -#include "conduit/conduit_relay.hpp" -#include "conduit/conduit_relay_io_hdf5.hpp" -#include "lbann/lbann.hpp" +#include "lbann/utils/options.hpp" +#include "lbann/utils/exception.hpp" +#include "lbann/utils/timer.hpp" +#include "lbann/utils/lbann_library.hpp" +#include "lbann/comm.hpp" using namespace std; using namespace lbann; From 7017ab56edfb393aa3cca4332516a7148ba026ec Mon Sep 17 00:00:00 2001 From: "David A. Hysom" Date: Sat, 17 Aug 2019 16:21:51 -0700 Subject: [PATCH 246/634] Refactored and added comments - Made some data structures in function calls const, to clearly indicate they are inputs - Added a few comments to (hopefully) more clearly indicate what the code is doing - Moved the does-output-directory-exist check from main into a function --- model_zoo/jag_utils/select_samples.cpp | 77 ++++++++++++++++---------- 1 file changed, 49 insertions(+), 28 deletions(-) diff --git a/model_zoo/jag_utils/select_samples.cpp b/model_zoo/jag_utils/select_samples.cpp index ef8d7f40e7d..55964d11fa5 100644 --- a/model_zoo/jag_utils/select_samples.cpp +++ b/model_zoo/jag_utils/select_samples.cpp @@ -17,37 +17,45 @@ using namespace std; using namespace lbann; //============================================================================ -// sanity check the cmd line +// sanity checks the cmd line void check_cmd_line(); // returns the help message string help_msg(); +// tests that the output dir exists and is writable +void test_output_dir(); + // tests that there are sufficient samples to build the lists +// (i.e, num_lists*num_samples_per_list must not be greater than +// the total number of (successful) samples void sanity_test_request(); +// constructs various mappings from the mapping file void read_mapping_file( unordered_map> &sample_mapping, unordered_map> &sample_mapping_v, unordered_map& string_to_index); +// constructs various mappings from the index file void build_index_maps( - unordered_map> &sample_mapping, unordered_map> &index_map_keep, unordered_map> &index_map_exclude, unordered_map &string_to_index, unordered_map &filename_data); +// partition the sample IDs in index_map_keep into n sets; +// on entry, sets.size() = num_lists void divide_selected_samples( const unordered_map> &index_map_keep, vector>> &sets); -//todo: some of these should be const +// write the n-th sample list to file void write_sample_list( int n, - vector>> &subsets, - unordered_map> &sample_mapping_v, - std::unordered_map &filename_data); + const vector>> &subsets, + const unordered_map> &sample_mapping_v, + const std::unordered_map &filename_data); //============================================================================ int main(int argc, char **argv) { @@ -57,7 +65,7 @@ int main(int argc, char **argv) { try { - if (np!= 1) { + if (np != 1) { LBANN_ERROR("please run with a single processor"); } @@ -70,17 +78,11 @@ int main(int argc, char **argv) { return EXIT_FAILURE; } - // sanity checks + // check for proper invocation check_cmd_line(); // check that output directory exists and is writable - const string d = opts->get_string("output_dir") + "/ok_to_erase_me"; - ofstream testing(d.c_str()); - if (!testing) { - LBANN_ERROR("the output directory \"" + opts->get_string("output_dir") + "\" either doesn't exist or is not writable"); - } - testing.close(); - remove(d.c_str()); + test_output_dir(); // ensure we have enough samples to fullfill the requirements sanity_test_request(); @@ -99,15 +101,16 @@ int main(int argc, char **argv) { unordered_map> index_map_keep; unordered_map> index_map_exclude; std::unordered_map filename_data; - build_index_maps(sample_mapping, index_map_keep, index_map_exclude, string_to_index, filename_data); + build_index_maps(index_map_keep, index_map_exclude, string_to_index, filename_data); - // divide the selected samples into num_list sets + // partition the randomly selected samples into "num_lists" sets int num_lists = opts->get_int("num_lists"); vector>> subsets(num_lists); divide_selected_samples(index_map_keep, subsets); - const string output_dir = opts->get_string("output_dir"); - const string output_base = opts->get_string("output_base_fn"); + // write the sample lists +// const string output_dir = opts->get_string("output_dir"); +// const string output_base = opts->get_string("output_base_fn"); for (int n=0; n> &sample_mapp // set of indices (not sample_ids; that comes later!) that are to be // included and excluded void build_index_maps( - unordered_map> &sample_mapping, unordered_map> &index_map_keep, unordered_map> &index_map_exclude, unordered_map& string_to_index, @@ -268,10 +270,8 @@ void build_index_maps( index_map_exclude[fn]; index_map_keep[fn]; string sample_id; + while (s >> sample_id) { - if (sample_mapping[fn].find(sample_id) == sample_mapping[fn].end()) { - LBANN_ERROR("failed to find " + sample_id + " in sample_mapping"); - } index_map_exclude[fn].insert(string_to_index[sample_id]); } if (index_map_exclude[fn].size() != bad) { @@ -352,9 +352,9 @@ void divide_selected_samples( void write_sample_list( int n, - vector>> &subsets, - unordered_map> &sample_mapping_v, - std::unordered_map &filename_data) { + const vector>> &subsets, + const unordered_map> &sample_mapping_v, + const std::unordered_map &filename_data) { const string dir = options::get()->get_string("output_dir"); const string fn = options::get()->get_string("output_base_fn"); stringstream s; @@ -385,7 +385,11 @@ void write_sample_list( } // get total samples for the current file - stringstream s5(filename_data[filename]); + std::unordered_map::const_iterator t4 = filename_data.find(filename); + if (t4 == filename_data.end()) { + LBANN_ERROR("t4 == filename_data.end()"); + } + stringstream s5(t4->second); int good, bad; string fn2; s5 >> fn2 >> good >> bad; @@ -404,7 +408,14 @@ void write_sample_list( if (sample_mapping_v.find(fn2) == sample_mapping_v.end()) { LBANN_ERROR("failed to find the key: " + fn2 + " in sample_mapping_v map"); } - sout << " " << sample_mapping_v[fn2][t3]; + unordered_map>::const_iterator t5 = sample_mapping_v.find(fn2); + if (t5 == sample_mapping_v.end()) { + LBANN_ERROR("t5 == sample_mapping_v.end()"); + } + if (static_cast(t3) >= t5->second.size()) { + LBANN_ERROR("t3 >= t5->second.size()"); + } + sout << " " << t5->second[t3]; } sout << "\n"; } @@ -416,3 +427,13 @@ void write_sample_list( << "\n" << base_dir << "\n" << sout.str(); out.close(); } + +void test_output_dir() { + const string d = options::get()->get_string("output_dir") + "/ok_to_erase_me"; + ofstream testing(d.c_str()); + if (!testing) { + LBANN_ERROR("the output directory \"" + options::get()->get_string("output_dir") + "\" either doesn't exist or is not writable"); + } + testing.close(); + remove(d.c_str()); +} From a3b2d94bb95cd488f6c49dd8f0933b889e3453df Mon Sep 17 00:00:00 2001 From: "David A. Hysom" Date: Sat, 17 Aug 2019 16:38:23 -0700 Subject: [PATCH 247/634] Revised comments, and changed variable "data" to "filename_data" for consistency --- model_zoo/jag_utils/select_samples.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/model_zoo/jag_utils/select_samples.cpp b/model_zoo/jag_utils/select_samples.cpp index 55964d11fa5..47c51db1ceb 100644 --- a/model_zoo/jag_utils/select_samples.cpp +++ b/model_zoo/jag_utils/select_samples.cpp @@ -87,18 +87,18 @@ int main(int argc, char **argv) { // ensure we have enough samples to fullfill the requirements sanity_test_request(); - // maps filename to { sample_ids } + // maps a sample_id filename to the set of sample IDs unordered_map> sample_mapping; - // maps filename to [ sample_ids ] + // maps a sample_id filename to a list of sample IDs unordered_map> sample_mapping_v; // maps a sampleID to a local idex unordered_map string_to_index; - // note: the above mappings contain sample IDs for all samples, - // whether successful or failed read_mapping_file(sample_mapping, sample_mapping_v, string_to_index); + // maps a samole_id filename to a set of randomly selected sample_ids unordered_map> index_map_keep; + // maps a samole_id filename to the set of sample_ids that have not been randomly selscted unordered_map> index_map_exclude; std::unordered_map filename_data; build_index_maps(index_map_keep, index_map_exclude, string_to_index, filename_data); @@ -206,7 +206,7 @@ void build_index_maps( unordered_map> &index_map_keep, unordered_map> &index_map_exclude, unordered_map& string_to_index, - unordered_map &data) { + unordered_map &filename_data) { cout << "starting build_index_maps\n"; double tm1 = get_time(); @@ -265,7 +265,7 @@ void build_index_maps( if (num_files % 1000 == 0) cerr << num_files/1000 << "K input lines processed\n"; stringstream s(line); s >> fn >> good >> bad; - data[fn] = line; + filename_data[fn] = line; const int total = good+bad; index_map_exclude[fn]; index_map_keep[fn]; From 27fdf65e10d9eadee9cc5f3a22a385f466f284d1 Mon Sep 17 00:00:00 2001 From: "David A. Hysom" Date: Sat, 17 Aug 2019 17:04:11 -0700 Subject: [PATCH 248/634] Modified CMakeLists.txt to install some drivers in model_zoo/jag_utils in the install/bin directory --- model_zoo/jag_utils/CMakeLists.txt | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/model_zoo/jag_utils/CMakeLists.txt b/model_zoo/jag_utils/CMakeLists.txt index d39b17d2099..794ee8e61b8 100644 --- a/model_zoo/jag_utils/CMakeLists.txt +++ b/model_zoo/jag_utils/CMakeLists.txt @@ -65,3 +65,13 @@ add_executable( convert-bin convert.cpp ) target_link_libraries(convert-bin lbann ) set_target_properties(convert-bin PROPERTIES OUTPUT_NAME convert) + +# Install the binaries +install( + TARGETS select_samples-bin build_sample_id_mapping-bin build_index-bin + EXPORT LBANNTargets + RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} + INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} + ) From 0ba55c45a1b5e75c4051404583fe612d651ca239 Mon Sep 17 00:00:00 2001 From: "David A. Hysom" Date: Sun, 18 Aug 2019 16:28:30 -0700 Subject: [PATCH 249/634] Removed "using namespace lbann"; added "using lbann::options There are very few other classes/functions that this driver uses from the lbann library; for those that are used, I prefaced them with lbann:: I prefer to retain "using namespace std" since, IMHO, it makes constructs like "vector>>" for more readable. An alternative is to write something like: using thingy_t = std::vector>>; but when I do that I'm always forgetting, "um just what is thingy_t again?" --- model_zoo/jag_utils/select_samples.cpp | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/model_zoo/jag_utils/select_samples.cpp b/model_zoo/jag_utils/select_samples.cpp index 47c51db1ceb..71f26bb7345 100644 --- a/model_zoo/jag_utils/select_samples.cpp +++ b/model_zoo/jag_utils/select_samples.cpp @@ -14,7 +14,7 @@ #include "lbann/comm.hpp" using namespace std; -using namespace lbann; +using lbann::options; //============================================================================ // sanity checks the cmd line @@ -59,8 +59,8 @@ void write_sample_list( //============================================================================ int main(int argc, char **argv) { - int random_seed = lbann_default_random_seed; - world_comm_ptr comm = initialize(argc, argv, random_seed); + int random_seed = lbann::lbann_default_random_seed; + lbann::world_comm_ptr comm = lbann::initialize(argc, argv, random_seed); int np = comm->get_procs_in_world(); try { @@ -118,7 +118,7 @@ int main(int argc, char **argv) { } catch (lbann::exception& e) { if (options::get()->get_bool("stack_trace_to_file")) { ostringstream ss("stack_trace"); - const auto& rank = get_rank_in_world(); + const auto& rank = lbann::get_rank_in_world(); if (rank >= 0) { ss << "_rank" << rank; } @@ -170,7 +170,7 @@ string help_msg() { void read_mapping_file(unordered_map> &sample_mapping, unordered_map> &sample_mapping_v, unordered_map& string_to_index) { cerr << "starting read_mapping_file\n"; - double tm1 = get_time(); + double tm1 = lbann::get_time(); const string mapping_fn = options::get()->get_string("mapping_fn"); ifstream in(mapping_fn.c_str()); string filename; @@ -195,7 +195,7 @@ void read_mapping_file(unordered_map> &sample_mapp } } in.close(); - double tm2 = get_time() - tm1; + double tm2 = lbann::get_time() - tm1; cerr << " FINISHED reading sample mapping: num lines processed: " << n << "; time: " << tm2 << "\n"; } @@ -209,7 +209,7 @@ void build_index_maps( unordered_map &filename_data) { cout << "starting build_index_maps\n"; - double tm1 = get_time(); + double tm1 = lbann::get_time(); int samples_per_list = options::get()->get_int("num_samples_per_list"); int num_lists = options::get()->get_int("num_lists"); @@ -237,7 +237,7 @@ void build_index_maps( cerr << "input index file contains " << num_valid << " valid samples\n"; cerr << "generating random indices ...\n"; - double tm2 = get_time(); + double tm2 = lbann::get_time(); unordered_set random_indices; srandom(options::get()->get_int("random_seed")); while (true) { @@ -247,7 +247,7 @@ void build_index_maps( break; } } - cerr << " FINISHED generating random indices; time: " << get_time() - tm2 << endl; + cerr << " FINISHED generating random indices; time: " << lbann::get_time() - tm2 << endl; // loop over each entry from in input index file; determine which, if any, // local indices will be added to the INCLUSION index @@ -295,7 +295,7 @@ void build_index_maps( if (index_map_exclude.size() != index_map_keep.size()) { LBANN_ERROR("index_map_exclude.size() != index_map_keep.size()"); } - cout << " FINISHED build_index_maps; time: " << get_time() - tm1 << endl; + cout << " FINISHED build_index_maps; time: " << lbann::get_time() - tm1 << endl; } void sanity_test_request() { From d7363622d1fd59e2911d06f77a4e6d94c41d91d1 Mon Sep 17 00:00:00 2001 From: "David A. Hysom" Date: Mon, 19 Aug 2019 09:11:53 -0700 Subject: [PATCH 250/634] Changed all print statements from cerr to cout. Revised the cmd line sanity check to print out any missing cmd line parameters --- model_zoo/jag_utils/select_samples.cpp | 81 ++++++++++++++++++++++---- 1 file changed, 71 insertions(+), 10 deletions(-) diff --git a/model_zoo/jag_utils/select_samples.cpp b/model_zoo/jag_utils/select_samples.cpp index 71f26bb7345..e016714ecbf 100644 --- a/model_zoo/jag_utils/select_samples.cpp +++ b/model_zoo/jag_utils/select_samples.cpp @@ -12,6 +12,9 @@ #include "lbann/utils/timer.hpp" #include "lbann/utils/lbann_library.hpp" #include "lbann/comm.hpp" +#include +#include +#include using namespace std; using lbann::options; @@ -23,7 +26,8 @@ void check_cmd_line(); // returns the help message string help_msg(); -// tests that the output dir exists and is writable +// tests that the output dir exists and is writable, +// and creates it if otherwise void test_output_dir(); // tests that there are sufficient samples to build the lists @@ -81,7 +85,8 @@ int main(int argc, char **argv) { // check for proper invocation check_cmd_line(); - // check that output directory exists and is writable + // check that output directory exists and is writable, + // and creates it if otherwise test_output_dir(); // ensure we have enough samples to fullfill the requirements @@ -109,12 +114,12 @@ int main(int argc, char **argv) { divide_selected_samples(index_map_keep, subsets); // write the sample lists -// const string output_dir = opts->get_string("output_dir"); -// const string output_base = opts->get_string("output_base_fn"); for (int n=0; nget_bool("stack_trace_to_file")) { ostringstream ss("stack_trace"); @@ -146,6 +151,28 @@ void check_cmd_line() { && opts->has_int("random_seed") && opts->has_string("output_dir") && opts->has_string("output_base_fn"))) { cout << help_msg(); + if (!opts->has_string("index_fn")) { + cout << "missing --index_fn= \n"; + } + if (!opts->has_string("mapping_fn")) { + cout << "missing --mapping_fn= \n"; + } + if (!opts->has_string("num_samples_per_list")) { + cout << "missing --num_samples_per_list= \n"; + } + if (!opts->has_string("num_lists")) { + cout << "missing --num_lists= \n"; + } + if (!opts->has_string("random_seed")) { + cout << "missing --random_seed= \n"; + } + if (!opts->has_string("output_dir")) { + cout << "missing --output_dir= \n"; + } + if (!opts->has_string("output_base_fn")) { + cout << "missing --output_base_fn= \n"; + } + cout << "\n"; exit(0); } } @@ -169,7 +196,7 @@ string help_msg() { } void read_mapping_file(unordered_map> &sample_mapping, unordered_map> &sample_mapping_v, unordered_map& string_to_index) { - cerr << "starting read_mapping_file\n"; + cout << "starting read_mapping_file\n"; double tm1 = lbann::get_time(); const string mapping_fn = options::get()->get_string("mapping_fn"); ifstream in(mapping_fn.c_str()); @@ -196,7 +223,7 @@ void read_mapping_file(unordered_map> &sample_mapp } in.close(); double tm2 = lbann::get_time() - tm1; - cerr << " FINISHED reading sample mapping: num lines processed: " << n << "; time: " << tm2 << "\n"; + cout << " FINISHED reading sample mapping: num lines processed: " << n << "; time: " << tm2 << "\n"; } // build two maps: > maps a filename to the @@ -234,9 +261,9 @@ void build_index_maps( string base_dir; getline(in, base_dir); options::get()->set_option("base_dir", base_dir); - cerr << "input index file contains " << num_valid << " valid samples\n"; + cout << "input index file contains " << num_valid << " valid samples\n"; - cerr << "generating random indices ...\n"; + cout << "generating random indices ...\n"; double tm2 = lbann::get_time(); unordered_set random_indices; srandom(options::get()->get_int("random_seed")); @@ -247,7 +274,9 @@ void build_index_maps( break; } } - cerr << " FINISHED generating random indices; time: " << lbann::get_time() - tm2 << endl; + cout << " FINISHED generating random indices; time: " << lbann::get_time() - tm2 << endl; + cout << "selecting samples based on random indices\n"; + double tm3 = lbann::get_time(); // loop over each entry from in input index file; determine which, if any, // local indices will be added to the INCLUSION index @@ -262,7 +291,7 @@ void build_index_maps( break; } ++num_files; - if (num_files % 1000 == 0) cerr << num_files/1000 << "K input lines processed\n"; + if (num_files % 1000 == 0) cout << num_files/1000 << "K input lines processed\n"; stringstream s(line); s >> fn >> good >> bad; filename_data[fn] = line; @@ -291,6 +320,7 @@ void build_index_maps( } first += good; } + cout << "FINISHED selecting samples based on random indices; time: " << lbann::get_time() - tm3 << endl; if (index_map_exclude.size() != index_map_keep.size()) { LBANN_ERROR("index_map_exclude.size() != index_map_keep.size()"); @@ -429,6 +459,37 @@ void write_sample_list( } void test_output_dir() { +/* + const string dir = options::get()->get_string("output_dir"); + struct stat buf; + int err = stat(dir.c_str(), &buf); + if (!err) { + cout << "output directory " << dir << " exists; next will test if it's writable" << endl; + + const string test_fn = options::get()->get_string("output_dir") + "/ok_to_eraseme"; + ofstream testing(test_fn.c_str()); + if (!testing) { + LBANN_ERROR("the output directory " << dir << " exists, but is not writable"); + } else { + testing.close(); + remove(d.c_str()); + // good to go! + return; + } + } + + // output dir doesn't exist, so attempt to create it + + + + + + cout << "stat for dir: " << dir << " is: " << err << endl; + err = stat("/", &buf); + cout << "stat for dir: /; " << err << endl; + exit(0); + +*/ const string d = options::get()->get_string("output_dir") + "/ok_to_erase_me"; ofstream testing(d.c_str()); if (!testing) { From f387cc969e94b6f742bc297b325f2910edfbca29 Mon Sep 17 00:00:00 2001 From: "Thomas R. Benson" Date: Mon, 19 Aug 2019 16:33:21 -0700 Subject: [PATCH 251/634] silence gcc warning by initializing a variable --- src/data_store/data_store_conduit.cpp | 33 +++++++++++++-------------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/src/data_store/data_store_conduit.cpp b/src/data_store/data_store_conduit.cpp index b9b0e323e6b..be9c516b045 100644 --- a/src/data_store/data_store_conduit.cpp +++ b/src/data_store/data_store_conduit.cpp @@ -205,7 +205,7 @@ void data_store_conduit::copy_members(const data_store_conduit& rhs, const std:: const std::vector &names2 = rhs.m_data[i]["data"][names[0]].child_names(); for (auto t : names2) { n2[names[0]][t] = rhs.m_data[i]["data"][names[0]][t]; - } + } build_node_for_sending(n2, m_data[i]); } else { m_data[i] = rhs.m_data[i]; @@ -392,9 +392,9 @@ void data_store_conduit::exchange_data_by_super_node(size_t current_pos, size_t m_output << "m_minibatch_data.size(): " << m_minibatch_data.size() << "; indices: "; for (auto t : m_minibatch_data) { m_output << t.first << " "; - } + } m_output << std::endl; - } + } } void data_store_conduit::set_preloaded_conduit_node(int data_id, conduit::Node &node) { @@ -418,12 +418,12 @@ void data_store_conduit::set_preloaded_conduit_node(int data_id, conduit::Node & if (m_output) { m_output << "set_preloaded_conduit_node: " << data_id << " for super_node mode\n"; } - } else { + } else { if (m_output) { m_output << "set_preloaded_conduit_node: " << data_id << " is already in m_data\n"; } } - } + } } void data_store_conduit::error_check_compacted_node(const conduit::Node &nd, int data_id) { @@ -537,7 +537,7 @@ const conduit::Node & data_store_conduit::get_conduit_node(int data_id) const { m_output << "failed to find data_id: " << data_id << " in m_minibatch_data; my m_minibatch_data indices: "; for (auto t : m_minibatch_data) { m_output << t.first << " "; - } + } m_output << std::endl; } } @@ -591,7 +591,7 @@ void data_store_conduit::exchange_data_by_sample(size_t current_pos, size_t mb_s } /// exchange sample sizes if they are non-uniform (imagenet); - /// this will only be called once, during the first call to + /// this will only be called once, during the first call to /// exchange_data_by_sample at the beginning of the 2nd epoch, /// or during the first call th exchange_data_by_sample() during /// the first epoch if preloading @@ -831,7 +831,7 @@ void data_store_conduit::purge_unused_samples(const std::vector& indices) { void data_store_conduit::compact_nodes() { if (m_super_node) { return; - } + } for(auto&& j : *m_shuffled_indices) { if(m_data.find(j) != m_data.end()){ if(! (m_data[j].is_contiguous() && m_data[j].is_compact()) ) { @@ -1010,8 +1010,8 @@ bool data_store_conduit::has_conduit_node(int data_id) const { return t != m_data.end(); } -void data_store_conduit::set_shuffled_indices(const std::vector *indices) { - m_shuffled_indices = indices; +void data_store_conduit::set_shuffled_indices(const std::vector *indices) { + m_shuffled_indices = indices; } void data_store_conduit::exchange_sample_sizes() { @@ -1055,7 +1055,7 @@ void data_store_conduit::exchange_sample_sizes() { m_have_sample_sizes = true; } -void data_store_conduit::set_preload() { +void data_store_conduit::set_preload() { m_preload = true; } @@ -1172,7 +1172,7 @@ void data_store_conduit::allocate_shared_segment(std::unordered_map } m_comm->trainer_barrier(); - int shm_fd; + int shm_fd = -1; if (node_id == 0) { shm_fd = shm_open(m_seg_name.c_str(), O_CREAT | O_RDWR | O_EXCL, 0666); @@ -1193,7 +1193,7 @@ void data_store_conduit::allocate_shared_segment(std::unordered_map if (sanity != 0) { LBANN_ERROR("msync failed"); } - } + } m_comm->barrier(m_comm->get_node_comm()); @@ -1221,7 +1221,7 @@ void data_store_conduit::allocate_shared_segment(std::unordered_map } void data_store_conduit::preload_local_cache() { - std::unordered_map file_sizes; + std::unordered_map file_sizes; std::vector> indices; double tm1 = get_time(); @@ -1233,7 +1233,7 @@ void data_store_conduit::preload_local_cache() { //that P_j will read from disk, and subsequently bcast to all others // //file_sizes maps an index to its file size - + if (m_world_master) std::cout << "calling allocate_shared_segment" << std::endl; allocate_shared_segment(file_sizes, indices); if (m_world_master) std::cout << " allocate_shared_segment time: " << (get_time()-tm1) << std::endl; @@ -1308,7 +1308,7 @@ void data_store_conduit::build_conduit_nodes(std::unordered_map &siz } void data_store_conduit::fillin_shared_images(const std::vector &images, size_t offset) { - memcpy(m_mem_seg+offset, reinterpret_cast(images.data()), images.size()); + memcpy(m_mem_seg+offset, reinterpret_cast(images.data()), images.size()); } void data_store_conduit::exchange_images(std::vector &work, std::unordered_map &image_sizes, std::vector> &indices) { @@ -1343,4 +1343,3 @@ void data_store_conduit::exchange_images(std::vector &work, std::unordered } // namespace lbann - From 8643d83d8643616affea50c3978c1572c2a3715c Mon Sep 17 00:00:00 2001 From: "David A. Hysom" Date: Tue, 20 Aug 2019 09:48:17 -0700 Subject: [PATCH 252/634] Modified to create output directories if they don't exist --- model_zoo/jag_utils/select_samples.cpp | 93 ++++++++++++++++---------- 1 file changed, 58 insertions(+), 35 deletions(-) diff --git a/model_zoo/jag_utils/select_samples.cpp b/model_zoo/jag_utils/select_samples.cpp index e016714ecbf..b6f914972da 100644 --- a/model_zoo/jag_utils/select_samples.cpp +++ b/model_zoo/jag_utils/select_samples.cpp @@ -138,7 +138,6 @@ int main(int argc, char **argv) { return EXIT_FAILURE; } - return EXIT_SUCCESS; } @@ -458,43 +457,67 @@ void write_sample_list( out.close(); } +bool file_exists(const char *path) { + struct stat s; + int err = stat(path, &s); + if (err == -1) { + return false; + } + return true; +} + +void make_dir(char *cpath) { + cout << " path doesn't exist: " << strerror(errno) << endl; + cout << " attempting to create path\n"; + int err = mkdir(cpath, S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IXUSR | S_IXGRP); + if (err) { + free(cpath); + LBANN_ERROR("mkdir failed for \"", cpath, "\"; please create this directory yourself, then rerun this program"); + cout << " mkdir failed: " << strerror(errno) << endl; + } else { + cout << " SUCCESS!\n"; + cout << " attempting to change permissions\n"; + err = chmod(cpath, S_ISGID | S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IXUSR | S_IXGRP); + if (err) { + cout << " mkdir failed: " << strerror(errno) << endl; + } else { + cout << " SUCCESS!\n"; + } + } +} + void test_output_dir() { -/* + cout << "\nChecking if output diretory path exists;\n" + " if not, we'll attempt to create it.\n"; const string dir = options::get()->get_string("output_dir"); - struct stat buf; - int err = stat(dir.c_str(), &buf); - if (!err) { - cout << "output directory " << dir << " exists; next will test if it's writable" << endl; - - const string test_fn = options::get()->get_string("output_dir") + "/ok_to_eraseme"; - ofstream testing(test_fn.c_str()); - if (!testing) { - LBANN_ERROR("the output directory " << dir << " exists, but is not writable"); + char *cpath = strdup(dir.c_str()); + char *pp = cpath; + if (pp[0] == '/') { + ++pp; + } + char *sp; + int status = 0; + while (status == 0 && (sp = strchr(pp, '/')) != 0) { + if (sp != pp) { + *sp = '\0'; + cout << cpath << endl; + if (file_exists(cpath)) { + cout << " path exists\n"; + } else { + make_dir(cpath); + } + *sp = '/'; + } + pp = sp+1; + } + if (status == 0) { + cout << cpath << endl; + if (file_exists(cpath)) { + cout << " path exists\n"; } else { - testing.close(); - remove(d.c_str()); - // good to go! - return; + make_dir(cpath); } } - - // output dir doesn't exist, so attempt to create it - - - - - - cout << "stat for dir: " << dir << " is: " << err << endl; - err = stat("/", &buf); - cout << "stat for dir: /; " << err << endl; - exit(0); - -*/ - const string d = options::get()->get_string("output_dir") + "/ok_to_erase_me"; - ofstream testing(d.c_str()); - if (!testing) { - LBANN_ERROR("the output directory \"" + options::get()->get_string("output_dir") + "\" either doesn't exist or is not writable"); - } - testing.close(); - remove(d.c_str()); + free(cpath); + cout << endl; } From 5918194d0ed53d9a9f0e7a5d730373cf999662d6 Mon Sep 17 00:00:00 2001 From: "David A. Hysom" Date: Tue, 20 Aug 2019 09:52:28 -0700 Subject: [PATCH 253/634] changed LBANN_ERROR occurrances to variadic format --- model_zoo/jag_utils/select_samples.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/model_zoo/jag_utils/select_samples.cpp b/model_zoo/jag_utils/select_samples.cpp index b6f914972da..9874d281852 100644 --- a/model_zoo/jag_utils/select_samples.cpp +++ b/model_zoo/jag_utils/select_samples.cpp @@ -215,7 +215,7 @@ void read_mapping_file(unordered_map> &sample_mapp sample_mapping[filename].insert(sample_id); sample_mapping_v[filename].push_back(sample_id); if (string_to_index.find(sample_id) != string_to_index.end()) { - LBANN_ERROR("duplicate sample_ID: " + sample_id + " in file: " + filename); + LBANN_ERROR("duplicate sample_ID: ", sample_id, " in file: ", filename); } string_to_index[sample_id] = hh++; } @@ -245,7 +245,7 @@ void build_index_maps( const string index_fn = options::get()->get_string("index_fn").c_str(); ifstream in(index_fn.c_str()); if (!in) { - LBANN_ERROR("failed to open " + index_fn + " for reading"); + LBANN_ERROR("failed to open ", index_fn, " for reading"); } string line; @@ -303,7 +303,7 @@ void build_index_maps( index_map_exclude[fn].insert(string_to_index[sample_id]); } if (index_map_exclude[fn].size() != bad) { - LBANN_ERROR("exclude.size(): " + std::to_string(index_map_exclude[fn].size()) + " should be: " + std::to_string(bad) + " but isn't\n"); + LBANN_ERROR("exclude.size(): ", index_map_exclude[fn].size(), " should be: ", bad, " but isn't\n"); } int local_valid_index = 0; @@ -331,7 +331,7 @@ void sanity_test_request() { const string index_fn = options::get()->get_string("index_fn").c_str(); ifstream in(index_fn.c_str()); if (!in) { - LBANN_ERROR("failed to open " + index_fn + " for reading"); + LBANN_ERROR("failed to open ", index_fn, " for reading"); } string line; @@ -346,7 +346,7 @@ void sanity_test_request() { int num_lists = options::get()->get_int("num_lists"); int num_samples = samples_per_list * num_lists; if (num_samples > num_valid) { - LBANN_ERROR("you requested a total of " + std::to_string(num_samples) + " samples, but only " + std::to_string(num_valid) + " are available"); + LBANN_ERROR("you requested a total of ", num_samples, " samples, but only ", num_valid, " are available"); } } @@ -390,7 +390,7 @@ void write_sample_list( s << dir << '/' << "t" << n << '_' << fn; ofstream out(s.str().c_str()); if (!out) { - LBANN_ERROR("failed to open " + s.str() + " for writing"); + LBANN_ERROR("failed to open ", s.str(), " for writing"); } cout << "WRITING output file: " << s.str() << endl; @@ -410,7 +410,7 @@ void write_sample_list( } LBANN_ERROR(err.str()); #endif - LBANN_ERROR("filename_data.find(" + filename + ") failed"); + LBANN_ERROR("filename_data.find(", filename, ") failed"); } // get total samples for the current file @@ -435,7 +435,7 @@ void write_sample_list( sout << filename << " " << included << " " << excluded; for (auto &t3 : include_me) { if (sample_mapping_v.find(fn2) == sample_mapping_v.end()) { - LBANN_ERROR("failed to find the key: " + fn2 + " in sample_mapping_v map"); + LBANN_ERROR("failed to find the key: ", fn2, " in sample_mapping_v map"); } unordered_map>::const_iterator t5 = sample_mapping_v.find(fn2); if (t5 == sample_mapping_v.end()) { From 9ade9a4c085d9961eb43d9acdc7d194649b3c406 Mon Sep 17 00:00:00 2001 From: "Thomas R. Benson" Date: Tue, 20 Aug 2019 10:16:10 -0700 Subject: [PATCH 254/634] install select_samples; simplify CMake targets --- model_zoo/jag_utils/CMakeLists.txt | 126 ++++++++++++++--------------- 1 file changed, 59 insertions(+), 67 deletions(-) diff --git a/model_zoo/jag_utils/CMakeLists.txt b/model_zoo/jag_utils/CMakeLists.txt index d39b17d2099..a02d1498cbf 100644 --- a/model_zoo/jag_utils/CMakeLists.txt +++ b/model_zoo/jag_utils/CMakeLists.txt @@ -1,67 +1,59 @@ - add_executable( build_index-bin build_index.cpp ) - target_link_libraries(build_index-bin lbann ) - set_target_properties(build_index-bin PROPERTIES OUTPUT_NAME build_index) - - add_executable( extract_random_samples-bin extract_random_samples.cpp ) - target_link_libraries(extract_random_samples-bin lbann ) - set_target_properties(extract_random_samples-bin PROPERTIES OUTPUT_NAME extract_random_samples) - - add_executable( dump_bundle-bin dump_bundle.cpp ) - target_link_libraries(dump_bundle-bin lbann ) - set_target_properties(dump_bundle-bin PROPERTIES OUTPUT_NAME dump_bundle) - - add_executable( check_images-bin check_images.cpp ) - target_link_libraries(check_images-bin lbann ) - set_target_properties(check_images-bin PROPERTIES OUTPUT_NAME check_images) - - add_executable( detect_corruption-bin detect_corruption.cpp ) - target_link_libraries(detect_corruption-bin lbann ) - set_target_properties(detect_corruption-bin PROPERTIES OUTPUT_NAME detect_corruption) - - add_executable( load_bundle2raw-bin load_bundle2raw.cpp ) - target_link_libraries(load_bundle2raw-bin lbann ) - set_target_properties(load_bundle2raw-bin PROPERTIES OUTPUT_NAME load_bundle2raw) - - add_executable( compute_min_max_images-bin compute_min_max_images.cpp ) - target_link_libraries(compute_min_max_images-bin lbann ) - set_target_properties(compute_min_max_images-bin PROPERTIES OUTPUT_NAME compute_min_max_images) - - add_executable( compute_per_channel_image_avg_min_max-bin compute_per_channel_image_avg_min_max.cpp ) - target_link_libraries(compute_per_channel_image_avg_min_max-bin lbann ) - set_target_properties(compute_per_channel_image_avg_min_max-bin PROPERTIES OUTPUT_NAME compute_per_channel_image_avg_min_max) - - add_executable( load_balance-bin load_balance.cpp ) - target_link_libraries(load_balance-bin lbann ) - set_target_properties(load_balance-bin PROPERTIES OUTPUT_NAME load_balance) - - add_executable( check_for_duplicate_samples-bin check_for_duplicate_samples.cpp ) - target_link_libraries(check_for_duplicate_samples-bin lbann ) - set_target_properties(check_for_duplicate_samples-bin PROPERTIES OUTPUT_NAME check_for_duplicate_samples) - - add_executable( test_conduit_hdf5-bin test_conduit_hdf5.cpp ) - target_link_libraries(test_conduit_hdf5-bin lbann ) - set_target_properties(test_conduit_hdf5-bin PROPERTIES OUTPUT_NAME test_conduit_hdf5) - - add_executable( select_samples-bin select_samples.cpp ) - target_link_libraries(select_samples-bin lbann ) - set_target_properties(select_samples-bin PROPERTIES OUTPUT_NAME select_samples) - - add_executable( build_sample_id_mapping-bin build_sample_id_mapping.cpp ) - target_link_libraries(build_sample_id_mapping-bin lbann ) - set_target_properties(build_sample_id_mapping-bin PROPERTIES OUTPUT_NAME build_sample_id_mapping) - - add_executable( generate_corrupt_samples-bin generate_corrupt_samples.cpp ) - target_link_libraries(generate_corrupt_samples-bin lbann ) - set_target_properties(generate_corrupt_samples-bin PROPERTIES OUTPUT_NAME generate_corrupt_samples) - - add_executable( compute_hydra_normalization-bin compute_hydra_normalization.cpp ) - target_link_libraries(compute_hydra_normalization-bin lbann ) - set_target_properties(compute_hydra_normalization-bin PROPERTIES OUTPUT_NAME compute_hydra_normalization) - - add_executable( test_reading_speed-bin test_reading_speed.cpp ) - target_link_libraries(test_reading_speed-bin lbann ) - set_target_properties(test_reading_speed-bin PROPERTIES OUTPUT_NAME test_reading_speed) - - add_executable( convert-bin convert.cpp ) - target_link_libraries(convert-bin lbann ) - set_target_properties(convert-bin PROPERTIES OUTPUT_NAME convert) +add_executable(build_index build_index.cpp) +target_link_libraries(build_index lbann) + +add_executable(extract_random_samples extract_random_samples.cpp) +target_link_libraries(extract_random_samples lbann) + +add_executable(dump_bundle dump_bundle.cpp) +target_link_libraries(dump_bundle lbann) + +add_executable(check_images check_images.cpp) +target_link_libraries(check_images lbann) + +add_executable(detect_corruption detect_corruption.cpp) +target_link_libraries(detect_corruption lbann) + +add_executable(load_bundle2raw load_bundle2raw.cpp) +target_link_libraries(load_bundle2raw lbann) + +add_executable(compute_min_max_images compute_min_max_images.cpp) +target_link_libraries(compute_min_max_images lbann) + +add_executable(compute_per_channel_image_avg_min_max compute_per_channel_image_avg_min_max.cpp) +target_link_libraries(compute_per_channel_image_avg_min_max lbann) + +add_executable(load_balance load_balance.cpp) +target_link_libraries(load_balance lbann) + +add_executable(check_for_duplicate_samples check_for_duplicate_samples.cpp) +target_link_libraries(check_for_duplicate_samples lbann) + +add_executable(test_conduit_hdf5 test_conduit_hdf5.cpp) +target_link_libraries(test_conduit_hdf5 lbann) + +add_executable(select_samples select_samples.cpp) +target_link_libraries(select_samples lbann) + +add_executable(build_sample_id_mapping build_sample_id_mapping.cpp) +target_link_libraries(build_sample_id_mapping lbann) + +add_executable(generate_corrupt_samples generate_corrupt_samples.cpp) +target_link_libraries(generate_corrupt_samples lbann) + +add_executable(compute_hydra_normalization compute_hydra_normalization.cpp) +target_link_libraries(compute_hydra_normalization lbann) + +add_executable(test_reading_speed test_reading_speed.cpp) +target_link_libraries(test_reading_speed lbann) + +add_executable(convert convert.cpp) +target_link_libraries(convert lbann) + +install( + TARGETS select_samples + EXPORT LBANNTargets + RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} + INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} + ) From a9c7ee84e315131b03fe42234cfe66b8ab3426c9 Mon Sep 17 00:00:00 2001 From: "Thomas R. Benson" Date: Tue, 20 Aug 2019 10:18:35 -0700 Subject: [PATCH 255/634] use keyworded target_link_libraries --- model_zoo/jag_utils/CMakeLists.txt | 34 +++++++++++++++--------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/model_zoo/jag_utils/CMakeLists.txt b/model_zoo/jag_utils/CMakeLists.txt index a02d1498cbf..021ff208cc6 100644 --- a/model_zoo/jag_utils/CMakeLists.txt +++ b/model_zoo/jag_utils/CMakeLists.txt @@ -1,53 +1,53 @@ add_executable(build_index build_index.cpp) -target_link_libraries(build_index lbann) +target_link_libraries(build_index PRIVATE lbann) add_executable(extract_random_samples extract_random_samples.cpp) -target_link_libraries(extract_random_samples lbann) +target_link_libraries(extract_random_samples PRIVATE lbann) add_executable(dump_bundle dump_bundle.cpp) -target_link_libraries(dump_bundle lbann) +target_link_libraries(dump_bundle PRIVATE lbann) add_executable(check_images check_images.cpp) -target_link_libraries(check_images lbann) +target_link_libraries(check_images PRIVATE lbann) add_executable(detect_corruption detect_corruption.cpp) -target_link_libraries(detect_corruption lbann) +target_link_libraries(detect_corruption PRIVATE lbann) add_executable(load_bundle2raw load_bundle2raw.cpp) -target_link_libraries(load_bundle2raw lbann) +target_link_libraries(load_bundle2raw PRIVATE lbann) add_executable(compute_min_max_images compute_min_max_images.cpp) -target_link_libraries(compute_min_max_images lbann) +target_link_libraries(compute_min_max_images PRIVATE lbann) add_executable(compute_per_channel_image_avg_min_max compute_per_channel_image_avg_min_max.cpp) -target_link_libraries(compute_per_channel_image_avg_min_max lbann) +target_link_libraries(compute_per_channel_image_avg_min_max PRIVATE lbann) add_executable(load_balance load_balance.cpp) -target_link_libraries(load_balance lbann) +target_link_libraries(load_balance PRIVATE lbann) add_executable(check_for_duplicate_samples check_for_duplicate_samples.cpp) -target_link_libraries(check_for_duplicate_samples lbann) +target_link_libraries(check_for_duplicate_samples PRIVATE lbann) add_executable(test_conduit_hdf5 test_conduit_hdf5.cpp) -target_link_libraries(test_conduit_hdf5 lbann) +target_link_libraries(test_conduit_hdf5 PRIVATE lbann) add_executable(select_samples select_samples.cpp) -target_link_libraries(select_samples lbann) +target_link_libraries(select_samples PRIVATE lbann) add_executable(build_sample_id_mapping build_sample_id_mapping.cpp) -target_link_libraries(build_sample_id_mapping lbann) +target_link_libraries(build_sample_id_mapping PRIVATE lbann) add_executable(generate_corrupt_samples generate_corrupt_samples.cpp) -target_link_libraries(generate_corrupt_samples lbann) +target_link_libraries(generate_corrupt_samples PRIVATE lbann) add_executable(compute_hydra_normalization compute_hydra_normalization.cpp) -target_link_libraries(compute_hydra_normalization lbann) +target_link_libraries(compute_hydra_normalization PRIVATE lbann) add_executable(test_reading_speed test_reading_speed.cpp) -target_link_libraries(test_reading_speed lbann) +target_link_libraries(test_reading_speed PRIVATE lbann) add_executable(convert convert.cpp) -target_link_libraries(convert lbann) +target_link_libraries(convert PRIVATE lbann) install( TARGETS select_samples From 9d767e89cfe11b38228d04818a9902fd7e3b67a2 Mon Sep 17 00:00:00 2001 From: "David A. Hysom" Date: Tue, 20 Aug 2019 10:54:17 -0700 Subject: [PATCH 256/634] Modified to write bar file The bar file is an exclusion file; it contains lists of all sampless that are not in any of the other sample lists. --- model_zoo/jag_utils/select_samples.cpp | 79 ++++++++++++++++++++++---- 1 file changed, 69 insertions(+), 10 deletions(-) diff --git a/model_zoo/jag_utils/select_samples.cpp b/model_zoo/jag_utils/select_samples.cpp index 9874d281852..e0244aeac35 100644 --- a/model_zoo/jag_utils/select_samples.cpp +++ b/model_zoo/jag_utils/select_samples.cpp @@ -61,6 +61,10 @@ void write_sample_list( const unordered_map> &sample_mapping_v, const std::unordered_map &filename_data); +void write_exclusion_file( + const unordered_map> index_map_exclude, + const unordered_map> &sample_mapping_v, + const unordered_map &filename_data); //============================================================================ int main(int argc, char **argv) { int random_seed = lbann::lbann_default_random_seed; @@ -113,6 +117,8 @@ int main(int argc, char **argv) { vector>> subsets(num_lists); divide_selected_samples(index_map_keep, subsets); + write_exclusion_file(index_map_exclude, sample_mapping_v, filename_data); + // write the sample lists for (int n=0; n::const_iterator t4 = filename_data.find(filename); @@ -521,3 +517,66 @@ void test_output_dir() { free(cpath); cout << endl; } + + +void write_exclusion_file( + const unordered_map> index_map_exclude, + const unordered_map> &sample_mapping_v, + const unordered_map &filename_data +) { + const string dir = options::get()->get_string("output_dir"); + const string base_fn = options::get()->get_string("output_base_fn"); + stringstream s; + s << dir << '/' << "t_" << '_' << base_fn << "_bar"; + std::cerr << "\nWRITING output bar file: " << s.str() << "\n"; + std::ofstream out(s.str().c_str()); + if (!out) { + LBANN_ERROR("failed to open ", s.str(), " for writing\n"); + } + out<< "CONDUIT_HDF5_EXCLUSION\n"; + + std::stringstream sout; + size_t total_good = 0; + size_t total_bad = 0; + size_t num_include_files = 0; + + string fn; + int good; + int bad; + for (auto t : index_map_exclude) { + const string &filename = t.first; + + // get total samples for the current file + std::unordered_map::const_iterator t4 = filename_data.find(filename); + if (t4 == filename_data.end()) { + LBANN_ERROR("t4 == filename_data.end()"); + } + + std::stringstream s5(t4->second); + s5 >> fn >> good >> bad; + size_t total = good+bad; + + const std::unordered_set &exclude_me = t.second; + int excluded = exclude_me.size(); + int included = total - excluded; + if (included) { + ++num_include_files; + total_good += included; + total_bad += excluded; + sout << filename << " " << included << " " << excluded; + for (auto t3 : exclude_me) { + unordered_map>::const_iterator t5 = sample_mapping_v.find(fn); + if (t5 == sample_mapping_v.end()) { + LBANN_ERROR("t5 == sample_mapping_v.end())"); + } + sout << " " << t5->second[t3]; + } + sout << "\n"; + } + } + + const string base_dir = options::get()->get_string("base_dir"); + out << total_good << " " << total_bad << " " << num_include_files << "\n" + << base_dir << "\n" << sout.str(); + out.close(); +} From 8a1e50ed7c4c118a603f084a410e84650db5d228 Mon Sep 17 00:00:00 2001 From: "David A. Hysom" Date: Tue, 20 Aug 2019 10:57:30 -0700 Subject: [PATCH 257/634] Removed build_trainer_lists.py build_trainer_lists.py was a wrapper that called ../select_samples once for each sample list. We now invoke select_samples directly; that code has been improved to be much faster. --- .../jag_utils/python/build_trainer_lists.py | 108 ------------------ 1 file changed, 108 deletions(-) delete mode 100644 model_zoo/jag_utils/python/build_trainer_lists.py diff --git a/model_zoo/jag_utils/python/build_trainer_lists.py b/model_zoo/jag_utils/python/build_trainer_lists.py deleted file mode 100644 index ac4ee05535f..00000000000 --- a/model_zoo/jag_utils/python/build_trainer_lists.py +++ /dev/null @@ -1,108 +0,0 @@ -#!/usr/bin/env python -import os -import subprocess -import sys -import random -import time - - -def runme(cmd) : - print 'about to run system call:', cmd - t = cmd.split() - r = subprocess.check_call(t) - - -if len(sys.argv) < 8: - print '\nusage:', sys.argv[0], 'index_fn sample_mapping_fn num_samples num_lists output_dir output_base_name random_seed [HOST]' - print 'function: creates "num_lists" sample lists from index_fn;' - print ' each list will contain "num_samples." Each list is printed' - print ' to a separate file' - print - print 'if your environment doesn\'t contain HOST (e.g: $echo $HOST pascal83) then you' - print 'can specify HOST as the final cmd line param' - print - print 'example invocation, lassen:' - print ' $ build_trainer_lists.py /p/gpfs1/brainusr/datasets/10MJAG/1M_B/index.txt /p/gpfs1/brainusr/datasets/10MJAG/1M_B/id_mapping.txt 10000 4 /p/gpfs1/brainusr/datasets/10MJAG/1M_B sample_list 42\n' - print - print 'example invocation, lustre:' - print ' $ build_trainer_lists.py /p/lscratchh/brainusr/datasets/10MJAG/1M_B/index.txt /p/lscratchh/brainusr/datasets/10MJAG/1M_B/id_mapping.txt 10000 4 /p/lscratchh/brainusr/datasets/10MJAG/1M_B sample_list 42\n' - exit(9) - -# defaults; because who doesn't use gnu? -build = 'Release' -compiler = 'gnu' - -# this will fail if we're not running in an lbann repo -lbann_dir = subprocess.check_output(['git', 'rev-parse', '--show-toplevel'])[:-1] - -# get cluster name -host = '' -if len(sys.argv) == 9 : - host = sys.argv[8] -else : - try : - host = os.environ['HOST'] - except : - print '\nYour environment does not appear to contain the HOST variable;' - print 'therefore, please specify HOST as the final argument on the cmd line' - exit(9) - -cluster = '' -for x in os.environ['HOST'] : - if not x.isdigit() : - cluster += x - -index_fn = sys.argv[1] -mapping_fn = sys.argv[2] -num_samples = sys.argv[3] -num_lists = int(sys.argv[4]) -output_dir = sys.argv[5] -output_base_name = sys.argv[6] -seed = sys.argv[7] - -# get path to the c++ executable -exe = lbann_dir + '/build/' + compiler + '.' + build + '.' + cluster \ - + '.llnl.gov/lbann/build/model_zoo/jag_utils/select_samples' -cur_dir = os.getcwd() - -# seed the random number generator -random.seed(seed) - -first_fn = output_dir + '/t0_' + output_base_name + '.txt' -bar_fn = output_dir + '/t_' + output_base_name + '.txt_bar' - -print 'constructing trainer file # 0 ... please wait ...' -cmd = exe + ' --index_fn=' + index_fn + ' --sample_mapping_fn=' + mapping_fn \ - + ' --num_samples=' + num_samples + ' --output_fn=' + first_fn \ - + ' --random_seed=' + seed -runme(cmd) - -cmd = 'mv ' + first_fn + '_bar ' + bar_fn -runme(cmd) - -filenames = [] -filenames.append(first_fn) - -for j in range(1, num_lists) : - fn = output_dir + '/t' + str(j) + '_' + output_base_name + '.txt' - print 'constructing trainer file #', j, '... please wait ...' - - cmd = exe + ' --index_fn=' + bar_fn + ' --sample_mapping_fn=' + mapping_fn \ - + ' --num_samples=' + num_samples + ' --output_fn=' + fn \ - + ' --random_seed=' + seed - runme(cmd) - filenames.append(fn) - - cmd = 'mv ' + fn + '_bar ' + bar_fn - runme(cmd) -filenames.append(bar_fn) - -os.system('chgrp brain ' + output_dir + '/*') -os.system('chmod 660 ' + output_dir + '/*') - -print -print '=================================================================\n' -print 'generated these files:' -for f in filenames : - print f - From 4ab622de237f1c5f3e13aa5b16d0ba043b66326b Mon Sep 17 00:00:00 2001 From: Ryan Forsyth Date: Mon, 19 Aug 2019 14:43:50 -0700 Subject: [PATCH 258/634] Improve error reporting --- bamboo/common_python/tools.py | 23 +++++++++++++-- .../test_unit_check_proto_models.py | 5 +++- bamboo/unit_tests/test_unit_checkpoint.py | 29 ++++++++++++------- bamboo/unit_tests/test_unit_layer_clamp.py | 2 +- .../unit_tests/test_unit_layer_covariance.py | 2 +- bamboo/unit_tests/test_unit_layer_elu.py | 2 +- bamboo/unit_tests/test_unit_layer_identity.py | 2 +- bamboo/unit_tests/test_unit_layer_l1_norm.py | 2 +- bamboo/unit_tests/test_unit_layer_l2_norm2.py | 2 +- .../unit_tests/test_unit_layer_leaky_relu.py | 2 +- .../unit_tests/test_unit_layer_log_sigmoid.py | 2 +- .../unit_tests/test_unit_layer_log_softmax.py | 2 +- .../test_unit_layer_mean_absolute_error.py | 2 +- bamboo/unit_tests/test_unit_layer_relu.py | 2 +- bamboo/unit_tests/test_unit_layer_selu.py | 2 +- bamboo/unit_tests/test_unit_layer_sigmoid.py | 2 +- bamboo/unit_tests/test_unit_layer_softmax.py | 2 +- bamboo/unit_tests/test_unit_layer_softplus.py | 2 +- bamboo/unit_tests/test_unit_layer_softsign.py | 2 +- .../test_unit_layer_squared_difference.py | 2 +- .../unit_tests/test_unit_layer_tessellate.py | 2 +- bamboo/unit_tests/test_unit_layer_variance.py | 2 +- bamboo/unit_tests/test_unit_lbann2_reload.py | 14 ++++++--- .../unit_tests/test_unit_lbann_invocation.py | 12 ++++++-- .../unit_tests/test_unit_mnist_conv_graph.py | 2 +- .../test_unit_mnist_ridge_regression.py | 2 +- .../test_unit_mnist_softmax_classifier.py | 2 +- .../test_unit_reconstruction_loss.py | 2 +- 28 files changed, 86 insertions(+), 43 deletions(-) diff --git a/bamboo/common_python/tools.py b/bamboo/common_python/tools.py index f2bd74712d4..1ae665eec71 100644 --- a/bamboo/common_python/tools.py +++ b/bamboo/common_python/tools.py @@ -258,8 +258,8 @@ def get_command(cluster, # If data_reader_name is set, an exception will be raised later. option_data_reader = ' --reader=%s' % data_reader_path if optimizer_path is not None: - # If optimizer_name is set, an exception will be raised later. - option_optimizer_name = ' --optimizer=%s' % optimizer_path + # If optimizer_name is also set, an exception will be raised later. + option_optimizer = ' --optimizer=%s' % optimizer_path if dir_name is not None: if model_path is not None: if (model_folder is not None) or (model_name is not None): @@ -549,3 +549,22 @@ def get_default_exes(default_dirname, cluster): print('default_exes={d}'.format(d=default_exes)) return default_exes + + +def assert_success(return_code, error_file_name): + if return_code != 0: + with open(error_file_name, 'r') as error_file: + error_line = '' + previous_line = '' + for line in error_file: + if 'ERROR' in line: + error_line = line + break + elif 'Stack trace:' in line: + error_line = previous_line + break + else: + previous_line = line + raise AssertionError( + 'return_code={rc}\n{el}\nSee {efn}'.format( + rc=return_code, el=error_line, efn=error_file_name)) diff --git a/bamboo/unit_tests/test_unit_check_proto_models.py b/bamboo/unit_tests/test_unit_check_proto_models.py index 4c66c2ae8de..287a275791e 100644 --- a/bamboo/unit_tests/test_unit_check_proto_models.py +++ b/bamboo/unit_tests/test_unit_check_proto_models.py @@ -115,7 +115,10 @@ def skeleton_models(cluster, dir_name, executables, compiler_name): print('Errors for: The following models exited with errors %s' % compiler_name) for model in defective_models: print(model) - assert num_defective == 0 + if num_defective != 0: + raise AssertionError( + 'num_defective={nd}\nDefective models:\n{dms}'.format( + nd=num_defective, dms=defective_models)) def test_unit_models_clang6(cluster, dirname, exes): diff --git a/bamboo/unit_tests/test_unit_checkpoint.py b/bamboo/unit_tests/test_unit_checkpoint.py index b3756453fc3..d46173a0a8d 100644 --- a/bamboo/unit_tests/test_unit_checkpoint.py +++ b/bamboo/unit_tests/test_unit_checkpoint.py @@ -28,8 +28,8 @@ def skeleton_checkpoint_lenet_shared(cluster, executables, dir_name, sys.stderr.write('LeNet (no checkpoint) execution failed, exiting with error') sys.exit(1) os.system('mkdir ckpt_lenet_shared') - no_ckpt = 'ckpt_lenet_shared/no_ckpt_{c}'.format(c=compiler_name) - os.system('mv ckpt {c}'.format(c=no_ckpt)) + no_ckpt_dir = 'ckpt_lenet_shared/no_ckpt_{c}'.format(c=compiler_name) + os.system('mv ckpt {c}'.format(c=no_ckpt_dir)) # Run to checkpoint, printing weights to files. output_file_name = '%s/bamboo/unit_tests/output/checkpoint_lenet_shared_checkpoint_%s_output.txt' % (dir_name, compiler_name) @@ -61,9 +61,13 @@ def skeleton_checkpoint_lenet_shared(cluster, executables, dir_name, sys.stderr.write('LeNet execution (restart from checkpoint) failed, exiting with error') sys.exit(1) - diff_test = os.system('diff -rq ckpt {c}'.format(c=no_ckpt)) - os.system('mv ckpt ckpt_lenet_shared/ckpt_{c}'.format(c=compiler_name)) - assert diff_test == 0 + diff_test = os.system('diff -rq ckpt {c}'.format(c=no_ckpt_dir)) + ckpt_dir = 'ckpt_lenet_shared/ckpt_{c}'.format(c=compiler_name) + os.system('mv ckpt {c}'.format(c=ckpt_dir)) + path_prefix = '{d}/bamboo/unit_tests/'.format(d=dir_name) + if diff_test !=0: + raise AssertionError('diff_test={dt}\nCompare {ncd} and {cd} in {p}'.format( + dt=diff_test, ncd=no_ckpt_dir, cd=ckpt_dir, p=path_prefix)) def skeleton_checkpoint_lenet_distributed(cluster, executables, dir_name, @@ -89,8 +93,8 @@ def skeleton_checkpoint_lenet_distributed(cluster, executables, dir_name, sys.stderr.write('LeNet (no checkpoint) execution failed, exiting with error') sys.exit(1) os.system('mkdir ckpt_lenet_distributed') - no_ckpt = 'ckpt_lenet_distributed/no_ckpt_{c}'.format(c=compiler_name) - os.system('mv ckpt {c}'.format(c=no_ckpt)) + no_ckpt_dir = 'ckpt_lenet_distributed/no_ckpt_{c}'.format(c=compiler_name) + os.system('mv ckpt {c}'.format(c=no_ckpt_dir)) # Run to checkpoint, printing weights to files. output_file_name = '%s/bamboo/unit_tests/output/checkpoint_lenet_distributed_checkpoint_%s_output.txt' % (dir_name, compiler_name) @@ -122,9 +126,14 @@ def skeleton_checkpoint_lenet_distributed(cluster, executables, dir_name, sys.stderr.write('LeNet execution (restart from checkpoint) failed, exiting with error') sys.exit(1) - diff_test = os.system('diff -rq ckpt {c}'.format(c=no_ckpt)) - os.system('mv ckpt ckpt_lenet_distributed/ckpt_{c}'.format(c=compiler_name)) - assert diff_test == 0 + diff_test = os.system('diff -rq ckpt {c}'.format(c=no_ckpt_dir)) + ckpt_dir = 'ckpt_lenet_distributed/ckpt_{c}'.format(c=compiler_name) + os.system('mv ckpt {c}'.format(c=ckpt_dir)) + path_prefix = '{d}/bamboo/unit_tests'.format(d=dir_name) + if diff_test != 0: + raise AssertionError( + 'diff_test={dt}\nCompare {ncd} and {cd} in {p}'.format( + dt=diff_test, ncd=no_ckpt_dir, cd=ckpt_dir, p=path_prefix)) def test_unit_checkpoint_lenet_shared_clang6(cluster, exes, dirname): diff --git a/bamboo/unit_tests/test_unit_layer_clamp.py b/bamboo/unit_tests/test_unit_layer_clamp.py index 73a4a48a87d..ddfddd5be8a 100644 --- a/bamboo/unit_tests/test_unit_layer_clamp.py +++ b/bamboo/unit_tests/test_unit_layer_clamp.py @@ -21,7 +21,7 @@ def skeleton_layer_clamp(cluster, executables, dir_name, compiler_name): optimizer_name='sgd', output_file_name=output_file_name, error_file_name=error_file_name) return_code = os.system(command) - assert return_code == 0 + tools.assert_success(return_code, error_file_name) def test_unit_layer_clamp_clang6(cluster, exes, dirname): diff --git a/bamboo/unit_tests/test_unit_layer_covariance.py b/bamboo/unit_tests/test_unit_layer_covariance.py index 8e6450495cc..ff10756dc5a 100644 --- a/bamboo/unit_tests/test_unit_layer_covariance.py +++ b/bamboo/unit_tests/test_unit_layer_covariance.py @@ -21,7 +21,7 @@ def skeleton_layer_covariance(cluster, executables, dir_name, compiler_name): optimizer_name='sgd', output_file_name=output_file_name, error_file_name=error_file_name) return_code = os.system(command) - assert return_code == 0 + tools.assert_success(return_code, error_file_name) def test_unit_layer_covariance_clang6(cluster, exes, dirname): diff --git a/bamboo/unit_tests/test_unit_layer_elu.py b/bamboo/unit_tests/test_unit_layer_elu.py index 06e50790d0a..8282974d850 100644 --- a/bamboo/unit_tests/test_unit_layer_elu.py +++ b/bamboo/unit_tests/test_unit_layer_elu.py @@ -21,7 +21,7 @@ def skeleton_layer_elu(cluster, executables, dir_name, compiler_name): optimizer_name='sgd', output_file_name=output_file_name, error_file_name=error_file_name) return_code = os.system(command) - assert return_code == 0 + tools.assert_success(return_code, error_file_name) def test_unit_layer_elu_clang6(cluster, exes, dirname): diff --git a/bamboo/unit_tests/test_unit_layer_identity.py b/bamboo/unit_tests/test_unit_layer_identity.py index f1695ff6dda..6212b317acd 100644 --- a/bamboo/unit_tests/test_unit_layer_identity.py +++ b/bamboo/unit_tests/test_unit_layer_identity.py @@ -21,7 +21,7 @@ def skeleton_layer_identity(cluster, executables, dir_name, compiler_name): optimizer_name='sgd', output_file_name=output_file_name, error_file_name=error_file_name) return_code = os.system(command) - assert return_code == 0 + tools.assert_success(return_code, error_file_name) def test_unit_layer_identity_clang6(cluster, exes, dirname): diff --git a/bamboo/unit_tests/test_unit_layer_l1_norm.py b/bamboo/unit_tests/test_unit_layer_l1_norm.py index 1635895bfe1..6a7ae7e7d54 100644 --- a/bamboo/unit_tests/test_unit_layer_l1_norm.py +++ b/bamboo/unit_tests/test_unit_layer_l1_norm.py @@ -21,7 +21,7 @@ def skeleton_layer_l1_norm(cluster, executables, dir_name, compiler_name): optimizer_name='sgd', output_file_name=output_file_name, error_file_name=error_file_name) return_code = os.system(command) - assert return_code == 0 + tools.assert_success(return_code, error_file_name) def test_unit_layer_l1_norm_clang6(cluster, exes, dirname): diff --git a/bamboo/unit_tests/test_unit_layer_l2_norm2.py b/bamboo/unit_tests/test_unit_layer_l2_norm2.py index b4d5eda45a5..5a13f0da5f3 100644 --- a/bamboo/unit_tests/test_unit_layer_l2_norm2.py +++ b/bamboo/unit_tests/test_unit_layer_l2_norm2.py @@ -21,7 +21,7 @@ def skeleton_layer_l2_norm2(cluster, executables, dir_name, compiler_name): optimizer_name='sgd', output_file_name=output_file_name, error_file_name=error_file_name) return_code = os.system(command) - assert return_code == 0 + tools.assert_success(return_code, error_file_name) def test_unit_layer_l2_norm2_clang6(cluster, exes, dirname): diff --git a/bamboo/unit_tests/test_unit_layer_leaky_relu.py b/bamboo/unit_tests/test_unit_layer_leaky_relu.py index 76551e0168d..ca8d02e245d 100644 --- a/bamboo/unit_tests/test_unit_layer_leaky_relu.py +++ b/bamboo/unit_tests/test_unit_layer_leaky_relu.py @@ -21,7 +21,7 @@ def skeleton_layer_leaky_relu(cluster, executables, dir_name, compiler_name): optimizer_name='sgd', output_file_name=output_file_name, error_file_name=error_file_name) return_code = os.system(command) - assert return_code == 0 + tools.assert_success(return_code, error_file_name) def test_unit_layer_leaky_relu_clang6(cluster, exes, dirname): diff --git a/bamboo/unit_tests/test_unit_layer_log_sigmoid.py b/bamboo/unit_tests/test_unit_layer_log_sigmoid.py index 933a080a84e..dbbb0663d5d 100644 --- a/bamboo/unit_tests/test_unit_layer_log_sigmoid.py +++ b/bamboo/unit_tests/test_unit_layer_log_sigmoid.py @@ -21,7 +21,7 @@ def skeleton_layer_log_sigmoid(cluster, executables, dir_name, compiler_name): optimizer_name='sgd', output_file_name=output_file_name, error_file_name=error_file_name) return_code = os.system(command) - assert return_code == 0 + tools.assert_success(return_code, error_file_name) def test_unit_layer_log_sigmoid_clang6(cluster, exes, dirname): diff --git a/bamboo/unit_tests/test_unit_layer_log_softmax.py b/bamboo/unit_tests/test_unit_layer_log_softmax.py index cc89c82cb17..0be482f7701 100644 --- a/bamboo/unit_tests/test_unit_layer_log_softmax.py +++ b/bamboo/unit_tests/test_unit_layer_log_softmax.py @@ -22,7 +22,7 @@ def skeleton_layer_log_softmax(cluster, executables, dir_name, compiler_name): optimizer_name='sgd', output_file_name=output_file_name, error_file_name=error_file_name) return_code = os.system(command) - assert return_code == 0 + tools.assert_success(return_code, error_file_name) def test_unit_layer_log_softmax_clang6(cluster, exes, dirname): diff --git a/bamboo/unit_tests/test_unit_layer_mean_absolute_error.py b/bamboo/unit_tests/test_unit_layer_mean_absolute_error.py index a04207d1a86..e32a08c77b4 100644 --- a/bamboo/unit_tests/test_unit_layer_mean_absolute_error.py +++ b/bamboo/unit_tests/test_unit_layer_mean_absolute_error.py @@ -21,7 +21,7 @@ def skeleton_layer_mean_absolute_error(cluster, executables, dir_name, compiler_ optimizer_name='sgd', output_file_name=output_file_name, error_file_name=error_file_name) return_code = os.system(command) - assert return_code == 0 + tools.assert_success(return_code, error_file_name) def test_unit_layer_mean_absolute_error_clang6(cluster, exes, dirname): diff --git a/bamboo/unit_tests/test_unit_layer_relu.py b/bamboo/unit_tests/test_unit_layer_relu.py index 4bd05c82f52..851c3137c2c 100644 --- a/bamboo/unit_tests/test_unit_layer_relu.py +++ b/bamboo/unit_tests/test_unit_layer_relu.py @@ -22,7 +22,7 @@ def skeleton_layer_relu(cluster, executables, dir_name, compiler_name): optimizer_name='sgd', output_file_name=output_file_name, error_file_name=error_file_name) return_code = os.system(command) - assert return_code == 0 + tools.assert_success(return_code, error_file_name) def test_unit_layer_relu_clang6(cluster, exes, dirname): diff --git a/bamboo/unit_tests/test_unit_layer_selu.py b/bamboo/unit_tests/test_unit_layer_selu.py index 77be41d2283..8f1f7b69fb6 100644 --- a/bamboo/unit_tests/test_unit_layer_selu.py +++ b/bamboo/unit_tests/test_unit_layer_selu.py @@ -22,7 +22,7 @@ def skeleton_layer_selu(cluster, executables, dir_name, compiler_name): optimizer_name='sgd', output_file_name=output_file_name, error_file_name=error_file_name) return_code = os.system(command) - assert return_code == 0 + tools.assert_success(return_code, error_file_name) def test_unit_layer_selu_clang6(cluster, exes, dirname): diff --git a/bamboo/unit_tests/test_unit_layer_sigmoid.py b/bamboo/unit_tests/test_unit_layer_sigmoid.py index d8143a558d4..c03895f425e 100644 --- a/bamboo/unit_tests/test_unit_layer_sigmoid.py +++ b/bamboo/unit_tests/test_unit_layer_sigmoid.py @@ -22,7 +22,7 @@ def skeleton_layer_sigmoid(cluster, executables, dir_name, compiler_name): optimizer_name='sgd', output_file_name=output_file_name, error_file_name=error_file_name) return_code = os.system(command) - assert return_code == 0 + tools.assert_success(return_code, error_file_name) def test_unit_layer_sigmoid_clang6(cluster, exes, dirname): diff --git a/bamboo/unit_tests/test_unit_layer_softmax.py b/bamboo/unit_tests/test_unit_layer_softmax.py index 4ba2b8561bc..3e06ae2b890 100644 --- a/bamboo/unit_tests/test_unit_layer_softmax.py +++ b/bamboo/unit_tests/test_unit_layer_softmax.py @@ -22,7 +22,7 @@ def skeleton_layer_softmax(cluster, executables, dir_name, compiler_name): optimizer_name='sgd', output_file_name=output_file_name, error_file_name=error_file_name) return_code = os.system(command) - assert return_code == 0 + tools.assert_success(return_code, error_file_name) def test_unit_layer_softmax_clang6(cluster, exes, dirname): diff --git a/bamboo/unit_tests/test_unit_layer_softplus.py b/bamboo/unit_tests/test_unit_layer_softplus.py index 362b261f27f..fed22827d05 100644 --- a/bamboo/unit_tests/test_unit_layer_softplus.py +++ b/bamboo/unit_tests/test_unit_layer_softplus.py @@ -21,7 +21,7 @@ def skeleton_layer_softplus(cluster, executables, dir_name, compiler_name): optimizer_name='sgd', output_file_name=output_file_name, error_file_name=error_file_name) return_code = os.system(command) - assert return_code == 0 + tools.assert_success(return_code, error_file_name) def test_unit_layer_softplus_clang6(cluster, exes, dirname): diff --git a/bamboo/unit_tests/test_unit_layer_softsign.py b/bamboo/unit_tests/test_unit_layer_softsign.py index 1e262807486..878e4e4474b 100644 --- a/bamboo/unit_tests/test_unit_layer_softsign.py +++ b/bamboo/unit_tests/test_unit_layer_softsign.py @@ -21,7 +21,7 @@ def skeleton_layer_softsign(cluster, executables, dir_name, compiler_name): optimizer_name='sgd', output_file_name=output_file_name, error_file_name=error_file_name) return_code = os.system(command) - assert return_code == 0 + tools.assert_success(return_code, error_file_name) def test_unit_layer_softsign_clang6(cluster, exes, dirname): diff --git a/bamboo/unit_tests/test_unit_layer_squared_difference.py b/bamboo/unit_tests/test_unit_layer_squared_difference.py index f6deacdea6f..768cc93ff85 100644 --- a/bamboo/unit_tests/test_unit_layer_squared_difference.py +++ b/bamboo/unit_tests/test_unit_layer_squared_difference.py @@ -21,7 +21,7 @@ def skeleton_layer_squared_difference(cluster, executables, dir_name, compiler_n optimizer_name='sgd', output_file_name=output_file_name, error_file_name=error_file_name) return_code = os.system(command) - assert return_code == 0 + tools.assert_success(return_code, error_file_name) def test_unit_layer_squared_difference_clang6(cluster, exes, dirname): diff --git a/bamboo/unit_tests/test_unit_layer_tessellate.py b/bamboo/unit_tests/test_unit_layer_tessellate.py index 024ebab761a..4d788d005ca 100644 --- a/bamboo/unit_tests/test_unit_layer_tessellate.py +++ b/bamboo/unit_tests/test_unit_layer_tessellate.py @@ -21,7 +21,7 @@ def skeleton_layer_tessellate(cluster, executables, dir_name, compiler_name): optimizer_name='sgd', output_file_name=output_file_name, error_file_name=error_file_name) return_code = os.system(command) - assert return_code == 0 + tools.assert_success(return_code, error_file_name) def test_unit_layer_tessellate_clang6(cluster, exes, dirname): diff --git a/bamboo/unit_tests/test_unit_layer_variance.py b/bamboo/unit_tests/test_unit_layer_variance.py index 20af21d60e1..bccafe90be9 100644 --- a/bamboo/unit_tests/test_unit_layer_variance.py +++ b/bamboo/unit_tests/test_unit_layer_variance.py @@ -22,7 +22,7 @@ def skeleton_layer_variance(cluster, executables, dir_name, compiler_name): optimizer_name='sgd', output_file_name=output_file_name, error_file_name=error_file_name) return_code = os.system(command) - assert return_code == 0 + tools.assert_success(return_code, error_file_name) def test_unit_layer_variance_clang6(cluster, exes, dirname): diff --git a/bamboo/unit_tests/test_unit_lbann2_reload.py b/bamboo/unit_tests/test_unit_lbann2_reload.py index 215f8cac960..5f39047d6ad 100644 --- a/bamboo/unit_tests/test_unit_lbann2_reload.py +++ b/bamboo/unit_tests/test_unit_lbann2_reload.py @@ -34,8 +34,8 @@ def skeleton_lbann2_reload(cluster, executables, dir_name, compiler_name): sys.exit(1) os.system('mkdir ckpt_lbann2_reload') - no_ckpt = 'ckpt_lbann2_reload/lbann2_no_ckpt_{c}'.format(c=compiler_name) - os.system('mv lbann2_ckpt {c}'.format(c=no_ckpt)) + no_ckpt_dir = 'ckpt_lbann2_reload/lbann2_no_ckpt_{c}'.format(c=compiler_name) + os.system('mv lbann2_ckpt {c}'.format(c=no_ckpt_dir)) # Run to checkpoint, printing weights to files. output_file_name = '%s/bamboo/unit_tests/output/lbann2_checkpoint_%s_output.txt' % (dir_name, compiler_name) @@ -75,7 +75,7 @@ def skeleton_lbann2_reload(cluster, executables, dir_name, compiler_name): os.system('rm lbann2_ckpt/model0-epoch*') os.system('rm lbann2_nockpt/model0-epoch*') - diff_result = os.system('diff -rq lbann2_ckpt/ {c}'.format(c=no_ckpt)) + diff_result = os.system('diff -rq lbann2_ckpt/ {c}'.format(c=no_ckpt_dir)) allow_epsilon_diff = False if allow_epsilon_diff and (diff_result != 0): equal_within_epsilon = True @@ -109,7 +109,13 @@ def skeleton_lbann2_reload(cluster, executables, dir_name, compiler_name): print(error_string) if equal_within_epsilon: diff_result = 0 - os.system('mv lbann2_ckpt ckpt_lbann2_reload/lbann2_ckpt_{c}'.format(c=compiler_name)) + ckpt_dir = 'ckpt_lbann2_reload/lbann2_ckpt_{c}'.format(c=compiler_name) + os.system('mv lbann2_ckpt {c}'.format(c=ckpt_dir)) + path_prefix = '{d}/bamboo/unit_tests'.format(d=dir_name) + if diff_result != 0: + raise AssertionError( + 'diff_test={dt}\nCompare {ncd} and {cd} in {p}'.format( + dt=diff_result, ncd=no_ckpt_dir, cd=ckpt_dir, p=path_prefix)) assert diff_result == 0 diff --git a/bamboo/unit_tests/test_unit_lbann_invocation.py b/bamboo/unit_tests/test_unit_lbann_invocation.py index 5bde732c262..91ed3ca4fb9 100644 --- a/bamboo/unit_tests/test_unit_lbann_invocation.py +++ b/bamboo/unit_tests/test_unit_lbann_invocation.py @@ -109,7 +109,7 @@ def test_unit_bad_params(cluster, exes): # Run with python3 -m pytest -s test_unit_lbann_invocation.py -k 'test_unit_should_work' --exes= -def test_unit_should_work(cluster, exes): +def test_unit_should_work(cluster, dirname, exes): if isinstance(exes, dict): exe = exes['gcc7'] else: @@ -118,10 +118,16 @@ def test_unit_should_work(cluster, exes): model_path = '{prototext/model_mnist_simple_1.prototext,prototext/model_mnist_simple_1.prototext}' data_reader_path = 'prototext/data_reader_mnist.prototext' optimizer_path = 'prototext/opt_sgd.prototext' + output_file_name = '{d}/bamboo/unit_tests/output/lbann_invocation_should_work_output.txt'.format( + d=dirname) + error_file_name = '{d}/bamboo/unit_tests/error/lbann_invocation_should_work_error.txt'.format( + d=dirname) command = tools.get_command( cluster=cluster, executable=exe, data_reader_path=data_reader_path, data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST', exit_after_setup=True, model_path=model_path, - optimizer_path=optimizer_path) + optimizer_path=optimizer_path, + output_file_name=output_file_name, + error_file_name=error_file_name) return_code = os.system(command) - assert return_code != 0 + tools.assert_success(return_code, error_file_name) diff --git a/bamboo/unit_tests/test_unit_mnist_conv_graph.py b/bamboo/unit_tests/test_unit_mnist_conv_graph.py index 6c6f45d6ca0..0c6a6610368 100644 --- a/bamboo/unit_tests/test_unit_mnist_conv_graph.py +++ b/bamboo/unit_tests/test_unit_mnist_conv_graph.py @@ -27,7 +27,7 @@ def skeleton_mnist_conv_graph(cluster, executables, dir_name, compiler_name): output_file_name=output_file_name, error_file_name=error_file_name) return_code = os.system(command) - assert return_code == 0 + tools.assert_success(return_code, error_file_name) def test_unit_mnist_conv_graph_clang6(cluster, exes, dirname): diff --git a/bamboo/unit_tests/test_unit_mnist_ridge_regression.py b/bamboo/unit_tests/test_unit_mnist_ridge_regression.py index 0c2b3e8df30..d89158deb52 100644 --- a/bamboo/unit_tests/test_unit_mnist_ridge_regression.py +++ b/bamboo/unit_tests/test_unit_mnist_ridge_regression.py @@ -21,7 +21,7 @@ def skeleton_mnist_ridge_regression(cluster, executables, dir_name, compiler_nam optimizer_name='adam', output_file_name=output_file_name, error_file_name=error_file_name) return_code = os.system(command) - assert return_code == 0 + tools.assert_success(return_code, error_file_name) def test_unit_mnist_ridge_regression_clang6(cluster, exes, dirname): diff --git a/bamboo/unit_tests/test_unit_mnist_softmax_classifier.py b/bamboo/unit_tests/test_unit_mnist_softmax_classifier.py index 8a018403867..ad289e839c3 100644 --- a/bamboo/unit_tests/test_unit_mnist_softmax_classifier.py +++ b/bamboo/unit_tests/test_unit_mnist_softmax_classifier.py @@ -21,7 +21,7 @@ def skeleton_mnist_softmax_classifier(cluster, executables, dir_name, compiler_n optimizer_name='adam', output_file_name=output_file_name, error_file_name=error_file_name) return_code = os.system(command) - assert return_code == 0 + tools.assert_success(return_code, error_file_name) def test_unit_mnist_softmax_classifier_clang6(cluster, exes, dirname): diff --git a/bamboo/unit_tests/test_unit_reconstruction_loss.py b/bamboo/unit_tests/test_unit_reconstruction_loss.py index 951f9b46656..e7283893404 100644 --- a/bamboo/unit_tests/test_unit_reconstruction_loss.py +++ b/bamboo/unit_tests/test_unit_reconstruction_loss.py @@ -27,7 +27,7 @@ def skeleton_jag_reconstruction_loss(cluster, executables, dir_name, compiler_na output_file_name=output_file_name, error_file_name=error_file_name) return_code = os.system(command) - assert return_code == 0 + tools.assert_success(return_code, error_file_name) def test_unit_jag_reconstruction_loss_clang6(cluster, exes, dirname): From ebd004add580061e23aa12c4eb31c6219434b6d8 Mon Sep 17 00:00:00 2001 From: "David A. Hysom" Date: Wed, 21 Aug 2019 08:39:20 -0700 Subject: [PATCH 259/634] Modified sanity.py input params; added additional test --- model_zoo/jag_utils/python/README.txt | 10 ++--- model_zoo/jag_utils/python/sanity.py | 62 ++++++++++++++++++++++----- 2 files changed, 55 insertions(+), 17 deletions(-) diff --git a/model_zoo/jag_utils/python/README.txt b/model_zoo/jag_utils/python/README.txt index d088b8f50c4..2de2507e544 100644 --- a/model_zoo/jag_utils/python/README.txt +++ b/model_zoo/jag_utils/python/README.txt @@ -1,7 +1,5 @@ -build_trainer_lists.py +sanity.py + script to test that sample lists generated by build_trainer_lists.py + contain unique indices. - This is a wrapper that calls the c++ code: - lbann/model_zoo/jag_utils/select_samples - - Function: generates a set of sample_list files - s.t, any sample_ID appears in at most one sample_list file. + usage: sanity.py id_mapping_fn bar_fn t0_fn [t1_fn, ...] diff --git a/model_zoo/jag_utils/python/sanity.py b/model_zoo/jag_utils/python/sanity.py index cc3a5a4e6b4..a8287471302 100644 --- a/model_zoo/jag_utils/python/sanity.py +++ b/model_zoo/jag_utils/python/sanity.py @@ -12,12 +12,25 @@ if len(sys.argv) == 1 : print ''' - usage: sanity.py id_mapping_fn bar_fn t0_fn [t1_fn, ...] + usage: sanity.py id_mapping_fn sample_list_dir sample_list_base_name num_sample_lists where: bar_fn, t0_fn, etc, are outputs from build_trainer_lists.py function: test that the intersection of the sample IDs in the - sample lists are empty.\n''' + sample lists are empty, and that every sample_ID + is in either one sample list or in the exclusion (bar) file\n + example usage: + python sanity.py \\ + /p/lustre2/brainusr/datasets/10MJAG/1M_A/id_mapping.txt \\ + /p/lustre2/brainusr/datasets/10MJAG/1M_A/select_samples_test/another_dir \\ + my_samples.txt \\ + 10 + + CAUTION: this script is fragile: it may break if/when model_zoo/jag_utils/select_samples.cpp is modified + ''' + + exit(9) +#====================================================================== def buildInc(mp, fn) : r = set() print 'buildInc; opening:', fn @@ -31,6 +44,7 @@ def buildInc(mp, fn) : r.add(j) return r +#====================================================================== def buildExc(mp, fn) : s = set() print 'buildExc; opening:', fn @@ -48,29 +62,55 @@ def buildExc(mp, fn) : r.add(sample_id) return r - +#====================================================================== +#build set that contains all sample names mp = set() a = open(sys.argv[1]) for line in a : t = line.split() for j in t[1:] : mp.add(j) -print '\nlen(map):', len(mp), '/n' +print '\nlen(map):', len(mp) + +sample_list_dir = sys.argv[2] +sample_list_base_name = sys.argv[3] + +#build exclusion set; this set contains all valid (successful) sample IDs +s2 = buildExc(mp, sample_list_dir + '/t__' + sample_list_base_name + '_bar') data = [] -s2 = buildExc(mp, sys.argv[2]) data.append(s2) -for j in range(3, len(sys.argv)) : - s2 = buildInc(mp, sys.argv[j]) +for j in range(int(sys.argv[4])) : + s2 = buildInc(mp, sample_list_dir + '/t' + str(j) + '_' + sample_list_base_name) data.append(s2) print len(s2) - print print '====================================================================' +print 'running intersection test ...' +success = True for j in range(0, len(data)-1) : for k in range(1, len(data)) : - a = data[j] - b = data[k] - print 'testing', sys.argv[j], 'against', sys.argv[k], '; len(intersection):', len(a.intersection(b)) + if j != k : + a = data[j] + b = data[k] + #print 'testing', j, 'against', k + r = len(a.intersection(b)) + if r != 0 : + print 'FAILED: ', j, 'intersection with',k, '=' , r + success = False +if success : + print ' SUCCESS!' + +print +print 'testing that all samples appear in one sample list, or the bar file' +s2 = set() +for j in range(0, len(data)) : + for sample_id in data[j] : + assert(sample_id in mp) + mp.remove(sample_id) +if len(mp) == 0 : + print ' SUCCESS!' +else : + print ' FAILED; len(mp)= ', len(mp), 'should be zero' From 366944463c29e80ee698e94e7b5ae88997145dd5 Mon Sep 17 00:00:00 2001 From: "Thomas R. Benson" Date: Wed, 21 Aug 2019 10:04:58 -0700 Subject: [PATCH 260/634] update missed prototext --- bamboo/unit_tests/prototext/model_mnist_simple_1.prototext | 4 +++- bamboo/unit_tests/prototext/model_mnist_simple_2.prototext | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/bamboo/unit_tests/prototext/model_mnist_simple_1.prototext b/bamboo/unit_tests/prototext/model_mnist_simple_1.prototext index 77a1c7ed256..14511959031 100644 --- a/bamboo/unit_tests/prototext/model_mnist_simple_1.prototext +++ b/bamboo/unit_tests/prototext/model_mnist_simple_1.prototext @@ -33,11 +33,13 @@ model { # Callbacks ################################################### + summarizer { + dir: "." + } callback { print {} } callback { timer {} } callback { summary { - dir: "." mat_interval: 25 } } diff --git a/bamboo/unit_tests/prototext/model_mnist_simple_2.prototext b/bamboo/unit_tests/prototext/model_mnist_simple_2.prototext index c89c171566f..be9f91d62b2 100644 --- a/bamboo/unit_tests/prototext/model_mnist_simple_2.prototext +++ b/bamboo/unit_tests/prototext/model_mnist_simple_2.prototext @@ -33,11 +33,13 @@ model { # Callbacks ################################################### + summarizer { + dir: "." + } callback { print {} } callback { timer {} } callback { summary { - dir: "." mat_interval: 25 } } From 8b482b6ec337105dc82f04093543f50c5e96a14e Mon Sep 17 00:00:00 2001 From: Ryan Forsyth Date: Wed, 21 Aug 2019 10:38:12 -0700 Subject: [PATCH 261/634] Expand error reporting --- bamboo/common_python/tools.py | 4 +++- bamboo/compiler_tests/test_compiler.py | 14 +++++++------- bamboo/integration_tests/common_code.py | 14 +++++++------- bamboo/integration_tests/test_integration_debug.py | 6 ++---- 4 files changed, 19 insertions(+), 19 deletions(-) diff --git a/bamboo/common_python/tools.py b/bamboo/common_python/tools.py index 1ae665eec71..d298ff506c4 100644 --- a/bamboo/common_python/tools.py +++ b/bamboo/common_python/tools.py @@ -557,7 +557,9 @@ def assert_success(return_code, error_file_name): error_line = '' previous_line = '' for line in error_file: - if 'ERROR' in line: + if ('ERROR' in line) or ('LBANN error' in line) or \ + ('Error:' in line) or \ + ('Expired or invalid job' in line): error_line = line break elif 'Stack trace:' in line: diff --git a/bamboo/compiler_tests/test_compiler.py b/bamboo/compiler_tests/test_compiler.py index bf3c58e3109..c66bcdb7462 100644 --- a/bamboo/compiler_tests/test_compiler.py +++ b/bamboo/compiler_tests/test_compiler.py @@ -1,6 +1,6 @@ -# import sys -# sys.path.insert(0, '../common_python') -# import tools +import sys +sys.path.insert(0, '../common_python') +import tools import pytest import os, re, subprocess @@ -22,7 +22,7 @@ def test_compiler_build_script(cluster, dirname): error_file = open(error_file_name, 'r') for line in error_file: print('%s: %s' % (error_file_name, line)) - assert return_code == 0 + tools.assert_success(return_code, error_file_name) def test_compiler_clang6_release(cluster, dirname): @@ -137,7 +137,7 @@ def spack_skeleton(dir_name, compiler, mpi_lib, debug, should_log): error_file = open(error_file_name, 'r') for line in error_file: print('%s: %s' % (error_file_name, line)) - assert return_code == 0 + tools.assert_success(return_code, error_file_name) def build_skeleton(dir_name, compiler, debug, should_log): @@ -172,7 +172,7 @@ def build_skeleton(dir_name, compiler, debug, should_log): error_file = open(error_file_name, 'r') for line in error_file: print('%s: %s' % (error_file_name, line)) - assert return_code == 0 + tools.assert_success(return_code, error_file_name) def build_script(cluster, dirname, compiler, debug): @@ -196,4 +196,4 @@ def build_script(cluster, dirname, compiler, debug): error_file = open(error_file_name, 'r') for line in error_file: print('%s: %s' % (error_file_name, line)) - assert return_code == 0 + tools.assert_success(return_code, error_file_name) diff --git a/bamboo/integration_tests/common_code.py b/bamboo/integration_tests/common_code.py index d0d2aed6d7e..62e75133709 100644 --- a/bamboo/integration_tests/common_code.py +++ b/bamboo/integration_tests/common_code.py @@ -72,12 +72,12 @@ def run_lbann(command, model_name, output_file_name, error_file_name, print('About to run: %s' % command) print('%s began waiting in the queue at ' % model_name + time.strftime('%H:%M:%S', time.localtime())) - output_value = os.system(command) + return_code = os.system(command) print('%s finished at ' % model_name + time.strftime('%H:%M:%S', time.localtime())) lbann_exceptions = [] timed_out = False - if should_log or (output_value != 0): + if should_log or (return_code != 0): output_file = open(output_file_name, 'r') for line in output_file: print('%s: %s' % (output_file_name, line)) @@ -94,13 +94,13 @@ def run_lbann(command, model_name, output_file_name, error_file_name, is_match = re.search('LBANN error on (.*)', line) if is_match: lbann_exceptions.append(is_match.group(1)) - if output_value != 0: - error_string = ('Model %s crashed with output_value=%d, timed_out=%s,' + if return_code != 0: + error_string = ('Model %s crashed with return_code=%d, timed_out=%s,' ' and lbann exceptions=%s. Command was: %s') % ( - model_name, output_value, str(timed_out), + model_name, return_code, str(timed_out), str(collections.Counter(lbann_exceptions)), command) - raise Exception(error_string) - return output_value + print(error_string) + tools.assert_success(return_code, error_file_name) # Extract data from output #################################################### diff --git a/bamboo/integration_tests/test_integration_debug.py b/bamboo/integration_tests/test_integration_debug.py index 0d4d57b9701..9f64e821e2e 100644 --- a/bamboo/integration_tests/test_integration_debug.py +++ b/bamboo/integration_tests/test_integration_debug.py @@ -26,8 +26,7 @@ def skeleton_mnist_debug(cluster, dir_name, executables, compiler_name, weekly, data_reader_name='mnist', model_folder='models/' + model_name, model_name=model_name, num_epochs=5, optimizer_name='adagrad', output_file_name=output_file_name, error_file_name=error_file_name) - output_value = common_code.run_lbann(command, model_name, output_file_name, error_file_name, should_log) - assert output_value == 0 + common_code.run_lbann(command, model_name, output_file_name, error_file_name, should_log) def skeleton_cifar_debug(cluster, dir_name, executables, compiler_name, weekly, @@ -56,8 +55,7 @@ def skeleton_cifar_debug(cluster, dir_name, executables, compiler_name, weekly, data_reader_name='cifar10', data_reader_percent=0.01, model_folder='models/' + model_name, model_name='conv_' + model_name, num_epochs=5, optimizer_name='adagrad', output_file_name=output_file_name, error_file_name=error_file_name) - output_value = common_code.run_lbann(command, model_name, output_file_name, error_file_name, should_log) - assert output_value == 0 + common_code.run_lbann(command, model_name, output_file_name, error_file_name, should_log) def test_integration_mnist_clang6_debug(cluster, dirname, exes, weekly, debug_build): From d53efc440a5118686e6a818675fc821426151bf8 Mon Sep 17 00:00:00 2001 From: "David A. Hysom" Date: Thu, 22 Aug 2019 19:27:50 -0700 Subject: [PATCH 262/634] bug fix (for bug that Brian discovered) --- model_zoo/jag_utils/select_samples.cpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/model_zoo/jag_utils/select_samples.cpp b/model_zoo/jag_utils/select_samples.cpp index e0244aeac35..fb0f93e4a7f 100644 --- a/model_zoo/jag_utils/select_samples.cpp +++ b/model_zoo/jag_utils/select_samples.cpp @@ -370,16 +370,24 @@ void divide_selected_samples( sets[which][filename].insert(it2); ++total; ++count; + /* if (count == samples_per_list) { count = 0; ++which; } + */ + ++which; + if (which == sets.size()) { + which = 0; + } } } +/* if (which != sets.size()) { LBANN_ERROR("which != sets.size()"); } + */ if (total != samples_per_list * sets.size()) { LBANN_ERROR("samples_per_list * sets.size()"); } From 59ee7b986ab2554dcd54b701b8c2ce93eeba907e Mon Sep 17 00:00:00 2001 From: Ryan Forsyth Date: Thu, 22 Aug 2019 11:12:55 -0700 Subject: [PATCH 263/634] Test updates --- bamboo/allocate_and_run.sh | 2 +- bamboo/common_python/tools.py | 57 ++++-- .../prototext/data_reader_mnist.prototext | 64 ------- .../prototext/model_mnist_simple_1.prototext | 124 ------------- .../prototext/model_mnist_simple_2.prototext | 140 --------------- bamboo/unit_tests/prototext/opt_sgd.prototext | 7 - .../unit_tests/test_unit_lbann_invocation.py | 168 ++++++++++++------ 7 files changed, 159 insertions(+), 403 deletions(-) delete mode 100644 bamboo/unit_tests/prototext/data_reader_mnist.prototext delete mode 100644 bamboo/unit_tests/prototext/model_mnist_simple_1.prototext delete mode 100644 bamboo/unit_tests/prototext/model_mnist_simple_2.prototext delete mode 100644 bamboo/unit_tests/prototext/opt_sgd.prototext diff --git a/bamboo/allocate_and_run.sh b/bamboo/allocate_and_run.sh index a5944d5eb8e..2da19c8cf78 100755 --- a/bamboo/allocate_and_run.sh +++ b/bamboo/allocate_and_run.sh @@ -52,7 +52,7 @@ elif [ "${CLUSTER}" = 'catalyst' ] || [ "${CLUSTER}" = 'corona' ] || [ "${CLUSTE fi else if [ "${CLUSTER}" = 'catalyst' ]; then - ALLOCATION_TIME_LIMIT=240 + ALLOCATION_TIME_LIMIT=300 elif [ "${CLUSTER}" = 'corona' ] || [ "${CLUSTER}" = 'pascal' ]; then ALLOCATION_TIME_LIMIT=660 fi diff --git a/bamboo/common_python/tools.py b/bamboo/common_python/tools.py index d298ff506c4..cbb08c0efe2 100644 --- a/bamboo/common_python/tools.py +++ b/bamboo/common_python/tools.py @@ -551,22 +551,45 @@ def get_default_exes(default_dirname, cluster): return default_exes +def get_error_line(error_file_name): + with open(error_file_name, 'r') as error_file: + error_line = '' + previous_line = '' + for line in error_file: + if ('ERROR' in line) or ('LBANN error' in line) or \ + ('Error:' in line) or \ + ('Expired or invalid job' in line): + error_line = line + break + elif 'Stack trace:' in line: + error_line = previous_line + break + else: + previous_line = line + return error_line + + def assert_success(return_code, error_file_name): if return_code != 0: - with open(error_file_name, 'r') as error_file: - error_line = '' - previous_line = '' - for line in error_file: - if ('ERROR' in line) or ('LBANN error' in line) or \ - ('Error:' in line) or \ - ('Expired or invalid job' in line): - error_line = line - break - elif 'Stack trace:' in line: - error_line = previous_line - break - else: - previous_line = line - raise AssertionError( - 'return_code={rc}\n{el}\nSee {efn}'.format( - rc=return_code, el=error_line, efn=error_file_name)) + error_line = get_error_line(error_file_name) + raise AssertionError( + 'return_code={rc}\n{el}\nSee {efn}'.format( + rc=return_code, el=error_line, efn=error_file_name)) + + +def assert_failure(return_code, expected_error, error_file_name): + if return_code == 0: + raise AssertionError( + 'return_code={rc}\nSuccess when expecting failure.\nSee {efn}'.format( + rc=return_code, efn=error_file_name)) + with open(error_file_name, 'r') as error_file: + for line in error_file: + if expected_error in line: + return True + # If we're at this point, then we know the test did not succeed, + # but we didn't get the expected error. + actual_error = get_error_line(error_file_name) + raise AssertionError( + 'return_code={rc}\nFailed with error different than expected.\nactual_error={ae}\nexpected_error={ee}\nSee {efn}'.format( + rc=return_code, ae=actual_error, ee=expected_error, + efn=error_file_name)) diff --git a/bamboo/unit_tests/prototext/data_reader_mnist.prototext b/bamboo/unit_tests/prototext/data_reader_mnist.prototext deleted file mode 100644 index 9d2e2663202..00000000000 --- a/bamboo/unit_tests/prototext/data_reader_mnist.prototext +++ /dev/null @@ -1,64 +0,0 @@ -data_reader { - reader { - name: "mnist" - role: "train" - shuffle: true - data_filedir: "/p/lscratchh/brainusr/datasets/MNIST" - data_filename: "train-images-idx3-ubyte" - label_filename: "train-labels-idx1-ubyte" - validation_percent: 0.1 - absolute_sample_count: 0 - percent_of_data_to_use: 1.0 - image_preprocessor { - normalizer { - scale: true - subtract_mean: false - unit_variance: false - z_score: false - } - augmenter { - horizontal_flip: false - vertical_flip: false - rotation: 0 - horizontal_shift: 0 - vertical_shift: 0 - shear_range: 0 - } - noiser { - disable: true - factor: 0.0 - } - } - } - reader { - name: "mnist" - role: "test" - shuffle: true - data_filedir: "/p/lscratchh/brainusr/datasets/MNIST" - data_filename: "t10k-images-idx3-ubyte" - label_filename: "t10k-labels-idx1-ubyte" - validation_percent: 1.0 - absolute_sample_count: 0 - percent_of_data_to_use: 1.0 - image_preprocessor { - normalizer { - scale: true - subtract_mean: false - unit_variance: false - z_score: false - } - augmenter { - horizontal_flip: false - vertical_flip: false - rotation: 0 - horizontal_shift: 0 - vertical_shift: 0 - shear_range: 0 - } - noiser { - disable: true - factor: 0.0 - } - } - } -} diff --git a/bamboo/unit_tests/prototext/model_mnist_simple_1.prototext b/bamboo/unit_tests/prototext/model_mnist_simple_1.prototext deleted file mode 100644 index 14511959031..00000000000 --- a/bamboo/unit_tests/prototext/model_mnist_simple_1.prototext +++ /dev/null @@ -1,124 +0,0 @@ -model { - data_layout: "data_parallel" - mini_batch_size: 64 - block_size: 256 - num_epochs: 3 - num_parallel_readers: 0 - procs_per_trainer: 0 - - ################################################### - # Objective function - ################################################### - - objective_function { - layer_term { layer: "cross_entropy" } - l2_weight_regularization { - scale_factor: 1e-4 - } - } - - ################################################### - # Metrics - ################################################### - - metric { - layer_metric { - name: "categorical accuracy" - layer: "accuracy" - unit: "%" - } - } - - ################################################### - # Callbacks - ################################################### - - summarizer { - dir: "." - } - callback { print {} } - callback { timer {} } - callback { - summary { - mat_interval: 25 - } - } - callback { - adaptive_learning_rate { - patience: 4 - amt: 0.1 - } - } - - ################################################### - # Layers - ################################################### - - layer { - name: "data" - children: "image label" - data_layout: "data_parallel" - input {} - } - layer { - parents: "data" - name: "image" - data_layout: "data_parallel" - split {} - } - layer { - parents: "data" - name: "label" - data_layout: "data_parallel" - split {} - } - - layer { - parents: "image" - name: "ip1" - data_layout: "model_parallel" - fully_connected { - num_neurons: 500 - has_bias: true - } - } - - layer { - parents: "ip1" - name: "relu1" - data_layout: "model_parallel" - relu {} - } - - layer { - parents: "relu1" - name: "ip2" - data_layout: "model_parallel" - fully_connected { - num_neurons: 10 - has_bias: true - } - } - - layer { - parents: "ip2" - name: "prob" - data_layout: "data_parallel" - softmax {} - } - - layer { - parents: "prob label" - name: "cross_entropy" - data_layout: "data_parallel" - cross_entropy {} - } - - layer { - parents: "prob label" - name: "accuracy" - data_layout: "data_parallel" - categorical_accuracy {} - } - -} diff --git a/bamboo/unit_tests/prototext/model_mnist_simple_2.prototext b/bamboo/unit_tests/prototext/model_mnist_simple_2.prototext deleted file mode 100644 index be9f91d62b2..00000000000 --- a/bamboo/unit_tests/prototext/model_mnist_simple_2.prototext +++ /dev/null @@ -1,140 +0,0 @@ -model { - data_layout: "data_parallel" - mini_batch_size: 64 - block_size: 256 - num_epochs: 3 - num_parallel_readers: 0 - procs_per_trainer: 0 - - ################################################### - # Objective function - ################################################### - - objective_function { - layer_term { layer: "cross_entropy" } - l2_weight_regularization { - scale_factor: 1e-4 - } - } - - ################################################### - # Metrics - ################################################### - - metric { - layer_metric { - name: "categorical accuracy" - layer: "accuracy" - unit: "%" - } - } - - ################################################### - # Callbacks - ################################################### - - summarizer { - dir: "." - } - callback { print {} } - callback { timer {} } - callback { - summary { - mat_interval: 25 - } - } - callback { - adaptive_learning_rate { - patience: 4 - amt: 0.1 - } - } - - ################################################### - # Layers - ################################################### - - layer { - name: "data" - children: "image label" - data_layout: "data_parallel" - input {} - } - layer { - parents: "data" - name: "image" - data_layout: "data_parallel" - split {} - } - layer { - parents: "data" - name: "label" - data_layout: "data_parallel" - split {} - } - - layer { - parents: "image" - name: "ip1" - data_layout: "model_parallel" - fully_connected { - num_neurons: 500 - has_bias: true - } - } - - layer { - parents: "ip1" - name: "relu1" - data_layout: "model_parallel" - relu {} - } - - layer { - parents: "relu1" - name: "ip3" - data_layout: "model_parallel" - fully_connected { - num_neurons: 500 - has_bias: true - } - } - - layer { - parents: "ip3" - name: "relu3" - data_layout: "model_parallel" - relu {} - } - layer { - parents: "relu3" - name: "ip2" - data_layout: "model_parallel" - fully_connected { - num_neurons: 10 - has_bias: true - } - } - - layer { - parents: "ip2" - name: "prob" - data_layout: "data_parallel" - softmax {} - } - - layer { - parents: "prob label" - name: "cross_entropy" - data_layout: "data_parallel" - cross_entropy {} - } - - layer { - parents: "prob label" - name: "accuracy" - data_layout: "data_parallel" - categorical_accuracy {} - } - -} diff --git a/bamboo/unit_tests/prototext/opt_sgd.prototext b/bamboo/unit_tests/prototext/opt_sgd.prototext deleted file mode 100644 index 8d066780476..00000000000 --- a/bamboo/unit_tests/prototext/opt_sgd.prototext +++ /dev/null @@ -1,7 +0,0 @@ -optimizer { - sgd { - learn_rate: 0.01 - momentum: 0.9 - nesterov: false - } -} diff --git a/bamboo/unit_tests/test_unit_lbann_invocation.py b/bamboo/unit_tests/test_unit_lbann_invocation.py index 91ed3ca4fb9..51985037dec 100644 --- a/bamboo/unit_tests/test_unit_lbann_invocation.py +++ b/bamboo/unit_tests/test_unit_lbann_invocation.py @@ -1,111 +1,183 @@ import sys sys.path.insert(0, '../common_python') import tools -import os, sys +import os + + +def get_default_parameters(dir_name, two_models=True): + data_reader_path = '{d}/model_zoo/data_readers/data_reader_mnist.prototext'.format( + d=dir_name) + model_path = '{d}/model_zoo/models/simple_mnist/model_mnist_simple_1.prototext'.format( + d=dir_name) + if two_models: + model_path = '{{{mp},{mp}}}'.format(mp=model_path) + optimizer_path = '{d}/model_zoo/optimizers/opt_sgd.prototext'.format( + d=dir_name) + return data_reader_path, model_path, optimizer_path + + +def get_file_names(dir_name, test_name): + output_file_name = '{d}/bamboo/unit_tests/output/lbann_invocation_{t}_output.txt'.format( + d=dir_name, t=test_name) + error_file_name = '{d}/bamboo/unit_tests/error/lbann_invocation_{t}_error.txt'.format( + d=dir_name, t=test_name) + return output_file_name, error_file_name # Run with python3 -m pytest -s test_unit_lbann_invocation.py -k 'test_unit_no_params_bad' --exes= -def test_unit_no_params_bad(cluster, exes): +def test_unit_no_params_bad(cluster, dirname, exes): if isinstance(exes, dict): exe = exes['gcc7'] else: exe = exes - sys.stderr.write('TESTING: run lbann with no params; lbann should throw exception\n') + print('TESTING: run lbann with no params; lbann should throw exception\n') + (output_file_name, error_file_name) = get_file_names(dirname, 'no_params_bad') command = tools.get_command( - cluster=cluster, executable=exe, exit_after_setup=True) + cluster=cluster, executable=exe, + exit_after_setup=True, + output_file_name=output_file_name, + error_file_name=error_file_name + ) return_code = os.system(command) - assert return_code != 0 + tools.assert_failure(return_code, + 'Failed to load any prototext files', + error_file_name) # Run with python3 -m pytest -s test_unit_lbann_invocation.py -k 'test_unit_one_model_bad' --exes= -def test_unit_one_model_bad(cluster, exes): +def test_unit_one_model_bad(cluster, dirname, exes): if isinstance(exes, dict): exe = exes['gcc7'] else: exe = exes - sys.stderr.write('TESTING: run lbann with no optimizer or reader; lbann should throw exception\n') - model_path = 'prototext/model_mnist_simple_1.prototext' + print('TESTING: run lbann with no optimizer or reader; lbann should throw exception\n') + (_, model_path, _) = get_default_parameters(dirname, two_models=False) + (output_file_name, error_file_name) = get_file_names(dirname, 'one_model_bad') command = tools.get_command( - cluster=cluster, executable=exe, exit_after_setup=True, - model_path=model_path) + cluster=cluster, executable=exe, + exit_after_setup=True, + model_path=model_path, + output_file_name=output_file_name, + error_file_name=error_file_name + ) return_code = os.system(command) - assert return_code != 0 + tools.assert_failure(return_code, + 'you specified 1 model filenames, and 0 optimizer filenames; you must specify either one or 1 optimizer filenames', + error_file_name) # Run with python3 -m pytest -s test_unit_lbann_invocation.py -k 'test_unit_two_models_bad' --exes= -def test_unit_two_models_bad(cluster, exes): +def test_unit_two_models_bad(cluster, dirname, exes): if isinstance(exes, dict): exe = exes['gcc7'] else: exe = exes - sys.stderr.write('TESTING: run lbann with two models but no optimizer or reader; lbann should throw exception\n') - model_path = '{prototext/model_mnist_simple_1.prototext,prototext/model_mnist_simple_1.prototext}' + print('TESTING: run lbann with two models but no optimizer or reader; lbann should throw exception\n') + (_, model_path, _) = get_default_parameters(dirname) + (output_file_name, error_file_name) = get_file_names(dirname, 'two_models_bad') command = tools.get_command( - cluster=cluster, executable=exe, exit_after_setup=True, - model_path=model_path) + cluster=cluster, executable=exe, + exit_after_setup=True, + model_path=model_path, + output_file_name=output_file_name, + error_file_name=error_file_name + ) return_code = os.system(command) - assert return_code != 0 + tools.assert_failure(return_code, + 'you specified 2 model filenames, and 0 optimizer filenames; you must specify either one or 2 optimizer filenames', + error_file_name) # Run with python3 -m pytest -s test_unit_lbann_invocation.py -k 'test_unit_two_models_bad2' --exes= -def test_unit_two_models_bad2(cluster, exes): +def test_unit_two_models_bad2(cluster, dirname, exes): if isinstance(exes, dict): exe = exes['gcc7'] else: exe = exes - sys.stderr.write('TESTING: run lbann with two models with missing {; lbann should throw exception\n') - model_path='prototext/model_mnist_simple_1.prototext,prototext/model_mnist_simple_1.prototext}' + print('TESTING: run lbann with two models with missing {; lbann should throw exception\n') + (_, model_path, _) = get_default_parameters(dirname, two_models=False) + model_path = '{mp},{mp}}}'.format(mp=model_path) + (output_file_name, error_file_name) = get_file_names(dirname, 'two_models_bad2') command = tools.get_command( - cluster=cluster, executable=exe, exit_after_setup=True, - model_path=model_path) + cluster=cluster, executable=exe, + exit_after_setup=True, + model_path=model_path, + output_file_name=output_file_name, + error_file_name=error_file_name + ) return_code = os.system(command) - assert return_code != 0 + tools.assert_failure(return_code, + "possibly you left out '{' or '}' or both ??", + error_file_name) # Run with python3 -m pytest -s test_unit_lbann_invocation.py -k 'test_unit_missing_optimizer' --exes= -def test_unit_missing_optimizer(cluster, exes): +def test_unit_missing_optimizer(cluster, dirname, exes): if isinstance(exes, dict): exe = exes['gcc7'] else: exe = exes - sys.stderr.write('TESTING: run lbann with two models, reader, but no optimizer; lbann should throw exception\n') - model_path='{prototext/model_mnist_simple_1.prototext,prototext/model_mnist_simple_1.prototext}' - data_reader_path='prototext/data_reader_mnist.prototext' + print('TESTING: run lbann with two models, reader, but no optimizer; lbann should throw exception\n') + (data_reader_path, model_path, _) = get_default_parameters(dirname) + (output_file_name, error_file_name) = get_file_names(dirname, 'missing_optimizer') command = tools.get_command( - cluster=cluster, executable=exe, data_reader_path=data_reader_path, + cluster=cluster, executable=exe, + data_reader_path=data_reader_path, data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST', - exit_after_setup=True, model_path=model_path) + exit_after_setup=True, model_path=model_path, + output_file_name=output_file_name, + error_file_name=error_file_name + ) return_code = os.system(command) - assert return_code != 0 + tools.assert_failure(return_code, + 'you specified 2 model filenames, and 0 optimizer filenames; you must specify either one or 2 optimizer filenames', + error_file_name) # Run with python3 -m pytest -s test_unit_lbann_invocation.py -k 'test_unit_missing_reader' --exes= -def test_unit_missing_reader(cluster, exes): +def test_unit_missing_reader(cluster, dirname, exes): if isinstance(exes, dict): exe = exes['gcc7'] else: exe = exes - sys.stderr.write('TESTING: run lbann with two models, reader, but no reader; lbann should throw exception\n') - model_path = '{prototext/model_mnist_simple_1.prototext,prototext/model_mnist_simple_1.prototext}' - optimizer_path = 'prototext/opt_sgd.prototext' + print('TESTING: run lbann with two models, reader, but no reader; lbann should throw exception\n') + (_, model_path, optimizer_path) = get_default_parameters(dirname) + (output_file_name, error_file_name) = get_file_names(dirname, 'missing_reader') command = tools.get_command( - cluster=cluster, executable=exe, exit_after_setup=True, - model_path=model_path, optimizer_path=optimizer_path) + cluster=cluster, executable=exe, + exit_after_setup=True, + model_path=model_path, optimizer_path=optimizer_path, + output_file_name=output_file_name, + error_file_name=error_file_name + ) return_code = os.system(command) - assert return_code != 0 + tools.assert_failure(return_code, + 'you specified 2 model filenames, and 0 reader filenames; you must specify either one or 2 reader filenames', + error_file_name) # Run with python3 -m pytest -s test_unit_lbann_invocation.py -k 'test_unit_bad_params' --exes= -def test_unit_bad_params(cluster, exes): +def test_unit_bad_params(cluster, dirname, exes): if isinstance(exes, dict): exe = exes['gcc7'] else: exe = exes - sys.stderr.write('TESTING: run lbann with ill-formed param (missing -) lbann should throw exception\n') - (command_allocate, command_run, _, _) = tools.get_command(cluster=cluster, executable=exe, return_tuple=True) - command_string = '%s%s %s -exit_after_setup --reader=prototext/data_reader_mnist.prototext --model={prototext/model_mnist_simple_1.prototext,prototext/model_mnist_simple_1.prototext} --optimizer=prototext/opt_sgd.prototext' % (command_allocate, command_run, exe) + print('TESTING: run lbann with ill-formed param (exit_after_setup should have `--` not `-`) lbann should throw exception\n') + (data_reader_path, model_path, optimizer_path) = get_default_parameters( + dirname) + (command_allocate, command_run, _, _) = tools.get_command( + cluster=cluster, executable=exe, + return_tuple=True) + (output_file_name, error_file_name) = get_file_names(dirname, 'bad_params') + command_string = '{ca}{cr} {e} -exit_after_setup --reader={d} --model={m} --optimizer={o} > {ofn} 2> {efn}'.format( + ca=command_allocate, cr=command_run, e=exe, + d=data_reader_path, m=model_path, o=optimizer_path, + ofn=output_file_name, efn=error_file_name + ) return_code = os.system(command_string) - assert return_code != 0 + tools.assert_failure(return_code, + "badly formed cmd line param; must begin with '--': -exit_after_setup", + error_file_name) # Run with python3 -m pytest -s test_unit_lbann_invocation.py -k 'test_unit_should_work' --exes= @@ -114,14 +186,10 @@ def test_unit_should_work(cluster, dirname, exes): exe = exes['gcc7'] else: exe = exes - sys.stderr.write('TESTING: run lbann with two models, reader, and optimizer; lbann should NOT throw exception\n') - model_path = '{prototext/model_mnist_simple_1.prototext,prototext/model_mnist_simple_1.prototext}' - data_reader_path = 'prototext/data_reader_mnist.prototext' - optimizer_path = 'prototext/opt_sgd.prototext' - output_file_name = '{d}/bamboo/unit_tests/output/lbann_invocation_should_work_output.txt'.format( - d=dirname) - error_file_name = '{d}/bamboo/unit_tests/error/lbann_invocation_should_work_error.txt'.format( - d=dirname) + print('TESTING: run lbann with two models, reader, and optimizer; lbann should NOT throw exception\n') + (data_reader_path, model_path, optimizer_path) = get_default_parameters( + dirname) + (output_file_name, error_file_name) = get_file_names(dirname, 'should_work') command = tools.get_command( cluster=cluster, executable=exe, data_reader_path=data_reader_path, data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST', From 023b7f84523912a442af47e98f7bba1552a28e4b Mon Sep 17 00:00:00 2001 From: Ryan Forsyth Date: Thu, 22 Aug 2019 16:35:58 -0700 Subject: [PATCH 264/634] Remove error printing Now that artifacts on Bamboo are set up, we can remove code to print error files to the log. --- bamboo/compiler_tests/test_compiler.py | 52 ++++--------------- bamboo/integration_tests/common_code.py | 4 +- .../test_integration_autoencoders.py | 3 -- .../test_integration_debug.py | 6 ++- .../test_integration_performance.py | 3 -- 5 files changed, 16 insertions(+), 52 deletions(-) diff --git a/bamboo/compiler_tests/test_compiler.py b/bamboo/compiler_tests/test_compiler.py index c66bcdb7462..eddcd801ba4 100644 --- a/bamboo/compiler_tests/test_compiler.py +++ b/bamboo/compiler_tests/test_compiler.py @@ -15,13 +15,6 @@ def test_compiler_build_script(cluster, dirname): command = '%s/bamboo/compiler_tests/build_script.sh > %s 2> %s' % ( dirname, output_file_name, error_file_name) return_code = os.system(command) - if return_code != 0: - output_file = open(output_file_name, 'r') - for line in output_file: - print('%s: %s' % (output_file_name, line)) - error_file = open(error_file_name, 'r') - for line in error_file: - print('%s: %s' % (error_file_name, line)) tools.assert_success(return_code, error_file_name) @@ -73,48 +66,46 @@ def test_compiler_intel19_debug(cluster, dirname): assert os.path.exists(path) -def skeleton_clang6(cluster, dir_name, debug, should_log=False): +def skeleton_clang6(cluster, dir_name, debug): if cluster not in ['catalyst']: e = 'skeleton_clang6: Unsupported Cluster %s' % cluster print('Skip - ' + e) pytest.skip(e) try: - spack_skeleton(dir_name, 'clang@6.0.0', 'mvapich2@2.2', debug, - should_log) - build_skeleton(dir_name, 'clang@6.0.0', debug, should_log) + spack_skeleton(dir_name, 'clang@6.0.0', 'mvapich2@2.2', debug) + build_skeleton(dir_name, 'clang@6.0.0', debug) except AssertionError as e: print(e) build_script(cluster, dir_name, 'clang6', debug) -def skeleton_gcc7(cluster, dir_name, debug, should_log=False): +def skeleton_gcc7(cluster, dir_name, debug): if cluster not in ['catalyst', 'pascal']: e = 'skeleton_gcc7: Unsupported Cluster %s' % cluster print('Skip - ' + e) pytest.skip(e) try: - spack_skeleton(dir_name, 'gcc@7.1.0', 'mvapich2@2.2', debug, should_log) - build_skeleton(dir_name, 'gcc@7.1.0', debug, should_log) + spack_skeleton(dir_name, 'gcc@7.1.0', 'mvapich2@2.2', debug) + build_skeleton(dir_name, 'gcc@7.1.0', debug) except AssertionError as e: print(e) build_script(cluster, dir_name, 'gcc7', debug) -def skeleton_intel19(cluster, dir_name, debug, should_log=False): +def skeleton_intel19(cluster, dir_name, debug): if cluster not in []: # Taking out 'catalyst' e = 'skeleton_intel19: Unsupported Cluster %s' % cluster print('Skip - ' + e) pytest.skip(e) try: - spack_skeleton(dir_name, 'intel@19.0.0', 'mvapich2@2.2', debug, - should_log) - build_skeleton(dir_name, 'intel@19.0.0', debug, should_log) + spack_skeleton(dir_name, 'intel@19.0.0', 'mvapich2@2.2', debug) + build_skeleton(dir_name, 'intel@19.0.0', debug) except AssertionError as e: print(e) build_script(cluster, dir_name, 'intel19', debug) -def spack_skeleton(dir_name, compiler, mpi_lib, debug, should_log): +def spack_skeleton(dir_name, compiler, mpi_lib, debug): compiler_underscored = re.sub('[@\.]', '_', compiler) if debug: build_type = 'debug' @@ -130,17 +121,10 @@ def spack_skeleton(dir_name, compiler, mpi_lib, debug, should_log): dir_name, compiler, mpi_lib, debug_flag, output_file_name, error_file_name) return_code = os.system(command) os.chdir('..') - if should_log or (return_code != 0): - output_file = open(output_file_name, 'r') - for line in output_file: - print('%s: %s' % (output_file_name, line)) - error_file = open(error_file_name, 'r') - for line in error_file: - print('%s: %s' % (error_file_name, line)) tools.assert_success(return_code, error_file_name) -def build_skeleton(dir_name, compiler, debug, should_log): +def build_skeleton(dir_name, compiler, debug): compiler_underscored = re.sub('[@\.]', '_', compiler) if debug: build_type = 'debug' @@ -165,13 +149,6 @@ def build_skeleton(dir_name, compiler, debug, should_log): command = 'make -j all > %s 2> %s' % (output_file_name, error_file_name) return_code = os.system(command) os.chdir('../..') - if should_log or (return_code != 0): - output_file = open(output_file_name, 'r') - for line in output_file: - print('%s: %s' % (output_file_name, line)) - error_file = open(error_file_name, 'r') - for line in error_file: - print('%s: %s' % (error_file_name, line)) tools.assert_success(return_code, error_file_name) @@ -189,11 +166,4 @@ def build_script(cluster, dirname, compiler, debug): error_file_name = '%s/bamboo/compiler_tests/error/%s_%s_%s_build_script_error.txt' % (dirname, cluster, compiler, build) command = '%s/bamboo/compiler_tests/build_script_specific.sh --compiler %s %s> %s 2> %s' % (dirname, compiler, debug_flag, output_file_name, error_file_name) return_code = os.system(command) - if return_code != 0: - output_file = open(output_file_name, 'r') - for line in output_file: - print('%s: %s' % (output_file_name, line)) - error_file = open(error_file_name, 'r') - for line in error_file: - print('%s: %s' % (error_file_name, line)) tools.assert_success(return_code, error_file_name) diff --git a/bamboo/integration_tests/common_code.py b/bamboo/integration_tests/common_code.py index 62e75133709..ee53649d757 100644 --- a/bamboo/integration_tests/common_code.py +++ b/bamboo/integration_tests/common_code.py @@ -80,7 +80,6 @@ def run_lbann(command, model_name, output_file_name, error_file_name, if should_log or (return_code != 0): output_file = open(output_file_name, 'r') for line in output_file: - print('%s: %s' % (output_file_name, line)) is_match = re.search( 'This lbann_exception is about to be thrown:(.*)', line) if is_match: @@ -90,7 +89,6 @@ def run_lbann(command, model_name, output_file_name, error_file_name, timed_out = True error_file = open(error_file_name, 'r') for line in error_file: - print('%s: %s' % (error_file_name, line)) is_match = re.search('LBANN error on (.*)', line) if is_match: lbann_exceptions.append(is_match.group(1)) @@ -227,7 +225,7 @@ def skeleton(cluster, dir_name, executable, model_folder, model_name, cluster, dir_name, model_folder, model_name, executable, output_file_name, error_file_name, compiler_name, weekly=weekly) run_lbann(command, model_name, output_file_name, - error_file_name, should_log) # Don't need return value + error_file_name, should_log) return extract_data(output_file_name, data_fields, should_log) # Misc. functions ############################################################ diff --git a/bamboo/integration_tests/test_integration_autoencoders.py b/bamboo/integration_tests/test_integration_autoencoders.py index 8d1b0c2216b..c4e4f7bf9e4 100644 --- a/bamboo/integration_tests/test_integration_autoencoders.py +++ b/bamboo/integration_tests/test_integration_autoencoders.py @@ -36,9 +36,6 @@ def run_tests(actual_objective_functions, model_name, dir_name, cluster, actual_objective_functions, expected_objective_functions, model_name, errors, all_values, frequency_str) - print('Errors for: %s %s (%d)' % (model_name, compiler_name, len(errors))) - for error in errors: - print(error) if should_log: print('All values for: %s %s (%d)' % (model_name, compiler_name, len(all_values))) diff --git a/bamboo/integration_tests/test_integration_debug.py b/bamboo/integration_tests/test_integration_debug.py index 9f64e821e2e..274172be72b 100644 --- a/bamboo/integration_tests/test_integration_debug.py +++ b/bamboo/integration_tests/test_integration_debug.py @@ -26,7 +26,8 @@ def skeleton_mnist_debug(cluster, dir_name, executables, compiler_name, weekly, data_reader_name='mnist', model_folder='models/' + model_name, model_name=model_name, num_epochs=5, optimizer_name='adagrad', output_file_name=output_file_name, error_file_name=error_file_name) - common_code.run_lbann(command, model_name, output_file_name, error_file_name, should_log) + common_code.run_lbann(command, model_name, output_file_name, + error_file_name, should_log) def skeleton_cifar_debug(cluster, dir_name, executables, compiler_name, weekly, @@ -55,7 +56,8 @@ def skeleton_cifar_debug(cluster, dir_name, executables, compiler_name, weekly, data_reader_name='cifar10', data_reader_percent=0.01, model_folder='models/' + model_name, model_name='conv_' + model_name, num_epochs=5, optimizer_name='adagrad', output_file_name=output_file_name, error_file_name=error_file_name) - common_code.run_lbann(command, model_name, output_file_name, error_file_name, should_log) + common_code.run_lbann(command, model_name, output_file_name, + error_file_name, should_log) def test_integration_mnist_clang6_debug(cluster, dirname, exes, weekly, debug_build): diff --git a/bamboo/integration_tests/test_integration_performance.py b/bamboo/integration_tests/test_integration_performance.py index 31044b4dafd..fdb3e01df4a 100644 --- a/bamboo/integration_tests/test_integration_performance.py +++ b/bamboo/integration_tests/test_integration_performance.py @@ -80,9 +80,6 @@ def run_tests(actual_performance, model_name, dir_name, should_log, else: print('os.environ["LOGNAME"]=%s' % os.environ['LOGNAME']) - print('Errors for: %s %s (%d)' % (model_name, compiler_name, len(errors))) - for error in errors: - print(error) if should_log: print('All values for: %s %s (%d)' % ( model_name, compiler_name, len(all_values))) From ce2349a6445ea63f42a6ba2bb7769888d9fc5745 Mon Sep 17 00:00:00 2001 From: Ryan Forsyth Date: Thu, 1 Aug 2019 15:52:27 -0700 Subject: [PATCH 265/634] Update JAG file path --- bamboo/unit_tests/test_unit_reconstruction_loss.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bamboo/unit_tests/test_unit_reconstruction_loss.py b/bamboo/unit_tests/test_unit_reconstruction_loss.py index e7283893404..c5617a0335a 100644 --- a/bamboo/unit_tests/test_unit_reconstruction_loss.py +++ b/bamboo/unit_tests/test_unit_reconstruction_loss.py @@ -18,7 +18,7 @@ def skeleton_jag_reconstruction_loss(cluster, executables, dir_name, compiler_na num_nodes=16, num_processes=32, dir_name=dir_name, - data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST', + data_filedir_default='/p/lscratchh/brainusr/datasets/10MJAG/1M_A/100K4trainers', data_reader_name='jag', metadata='model_zoo/models/jag/wae_cycle_gan/jag_100M_metadata.prototext', model_folder='tests', From 794e415326b92b8890241998d5eb78b10a43f7af Mon Sep 17 00:00:00 2001 From: "David A. Hysom" Date: Fri, 23 Aug 2019 19:00:43 -0700 Subject: [PATCH 266/634] initial commit. Shuffles lines 3..n of sample list files --- model_zoo/jag_utils/python/shuffle.py | 42 +++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100644 model_zoo/jag_utils/python/shuffle.py diff --git a/model_zoo/jag_utils/python/shuffle.py b/model_zoo/jag_utils/python/shuffle.py new file mode 100644 index 00000000000..7e19f2a525e --- /dev/null +++ b/model_zoo/jag_utils/python/shuffle.py @@ -0,0 +1,42 @@ +import sys +import os +import random + +def shuffle(fn) : + fn2 = fn + '.shuffled' + a = open(fn).readlines() + b = open(fn2, 'w') + b.write(a[0]) + b.write(a[1]) + b.write(a[2]) + c = a[3:] + n = len(c) + r = set() + r_idx = [] + for y in range(n) : + while True : + y = random.randint(0, n-1) + if y not in r : + r.add(y) + r_idx.append(y) + if len(r) == n : + break + for j in range(len(c)) : + b.write(c[r_idx[j]]) + b.close() + print 'wrote:', fn2 + +#==================================================================== +if len(sys.argv) != 4 : + print 'usage:', sys.argv[0], 'base_dir num_sample_lists sample_list_base_name' + print 'example: python', sys.argv[0], '/p/lustre2/brainusr/datasets/10MJAG/1M_A/select_samples_test/another_dir 10 my_samples.txt', + exit(9) + +dir = sys.argv[1] +n = int(sys.argv[2]) +base_fn = sys.argv[3] + +for j in range(n) : + fn = dir + '/t' + str(j) + '_' + base_fn + shuffle(fn) + From c8d79935979124e400dcbac7a9d8465fb18a2ba7 Mon Sep 17 00:00:00 2001 From: Tim Moon Date: Tue, 27 Aug 2019 10:21:51 -0700 Subject: [PATCH 267/634] Callback to print model descriptions (#1181) * Include callbacks in model description messages * Add callback to print model description Models no longer print description by default. * Documenting "print model description" callback * Adopt review suggestions from @benson31 * Remove callback hook at beginning of setup Many objects are invalid before setup, so applying a callback is semantically iffy. --- include/lbann/callbacks/CMakeLists.txt | 1 + include/lbann/callbacks/callback.hpp | 6 ++ .../callbacks/print_model_description.hpp | 60 +++++++++++++++++++ include/lbann/lbann.hpp | 1 + include/lbann/models/model.hpp | 2 + model_zoo/vision/lenet.py | 4 +- src/callbacks/CMakeLists.txt | 2 + src/callbacks/callback.cpp | 35 +++++++++++ src/callbacks/print_model_description.cpp | 49 +++++++++++++++ src/models/model.cpp | 21 ++++++- src/proto/callbacks.proto | 10 ++++ src/proto/factories/callback_factory.cpp | 3 + src/utils/lbann_library.cpp | 9 --- 13 files changed, 192 insertions(+), 11 deletions(-) create mode 100644 include/lbann/callbacks/print_model_description.hpp create mode 100644 src/callbacks/callback.cpp create mode 100644 src/callbacks/print_model_description.cpp diff --git a/include/lbann/callbacks/CMakeLists.txt b/include/lbann/callbacks/CMakeLists.txt index ff9c269f350..f90de778a7d 100644 --- a/include/lbann/callbacks/CMakeLists.txt +++ b/include/lbann/callbacks/CMakeLists.txt @@ -26,6 +26,7 @@ set_full_path(THIS_DIR_HEADERS monitor_io.hpp perturb_adam.hpp perturb_dropout.hpp + print_model_description.hpp print_statistics.hpp profiler.hpp replace_weights.hpp diff --git a/include/lbann/callbacks/callback.hpp b/include/lbann/callbacks/callback.hpp index f7e7c428188..fcdc0295aad 100644 --- a/include/lbann/callbacks/callback.hpp +++ b/include/lbann/callbacks/callback.hpp @@ -31,6 +31,7 @@ #include "lbann/layers/layer.hpp" #include "lbann/models/model.hpp" +#include "lbann/utils/description.hpp" #include "lbann/utils/memory.hpp" #include "lbann/utils/summary.hpp" @@ -89,6 +90,8 @@ class callback_base { /** @name Callback hooks */ ///@{ + /** @brief Called at the end of setup. */ + virtual void on_setup_end(model *m) {} /** @brief Called at the beginning of training. */ virtual void on_train_begin(model *m) {} /** @brief Called at the end of training. */ @@ -173,6 +176,9 @@ class callback_base { /** @brief Return this callback's name. */ virtual std::string name() const = 0; + /** @brief Human-readable description. */ + virtual description get_description() const; + ///@} protected: diff --git a/include/lbann/callbacks/print_model_description.hpp b/include/lbann/callbacks/print_model_description.hpp new file mode 100644 index 00000000000..9f68cc39322 --- /dev/null +++ b/include/lbann/callbacks/print_model_description.hpp @@ -0,0 +1,60 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_CALLBACKS_CALLBACK_PRINT_MODEL_DESCRIPTION_HPP_INCLUDED +#define LBANN_CALLBACKS_CALLBACK_PRINT_MODEL_DESCRIPTION_HPP_INCLUDED + +#include "lbann/callbacks/callback.hpp" + +namespace lbann { +namespace callback { + +/** @brief Print human-readable description of model to standard input. + * + * Message is printed when the model has finished setup. The + * description includes information on the model's layers, weights, + * and callbacks. + */ +class print_model_description : public callback_base { +public: + print_model_description() : callback_base() {} + print_model_description(const print_model_description&) = default; + print_model_description& operator=(const print_model_description&) = default; + print_model_description* copy() const override { return new print_model_description(*this); } + void on_setup_end(model *m) override; + std::string name() const override { return "print_model_description"; } + +}; + +// Builder function +std::unique_ptr +build_print_model_description_callback_from_pbuf( + const google::protobuf::Message&, std::shared_ptr const&); + +} // namespace callback +} // namespace lbann + +#endif // LBANN_CALLBACKS_CALLBACK_PRINT_MODEL_DESCRIPTION_HPP_INCLUDED diff --git a/include/lbann/lbann.hpp b/include/lbann/lbann.hpp index d13a3f5ea47..66c10065fb2 100644 --- a/include/lbann/lbann.hpp +++ b/include/lbann/lbann.hpp @@ -158,6 +158,7 @@ #include "lbann/callbacks/monitor_io.hpp" #include "lbann/callbacks/perturb_adam.hpp" #include "lbann/callbacks/perturb_dropout.hpp" +#include "lbann/callbacks/print_model_description.hpp" #include "lbann/callbacks/print_statistics.hpp" #include "lbann/callbacks/profiler.hpp" #include "lbann/callbacks/replace_weights.hpp" diff --git a/include/lbann/models/model.hpp b/include/lbann/models/model.hpp index a15aee6d399..8448f37786e 100644 --- a/include/lbann/models/model.hpp +++ b/include/lbann/models/model.hpp @@ -387,6 +387,8 @@ class model { // Callbacks // =========================================== + /** @brief Execute callbacks at end of setup. */ + virtual void do_setup_end_cbs(); /** @brief Execute callbacks at start of training. */ virtual void do_train_begin_cbs(); /** @brief Execute callbacks at end of training. */ diff --git a/model_zoo/vision/lenet.py b/model_zoo/vision/lenet.py index fd90928819c..97cdb15ea14 100755 --- a/model_zoo/vision/lenet.py +++ b/model_zoo/vision/lenet.py @@ -80,7 +80,9 @@ layers=lbann.traverse_layer_graph(input), objective_function=loss, metrics=[lbann.Metric(acc, name='accuracy', unit='%')], - callbacks=[lbann.CallbackPrint(), lbann.CallbackTimer()]) + callbacks=[lbann.CallbackPrintModelDescription(), + lbann.CallbackPrint(), + lbann.CallbackTimer()]) # Setup optimizer opt = lbann.SGD(learn_rate=0.01, momentum=0.9) diff --git a/src/callbacks/CMakeLists.txt b/src/callbacks/CMakeLists.txt index 27e08c5d5c9..1ebb5f80f04 100644 --- a/src/callbacks/CMakeLists.txt +++ b/src/callbacks/CMakeLists.txt @@ -1,5 +1,6 @@ # Add the source files for this directory set_full_path(THIS_DIR_SOURCES + callback.cpp check_dataset.cpp check_gradients.cpp check_init.cpp @@ -25,6 +26,7 @@ set_full_path(THIS_DIR_SOURCES monitor_io.cpp perturb_adam.cpp perturb_dropout.cpp + print_model_description.cpp print_statistics.cpp profiler.cpp replace_weights.cpp diff --git a/src/callbacks/callback.cpp b/src/callbacks/callback.cpp new file mode 100644 index 00000000000..0a395ea8de2 --- /dev/null +++ b/src/callbacks/callback.cpp @@ -0,0 +1,35 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#include "lbann/callbacks/callback.hpp" + +namespace lbann { + +description callback_base::get_description() const { + return name(); +} + +} // namespace lbann diff --git a/src/callbacks/print_model_description.cpp b/src/callbacks/print_model_description.cpp new file mode 100644 index 00000000000..b44d64547aa --- /dev/null +++ b/src/callbacks/print_model_description.cpp @@ -0,0 +1,49 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#include "lbann/callbacks/print_model_description.hpp" +#include "lbann/models/model.hpp" +#include + +namespace lbann { +namespace callback { + +void print_model_description::on_setup_end(model *m) { + if (m->get_comm()->am_world_master()) { + std::cout << "\n" + << m->get_description() + << std::endl; + } +} + +std::unique_ptr +build_print_model_description_callback_from_pbuf( + const google::protobuf::Message&, const std::shared_ptr&) { + return make_unique(); +} + +} // namespace callback +} // namespace lbann diff --git a/src/models/model.cpp b/src/models/model.cpp index 3439b630642..4e4da3e6f04 100644 --- a/src/models/model.cpp +++ b/src/models/model.cpp @@ -268,7 +268,15 @@ description model::get_description() const { desc.add(std::string{}); desc.add(weights_desc); - /// @todo Descriptions for objective function, metrics, callbacks + // Callbacks + description callback_desc("Callbacks:"); + for (const auto& cb : m_callbacks) { + callback_desc.add(cb->get_description()); + } + desc.add(std::string{}); + desc.add(callback_desc); + + /// @todo Descriptions for objective function, metrics // Result return desc; @@ -593,6 +601,7 @@ void model::remap_pointers(const std::unordered_map& layer_map, // ============================================= void model::setup(std::shared_ptr io_thread_pool) { + // Setup I/O threads - set up before setting up the layers (input // layer depends on having a properly initialized thread pool) m_io_thread_pool = std::move(io_thread_pool); @@ -617,6 +626,10 @@ void model::setup(std::shared_ptr io_thread_pool) { for (const auto& cb : m_callbacks) { cb->setup(this); } + + // Callback hooks at end of setup + do_setup_end_cbs(); + } void model::setup_layer_topology() { @@ -1212,6 +1225,12 @@ void model::reconcile_weight_values() { // Callbacks // ============================================= +void model::do_setup_end_cbs() { + for (const auto& cb : m_callbacks) { + cb->on_setup_end(this); + } +} + void model::do_train_begin_cbs() { for (const auto& cb : m_callbacks) { cb->on_train_begin(this); diff --git a/src/proto/callbacks.proto b/src/proto/callbacks.proto index f9909233908..f204be30b79 100644 --- a/src/proto/callbacks.proto +++ b/src/proto/callbacks.proto @@ -74,6 +74,7 @@ message Callback { CallbackCheckInit init = 42; CallbackEarlyStopping early_stopping = 43; CallbackTimeline timeline = 44; + CallbackPrintModelDescription print_model_description = 45; } message CallbackLTFB { @@ -315,4 +316,13 @@ message Callback { message CallbackTimeline { string directory = 1; } + + // Print human-readable description of model to standard output. + // + // Message is printed when the model has finished setup. The + // description includes information on the model's layers, weights, + // and callbacks. + message CallbackPrintModelDescription { + } + } \ No newline at end of file diff --git a/src/proto/factories/callback_factory.cpp b/src/proto/factories/callback_factory.cpp index 2c463d004f2..d36ebe25408 100644 --- a/src/proto/factories/callback_factory.cpp +++ b/src/proto/factories/callback_factory.cpp @@ -51,6 +51,7 @@ #include "lbann/callbacks/monitor_io.hpp" #include "lbann/callbacks/perturb_adam.hpp" #include "lbann/callbacks/perturb_dropout.hpp" +#include "lbann/callbacks/print_model_description.hpp" #include "lbann/callbacks/print_statistics.hpp" #include "lbann/callbacks/profiler.hpp" #include "lbann/callbacks/replace_weights.hpp" @@ -155,6 +156,8 @@ void register_default_builders(factory_type& factory) build_perturb_dropout_callback_from_pbuf); factory.register_builder("CallbackPolyLearningRate", build_poly_learning_rate_callback_from_pbuf); + factory.register_builder("CallbackPrintModelDescription", + build_print_model_description_callback_from_pbuf); factory.register_builder("CallbackPrint", build_print_statistics_callback_from_pbuf); factory.register_builder("CallbackProfiler", diff --git a/src/utils/lbann_library.cpp b/src/utils/lbann_library.cpp index b0bda3d8470..214a70a36c4 100644 --- a/src/utils/lbann_library.cpp +++ b/src/utils/lbann_library.cpp @@ -207,15 +207,6 @@ std::unique_ptr build_model_from_prototext( //@todo //model->restartShared(); - if (comm->am_world_master()) { - std::cout << "\n" - << ret_model->get_description() - << "Callbacks:" << std::endl; - for (callback_base *cb : ret_model->get_callbacks()) { - std::cout << cb->name() << std::endl; - } - } - #ifndef LBANN_DETERMINISTIC // Under normal conditions, reinitialize the random number generator so // that regularization techniques (e.g. dropout) generate unique patterns From 60661a98235e94bcbc00e737d04431a64858e508 Mon Sep 17 00:00:00 2001 From: Ryan Forsyth Date: Mon, 26 Aug 2019 17:43:32 -0700 Subject: [PATCH 268/634] Fix Develop errors --- bamboo/allocate_and_run.sh | 7 +-- bamboo/common_python/tools.py | 3 +- .../catalyst/clang6/expected_performance.csv | 2 +- .../catalyst/gcc7/expected_performance.csv | 2 +- .../corona/gcc7/expected_performance.csv | 2 +- .../lassen/gcc7/expected_performance.csv | 2 +- .../pascal/gcc7/expected_performance.csv | 2 +- bamboo/integration_tests/full_alexnet.sh | 44 ++++++++++++++++--- bamboo/unit_tests/test_unit_checkpoint.py | 24 +++------- bamboo/unit_tests/test_unit_lbann2_reload.py | 16 ++----- 10 files changed, 55 insertions(+), 49 deletions(-) diff --git a/bamboo/allocate_and_run.sh b/bamboo/allocate_and_run.sh index 2da19c8cf78..6ada77c39bd 100755 --- a/bamboo/allocate_and_run.sh +++ b/bamboo/allocate_and_run.sh @@ -40,8 +40,8 @@ if [ "${CLUSTER}" = 'lassen' ]; then timeout -k 5 24h bsub -G guests -Is -q pbatch -nnodes 16 -W $ALLOCATION_TIME_LIMIT ./run.sh fi elif [ "${CLUSTER}" = 'catalyst' ] || [ "${CLUSTER}" = 'corona' ] || [ "${CLUSTER}" = 'pascal' ]; then + ALLOCATION_TIME_LIMIT=960 if [ ${WEEKLY} -ne 0 ]; then - ALLOCATION_TIME_LIMIT=720 timeout -k 5 24h salloc -N16 --partition=pbatch -t $ALLOCATION_TIME_LIMIT ./run.sh --weekly if [ "${CLUSTER}" = 'catalyst' ]; then cd integration_tests @@ -51,11 +51,6 @@ elif [ "${CLUSTER}" = 'catalyst' ] || [ "${CLUSTER}" = 'corona' ] || [ "${CLUSTE cd .. fi else - if [ "${CLUSTER}" = 'catalyst' ]; then - ALLOCATION_TIME_LIMIT=300 - elif [ "${CLUSTER}" = 'corona' ] || [ "${CLUSTER}" = 'pascal' ]; then - ALLOCATION_TIME_LIMIT=660 - fi timeout -k 5 24h salloc -N16 --partition=pbatch -t $ALLOCATION_TIME_LIMIT ./run.sh fi fi diff --git a/bamboo/common_python/tools.py b/bamboo/common_python/tools.py index cbb08c0efe2..4cc5839ae8e 100644 --- a/bamboo/common_python/tools.py +++ b/bamboo/common_python/tools.py @@ -561,7 +561,8 @@ def get_error_line(error_file_name): ('Expired or invalid job' in line): error_line = line break - elif 'Stack trace:' in line: + elif ('Stack trace:' in line) or \ + ('Error is not recoverable: exiting now' in line): error_line = previous_line break else: diff --git a/bamboo/integration_tests/expected_values/catalyst/clang6/expected_performance.csv b/bamboo/integration_tests/expected_values/catalyst/clang6/expected_performance.csv index c767d373880..6a9581ff8f8 100644 --- a/bamboo/integration_tests/expected_values/catalyst/clang6/expected_performance.csv +++ b/bamboo/integration_tests/expected_values/catalyst/clang6/expected_performance.csv @@ -1,5 +1,5 @@ Model_name, training_run_time, training_mean, training_max, training_min, training_stdev, test_accuracy alexnet_nightly, 117.00, 2.80, 9.00, 1.20, 2.00, 0.00 -alexnet_weekly, 490.00, 1.00, 3.00, 0.60, 0.50, 100.00 +alexnet_weekly, 490.00, 1.00, 9.00, 0.60, 0.50, 2.50 cache_alexnet, 0.00, 0.00, 0.00, 0.00, 0.00, 100.00 lenet_mnist, 100.00, 0.12, 0.40, 0.10, 0.09, 98.40 diff --git a/bamboo/integration_tests/expected_values/catalyst/gcc7/expected_performance.csv b/bamboo/integration_tests/expected_values/catalyst/gcc7/expected_performance.csv index 258fa233523..c05e05c43e8 100644 --- a/bamboo/integration_tests/expected_values/catalyst/gcc7/expected_performance.csv +++ b/bamboo/integration_tests/expected_values/catalyst/gcc7/expected_performance.csv @@ -1,5 +1,5 @@ Model_name, training_run_time, training_mean, training_max, training_min, training_stdev, test_accuracy alexnet_nightly, 65.00, 1.50, 8.30, 0.37, 1.70, 0.1 -alexnet_weekly, 360.00, 0.90, 4.00, 0.40, 0.70, 100.00 +alexnet_weekly, 360.00, 0.90, 4.00, 0.40, 0.70, 2.00 cache_alexnet, 0.00, 0.00, 0.00, 0.00, 0.00, 100.00 lenet_mnist, 137.00, 0.18, 0.40, 0.15, 0.04, 98.92 diff --git a/bamboo/integration_tests/expected_values/corona/gcc7/expected_performance.csv b/bamboo/integration_tests/expected_values/corona/gcc7/expected_performance.csv index e36bf374407..f48c79b35b6 100644 --- a/bamboo/integration_tests/expected_values/corona/gcc7/expected_performance.csv +++ b/bamboo/integration_tests/expected_values/corona/gcc7/expected_performance.csv @@ -1,5 +1,5 @@ Model_name, training_run_time, training_mean, training_max, training_min, training_stdev, test_accuracy alexnet_nightly, 55.00, 1.03, 1.90, 0.80, 0.21, 0.00 -alexnet_weekly, 0.00, 0.00, 0.00, 0.00, 0.00, 100.00 +alexnet_weekly, 491.00, 1.00, 9.00, 1.11, 0.60, 2.00 cache_alexnet, 0.00, 0.00, 0.00, 0.00, 0.00, 100.00 lenet_mnist, 385.00, 0.50, 2.00, 0.51, 0.80, 98.40 diff --git a/bamboo/integration_tests/expected_values/lassen/gcc7/expected_performance.csv b/bamboo/integration_tests/expected_values/lassen/gcc7/expected_performance.csv index 9b46fecbd43..aa67a5073a0 100644 --- a/bamboo/integration_tests/expected_values/lassen/gcc7/expected_performance.csv +++ b/bamboo/integration_tests/expected_values/lassen/gcc7/expected_performance.csv @@ -1,5 +1,5 @@ Model_name, training_run_time, training_mean, training_max, training_min, training_stdev, test_accuracy alexnet_nightly, 23.00, 0.70, 10.30, 0.10, 1.20, 0.00 -alexnet_weekly, 0.00, 0.00, 0.00, 0.00, 0.00, 100.00 +alexnet_weekly, 56.00, 0.15, 10.00, 0.70, 0.70, 1.50 cache_alexnet, 0.00, 0.00, 0.00, 0.00, 0.00, 100.00 lenet_mnist, 10.10, 0.06, 5.30, 0.01, 0.60, 98.30 diff --git a/bamboo/integration_tests/expected_values/pascal/gcc7/expected_performance.csv b/bamboo/integration_tests/expected_values/pascal/gcc7/expected_performance.csv index 4285d808302..98c22e515df 100644 --- a/bamboo/integration_tests/expected_values/pascal/gcc7/expected_performance.csv +++ b/bamboo/integration_tests/expected_values/pascal/gcc7/expected_performance.csv @@ -1,5 +1,5 @@ Model_name, training_run_time, training_mean, training_max, training_min, training_stdev, test_accuracy alexnet_nightly, 51.00, 1.20, 4.00, 0.50, 0.40, 100.00 -alexnet_weekly, 0.00, 0.00, 0.00, 0.00, 0.00, 100.00 +alexnet_weekly, 300.00, 1.00, 7.00, 0.10, 1.30, 2.0 cache_alexnet, 0.00, 0.00, 0.00, 0.00, 0.00, 100.00 lenet_mnist, 12.00, 0.04, 6.00, 0.01, 0.40, 98.40 diff --git a/bamboo/integration_tests/full_alexnet.sh b/bamboo/integration_tests/full_alexnet.sh index 8ce75add3a2..393a0fb9e43 100755 --- a/bamboo/integration_tests/full_alexnet.sh +++ b/bamboo/integration_tests/full_alexnet.sh @@ -2,28 +2,58 @@ module load mpifileutils +COMPILER=0 +while :; do + case ${1} in + --compiler) + # Choose compiler + if [ -n "${2}" ]; then + COMPILER=${2} + shift + else + echo "\"${1}\" option requires a non-empty option argument" >&2 + exit 1 + fi + ;; + -?*) + # Unknown option + echo "Unknown option (${1})" >&2 + exit 1 + ;; + *) + # Break loop if there are no more options + break + esac + shift +done + +if [ ${COMPILER} -eq 0 ]; then + exit 1 +fi + +LBANN_DIR=$(git rev-parse --show-toplevel) +CLUSTER=$(hostname | sed 's/\([a-zA-Z][a-zA-Z]*\)[0-9]*/\1/g') +FILE_PREFIX=${LBANN_DIR}/bamboo/unit_tests/output/full_alexnet_${CLUSTER}_${COMPILER} + # Clear SSDs -srun --wait=0 --clear-ssd hostname > /dev/null +srun --wait=0 --clear-ssd hostname > ${FILE_PREFIX}_1_output.txt # Cache dataset echo "Caching dataset..." [ -e /l/ssd/lbannusr/datasets-resized/ILSVRC2012/train_resized.tar ] || \ - srun --nodes=128 --ntasks-per-node=2 dbcast /p/lscratchh/brainusr/datasets/ILSVRC2012/original/train_resized.tar /l/ssd/lbannusr/datasets-resized/ILSVRC2012/train_resized.tar > /dev/null + srun --nodes=128 --ntasks-per-node=2 dbcast /p/lscratchh/brainusr/datasets/ILSVRC2012/original/train_resized.tar /l/ssd/lbannusr/datasets-resized/ILSVRC2012/train_resized.tar > ${FILE_PREFIX}_2_output.txt [ -d /l/ssd/lbannusr/datasets-resized/ILSVRC2012/train ] || \ srun --nodes=128 --ntasks-per-node=1 tar xf /l/ssd/lbannusr/datasets-resized/ILSVRC2012/train_resized.tar -C /l/ssd/lbannusr/datasets-resized/ILSVRC2012 [ -e /l/ssd/lbannusr/datasets-resized/ILSVRC2012/val_resized.tar ] || \ - srun --nodes=128 --ntasks-per-node=2 dbcast /p/lscratchh/brainusr/datasets/ILSVRC2012/original/val_resized.tar /l/ssd/lbannusr/datasets-resized/ILSVRC2012/val_resized.tar > /dev/null + srun --nodes=128 --ntasks-per-node=2 dbcast /p/lscratchh/brainusr/datasets/ILSVRC2012/original/val_resized.tar /l/ssd/lbannusr/datasets-resized/ILSVRC2012/val_resized.tar > ${FILE_PREFIX}_3_output.txt [ -d /l/ssd/lbannusr/datasets-resized/ILSVRC2012/val ] || \ srun --nodes=128 --ntasks-per-node=1 tar xf /l/ssd/lbannusr/datasets-resized/ILSVRC2012/val_resized.tar -C /l/ssd/lbannusr/datasets-resized/ILSVRC2012 [ -e /l/ssd/lbannusr/datasets-resized/ILSVRC2012/labels.tar ] || \ - srun --nodes=128 --ntasks-per-node=2 dbcast /p/lscratchh/brainusr/datasets/ILSVRC2012/original/labels.tar /l/ssd/lbannusr/datasets-resized/ILSVRC2012/labels.tar > /dev/null + srun --nodes=128 --ntasks-per-node=2 dbcast /p/lscratchh/brainusr/datasets/ILSVRC2012/original/labels.tar /l/ssd/lbannusr/datasets-resized/ILSVRC2012/labels.tar > ${FILE_PREFIX}_4_output.txt [ -e /l/ssd/lbannusr/datasets-resized/ILSVRC2012/labels/train.txt ] || \ srun --nodes=128 --ntasks-per-node=1 tar xf /l/ssd/lbannusr/datasets-resized/ILSVRC2012/labels.tar -C /l/ssd/lbannusr/datasets-resized/ILSVRC2012 wait echo "Done caching dataset..." -LBANN_DIR=$(git rev-parse --show-toplevel) -CLUSTER=$(hostname | sed 's/\([a-zA-Z][a-zA-Z]*\)[0-9]*/\1/g') - # Experiment srun --nodes=128 --ntasks-per-node=2 ${LBANN_DIR}/bamboo/compiler_tests/builds/catalyst_gcc-7.1.0_x86_64_mvapich2-2.2_openblas_rel/build/model_zoo/lbann --model=${LBANN_DIR}/model_zoo/models/alexnet/model_alexnet.prototext --optimizer=${LBANN_DIR}/model_zoo/optimizers/opt_sgd.prototext --reader=${LBANN_DIR}/model_zoo/data_readers/data_reader_imagenet.prototext --data_filedir_train=/l/ssd/lbannusr/datasets-resized/ILSVRC2012/train/ --data_filename_train=/l/ssd/lbannusr/datasets-resized/ILSVRC2012/labels/train.txt --data_filedir_test=/l/ssd/lbannusr/datasets-resized/ILSVRC2012/val/ --data_filename_test=/l/ssd/lbannusr/datasets-resized/ILSVRC2012/labels/val.txt diff --git a/bamboo/unit_tests/test_unit_checkpoint.py b/bamboo/unit_tests/test_unit_checkpoint.py index d46173a0a8d..bae6b789ef3 100644 --- a/bamboo/unit_tests/test_unit_checkpoint.py +++ b/bamboo/unit_tests/test_unit_checkpoint.py @@ -24,9 +24,7 @@ def skeleton_checkpoint_lenet_shared(cluster, executables, dir_name, model_name='lenet_mnist_ckpt', num_epochs=2, optimizer_name='sgd', output_file_name=output_file_name, error_file_name=error_file_name) return_code_nockpt = os.system(command) - if return_code_nockpt != 0: - sys.stderr.write('LeNet (no checkpoint) execution failed, exiting with error') - sys.exit(1) + tools.assert_success(return_code_nockpt, error_file_name) os.system('mkdir ckpt_lenet_shared') no_ckpt_dir = 'ckpt_lenet_shared/no_ckpt_{c}'.format(c=compiler_name) os.system('mv ckpt {c}'.format(c=no_ckpt_dir)) @@ -42,9 +40,7 @@ def skeleton_checkpoint_lenet_shared(cluster, executables, dir_name, model_name='lenet_mnist_ckpt', num_epochs=1, optimizer_name='sgd', output_file_name=output_file_name, error_file_name=error_file_name) return_code_ckpt_1 = os.system(command) - if return_code_ckpt_1 != 0: - sys.stderr.write('LeNet (checkpoint) execution failed, exiting with error') - sys.exit(1) + tools.assert_success(return_code_ckpt_1, error_file_name) # Pick up from checkpoint, printing weights to files. output_file_name = '%s/bamboo/unit_tests/output/checkpoint_lenet_shared_restart_%s_output.txt' % (dir_name, compiler_name) @@ -57,9 +53,7 @@ def skeleton_checkpoint_lenet_shared(cluster, executables, dir_name, model_name='lenet_mnist_ckpt', num_epochs=2, optimizer_name='sgd', output_file_name=output_file_name, error_file_name=error_file_name) return_code_ckpt_2 = os.system(command) - if return_code_ckpt_2 != 0: - sys.stderr.write('LeNet execution (restart from checkpoint) failed, exiting with error') - sys.exit(1) + tools.assert_success(return_code_ckpt_2, error_file_name) diff_test = os.system('diff -rq ckpt {c}'.format(c=no_ckpt_dir)) ckpt_dir = 'ckpt_lenet_shared/ckpt_{c}'.format(c=compiler_name) @@ -89,9 +83,7 @@ def skeleton_checkpoint_lenet_distributed(cluster, executables, dir_name, model_name='lenet_mnist_dist_ckpt', num_epochs=2, optimizer_name='sgd', output_file_name=output_file_name, error_file_name=error_file_name) return_code_nockpt = os.system(command) - if return_code_nockpt != 0: - sys.stderr.write('LeNet (no checkpoint) execution failed, exiting with error') - sys.exit(1) + tools.assert_success(return_code_nockpt, error_file_name) os.system('mkdir ckpt_lenet_distributed') no_ckpt_dir = 'ckpt_lenet_distributed/no_ckpt_{c}'.format(c=compiler_name) os.system('mv ckpt {c}'.format(c=no_ckpt_dir)) @@ -107,9 +99,7 @@ def skeleton_checkpoint_lenet_distributed(cluster, executables, dir_name, model_name='lenet_mnist_dist_ckpt', num_epochs=1, optimizer_name='sgd', output_file_name=output_file_name, error_file_name=error_file_name) return_code_ckpt_1 = os.system(command) - if return_code_ckpt_1 != 0: - sys.stderr.write('LeNet (checkpoint) execution failed, exiting with error') - sys.exit(1) + tools.assert_success(return_code_ckpt_1, error_file_name) # Pick up from checkpoint, printing weights to files. output_file_name = '%s/bamboo/unit_tests/output/checkpoint_lenet_distributed_restart_%s_output.txt' % (dir_name, compiler_name) @@ -122,9 +112,7 @@ def skeleton_checkpoint_lenet_distributed(cluster, executables, dir_name, model_name='lenet_mnist_dist_ckpt', num_epochs=2, optimizer_name='sgd', output_file_name=output_file_name, error_file_name=error_file_name) return_code_ckpt_2 = os.system(command) - if return_code_ckpt_2 != 0: - sys.stderr.write('LeNet execution (restart from checkpoint) failed, exiting with error') - sys.exit(1) + tools.assert_success(return_code_ckpt_2, error_file_name) diff_test = os.system('diff -rq ckpt {c}'.format(c=no_ckpt_dir)) ckpt_dir = 'ckpt_lenet_distributed/ckpt_{c}'.format(c=compiler_name) diff --git a/bamboo/unit_tests/test_unit_lbann2_reload.py b/bamboo/unit_tests/test_unit_lbann2_reload.py index 5f39047d6ad..6bd3aced4e1 100644 --- a/bamboo/unit_tests/test_unit_lbann2_reload.py +++ b/bamboo/unit_tests/test_unit_lbann2_reload.py @@ -28,10 +28,8 @@ def skeleton_lbann2_reload(cluster, executables, dir_name, compiler_name): error_file_name=error_file_name) os.mkdir('lbann2_ckpt') - return_code = os.system(command) - if return_code != 0: - sys.stderr.write('LBANN2 LeNet execution failed, exiting with error') - sys.exit(1) + return_code_no_ckpt = os.system(command) + tools.assert_success(return_code_no_ckpt, error_file_name) os.system('mkdir ckpt_lbann2_reload') no_ckpt_dir = 'ckpt_lbann2_reload/lbann2_no_ckpt_{c}'.format(c=compiler_name) @@ -49,10 +47,7 @@ def skeleton_lbann2_reload(cluster, executables, dir_name, compiler_name): output_file_name=output_file_name, error_file_name=error_file_name) return_code_ckpt_1 = os.system(command) - if return_code_ckpt_1 != 0: - sys.stderr.write( - 'LeNet (checkpoint) execution failed, exiting with error') - sys.exit(1) + tools.assert_success(return_code_ckpt_1, error_file_name) # Pick up from checkpoint, printing weights to files. output_file_name = '%s/bamboo/unit_tests/output/lbann2_restart_%s_output.txt' % (dir_name, compiler_name) @@ -68,10 +63,7 @@ def skeleton_lbann2_reload(cluster, executables, dir_name, compiler_name): output_file_name=output_file_name, error_file_name=error_file_name) return_code_ckpt_2 = os.system(command) - if return_code_ckpt_2 != 0: - sys.stderr.write( - 'LBANN2 LeNet weight reload failed, exiting with error') - sys.exit(1) + tools.assert_success(return_code_ckpt_2, error_file_name) os.system('rm lbann2_ckpt/model0-epoch*') os.system('rm lbann2_nockpt/model0-epoch*') From f8afaee0bdc14bec85d718faf2a7bb3f92b83d01 Mon Sep 17 00:00:00 2001 From: Tim Moon Date: Tue, 27 Aug 2019 11:12:52 -0700 Subject: [PATCH 269/634] Remove "moving MNIST" data reader Deprecated by Python data reader. --- include/lbann/data_readers/CMakeLists.txt | 1 - .../data_readers/data_reader_moving_mnist.hpp | 86 ------ include/lbann/lbann.hpp | 1 - .../data_reader_moving_mnist.prototext | 23 -- src/data_readers/CMakeLists.txt | 3 +- src/data_readers/data_reader_moving_mnist.cpp | 286 ------------------ src/proto/init_image_data_readers.cpp | 5 - src/proto/proto_common.cpp | 7 +- 8 files changed, 2 insertions(+), 410 deletions(-) delete mode 100644 include/lbann/data_readers/data_reader_moving_mnist.hpp delete mode 100644 model_zoo/data_readers/data_reader_moving_mnist.prototext delete mode 100644 src/data_readers/data_reader_moving_mnist.cpp diff --git a/include/lbann/data_readers/CMakeLists.txt b/include/lbann/data_readers/CMakeLists.txt index aecf829df39..b45863673da 100644 --- a/include/lbann/data_readers/CMakeLists.txt +++ b/include/lbann/data_readers/CMakeLists.txt @@ -10,7 +10,6 @@ set_full_path(THIS_DIR_HEADERS data_reader_merge_features.hpp data_reader_merge_samples.hpp data_reader_mnist.hpp - data_reader_moving_mnist.hpp data_reader_nci.hpp data_reader_numpy.hpp data_reader_numpy_npz.hpp diff --git a/include/lbann/data_readers/data_reader_moving_mnist.hpp b/include/lbann/data_readers/data_reader_moving_mnist.hpp deleted file mode 100644 index 034bca57880..00000000000 --- a/include/lbann/data_readers/data_reader_moving_mnist.hpp +++ /dev/null @@ -1,86 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. -// Produced at the Lawrence Livermore National Laboratory. -// Written by the LBANN Research Team (B. Van Essen, et al.) listed in -// the CONTRIBUTORS file. -// -// LLNL-CODE-697807. -// All rights reserved. -// -// This file is part of LBANN: Livermore Big Artificial Neural Network -// Toolkit. For details, see http://software.llnl.gov/LBANN or -// https://github.com/LLNL/LBANN. -// -// Licensed under the Apache License, Version 2.0 (the "Licensee"); you -// may not use this file except in compliance with the License. You may -// obtain a copy of the License at: -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the license. -//////////////////////////////////////////////////////////////////////////////// - -#ifndef LBANN_DATA_READER_MOVING_MNIST_HPP -#define LBANN_DATA_READER_MOVING_MNIST_HPP - -#include "data_reader.hpp" - -namespace lbann { - -class moving_mnist_reader : public generic_data_reader { -public: - moving_mnist_reader(El::Int num_frames, - El::Int image_height, - El::Int image_width, - El::Int num_objects); - moving_mnist_reader(const moving_mnist_reader&) = default; - moving_mnist_reader& operator=(const moving_mnist_reader&) = default; - ~moving_mnist_reader() override = default; - moving_mnist_reader* copy() const override { return new moving_mnist_reader(*this); } - - std::string get_type() const override { - return "moving_mnist_reader"; - } - - void load() override; - - const std::vector get_data_dims() const override; - int get_num_labels() const override; - int get_linearized_data_size() const override; - int get_linearized_label_size() const override; - -protected: - bool fetch_datum(CPUMat& X, int data_id, int mb_idx) override; - bool fetch_label(CPUMat& Y, int data_id, int mb_idx) override; - -private: - - /** Number of frames. */ - El::Int m_num_frames; - /** Frame height. */ - El::Int m_image_height; - /** Frame width. */ - El::Int m_image_width; - /** Number of MNIST digits in each frame. */ - El::Int m_num_objects; - - /** Number of MNIST samples. */ - El::Int m_num_raw_images = 0; - /** MNIST image height. */ - El::Int m_raw_image_height = 0; - /** MNIST image width. */ - El::Int m_raw_image_width = 0; - /** Raw MNIST image data. */ - std::vector m_raw_image_data; - /** Raw MNIST label data. */ - std::vector m_raw_label_data; - -}; - -} // namespace lbann - -#endif // LBANN_DATA_READER_MOVING_MNIST_HPP diff --git a/include/lbann/lbann.hpp b/include/lbann/lbann.hpp index 66c10065fb2..68f7835ba65 100644 --- a/include/lbann/lbann.hpp +++ b/include/lbann/lbann.hpp @@ -126,7 +126,6 @@ #include "lbann/data_readers/data_reader_ascii.hpp" #include "lbann/data_readers/data_reader_pilot2_molecular.hpp" #include "lbann/data_readers/data_reader_mesh.hpp" -#include "lbann/data_readers/data_reader_moving_mnist.hpp" #include "lbann/data_readers/data_reader_python.hpp" /// Data stores diff --git a/model_zoo/data_readers/data_reader_moving_mnist.prototext b/model_zoo/data_readers/data_reader_moving_mnist.prototext deleted file mode 100644 index 868fe0799f3..00000000000 --- a/model_zoo/data_readers/data_reader_moving_mnist.prototext +++ /dev/null @@ -1,23 +0,0 @@ -data_reader { - reader { - name: "moving_mnist" - role: "train" - shuffle: true - data_filedir: "/p/lscratchh/brainusr/datasets/MNIST" - data_filename: "train-images-idx3-ubyte" - label_filename: "train-labels-idx1-ubyte" - validation_percent: 0.1 - absolute_sample_count: 0 - percent_of_data_to_use: 1.0 - } - reader { - name: "moving_mnist" - role: "test" - shuffle: true - data_filedir: "/p/lscratchh/brainusr/datasets/MNIST" - data_filename: "t10k-images-idx3-ubyte" - label_filename: "t10k-labels-idx1-ubyte" - absolute_sample_count: 0 - percent_of_data_to_use: 1.0 - } -} diff --git a/src/data_readers/CMakeLists.txt b/src/data_readers/CMakeLists.txt index 64d7b339477..4d633e44aae 100644 --- a/src/data_readers/CMakeLists.txt +++ b/src/data_readers/CMakeLists.txt @@ -12,7 +12,6 @@ set_full_path(THIS_DIR_SOURCES data_reader_merge_samples.cpp data_reader_mesh.cpp data_reader_mnist.cpp - data_reader_moving_mnist.cpp data_reader_nci.cpp data_reader_numpy.cpp data_reader_numpy_npz.cpp @@ -22,7 +21,7 @@ set_full_path(THIS_DIR_SOURCES data_reader_multihead_siamese.cpp data_reader_python.cpp offline_patches_npz.cpp - numpy_conduit_converter.cpp + numpy_conduit_converter.cpp data_reader_numpy_npz_conduit.cpp ) diff --git a/src/data_readers/data_reader_moving_mnist.cpp b/src/data_readers/data_reader_moving_mnist.cpp deleted file mode 100644 index c67e92f6b73..00000000000 --- a/src/data_readers/data_reader_moving_mnist.cpp +++ /dev/null @@ -1,286 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. -// Produced at the Lawrence Livermore National Laboratory. -// Written by the LBANN Research Team (B. Van Essen, et al.) listed in -// the CONTRIBUTORS file. -// -// LLNL-CODE-697807. -// All rights reserved. -// -// This file is part of LBANN: Livermore Big Artificial Neural Network -// Toolkit. For details, see http://software.llnl.gov/LBANN or -// https://github.com/LLNL/LBANN. -// -// Licensed under the Apache License, Version 2.0 (the "Licensee"); you -// may not use this file except in compliance with the License. You may -// obtain a copy of the License at: -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the license. -//////////////////////////////////////////////////////////////////////////////// - -#include "lbann/data_readers/data_reader_moving_mnist.hpp" -#include "lbann/utils/file_utils.hpp" -#include "lbann/models/model.hpp" -#include -#include - -namespace lbann { - -namespace { - -/** Called repeatedly to incrementally create a hash value from - * several variables. - * - * Copied from Boost. See - * https://www.boost.org/doc/libs/1_55_0/doc/html/hash/reference.html#boost.hash_combine. - */ -template -inline void hash_combine(size_t& seed, T v) { - seed ^= std::hash()(v) + 0x9e3779b9 + (seed << 6) + (seed >> 2); -} - -} // namespace - -moving_mnist_reader::moving_mnist_reader(El::Int num_frames, - El::Int image_height, - El::Int image_width, - El::Int num_objects) - : generic_data_reader(true), - m_num_frames(num_frames), - m_image_height(image_height), - m_image_width(image_width), - m_num_objects(num_objects) {} - -// Data dimension access functions -const std::vector moving_mnist_reader::get_data_dims() const { - std::vector dims(4); - dims[0] = m_num_frames; - dims[1] = 3; - dims[2] = m_image_height; - dims[3] = m_image_width; - return dims; -} -int moving_mnist_reader::get_num_labels() const { - return 1 + 9 * m_num_objects; -} -int moving_mnist_reader::get_linearized_data_size() const { - const auto& dims = get_data_dims(); - return std::accumulate(dims.begin(), dims.end(), 1, - std::multiplies()); -} -int moving_mnist_reader::get_linearized_label_size() const { - return get_num_labels(); -} - -bool moving_mnist_reader::fetch_datum(CPUMat& X, int data_id, int col) { - - // Useful constants - constexpr DataType zero = 0; - constexpr DataType one = 1; - - // Choose raw images - /// @todo Implementation with uniform distribution - std::vector raw_image_indices(m_num_objects); - for (El::Int obj = 0; obj < m_num_objects; ++obj) { - size_t hash = 1234; - hash_combine(hash, data_id); - hash_combine(hash, m_model->get_epoch()); - hash_combine(hash, obj); - raw_image_indices[obj] = hash % m_num_raw_images; - } - - // Determine object boundaries - std::vector> bounds(m_num_objects); - for (El::Int obj = 0; obj < m_num_objects; ++obj) { - auto& xmin = bounds[obj][0] = m_raw_image_width; - auto& xmax = bounds[obj][1] = 0; - auto& ymin = bounds[obj][2] = m_raw_image_height; - auto& ymax = bounds[obj][3] = 0; - const auto& raw_image_offset = (raw_image_indices[obj] - * m_raw_image_height - * m_raw_image_width); - const auto* raw_image = &m_raw_image_data[raw_image_offset]; - for (El::Int j = 0; j < m_raw_image_height; ++j) { - for (El::Int i = 0; i < m_raw_image_width; ++i) { - if (raw_image[i + j * m_raw_image_width] != 0) { - xmin = std::min(xmin, i); - xmax = std::max(xmax, i+1); - ymin = std::min(ymin, j); - ymax = std::max(ymax, j+1); - } - } - } - xmin = std::min(xmin, xmax); - ymin = std::min(ymin, ymax); - } - - // Initial positions and velocities - /// @todo Ensure objects don't overlap - std::vector>> pos(m_num_objects); - std::vector> v(m_num_objects); - std::uniform_real_distribution dist(zero, one); - const DataType vmax = std::hypot(m_image_width, m_image_height) / 5; - for (El::Int obj = 0; obj < m_num_objects; ++obj) { - const auto& object_width = bounds[obj][1] - bounds[obj][0]; - const auto& object_height = bounds[obj][3] - bounds[obj][2]; - pos[obj].resize(m_num_frames); - pos[obj][0][0] = (m_image_width - object_width + 1) * dist(get_io_generator()); - pos[obj][0][1] = (m_image_height - object_height + 1) * dist(get_io_generator()); - const DataType vnorm = vmax * dist(get_io_generator()); - const DataType theta = 2 * M_PI * dist(get_io_generator()); - v[obj][0] = vnorm * std::sin(theta); - v[obj][1] = vnorm * std::cos(theta); - } - - // Determine object positions - /// @todo Ensure objects don't overlap - for (El::Int frame = 1; frame < m_num_frames; ++frame) { - for (El::Int obj = 0; obj < m_num_objects; ++obj) { - - // Linear motion - auto& x = pos[obj][frame][0]; - auto& y = pos[obj][frame][1]; - auto& vx = v[obj][0]; - auto& vy = v[obj][1]; - x = pos[obj][frame-1][0] + vx; - y = pos[obj][frame-1][1] + vy; - - // Reflections at boundaries - const auto& object_width = bounds[obj][1] - bounds[obj][0]; - const auto& object_height = bounds[obj][3] - bounds[obj][2]; - const DataType xmax = m_image_width - object_width + 1; - const DataType ymax = m_image_height - object_height + 1; - if (x <= zero || x >= xmax) { - x = std::min(std::max(x, zero), xmax); - vx = -vx; - } - if (y <= zero || y >= ymax) { - y = std::min(std::max(y, zero), ymax); - vy = -vy; - } - } - } - - // Populate frames - std::memset(X.Buffer(0, col), 0, X.Height() * sizeof(DataType)); - for (El::Int obj = 0; obj < m_num_objects; ++obj) { - - // Get raw image - const auto& object_width = bounds[obj][1] - bounds[obj][0]; - const auto& object_height = bounds[obj][3] - bounds[obj][2]; - const auto& object_width_offset = bounds[obj][0]; - const auto& object_height_offset = bounds[obj][2]; - const auto& raw_image_offset = ((raw_image_indices[obj] - * m_raw_image_height - * m_raw_image_width) - + object_width_offset - + (object_height_offset - * m_raw_image_width)); - const auto* raw_image = &m_raw_image_data[raw_image_offset]; - - // Copy raw image into each frame - const auto& xmax = m_image_width - object_width + 1; - const auto& ymax = m_image_height - object_height + 1; - for (El::Int frame = 0; frame < m_num_frames; ++frame) { - - // Get image position in current frame - El::Int xoff = pos[obj][frame][0]; - El::Int yoff = pos[obj][frame][1]; - xoff = std::min(std::max(xoff, El::Int(0)), xmax-1); - yoff = std::min(std::max(yoff, El::Int(0)), ymax-1); - - // Copy raw image into position - for (El::Int channel = 0; channel < 3; ++channel) { - for (El::Int j = 0; j < object_height; ++j) { - for (El::Int i = 0; i < object_width; ++i) { - const auto& row = (frame * 3 * m_image_height * m_image_width - + channel * m_image_height * m_image_width - + (yoff+j) * m_image_width - + (xoff+i)); - auto& pixel = X(row, col); - pixel += raw_image[i + j * m_raw_image_width] / 255.0; - pixel = std::min(pixel, one); - } - } - } - - } - - } - - return true; -} - -bool moving_mnist_reader::fetch_label(CPUMat& Y, int data_id, int col) { - - // Choose raw images - /// @todo Implementation with uniform distribution - std::vector raw_image_indices(m_num_objects); - for (El::Int obj = 0; obj < m_num_objects; ++obj) { - size_t hash = 1234; - hash_combine(hash, data_id); - hash_combine(hash, m_model->get_epoch()); - hash_combine(hash, obj); - raw_image_indices[obj] = hash % m_num_raw_images; - } - - // Label is sum of raw image labels - El::Int sum = 0; - for (const auto& i : raw_image_indices) { - sum += m_raw_label_data[i]; - } - auto&& Y_col = El::View(Y, El::ALL, El::IR(col)); - El::Zero(Y_col); - Y_col(sum, 0) = DataType(1); - - return true; -} - -void moving_mnist_reader::load() { - - // Read image data - const auto& image_file = get_file_dir() + "/" + get_data_filename(); - std::ifstream fs_image(image_file.c_str(), - std::fstream::in | std::fstream::binary); - unsigned int num_images = 0; - unsigned int image_height = 0; - unsigned int image_width = 0; - fs_image.ignore(4); - fs_image.read(reinterpret_cast(&num_images), 4); - fs_image.read(reinterpret_cast(&image_height), 4); - fs_image.read(reinterpret_cast(&image_width), 4); - __swapEndianInt(num_images); - __swapEndianInt(image_height); - __swapEndianInt(image_width); - m_num_raw_images = num_images; - m_raw_image_height = image_height; - m_raw_image_width = image_width; - m_raw_image_data.resize(num_images * image_height * image_width); - fs_image.read(reinterpret_cast(m_raw_image_data.data()), - num_images * image_height * image_width); - fs_image.close(); - - // Read labels - const auto& label_file = get_file_dir() + "/" + get_label_filename(); - std::ifstream fs_label(label_file.c_str(), - std::fstream::in | std::fstream::binary); - fs_label.ignore(8); - m_raw_label_data.resize(num_images); - fs_label.read(reinterpret_cast(m_raw_label_data.data()), num_images); - fs_label.close(); - - // Reset indices - m_shuffled_indices.resize(num_images); - std::iota(m_shuffled_indices.begin(), m_shuffled_indices.end(), 0); - resize_shuffled_indices(); - select_subset_of_data(); - -} - -} // namespace lbann diff --git a/src/proto/init_image_data_readers.cpp b/src/proto/init_image_data_readers.cpp index 014314994b0..71112ccd271 100644 --- a/src/proto/init_image_data_readers.cpp +++ b/src/proto/init_image_data_readers.cpp @@ -33,7 +33,6 @@ #include "lbann/data_readers/data_reader_jag_conduit.hpp" #include "lbann/data_readers/data_reader_imagenet.hpp" #include "lbann/data_readers/data_reader_mnist.hpp" -#include "lbann/data_readers/data_reader_moving_mnist.hpp" #include "lbann/data_readers/data_reader_multihead_siamese.hpp" #include @@ -90,8 +89,6 @@ void init_image_data_reader(const lbann_data::Reader& pb_readme, const lbann_dat reader = new imagenet_reader(shuffle); } else if (name == "multihead_siamese") { reader = new data_reader_multihead_siamese(pb_readme.num_image_srcs(), shuffle); - } else if (name == "moving_mnist") { - reader = new moving_mnist_reader(7, 40, 40, 2); } else if (name =="jag_conduit") { data_reader_jag_conduit* reader_jag = new data_reader_jag_conduit(shuffle); const lbann_data::DataSetMetaData::Schema& pb_schema = pb_metadata.schema(); @@ -293,8 +290,6 @@ void init_org_image_data_reader(const lbann_data::Reader& pb_readme, const bool } else if (name == "cifar10") { reader = new cifar10_reader(shuffle); if (master) std::cout << "cifar10_reader is set" << std::endl; - } else if (name == "moving_mnist") { - reader = new moving_mnist_reader(7, 40, 40, 2); } else { if (master) { std::stringstream err; diff --git a/src/proto/proto_common.cpp b/src/proto/proto_common.cpp index 3742d201edc..b50495ee9f4 100644 --- a/src/proto/proto_common.cpp +++ b/src/proto/proto_common.cpp @@ -96,7 +96,7 @@ void init_data_readers( // This is a hack that should be fixed when we clean up data reader setup. bool set_transform_pipeline = true; - if ((name == "mnist") || (name == "cifar10") || (name == "moving_mnist")) { + if ((name == "mnist") || (name == "cifar10")) { init_org_image_data_reader(readme, master, reader); set_transform_pipeline = false; } else if ((name == "imagenet") || @@ -324,8 +324,6 @@ void init_data_readers( } } else if (name == "mesh") { reader = new mesh_reader(shuffle); - } else if (name == "moving_mnist") { - reader = new moving_mnist_reader(7, 40, 40, 2); } else if (name == "python") { #ifdef LBANN_HAS_PYTHON const auto& params = readme.python(); @@ -481,9 +479,6 @@ void init_data_readers( } else if (name == "mesh") { reader_validation = new mesh_reader(shuffle); (*(mesh_reader *)reader_validation) = (*(mesh_reader *)reader); - } else if (name == "moving_mnist") { - reader_validation = new moving_mnist_reader(7, 40, 40, 2); - (*(moving_mnist_reader *)reader_validation) = (*(moving_mnist_reader *)reader); } else if (name == "python") { #ifdef LBANN_HAS_PYTHON const auto& params = readme.python(); From efbba9a24d7175f75424e6c0872fb713bd008ff8 Mon Sep 17 00:00:00 2001 From: Tim Moon Date: Tue, 27 Aug 2019 11:33:04 -0700 Subject: [PATCH 270/634] Remove ASCII data reader --- include/lbann/data_readers/CMakeLists.txt | 1 - .../lbann/data_readers/data_reader_ascii.hpp | 73 ----------- include/lbann/lbann.hpp | 1 - .../data_readers/data_reader_ascii.prototext | 19 --- src/data_readers/CMakeLists.txt | 3 +- src/data_readers/data_reader_ascii.cpp | 117 ------------------ 6 files changed, 1 insertion(+), 213 deletions(-) delete mode 100644 include/lbann/data_readers/data_reader_ascii.hpp delete mode 100644 model_zoo/data_readers/data_reader_ascii.prototext delete mode 100644 src/data_readers/data_reader_ascii.cpp diff --git a/include/lbann/data_readers/CMakeLists.txt b/include/lbann/data_readers/CMakeLists.txt index aecf829df39..a51df360114 100644 --- a/include/lbann/data_readers/CMakeLists.txt +++ b/include/lbann/data_readers/CMakeLists.txt @@ -2,7 +2,6 @@ set_full_path(THIS_DIR_HEADERS compound_data_reader.hpp data_reader.hpp - data_reader_ascii.hpp data_reader_cifar10.hpp data_reader_csv.hpp data_reader_image.hpp diff --git a/include/lbann/data_readers/data_reader_ascii.hpp b/include/lbann/data_readers/data_reader_ascii.hpp deleted file mode 100644 index 09504b49397..00000000000 --- a/include/lbann/data_readers/data_reader_ascii.hpp +++ /dev/null @@ -1,73 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. -// Produced at the Lawrence Livermore National Laboratory. -// Written by the LBANN Research Team (B. Van Essen, et al.) listed in -// the CONTRIBUTORS file. -// -// LLNL-CODE-697807. -// All rights reserved. -// -// This file is part of LBANN: Livermore Big Artificial Neural Network -// Toolkit. For details, see http://software.llnl.gov/LBANN or -// https://github.com/LLNL/LBANN. -// -// Licensed under the Apache License, Version 2.0 (the "Licensee"); you -// may not use this file except in compliance with the License. You may -// obtain a copy of the License at: -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the license. -// -// data_reader_ascii .hpp .cpp - generic_data_reader class for ASCII text files -//////////////////////////////////////////////////////////////////////////////// - -#ifndef LBANN_DATA_READER_ASCII_HPP -#define LBANN_DATA_READER_ASCII_HPP - -#include "data_reader.hpp" - -namespace lbann { - -class ascii_reader : public generic_data_reader { - public: - ascii_reader(int sequence_length = 1, bool shuffle = true); - ascii_reader(const ascii_reader&) = default; - ascii_reader& operator=(const ascii_reader&) = default; - ~ascii_reader() override = default; - ascii_reader* copy() const override { return new ascii_reader(*this); } - - std::string get_type() const override { - return "ascii_reader"; - } - - void load() override; - - int get_linearized_data_size() const override { - return 128 * m_sequence_length; - } - int get_linearized_label_size() const override { - return 128 * m_sequence_length; - } - const std::vector get_data_dims() const override { - return {128 * m_sequence_length}; - } - - protected: - bool fetch_datum(CPUMat& X, int data_id, int mb_idx) override; - bool fetch_label(CPUMat& Y, int data_id, int mb_idx) override; - - /** Length of text sequence. */ - int m_sequence_length; - /** Size of data file in bytes. */ - int m_file_size; - -}; - -} // namespace lbann - -#endif // LBANN_DATA_READER_ASCII_HPP diff --git a/include/lbann/lbann.hpp b/include/lbann/lbann.hpp index 66c10065fb2..096bf958275 100644 --- a/include/lbann/lbann.hpp +++ b/include/lbann/lbann.hpp @@ -123,7 +123,6 @@ #include "lbann/data_readers/data_reader_csv.hpp" #include "lbann/data_readers/data_reader_merge_samples.hpp" #include "lbann/data_readers/data_reader_merge_features.hpp" -#include "lbann/data_readers/data_reader_ascii.hpp" #include "lbann/data_readers/data_reader_pilot2_molecular.hpp" #include "lbann/data_readers/data_reader_mesh.hpp" #include "lbann/data_readers/data_reader_moving_mnist.hpp" diff --git a/model_zoo/data_readers/data_reader_ascii.prototext b/model_zoo/data_readers/data_reader_ascii.prototext deleted file mode 100644 index f7a956b99d0..00000000000 --- a/model_zoo/data_readers/data_reader_ascii.prototext +++ /dev/null @@ -1,19 +0,0 @@ -data_reader { - reader { - name: "ascii" - role: "train" - shuffle: true - data_filedir: "/p/lscratchh/brainusr/datasets/tinyshakespeare/" - data_filename: "input.txt" - validation_percent: 0.1 - percent_of_data_to_use: 1.0 - } - reader { - name: "ascii" - role: "test" - shuffle: false - data_filedir: "/p/lscratchh/brainusr/datasets/tinyshakespeare/" - data_filename: "input.txt" - percent_of_data_to_use: 1.0 - } -} diff --git a/src/data_readers/CMakeLists.txt b/src/data_readers/CMakeLists.txt index 64d7b339477..5c7522882f7 100644 --- a/src/data_readers/CMakeLists.txt +++ b/src/data_readers/CMakeLists.txt @@ -1,7 +1,6 @@ # Add the source files for this directory set_full_path(THIS_DIR_SOURCES data_reader.cpp - data_reader_ascii.cpp data_reader_cifar10.cpp data_reader_csv.cpp data_reader_image.cpp @@ -22,7 +21,7 @@ set_full_path(THIS_DIR_SOURCES data_reader_multihead_siamese.cpp data_reader_python.cpp offline_patches_npz.cpp - numpy_conduit_converter.cpp + numpy_conduit_converter.cpp data_reader_numpy_npz_conduit.cpp ) diff --git a/src/data_readers/data_reader_ascii.cpp b/src/data_readers/data_reader_ascii.cpp deleted file mode 100644 index 8be6521b512..00000000000 --- a/src/data_readers/data_reader_ascii.cpp +++ /dev/null @@ -1,117 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. -// Produced at the Lawrence Livermore National Laboratory. -// Written by the LBANN Research Team (B. Van Essen, et al.) listed in -// the CONTRIBUTORS file. -// -// LLNL-CODE-697807. -// All rights reserved. -// -// This file is part of LBANN: Livermore Big Artificial Neural Network -// Toolkit. For details, see http://software.llnl.gov/LBANN or -// https://github.com/LLNL/LBANN. -// -// Licensed under the Apache License, Version 2.0 (the "Licensee"); you -// may not use this file except in compliance with the License. You may -// obtain a copy of the License at: -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the license. -// -// lbann_data_reader_ascii .hpp .cpp - generic_data_reader class for ASCII text files -//////////////////////////////////////////////////////////////////////////////// - -#include "lbann/data_readers/data_reader_ascii.hpp" -#include -namespace lbann { - -ascii_reader::ascii_reader(int sequence_length, bool shuffle) - : generic_data_reader(shuffle), m_sequence_length(sequence_length) {} - -bool ascii_reader::fetch_datum(CPUMat& X, int data_id, int mb_idx) { - - // Get text sequence from file - const int pos = data_id - m_sequence_length; - const int num_chars = (std::min(pos + m_sequence_length, m_file_size) - - std::max(pos, 0)); - std::vector sequence(m_sequence_length, 0); - if (num_chars > 0) { - std::ifstream fs(get_file_dir() + get_data_filename(), - std::fstream::in); - fs.seekg(std::max(pos, 0)); - fs.read(&sequence[std::max(-pos, 0)], num_chars); - fs.close(); - } - - // Convert text sequence to binary vector - for (int i = 0; i < m_sequence_length; ++i) { - auto current_char = (int) sequence[i]; - if (current_char < 0 || current_char >= 128) { - current_char = 0; - } - X(128 * i + current_char, mb_idx) = DataType(1); - } - - return true; -} - -bool ascii_reader::fetch_label(CPUMat& Y, int data_id, int mb_idx) { - - // Get text sequence from file - const int pos = data_id - m_sequence_length + 1; - const int num_chars = (std::min(pos + m_sequence_length, m_file_size) - - std::max(pos, 0)); - std::vector sequence(m_sequence_length, 0); - if (num_chars > 0) { - std::ifstream fs(get_file_dir() + get_data_filename(), - std::fstream::in); - fs.seekg(std::max(pos, 0)); - fs.read(&sequence[std::max(-pos, 0)], num_chars); - fs.close(); - } - - // Convert text sequence to binary vector - for (int i = 0; i < m_sequence_length; ++i) { - auto current_char = (int) sequence[i]; - if (current_char < 0 || current_char >= 128) { - current_char = 0; - } - Y(128 * i + current_char, mb_idx) = DataType(1); - } - - return true; -} - -//=================================================== - -void ascii_reader::load() { - - // Make sure directory path ends with a slash - if (m_file_dir.back() != '/') { - m_file_dir.push_back('/'); - } - - // Get length of data file - std::ifstream fs(get_file_dir() + get_data_filename(), - std::fstream::in | std::fstream::ate); - m_file_size = fs.tellg(); - fs.close(); - - // Reset indices - m_shuffled_indices.resize(m_file_size + m_sequence_length); - std::iota(m_shuffled_indices.begin(), m_shuffled_indices.end(), 0); - if (is_master()) { - std::cerr << "calling select_subset_of_data; m_shuffled_indices.size: " << - m_shuffled_indices.size() << std::endl; - } - resize_shuffled_indices(); - select_subset_of_data(); - -} - -} // namespace lbann From 00d0779531f48649ffae3ff3d9c6cfc540bb09d6 Mon Sep 17 00:00:00 2001 From: Nikoli Dryden Date: Wed, 28 Aug 2019 12:31:09 -0700 Subject: [PATCH 271/634] Fix XL shadowing warning in beta.hpp XL does not like that we had a variable and a function with the same name. This placates it. --- include/lbann/utils/beta.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/lbann/utils/beta.hpp b/include/lbann/utils/beta.hpp index f8f73636ba7..e944437f042 100644 --- a/include/lbann/utils/beta.hpp +++ b/include/lbann/utils/beta.hpp @@ -56,9 +56,9 @@ class beta_distribution { public: using distribution_type = beta_distribution; - explicit param_type(RealType a, RealType b) : - m_a(a), m_b(b) { - if (a <= RealType(0) || b <= RealType(0)) { + explicit param_type(RealType param_a, RealType param_b) : + m_a(param_a), m_b(param_b) { + if (param_a <= RealType(0) || param_b <= RealType(0)) { LBANN_ERROR("Beta distribution parameters must be positive"); } } From e147774c3a9d16a74b6f6774c2c1ce8d10cc2efe Mon Sep 17 00:00:00 2001 From: Tim Moon Date: Thu, 29 Aug 2019 16:22:23 -0700 Subject: [PATCH 272/634] Move model visualization out of Python frontend Visualization functionality has been consolidated in scripts/viz.py. graphviz is no longer a dependency for the Python frontend. --- cmake/configure_files/setup.py.in | 3 +- python/lbann/viz.py | 87 ------------------------------- scripts/viz.py | 80 ++++++++++++++++++++++++---- 3 files changed, 70 insertions(+), 100 deletions(-) delete mode 100644 python/lbann/viz.py diff --git a/cmake/configure_files/setup.py.in b/cmake/configure_files/setup.py.in index 3f4bc1a1597..c56a7df3a4b 100644 --- a/cmake/configure_files/setup.py.in +++ b/cmake/configure_files/setup.py.in @@ -24,8 +24,7 @@ setuptools.setup( packages=setuptools.find_packages(src_dir), package_dir={'': src_dir}, data_files=[('lbann', [config_file])], - install_requires=['graphviz>=0.10.1', - 'matplotlib>=2.0.2', + install_requires=['matplotlib>=2.0.2', 'numpy>=1.16.0', 'onnx>=1.3.0', 'pandas>=0.24.1', diff --git a/python/lbann/viz.py b/python/lbann/viz.py deleted file mode 100644 index e47cab2bbe2..00000000000 --- a/python/lbann/viz.py +++ /dev/null @@ -1,87 +0,0 @@ -"""Visualize LBANN models.""" -from re import sub -from enum import Enum -from graphviz import Digraph -from lbann import lbann_pb2, Model - -def visualize_layer_graph(model, filename, - file_format='pdf', - label_format='name only', - graphviz_engine='dot'): - """Visualize a model's layer graph and save to file. - - Args: - model (`lbann_pb2.Model` or `lbann.model.Model`): Neural - network model. - filename (`str`): Output file. - file_format (`str`): Output file format. - label_format (`str`): Displayed layer information (options: - type-only, name-only, type-and-name, full). - graphviz_engine (`str`): Graphviz visualization scheme. - - """ - - # Get protobuf message - if isinstance(model, lbann_pb2.Model): - proto = model - elif isinstance(model, Model): - proto = model.export_proto() - else: - raise TypeError('expected `model` to be an ' - '`lbann_pb2.Model` or a `lbann.model.Model`') - - # Strip extension from filename - if filename.endswith('.' + file_format): - filename = filename[:-len(file_format)-1] - - # Convert label format to lowercase with no spaces - label_format = sub(r' |-|_', '', label_format.lower()) - - # Construct graphviz graph - graph = Digraph(filename=filename, format=file_format, engine=graphviz_engine) - graph.attr('node', shape='rect') - - # Construct nodes in layer graph - layer_types = (set(lbann_pb2.Layer.DESCRIPTOR.fields_by_name.keys()) - - set(['name', 'parents', 'children', - 'data_layout', 'device_allocation', 'weights', - 'num_neurons_from_data_reader', 'freeze', - 'hint_layer', 'weights_data', - 'top', 'bottom', 'type', 'motif_layer'])) - for l in proto.layer: - - # Determine layer type - type = '' - for _type in layer_types: - if l.HasField(_type): - type = getattr(l,_type).DESCRIPTOR.name - break - - # Construct node label - label = '' - if label_format == 'nameonly': - label = l.name - elif label_format == 'typeonly': - label = type - elif label_format == 'typeandname': - label = '<{0}
{1}>'.format(type, l.name) - elif label_format == 'full': - label = '<' - for (index, line) in enumerate(str(l).strip().split('\n')): - if index > 0: - label += '
' - label += line - label += '>' - - # Add layer as layer graph node - graph.node(l.name, label=label) - - # Add parent/child relationships as layer graph edges - edges = set() - for l in proto.layer: - edges.update([(p, l.name) for p in l.parents.split()]) - edges.update([(l.name, c) for c in l.children.split()]) - graph.edges(edges) - - # Save to file - graph.render(filename=filename, cleanup=True, format=file_format) diff --git a/scripts/viz.py b/scripts/viz.py index 95bc0899f9d..ed59a5ec54f 100755 --- a/scripts/viz.py +++ b/scripts/viz.py @@ -1,12 +1,15 @@ #!/usr/bin/env python3 +"""Visualize an LBANN model's layer graph and save to file.""" + import argparse -import google.protobuf.text_format as txtf -from lbann.proto import lbann_pb2 -import lbann.viz +import re +import graphviz +import google.protobuf.text_format +from lbann import lbann_pb2, layers_pb2 # Parse command-line arguments parser = argparse.ArgumentParser( - description='Visualize layer graph for LBANN model.') + description='Visualize an LBANN model\'s layer graph and save to file.') parser.add_argument( 'input', action='store', type=str, help='model prototext file') @@ -27,13 +30,68 @@ help='Graphviz visualization scheme (default: dot)', metavar='ENGINE') args = parser.parse_args() -# Parse prototext file +# Strip extension from filename +filename = args.output +file_format = args.file_format +if filename.endswith('.' + file_format): + filename = filename[:-len(file_format)-1] + +# Convert label format to lowercase with no spaces +label_format = re.sub(r' |-|_', '', args.label_format.lower()) + +# Read prototext file proto = lbann_pb2.LbannPB() with open(args.input, 'r') as f: - txtf.Merge(f.read(), proto) + google.protobuf.text_format.Merge(f.read(), proto) +model = proto.model + +# Construct graphviz graph +graph = graphviz.Digraph(filename=filename, + format=file_format, + engine=args.graphviz_engine) +graph.attr('node', shape='rect') + +# Construct nodes in layer graph +layer_types = (set(layers_pb2.Layer.DESCRIPTOR.fields_by_name.keys()) + - set(['name', 'parents', 'children', + 'data_layout', 'device_allocation', 'weights', + 'num_neurons_from_data_reader', 'freeze', + 'hint_layer', 'weights_data', + 'top', 'bottom', 'type', 'motif_layer'])) +for l in model.layer: + + # Determine layer type + type = '' + for _type in layer_types: + if l.HasField(_type): + type = getattr(l,_type).DESCRIPTOR.name + break + + # Construct node label + label = '' + if label_format == 'nameonly': + label = l.name + elif label_format == 'typeonly': + label = type + elif label_format == 'typeandname': + label = '<{0}
{1}>'.format(type, l.name) + elif label_format == 'full': + label = '<' + for (index, line) in enumerate(str(l).strip().split('\n')): + if index > 0: + label += '
' + label += line + label += '>' + + # Add layer as layer graph node + graph.node(l.name, label=label) + +# Add parent/child relationships as layer graph edges +edges = set() +for l in model.layer: + edges.update([(p, l.name) for p in l.parents.split()]) + edges.update([(l.name, c) for c in l.children.split()]) +graph.edges(edges) -# Visualize -lbann.viz.visualize_layer_graph(proto.model, args.output, - file_format=args.file_format, - label_format=args.label_format, - graphviz_engine=args.graphviz_engine) +# Save to file +graph.render(filename=filename, cleanup=True, format=file_format) From f9ffc24f6f0856b1331837d26d599f0187769c20 Mon Sep 17 00:00:00 2001 From: "David A. Hysom" Date: Fri, 30 Aug 2019 11:00:39 -0700 Subject: [PATCH 273/634] Added randomization for lines in the output sample list files (generated by select_samples). select_samples now outputs both inclusion and exclusion bar files. Modified sanity.py to check that the bar inclusion file is correct. --- model_zoo/jag_utils/python/sanity.py | 42 ++++++++++-- model_zoo/jag_utils/select_samples.cpp | 92 +++++++++++++++++++++----- 2 files changed, 114 insertions(+), 20 deletions(-) diff --git a/model_zoo/jag_utils/python/sanity.py b/model_zoo/jag_utils/python/sanity.py index a8287471302..01f5c7e9636 100644 --- a/model_zoo/jag_utils/python/sanity.py +++ b/model_zoo/jag_utils/python/sanity.py @@ -42,9 +42,11 @@ def buildInc(mp, fn) : t = line.split() for j in t[3:] : r.add(j) + print ' num sample IDs:', len(r) return r #====================================================================== +#returns (excluded, included) sample IDs from an input EXCLUSION sample list def buildExc(mp, fn) : s = set() print 'buildExc; opening:', fn @@ -56,11 +58,16 @@ def buildExc(mp, fn) : t = line.split() for j in t[3:] : s.add(j) + #at this point, 's' contains all excluded sample IDs (these are the IDs + #that are explicitly listed in the exclusion bar file); + #mp is the set of all sample IDs, whether included, or excluded + #(unsuccessfule) r = set() for sample_id in mp : if sample_id not in s : r.add(sample_id) - return r + print ' num sample IDs:', len(r) + return (s, r) #====================================================================== #build set that contains all sample names @@ -76,10 +83,27 @@ def buildExc(mp, fn) : sample_list_base_name = sys.argv[3] #build exclusion set; this set contains all valid (successful) sample IDs -s2 = buildExc(mp, sample_list_dir + '/t__' + sample_list_base_name + '_bar') +(excluded, included) = buildExc(mp, sample_list_dir + '/t_exclusion_' + sample_list_base_name + '_bar') +print '\nlen(included):', len(included), 'len(excluded):', len(excluded), 'intersection:', len(included.intersection(excluded)) data = [] -data.append(s2) +data.append(included) + +#build bar inclusion set +(included2, excluded2) = buildExc(mp, sample_list_dir + '/t_inclusion_' + sample_list_base_name + '_bar') +#(excluded2, included2) = buildExc(mp, sample_list_dir + '/t_inclusion_' + sample_list_base_name + '_bar') +print '\nlen(included):', len(included2), 'len(excluded):', len(excluded2), 'intersection:', len(included2.intersection(excluded2)) + +print +print 'checking that the bar files do not intersect' +r = len(excluded.intersection(included2)) +if r != 0 : + print 'FAILED!' + print 'len(intersection):', r + exit(0) +#print 'bar inclusion file contains', len(bar), 'sample IDs' +#data.append(bar) + for j in range(int(sys.argv[4])) : s2 = buildInc(mp, sample_list_dir + '/t' + str(j) + '_' + sample_list_base_name) @@ -94,16 +118,24 @@ def buildExc(mp, fn) : if j != k : a = data[j] b = data[k] - #print 'testing', j, 'against', k + print 'testing', j, 'against', k, 'len:', len(a), len(b) r = len(a.intersection(b)) if r != 0 : print 'FAILED: ', j, 'intersection with',k, '=' , r + tt = 0 + for x in a : + if x in b : + print x, + tt += 1 + print + print 'total:', tt + exit(9) success = False if success : print ' SUCCESS!' print -print 'testing that all samples appear in one sample list, or the bar file' +print 'testing that all samples appear in one sample list, or the exclusion bar file' s2 = set() for j in range(0, len(data)) : diff --git a/model_zoo/jag_utils/select_samples.cpp b/model_zoo/jag_utils/select_samples.cpp index fb0f93e4a7f..cd855539fd4 100644 --- a/model_zoo/jag_utils/select_samples.cpp +++ b/model_zoo/jag_utils/select_samples.cpp @@ -61,8 +61,9 @@ void write_sample_list( const unordered_map> &sample_mapping_v, const std::unordered_map &filename_data); -void write_exclusion_file( +void write_bar_files( const unordered_map> index_map_exclude, + const unordered_map> &sample_mapping, const unordered_map> &sample_mapping_v, const unordered_map &filename_data); //============================================================================ @@ -117,7 +118,7 @@ int main(int argc, char **argv) { vector>> subsets(num_lists); divide_selected_samples(index_map_keep, subsets); - write_exclusion_file(index_map_exclude, sample_mapping_v, filename_data); + write_bar_files(index_map_exclude, sample_mapping, sample_mapping_v, filename_data); // write the sample lists for (int n=0; n s6; // get total samples for the current file std::unordered_map::const_iterator t4 = filename_data.find(filename); @@ -436,7 +436,8 @@ void write_sample_list( ++num_include_files; total_good += included; total_bad += excluded; - sout << filename << " " << included << " " << excluded; + s6.resize(s6.size()+1); + s6.back() << filename << " " << included << " " << excluded; for (auto &t3 : include_me) { if (sample_mapping_v.find(fn2) == sample_mapping_v.end()) { LBANN_ERROR("failed to find the key: ", fn2, " in sample_mapping_v map"); @@ -448,9 +449,25 @@ void write_sample_list( if (static_cast(t3) >= t5->second.size()) { LBANN_ERROR("t3 >= t5->second.size()"); } - sout << " " << t5->second[t3]; + s6.back() << " " << t5->second[t3]; + } + + //compute values for randomizing + //(this was previously done with a python script) + size_t n2 = s6.size(); + unordered_set used_indices; + vector indices; + while (used_indices.size() < n2) { + int v = random() % n2; + if (used_indices.find(v) == used_indices.end()) { + used_indices.insert(v); + indices.push_back(v); + } + } + + for (size_t y=0; y> index_map_exclude, + const unordered_map> &sample_mapping, const unordered_map> &sample_mapping_v, const unordered_map &filename_data ) { + + unordered_set all_excluded; + const string dir = options::get()->get_string("output_dir"); const string base_fn = options::get()->get_string("output_base_fn"); stringstream s; - s << dir << '/' << "t_" << '_' << base_fn << "_bar"; - std::cerr << "\nWRITING output bar file: " << s.str() << "\n"; + s << dir << '/' << "t_exclusion_" << base_fn << "_bar"; + std::cerr << "\nWRITING exclusion bar file: " << s.str() << "\n"; std::ofstream out(s.str().c_str()); if (!out) { LBANN_ERROR("failed to open ", s.str(), " for writing\n"); @@ -578,13 +599,54 @@ void write_exclusion_file( LBANN_ERROR("t5 == sample_mapping_v.end())"); } sout << " " << t5->second[t3]; + all_excluded.insert(t5->second[t3]); } sout << "\n"; + } + } + + const string base_dir = options::get()->get_string("base_dir"); + out << total_good << " " << total_bad << " " << num_include_files << "\n" + << base_dir << endl << sout.str(); + out.close(); + + s.clear(); + s.str(""); + s << dir << '/' << "t_inclusion_" << base_fn << "_bar"; + std::cerr << "\nWRITING inclusion bar file: " << s.str() << "\n"; + std::ofstream out2(s.str().c_str()); + if (!out2) { + LBANN_ERROR("failed to open ", s.str(), " for writing\n"); + } + out2 << "CONDUIT_HDF5_INCLUSION\n"; + + num_include_files = 0; + unordered_map> data_for_inclusion; + for (auto &&t : sample_mapping) { + for (auto &t2 : t.second) { + if (all_excluded.find(t2) == all_excluded.end()) { + data_for_inclusion[t.first].insert(t2); } } + } - const string base_dir = options::get()->get_string("base_dir"); - out << total_good << " " << total_bad << " " << num_include_files << "\n" - << base_dir << "\n" << sout.str(); - out.close(); + cout << "all_excluded.size: " << all_excluded.size() << endl; + + out2 << total_good << " " << total_bad << " " << data_for_inclusion.size() << "\n" << base_dir << endl; + + for (auto &&t : data_for_inclusion) { + int included = t.second.size(); + unordered_map>::const_iterator it = sample_mapping.find(t.first); + if (it == sample_mapping.end()) { + LBANN_ERROR("it == sample_mapping.end()"); + } + int total = it->second.size(); + int excluded = total - included; + out2 << t.first << " " << included << " " << excluded << " "; + for (auto &t2 : t.second) { + out2 << t2 << " "; + } + out2 << endl; + } + out2 << endl; } From 32b6524ea71c410e7a2d198e6a68402fd19b81f8 Mon Sep 17 00:00:00 2001 From: Tim Moon Date: Fri, 30 Aug 2019 14:51:06 -0700 Subject: [PATCH 274/634] Add non-CUB implementation of channel-wise scale/bias layer CUB is an optional dependency again. --- src/layers/learning/channelwise_scale_bias.cu | 27 ++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/src/layers/learning/channelwise_scale_bias.cu b/src/layers/learning/channelwise_scale_bias.cu index 99df5f3e245..0843fe990ac 100644 --- a/src/layers/learning/channelwise_scale_bias.cu +++ b/src/layers/learning/channelwise_scale_bias.cu @@ -25,7 +25,9 @@ //////////////////////////////////////////////////////////////////////////////// #include "lbann/layers/learning/channelwise_scale_bias.hpp" +#ifdef HYDROGEN_HAVE_CUB #include "cub/block/block_reduce.cuh" +#endif // HYDROGEN_HAVE_CUB namespace lbann { @@ -122,7 +124,7 @@ __global__ void bp_kernel(size_t num_channels, } // Accumulate gradient contributions for block and add to result - // Note: Perform block reduction with CUB +#ifdef HYDROGEN_HAVE_CUB constexpr auto reduce_algo = cub::BLOCK_REDUCE_WARP_REDUCTIONS; using BlockReduce = cub::BlockReduce; __shared__ typename BlockReduce::TempStorage workspace; @@ -136,6 +138,29 @@ __global__ void bp_kernel(size_t num_channels, if (tid == 0) { cuda::atomic_add(&gradient_wrt_bias[channel], db); } +#else + __shared__ DataType workspace[bsizex*bsizey]; + workspace[tid] = private_da; + for (size_t stride = bsizex*bsizey/2; stride > 0; stride /= 2) { + __syncthreads(); + if (tid < stride) { + workspace[tid] += workspace[tid + stride]; + } + } + if (tid == 0) { + cuda::atomic_add(&gradient_wrt_scale[channel], workspace[0]); + } + workspace[tid] = private_db; + for (size_t stride = bsizex*bsizey/2; stride > 0; stride /= 2) { + __syncthreads(); + if (tid < stride) { + workspace[tid] += workspace[tid + stride]; + } + } + if (tid == 0) { + cuda::atomic_add(&gradient_wrt_bias[channel], workspace[0]); + } +#endif // HYDROGEN_HAVE_CUB } From b1208cdccd9a8e9c131fbc0223ee2fbef6e8caa7 Mon Sep 17 00:00:00 2001 From: Ryan Forsyth Date: Tue, 11 Jun 2019 14:10:05 -0700 Subject: [PATCH 275/634] Improve DenseNet accuracy --- .../densenet/generated_densenet.prototext | 663 +++++------------- model_zoo/vision/densenet.py | 80 ++- 2 files changed, 234 insertions(+), 509 deletions(-) diff --git a/model_zoo/models/densenet/generated_densenet.prototext b/model_zoo/models/densenet/generated_densenet.prototext index 07e4423f5c9..2342c1bf3e9 100644 --- a/model_zoo/models/densenet/generated_densenet.prototext +++ b/model_zoo/models/densenet/generated_densenet.prototext @@ -5,66 +5,63 @@ data_reader { shuffle: true data_filedir: "/p/lscratchh/brainusr/datasets/ILSVRC2012/original/train/" data_filename: "/p/lscratchh/brainusr/datasets/ILSVRC2012/original/labels/train.txt" - validation_percent: 0.01 percent_of_data_to_use: 1.0 - image_preprocessor { - cropper { - crop_randomly: true - crop_width: 224 - crop_height: 224 - resized_width: 256 - resized_height: 256 - } - augmenter { - horizontal_flip: true + num_labels: 1000 + transforms { + random_resized_crop { + height: 224 + width: 224 } - colorizer { + } + transforms { + horizontal_flip { + p: 0.5 } - subtractor { - disable: true - image_to_sub: "mean-256x256x3-6.bin" + } + transforms { + colorize { } - normalizer { - z_score: true + } + transforms { + normalize_to_lbann_layout { + means: "0.406 0.456 0.485" + stddevs: "0.225 0.224 0.229" } } - num_labels: 1000 } reader { name: "imagenet" - role: "test" + role: "validate" shuffle: true data_filedir: "/p/lscratchh/brainusr/datasets/ILSVRC2012/original/val/" data_filename: "/p/lscratchh/brainusr/datasets/ILSVRC2012/original/labels/val.txt" percent_of_data_to_use: 1.0 - image_preprocessor { - cropper { - crop_width: 224 + num_labels: 1000 + transforms { + resized_center_crop { + height: 256 + width: 256 crop_height: 224 - resized_width: 256 - resized_height: 256 - } - augmenter { - disable: true - } - colorizer { + crop_width: 224 } - subtractor { - disable: true - image_to_sub: "mean-256x256x3-6.bin" + } + transforms { + colorize { } - normalizer { - z_score: true + } + transforms { + normalize_to_lbann_layout { + means: "0.406 0.456 0.485" + stddevs: "0.225 0.224 0.229" } } - num_labels: 1000 } } model { objective_function { layer_term { scale_factor: 1.0 - layer: "layer434" + layer: "layer435" } l2_weight_regularization { scale_factor: 0.0001 @@ -72,14 +69,14 @@ model { } metric { layer_metric { - layer: "layer435" + layer: "layer436" name: "top-1 accuracy" unit: "%" } } metric { layer_metric { - layer: "layer436" + layer: "layer437" name: "top-5 accuracy" unit: "%" } @@ -90,15 +87,13 @@ model { layer { name: "layer1" children: "layer2 layer3" - data_layout: "data_parallel" input { } } layer { name: "layer3" parents: "layer1" - children: "layer434 layer435 layer436" - data_layout: "data_parallel" + children: "layer435 layer436 layer437" identity { } } @@ -106,7 +101,6 @@ model { name: "layer2" parents: "layer1" children: "layer4" - data_layout: "data_parallel" identity { } } @@ -114,7 +108,6 @@ model { name: "layer4" parents: "layer2" children: "layer5" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 64 @@ -127,18 +120,17 @@ model { name: "layer5" parents: "layer4" children: "layer6" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer6" parents: "layer5" children: "layer7" - data_layout: "data_parallel" relu { } } @@ -146,7 +138,6 @@ model { name: "layer7" parents: "layer6" children: "layer8 layer15 layer22 layer29 layer36 layer43 layer50" - data_layout: "data_parallel" pooling { num_dims: 2 pool_dims_i: 3 @@ -159,7 +150,6 @@ model { name: "layer8" parents: "layer7" children: "layer9" - data_layout: "data_parallel" concatenation { } } @@ -167,18 +157,17 @@ model { name: "layer9" parents: "layer8" children: "layer10" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer10" parents: "layer9" children: "layer11" - data_layout: "data_parallel" relu { } } @@ -186,7 +175,6 @@ model { name: "layer11" parents: "layer10" children: "layer12" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 128 @@ -198,18 +186,17 @@ model { name: "layer12" parents: "layer11" children: "layer13" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer13" parents: "layer12" children: "layer14" - data_layout: "data_parallel" relu { } } @@ -217,7 +204,6 @@ model { name: "layer14" parents: "layer13" children: "layer15 layer22 layer29 layer36 layer43 layer50" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 32 @@ -230,7 +216,6 @@ model { name: "layer15" parents: "layer7 layer14" children: "layer16" - data_layout: "data_parallel" concatenation { } } @@ -238,18 +223,17 @@ model { name: "layer16" parents: "layer15" children: "layer17" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer17" parents: "layer16" children: "layer18" - data_layout: "data_parallel" relu { } } @@ -257,7 +241,6 @@ model { name: "layer18" parents: "layer17" children: "layer19" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 128 @@ -269,18 +252,17 @@ model { name: "layer19" parents: "layer18" children: "layer20" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer20" parents: "layer19" children: "layer21" - data_layout: "data_parallel" relu { } } @@ -288,7 +270,6 @@ model { name: "layer21" parents: "layer20" children: "layer22 layer29 layer36 layer43 layer50" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 32 @@ -301,7 +282,6 @@ model { name: "layer22" parents: "layer7 layer14 layer21" children: "layer23" - data_layout: "data_parallel" concatenation { } } @@ -309,18 +289,17 @@ model { name: "layer23" parents: "layer22" children: "layer24" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer24" parents: "layer23" children: "layer25" - data_layout: "data_parallel" relu { } } @@ -328,7 +307,6 @@ model { name: "layer25" parents: "layer24" children: "layer26" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 128 @@ -340,18 +318,17 @@ model { name: "layer26" parents: "layer25" children: "layer27" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer27" parents: "layer26" children: "layer28" - data_layout: "data_parallel" relu { } } @@ -359,7 +336,6 @@ model { name: "layer28" parents: "layer27" children: "layer29 layer36 layer43 layer50" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 32 @@ -372,7 +348,6 @@ model { name: "layer29" parents: "layer7 layer14 layer21 layer28" children: "layer30" - data_layout: "data_parallel" concatenation { } } @@ -380,18 +355,17 @@ model { name: "layer30" parents: "layer29" children: "layer31" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer31" parents: "layer30" children: "layer32" - data_layout: "data_parallel" relu { } } @@ -399,7 +373,6 @@ model { name: "layer32" parents: "layer31" children: "layer33" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 128 @@ -411,18 +384,17 @@ model { name: "layer33" parents: "layer32" children: "layer34" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer34" parents: "layer33" children: "layer35" - data_layout: "data_parallel" relu { } } @@ -430,7 +402,6 @@ model { name: "layer35" parents: "layer34" children: "layer36 layer43 layer50" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 32 @@ -443,7 +414,6 @@ model { name: "layer36" parents: "layer7 layer14 layer21 layer28 layer35" children: "layer37" - data_layout: "data_parallel" concatenation { } } @@ -451,18 +421,17 @@ model { name: "layer37" parents: "layer36" children: "layer38" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer38" parents: "layer37" children: "layer39" - data_layout: "data_parallel" relu { } } @@ -470,7 +439,6 @@ model { name: "layer39" parents: "layer38" children: "layer40" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 128 @@ -482,18 +450,17 @@ model { name: "layer40" parents: "layer39" children: "layer41" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer41" parents: "layer40" children: "layer42" - data_layout: "data_parallel" relu { } } @@ -501,7 +468,6 @@ model { name: "layer42" parents: "layer41" children: "layer43 layer50" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 32 @@ -514,7 +480,6 @@ model { name: "layer43" parents: "layer7 layer14 layer21 layer28 layer35 layer42" children: "layer44" - data_layout: "data_parallel" concatenation { } } @@ -522,18 +487,17 @@ model { name: "layer44" parents: "layer43" children: "layer45" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer45" parents: "layer44" children: "layer46" - data_layout: "data_parallel" relu { } } @@ -541,7 +505,6 @@ model { name: "layer46" parents: "layer45" children: "layer47" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 128 @@ -553,18 +516,17 @@ model { name: "layer47" parents: "layer46" children: "layer48" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer48" parents: "layer47" children: "layer49" - data_layout: "data_parallel" relu { } } @@ -572,7 +534,6 @@ model { name: "layer49" parents: "layer48" children: "layer50" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 32 @@ -585,7 +546,6 @@ model { name: "layer50" parents: "layer7 layer14 layer21 layer28 layer35 layer42 layer49" children: "layer51" - data_layout: "data_parallel" concatenation { } } @@ -593,18 +553,17 @@ model { name: "layer51" parents: "layer50" children: "layer52" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer52" parents: "layer51" children: "layer53" - data_layout: "data_parallel" relu { } } @@ -612,7 +571,6 @@ model { name: "layer53" parents: "layer52" children: "layer54" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 128 @@ -624,7 +582,6 @@ model { name: "layer54" parents: "layer53" children: "layer55 layer62 layer69 layer76 layer83 layer90 layer97 layer104 layer111 layer118 layer125 layer132 layer139" - data_layout: "data_parallel" pooling { num_dims: 2 pool_dims_i: 2 @@ -636,7 +593,6 @@ model { name: "layer55" parents: "layer54" children: "layer56" - data_layout: "data_parallel" concatenation { } } @@ -644,18 +600,17 @@ model { name: "layer56" parents: "layer55" children: "layer57" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer57" parents: "layer56" children: "layer58" - data_layout: "data_parallel" relu { } } @@ -663,7 +618,6 @@ model { name: "layer58" parents: "layer57" children: "layer59" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 128 @@ -675,18 +629,17 @@ model { name: "layer59" parents: "layer58" children: "layer60" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer60" parents: "layer59" children: "layer61" - data_layout: "data_parallel" relu { } } @@ -694,7 +647,6 @@ model { name: "layer61" parents: "layer60" children: "layer62 layer69 layer76 layer83 layer90 layer97 layer104 layer111 layer118 layer125 layer132 layer139" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 32 @@ -707,7 +659,6 @@ model { name: "layer62" parents: "layer54 layer61" children: "layer63" - data_layout: "data_parallel" concatenation { } } @@ -715,18 +666,17 @@ model { name: "layer63" parents: "layer62" children: "layer64" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer64" parents: "layer63" children: "layer65" - data_layout: "data_parallel" relu { } } @@ -734,7 +684,6 @@ model { name: "layer65" parents: "layer64" children: "layer66" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 128 @@ -746,18 +695,17 @@ model { name: "layer66" parents: "layer65" children: "layer67" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer67" parents: "layer66" children: "layer68" - data_layout: "data_parallel" relu { } } @@ -765,7 +713,6 @@ model { name: "layer68" parents: "layer67" children: "layer69 layer76 layer83 layer90 layer97 layer104 layer111 layer118 layer125 layer132 layer139" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 32 @@ -778,7 +725,6 @@ model { name: "layer69" parents: "layer54 layer61 layer68" children: "layer70" - data_layout: "data_parallel" concatenation { } } @@ -786,18 +732,17 @@ model { name: "layer70" parents: "layer69" children: "layer71" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer71" parents: "layer70" children: "layer72" - data_layout: "data_parallel" relu { } } @@ -805,7 +750,6 @@ model { name: "layer72" parents: "layer71" children: "layer73" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 128 @@ -817,18 +761,17 @@ model { name: "layer73" parents: "layer72" children: "layer74" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer74" parents: "layer73" children: "layer75" - data_layout: "data_parallel" relu { } } @@ -836,7 +779,6 @@ model { name: "layer75" parents: "layer74" children: "layer76 layer83 layer90 layer97 layer104 layer111 layer118 layer125 layer132 layer139" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 32 @@ -849,7 +791,6 @@ model { name: "layer76" parents: "layer54 layer61 layer68 layer75" children: "layer77" - data_layout: "data_parallel" concatenation { } } @@ -857,18 +798,17 @@ model { name: "layer77" parents: "layer76" children: "layer78" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer78" parents: "layer77" children: "layer79" - data_layout: "data_parallel" relu { } } @@ -876,7 +816,6 @@ model { name: "layer79" parents: "layer78" children: "layer80" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 128 @@ -888,18 +827,17 @@ model { name: "layer80" parents: "layer79" children: "layer81" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer81" parents: "layer80" children: "layer82" - data_layout: "data_parallel" relu { } } @@ -907,7 +845,6 @@ model { name: "layer82" parents: "layer81" children: "layer83 layer90 layer97 layer104 layer111 layer118 layer125 layer132 layer139" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 32 @@ -920,7 +857,6 @@ model { name: "layer83" parents: "layer54 layer61 layer68 layer75 layer82" children: "layer84" - data_layout: "data_parallel" concatenation { } } @@ -928,18 +864,17 @@ model { name: "layer84" parents: "layer83" children: "layer85" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer85" parents: "layer84" children: "layer86" - data_layout: "data_parallel" relu { } } @@ -947,7 +882,6 @@ model { name: "layer86" parents: "layer85" children: "layer87" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 128 @@ -959,18 +893,17 @@ model { name: "layer87" parents: "layer86" children: "layer88" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer88" parents: "layer87" children: "layer89" - data_layout: "data_parallel" relu { } } @@ -978,7 +911,6 @@ model { name: "layer89" parents: "layer88" children: "layer90 layer97 layer104 layer111 layer118 layer125 layer132 layer139" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 32 @@ -991,7 +923,6 @@ model { name: "layer90" parents: "layer54 layer61 layer68 layer75 layer82 layer89" children: "layer91" - data_layout: "data_parallel" concatenation { } } @@ -999,18 +930,17 @@ model { name: "layer91" parents: "layer90" children: "layer92" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer92" parents: "layer91" children: "layer93" - data_layout: "data_parallel" relu { } } @@ -1018,7 +948,6 @@ model { name: "layer93" parents: "layer92" children: "layer94" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 128 @@ -1030,18 +959,17 @@ model { name: "layer94" parents: "layer93" children: "layer95" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer95" parents: "layer94" children: "layer96" - data_layout: "data_parallel" relu { } } @@ -1049,7 +977,6 @@ model { name: "layer96" parents: "layer95" children: "layer97 layer104 layer111 layer118 layer125 layer132 layer139" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 32 @@ -1062,7 +989,6 @@ model { name: "layer97" parents: "layer54 layer61 layer68 layer75 layer82 layer89 layer96" children: "layer98" - data_layout: "data_parallel" concatenation { } } @@ -1070,18 +996,17 @@ model { name: "layer98" parents: "layer97" children: "layer99" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer99" parents: "layer98" children: "layer100" - data_layout: "data_parallel" relu { } } @@ -1089,7 +1014,6 @@ model { name: "layer100" parents: "layer99" children: "layer101" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 128 @@ -1101,18 +1025,17 @@ model { name: "layer101" parents: "layer100" children: "layer102" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer102" parents: "layer101" children: "layer103" - data_layout: "data_parallel" relu { } } @@ -1120,7 +1043,6 @@ model { name: "layer103" parents: "layer102" children: "layer104 layer111 layer118 layer125 layer132 layer139" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 32 @@ -1133,7 +1055,6 @@ model { name: "layer104" parents: "layer54 layer61 layer68 layer75 layer82 layer89 layer96 layer103" children: "layer105" - data_layout: "data_parallel" concatenation { } } @@ -1141,18 +1062,17 @@ model { name: "layer105" parents: "layer104" children: "layer106" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer106" parents: "layer105" children: "layer107" - data_layout: "data_parallel" relu { } } @@ -1160,7 +1080,6 @@ model { name: "layer107" parents: "layer106" children: "layer108" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 128 @@ -1172,18 +1091,17 @@ model { name: "layer108" parents: "layer107" children: "layer109" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer109" parents: "layer108" children: "layer110" - data_layout: "data_parallel" relu { } } @@ -1191,7 +1109,6 @@ model { name: "layer110" parents: "layer109" children: "layer111 layer118 layer125 layer132 layer139" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 32 @@ -1204,7 +1121,6 @@ model { name: "layer111" parents: "layer54 layer61 layer68 layer75 layer82 layer89 layer96 layer103 layer110" children: "layer112" - data_layout: "data_parallel" concatenation { } } @@ -1212,18 +1128,17 @@ model { name: "layer112" parents: "layer111" children: "layer113" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer113" parents: "layer112" children: "layer114" - data_layout: "data_parallel" relu { } } @@ -1231,7 +1146,6 @@ model { name: "layer114" parents: "layer113" children: "layer115" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 128 @@ -1243,18 +1157,17 @@ model { name: "layer115" parents: "layer114" children: "layer116" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer116" parents: "layer115" children: "layer117" - data_layout: "data_parallel" relu { } } @@ -1262,7 +1175,6 @@ model { name: "layer117" parents: "layer116" children: "layer118 layer125 layer132 layer139" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 32 @@ -1275,7 +1187,6 @@ model { name: "layer118" parents: "layer54 layer61 layer68 layer75 layer82 layer89 layer96 layer103 layer110 layer117" children: "layer119" - data_layout: "data_parallel" concatenation { } } @@ -1283,18 +1194,17 @@ model { name: "layer119" parents: "layer118" children: "layer120" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer120" parents: "layer119" children: "layer121" - data_layout: "data_parallel" relu { } } @@ -1302,7 +1212,6 @@ model { name: "layer121" parents: "layer120" children: "layer122" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 128 @@ -1314,18 +1223,17 @@ model { name: "layer122" parents: "layer121" children: "layer123" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer123" parents: "layer122" children: "layer124" - data_layout: "data_parallel" relu { } } @@ -1333,7 +1241,6 @@ model { name: "layer124" parents: "layer123" children: "layer125 layer132 layer139" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 32 @@ -1346,7 +1253,6 @@ model { name: "layer125" parents: "layer54 layer61 layer68 layer75 layer82 layer89 layer96 layer103 layer110 layer117 layer124" children: "layer126" - data_layout: "data_parallel" concatenation { } } @@ -1354,18 +1260,17 @@ model { name: "layer126" parents: "layer125" children: "layer127" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer127" parents: "layer126" children: "layer128" - data_layout: "data_parallel" relu { } } @@ -1373,7 +1278,6 @@ model { name: "layer128" parents: "layer127" children: "layer129" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 128 @@ -1385,18 +1289,17 @@ model { name: "layer129" parents: "layer128" children: "layer130" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer130" parents: "layer129" children: "layer131" - data_layout: "data_parallel" relu { } } @@ -1404,7 +1307,6 @@ model { name: "layer131" parents: "layer130" children: "layer132 layer139" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 32 @@ -1417,7 +1319,6 @@ model { name: "layer132" parents: "layer54 layer61 layer68 layer75 layer82 layer89 layer96 layer103 layer110 layer117 layer124 layer131" children: "layer133" - data_layout: "data_parallel" concatenation { } } @@ -1425,18 +1326,17 @@ model { name: "layer133" parents: "layer132" children: "layer134" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer134" parents: "layer133" children: "layer135" - data_layout: "data_parallel" relu { } } @@ -1444,7 +1344,6 @@ model { name: "layer135" parents: "layer134" children: "layer136" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 128 @@ -1456,18 +1355,17 @@ model { name: "layer136" parents: "layer135" children: "layer137" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer137" parents: "layer136" children: "layer138" - data_layout: "data_parallel" relu { } } @@ -1475,7 +1373,6 @@ model { name: "layer138" parents: "layer137" children: "layer139" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 32 @@ -1488,7 +1385,6 @@ model { name: "layer139" parents: "layer54 layer61 layer68 layer75 layer82 layer89 layer96 layer103 layer110 layer117 layer124 layer131 layer138" children: "layer140" - data_layout: "data_parallel" concatenation { } } @@ -1496,18 +1392,17 @@ model { name: "layer140" parents: "layer139" children: "layer141" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer141" parents: "layer140" children: "layer142" - data_layout: "data_parallel" relu { } } @@ -1515,7 +1410,6 @@ model { name: "layer142" parents: "layer141" children: "layer143" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 256 @@ -1527,7 +1421,6 @@ model { name: "layer143" parents: "layer142" children: "layer144 layer151 layer158 layer165 layer172 layer179 layer186 layer193 layer200 layer207 layer214 layer221 layer228 layer235 layer242 layer249 layer256 layer263 layer270 layer277 layer284 layer291 layer298 layer305 layer312" - data_layout: "data_parallel" pooling { num_dims: 2 pool_dims_i: 2 @@ -1539,7 +1432,6 @@ model { name: "layer144" parents: "layer143" children: "layer145" - data_layout: "data_parallel" concatenation { } } @@ -1547,18 +1439,17 @@ model { name: "layer145" parents: "layer144" children: "layer146" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer146" parents: "layer145" children: "layer147" - data_layout: "data_parallel" relu { } } @@ -1566,7 +1457,6 @@ model { name: "layer147" parents: "layer146" children: "layer148" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 128 @@ -1578,18 +1468,17 @@ model { name: "layer148" parents: "layer147" children: "layer149" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer149" parents: "layer148" children: "layer150" - data_layout: "data_parallel" relu { } } @@ -1597,7 +1486,6 @@ model { name: "layer150" parents: "layer149" children: "layer151 layer158 layer165 layer172 layer179 layer186 layer193 layer200 layer207 layer214 layer221 layer228 layer235 layer242 layer249 layer256 layer263 layer270 layer277 layer284 layer291 layer298 layer305 layer312" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 32 @@ -1610,7 +1498,6 @@ model { name: "layer151" parents: "layer143 layer150" children: "layer152" - data_layout: "data_parallel" concatenation { } } @@ -1618,18 +1505,17 @@ model { name: "layer152" parents: "layer151" children: "layer153" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer153" parents: "layer152" children: "layer154" - data_layout: "data_parallel" relu { } } @@ -1637,7 +1523,6 @@ model { name: "layer154" parents: "layer153" children: "layer155" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 128 @@ -1649,18 +1534,17 @@ model { name: "layer155" parents: "layer154" children: "layer156" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer156" parents: "layer155" children: "layer157" - data_layout: "data_parallel" relu { } } @@ -1668,7 +1552,6 @@ model { name: "layer157" parents: "layer156" children: "layer158 layer165 layer172 layer179 layer186 layer193 layer200 layer207 layer214 layer221 layer228 layer235 layer242 layer249 layer256 layer263 layer270 layer277 layer284 layer291 layer298 layer305 layer312" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 32 @@ -1681,7 +1564,6 @@ model { name: "layer158" parents: "layer143 layer150 layer157" children: "layer159" - data_layout: "data_parallel" concatenation { } } @@ -1689,18 +1571,17 @@ model { name: "layer159" parents: "layer158" children: "layer160" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer160" parents: "layer159" children: "layer161" - data_layout: "data_parallel" relu { } } @@ -1708,7 +1589,6 @@ model { name: "layer161" parents: "layer160" children: "layer162" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 128 @@ -1720,18 +1600,17 @@ model { name: "layer162" parents: "layer161" children: "layer163" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer163" parents: "layer162" children: "layer164" - data_layout: "data_parallel" relu { } } @@ -1739,7 +1618,6 @@ model { name: "layer164" parents: "layer163" children: "layer165 layer172 layer179 layer186 layer193 layer200 layer207 layer214 layer221 layer228 layer235 layer242 layer249 layer256 layer263 layer270 layer277 layer284 layer291 layer298 layer305 layer312" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 32 @@ -1752,7 +1630,6 @@ model { name: "layer165" parents: "layer143 layer150 layer157 layer164" children: "layer166" - data_layout: "data_parallel" concatenation { } } @@ -1760,18 +1637,17 @@ model { name: "layer166" parents: "layer165" children: "layer167" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer167" parents: "layer166" children: "layer168" - data_layout: "data_parallel" relu { } } @@ -1779,7 +1655,6 @@ model { name: "layer168" parents: "layer167" children: "layer169" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 128 @@ -1791,18 +1666,17 @@ model { name: "layer169" parents: "layer168" children: "layer170" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer170" parents: "layer169" children: "layer171" - data_layout: "data_parallel" relu { } } @@ -1810,7 +1684,6 @@ model { name: "layer171" parents: "layer170" children: "layer172 layer179 layer186 layer193 layer200 layer207 layer214 layer221 layer228 layer235 layer242 layer249 layer256 layer263 layer270 layer277 layer284 layer291 layer298 layer305 layer312" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 32 @@ -1823,7 +1696,6 @@ model { name: "layer172" parents: "layer143 layer150 layer157 layer164 layer171" children: "layer173" - data_layout: "data_parallel" concatenation { } } @@ -1831,18 +1703,17 @@ model { name: "layer173" parents: "layer172" children: "layer174" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer174" parents: "layer173" children: "layer175" - data_layout: "data_parallel" relu { } } @@ -1850,7 +1721,6 @@ model { name: "layer175" parents: "layer174" children: "layer176" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 128 @@ -1862,18 +1732,17 @@ model { name: "layer176" parents: "layer175" children: "layer177" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer177" parents: "layer176" children: "layer178" - data_layout: "data_parallel" relu { } } @@ -1881,7 +1750,6 @@ model { name: "layer178" parents: "layer177" children: "layer179 layer186 layer193 layer200 layer207 layer214 layer221 layer228 layer235 layer242 layer249 layer256 layer263 layer270 layer277 layer284 layer291 layer298 layer305 layer312" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 32 @@ -1894,7 +1762,6 @@ model { name: "layer179" parents: "layer143 layer150 layer157 layer164 layer171 layer178" children: "layer180" - data_layout: "data_parallel" concatenation { } } @@ -1902,18 +1769,17 @@ model { name: "layer180" parents: "layer179" children: "layer181" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer181" parents: "layer180" children: "layer182" - data_layout: "data_parallel" relu { } } @@ -1921,7 +1787,6 @@ model { name: "layer182" parents: "layer181" children: "layer183" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 128 @@ -1933,18 +1798,17 @@ model { name: "layer183" parents: "layer182" children: "layer184" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer184" parents: "layer183" children: "layer185" - data_layout: "data_parallel" relu { } } @@ -1952,7 +1816,6 @@ model { name: "layer185" parents: "layer184" children: "layer186 layer193 layer200 layer207 layer214 layer221 layer228 layer235 layer242 layer249 layer256 layer263 layer270 layer277 layer284 layer291 layer298 layer305 layer312" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 32 @@ -1965,7 +1828,6 @@ model { name: "layer186" parents: "layer143 layer150 layer157 layer164 layer171 layer178 layer185" children: "layer187" - data_layout: "data_parallel" concatenation { } } @@ -1973,18 +1835,17 @@ model { name: "layer187" parents: "layer186" children: "layer188" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer188" parents: "layer187" children: "layer189" - data_layout: "data_parallel" relu { } } @@ -1992,7 +1853,6 @@ model { name: "layer189" parents: "layer188" children: "layer190" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 128 @@ -2004,18 +1864,17 @@ model { name: "layer190" parents: "layer189" children: "layer191" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer191" parents: "layer190" children: "layer192" - data_layout: "data_parallel" relu { } } @@ -2023,7 +1882,6 @@ model { name: "layer192" parents: "layer191" children: "layer193 layer200 layer207 layer214 layer221 layer228 layer235 layer242 layer249 layer256 layer263 layer270 layer277 layer284 layer291 layer298 layer305 layer312" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 32 @@ -2036,7 +1894,6 @@ model { name: "layer193" parents: "layer143 layer150 layer157 layer164 layer171 layer178 layer185 layer192" children: "layer194" - data_layout: "data_parallel" concatenation { } } @@ -2044,18 +1901,17 @@ model { name: "layer194" parents: "layer193" children: "layer195" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer195" parents: "layer194" children: "layer196" - data_layout: "data_parallel" relu { } } @@ -2063,7 +1919,6 @@ model { name: "layer196" parents: "layer195" children: "layer197" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 128 @@ -2075,18 +1930,17 @@ model { name: "layer197" parents: "layer196" children: "layer198" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer198" parents: "layer197" children: "layer199" - data_layout: "data_parallel" relu { } } @@ -2094,7 +1948,6 @@ model { name: "layer199" parents: "layer198" children: "layer200 layer207 layer214 layer221 layer228 layer235 layer242 layer249 layer256 layer263 layer270 layer277 layer284 layer291 layer298 layer305 layer312" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 32 @@ -2107,7 +1960,6 @@ model { name: "layer200" parents: "layer143 layer150 layer157 layer164 layer171 layer178 layer185 layer192 layer199" children: "layer201" - data_layout: "data_parallel" concatenation { } } @@ -2115,18 +1967,17 @@ model { name: "layer201" parents: "layer200" children: "layer202" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer202" parents: "layer201" children: "layer203" - data_layout: "data_parallel" relu { } } @@ -2134,7 +1985,6 @@ model { name: "layer203" parents: "layer202" children: "layer204" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 128 @@ -2146,18 +1996,17 @@ model { name: "layer204" parents: "layer203" children: "layer205" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer205" parents: "layer204" children: "layer206" - data_layout: "data_parallel" relu { } } @@ -2165,7 +2014,6 @@ model { name: "layer206" parents: "layer205" children: "layer207 layer214 layer221 layer228 layer235 layer242 layer249 layer256 layer263 layer270 layer277 layer284 layer291 layer298 layer305 layer312" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 32 @@ -2178,7 +2026,6 @@ model { name: "layer207" parents: "layer143 layer150 layer157 layer164 layer171 layer178 layer185 layer192 layer199 layer206" children: "layer208" - data_layout: "data_parallel" concatenation { } } @@ -2186,18 +2033,17 @@ model { name: "layer208" parents: "layer207" children: "layer209" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer209" parents: "layer208" children: "layer210" - data_layout: "data_parallel" relu { } } @@ -2205,7 +2051,6 @@ model { name: "layer210" parents: "layer209" children: "layer211" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 128 @@ -2217,18 +2062,17 @@ model { name: "layer211" parents: "layer210" children: "layer212" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer212" parents: "layer211" children: "layer213" - data_layout: "data_parallel" relu { } } @@ -2236,7 +2080,6 @@ model { name: "layer213" parents: "layer212" children: "layer214 layer221 layer228 layer235 layer242 layer249 layer256 layer263 layer270 layer277 layer284 layer291 layer298 layer305 layer312" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 32 @@ -2249,7 +2092,6 @@ model { name: "layer214" parents: "layer143 layer150 layer157 layer164 layer171 layer178 layer185 layer192 layer199 layer206 layer213" children: "layer215" - data_layout: "data_parallel" concatenation { } } @@ -2257,18 +2099,17 @@ model { name: "layer215" parents: "layer214" children: "layer216" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer216" parents: "layer215" children: "layer217" - data_layout: "data_parallel" relu { } } @@ -2276,7 +2117,6 @@ model { name: "layer217" parents: "layer216" children: "layer218" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 128 @@ -2288,18 +2128,17 @@ model { name: "layer218" parents: "layer217" children: "layer219" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer219" parents: "layer218" children: "layer220" - data_layout: "data_parallel" relu { } } @@ -2307,7 +2146,6 @@ model { name: "layer220" parents: "layer219" children: "layer221 layer228 layer235 layer242 layer249 layer256 layer263 layer270 layer277 layer284 layer291 layer298 layer305 layer312" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 32 @@ -2320,7 +2158,6 @@ model { name: "layer221" parents: "layer143 layer150 layer157 layer164 layer171 layer178 layer185 layer192 layer199 layer206 layer213 layer220" children: "layer222" - data_layout: "data_parallel" concatenation { } } @@ -2328,18 +2165,17 @@ model { name: "layer222" parents: "layer221" children: "layer223" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer223" parents: "layer222" children: "layer224" - data_layout: "data_parallel" relu { } } @@ -2347,7 +2183,6 @@ model { name: "layer224" parents: "layer223" children: "layer225" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 128 @@ -2359,18 +2194,17 @@ model { name: "layer225" parents: "layer224" children: "layer226" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer226" parents: "layer225" children: "layer227" - data_layout: "data_parallel" relu { } } @@ -2378,7 +2212,6 @@ model { name: "layer227" parents: "layer226" children: "layer228 layer235 layer242 layer249 layer256 layer263 layer270 layer277 layer284 layer291 layer298 layer305 layer312" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 32 @@ -2391,7 +2224,6 @@ model { name: "layer228" parents: "layer143 layer150 layer157 layer164 layer171 layer178 layer185 layer192 layer199 layer206 layer213 layer220 layer227" children: "layer229" - data_layout: "data_parallel" concatenation { } } @@ -2399,18 +2231,17 @@ model { name: "layer229" parents: "layer228" children: "layer230" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer230" parents: "layer229" children: "layer231" - data_layout: "data_parallel" relu { } } @@ -2418,7 +2249,6 @@ model { name: "layer231" parents: "layer230" children: "layer232" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 128 @@ -2430,18 +2260,17 @@ model { name: "layer232" parents: "layer231" children: "layer233" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer233" parents: "layer232" children: "layer234" - data_layout: "data_parallel" relu { } } @@ -2449,7 +2278,6 @@ model { name: "layer234" parents: "layer233" children: "layer235 layer242 layer249 layer256 layer263 layer270 layer277 layer284 layer291 layer298 layer305 layer312" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 32 @@ -2462,7 +2290,6 @@ model { name: "layer235" parents: "layer143 layer150 layer157 layer164 layer171 layer178 layer185 layer192 layer199 layer206 layer213 layer220 layer227 layer234" children: "layer236" - data_layout: "data_parallel" concatenation { } } @@ -2470,18 +2297,17 @@ model { name: "layer236" parents: "layer235" children: "layer237" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer237" parents: "layer236" children: "layer238" - data_layout: "data_parallel" relu { } } @@ -2489,7 +2315,6 @@ model { name: "layer238" parents: "layer237" children: "layer239" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 128 @@ -2501,18 +2326,17 @@ model { name: "layer239" parents: "layer238" children: "layer240" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer240" parents: "layer239" children: "layer241" - data_layout: "data_parallel" relu { } } @@ -2520,7 +2344,6 @@ model { name: "layer241" parents: "layer240" children: "layer242 layer249 layer256 layer263 layer270 layer277 layer284 layer291 layer298 layer305 layer312" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 32 @@ -2533,7 +2356,6 @@ model { name: "layer242" parents: "layer143 layer150 layer157 layer164 layer171 layer178 layer185 layer192 layer199 layer206 layer213 layer220 layer227 layer234 layer241" children: "layer243" - data_layout: "data_parallel" concatenation { } } @@ -2541,18 +2363,17 @@ model { name: "layer243" parents: "layer242" children: "layer244" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer244" parents: "layer243" children: "layer245" - data_layout: "data_parallel" relu { } } @@ -2560,7 +2381,6 @@ model { name: "layer245" parents: "layer244" children: "layer246" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 128 @@ -2572,18 +2392,17 @@ model { name: "layer246" parents: "layer245" children: "layer247" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer247" parents: "layer246" children: "layer248" - data_layout: "data_parallel" relu { } } @@ -2591,7 +2410,6 @@ model { name: "layer248" parents: "layer247" children: "layer249 layer256 layer263 layer270 layer277 layer284 layer291 layer298 layer305 layer312" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 32 @@ -2604,7 +2422,6 @@ model { name: "layer249" parents: "layer143 layer150 layer157 layer164 layer171 layer178 layer185 layer192 layer199 layer206 layer213 layer220 layer227 layer234 layer241 layer248" children: "layer250" - data_layout: "data_parallel" concatenation { } } @@ -2612,18 +2429,17 @@ model { name: "layer250" parents: "layer249" children: "layer251" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer251" parents: "layer250" children: "layer252" - data_layout: "data_parallel" relu { } } @@ -2631,7 +2447,6 @@ model { name: "layer252" parents: "layer251" children: "layer253" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 128 @@ -2643,18 +2458,17 @@ model { name: "layer253" parents: "layer252" children: "layer254" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer254" parents: "layer253" children: "layer255" - data_layout: "data_parallel" relu { } } @@ -2662,7 +2476,6 @@ model { name: "layer255" parents: "layer254" children: "layer256 layer263 layer270 layer277 layer284 layer291 layer298 layer305 layer312" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 32 @@ -2675,7 +2488,6 @@ model { name: "layer256" parents: "layer143 layer150 layer157 layer164 layer171 layer178 layer185 layer192 layer199 layer206 layer213 layer220 layer227 layer234 layer241 layer248 layer255" children: "layer257" - data_layout: "data_parallel" concatenation { } } @@ -2683,18 +2495,17 @@ model { name: "layer257" parents: "layer256" children: "layer258" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer258" parents: "layer257" children: "layer259" - data_layout: "data_parallel" relu { } } @@ -2702,7 +2513,6 @@ model { name: "layer259" parents: "layer258" children: "layer260" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 128 @@ -2714,18 +2524,17 @@ model { name: "layer260" parents: "layer259" children: "layer261" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer261" parents: "layer260" children: "layer262" - data_layout: "data_parallel" relu { } } @@ -2733,7 +2542,6 @@ model { name: "layer262" parents: "layer261" children: "layer263 layer270 layer277 layer284 layer291 layer298 layer305 layer312" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 32 @@ -2746,7 +2554,6 @@ model { name: "layer263" parents: "layer143 layer150 layer157 layer164 layer171 layer178 layer185 layer192 layer199 layer206 layer213 layer220 layer227 layer234 layer241 layer248 layer255 layer262" children: "layer264" - data_layout: "data_parallel" concatenation { } } @@ -2754,18 +2561,17 @@ model { name: "layer264" parents: "layer263" children: "layer265" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer265" parents: "layer264" children: "layer266" - data_layout: "data_parallel" relu { } } @@ -2773,7 +2579,6 @@ model { name: "layer266" parents: "layer265" children: "layer267" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 128 @@ -2785,18 +2590,17 @@ model { name: "layer267" parents: "layer266" children: "layer268" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer268" parents: "layer267" children: "layer269" - data_layout: "data_parallel" relu { } } @@ -2804,7 +2608,6 @@ model { name: "layer269" parents: "layer268" children: "layer270 layer277 layer284 layer291 layer298 layer305 layer312" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 32 @@ -2817,7 +2620,6 @@ model { name: "layer270" parents: "layer143 layer150 layer157 layer164 layer171 layer178 layer185 layer192 layer199 layer206 layer213 layer220 layer227 layer234 layer241 layer248 layer255 layer262 layer269" children: "layer271" - data_layout: "data_parallel" concatenation { } } @@ -2825,18 +2627,17 @@ model { name: "layer271" parents: "layer270" children: "layer272" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer272" parents: "layer271" children: "layer273" - data_layout: "data_parallel" relu { } } @@ -2844,7 +2645,6 @@ model { name: "layer273" parents: "layer272" children: "layer274" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 128 @@ -2856,18 +2656,17 @@ model { name: "layer274" parents: "layer273" children: "layer275" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer275" parents: "layer274" children: "layer276" - data_layout: "data_parallel" relu { } } @@ -2875,7 +2674,6 @@ model { name: "layer276" parents: "layer275" children: "layer277 layer284 layer291 layer298 layer305 layer312" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 32 @@ -2888,7 +2686,6 @@ model { name: "layer277" parents: "layer143 layer150 layer157 layer164 layer171 layer178 layer185 layer192 layer199 layer206 layer213 layer220 layer227 layer234 layer241 layer248 layer255 layer262 layer269 layer276" children: "layer278" - data_layout: "data_parallel" concatenation { } } @@ -2896,18 +2693,17 @@ model { name: "layer278" parents: "layer277" children: "layer279" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer279" parents: "layer278" children: "layer280" - data_layout: "data_parallel" relu { } } @@ -2915,7 +2711,6 @@ model { name: "layer280" parents: "layer279" children: "layer281" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 128 @@ -2927,18 +2722,17 @@ model { name: "layer281" parents: "layer280" children: "layer282" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer282" parents: "layer281" children: "layer283" - data_layout: "data_parallel" relu { } } @@ -2946,7 +2740,6 @@ model { name: "layer283" parents: "layer282" children: "layer284 layer291 layer298 layer305 layer312" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 32 @@ -2959,7 +2752,6 @@ model { name: "layer284" parents: "layer143 layer150 layer157 layer164 layer171 layer178 layer185 layer192 layer199 layer206 layer213 layer220 layer227 layer234 layer241 layer248 layer255 layer262 layer269 layer276 layer283" children: "layer285" - data_layout: "data_parallel" concatenation { } } @@ -2967,18 +2759,17 @@ model { name: "layer285" parents: "layer284" children: "layer286" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer286" parents: "layer285" children: "layer287" - data_layout: "data_parallel" relu { } } @@ -2986,7 +2777,6 @@ model { name: "layer287" parents: "layer286" children: "layer288" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 128 @@ -2998,18 +2788,17 @@ model { name: "layer288" parents: "layer287" children: "layer289" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer289" parents: "layer288" children: "layer290" - data_layout: "data_parallel" relu { } } @@ -3017,7 +2806,6 @@ model { name: "layer290" parents: "layer289" children: "layer291 layer298 layer305 layer312" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 32 @@ -3030,7 +2818,6 @@ model { name: "layer291" parents: "layer143 layer150 layer157 layer164 layer171 layer178 layer185 layer192 layer199 layer206 layer213 layer220 layer227 layer234 layer241 layer248 layer255 layer262 layer269 layer276 layer283 layer290" children: "layer292" - data_layout: "data_parallel" concatenation { } } @@ -3038,18 +2825,17 @@ model { name: "layer292" parents: "layer291" children: "layer293" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer293" parents: "layer292" children: "layer294" - data_layout: "data_parallel" relu { } } @@ -3057,7 +2843,6 @@ model { name: "layer294" parents: "layer293" children: "layer295" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 128 @@ -3069,18 +2854,17 @@ model { name: "layer295" parents: "layer294" children: "layer296" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer296" parents: "layer295" children: "layer297" - data_layout: "data_parallel" relu { } } @@ -3088,7 +2872,6 @@ model { name: "layer297" parents: "layer296" children: "layer298 layer305 layer312" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 32 @@ -3101,7 +2884,6 @@ model { name: "layer298" parents: "layer143 layer150 layer157 layer164 layer171 layer178 layer185 layer192 layer199 layer206 layer213 layer220 layer227 layer234 layer241 layer248 layer255 layer262 layer269 layer276 layer283 layer290 layer297" children: "layer299" - data_layout: "data_parallel" concatenation { } } @@ -3109,18 +2891,17 @@ model { name: "layer299" parents: "layer298" children: "layer300" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer300" parents: "layer299" children: "layer301" - data_layout: "data_parallel" relu { } } @@ -3128,7 +2909,6 @@ model { name: "layer301" parents: "layer300" children: "layer302" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 128 @@ -3140,18 +2920,17 @@ model { name: "layer302" parents: "layer301" children: "layer303" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer303" parents: "layer302" children: "layer304" - data_layout: "data_parallel" relu { } } @@ -3159,7 +2938,6 @@ model { name: "layer304" parents: "layer303" children: "layer305 layer312" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 32 @@ -3172,7 +2950,6 @@ model { name: "layer305" parents: "layer143 layer150 layer157 layer164 layer171 layer178 layer185 layer192 layer199 layer206 layer213 layer220 layer227 layer234 layer241 layer248 layer255 layer262 layer269 layer276 layer283 layer290 layer297 layer304" children: "layer306" - data_layout: "data_parallel" concatenation { } } @@ -3180,18 +2957,17 @@ model { name: "layer306" parents: "layer305" children: "layer307" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer307" parents: "layer306" children: "layer308" - data_layout: "data_parallel" relu { } } @@ -3199,7 +2975,6 @@ model { name: "layer308" parents: "layer307" children: "layer309" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 128 @@ -3211,18 +2986,17 @@ model { name: "layer309" parents: "layer308" children: "layer310" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer310" parents: "layer309" children: "layer311" - data_layout: "data_parallel" relu { } } @@ -3230,7 +3004,6 @@ model { name: "layer311" parents: "layer310" children: "layer312" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 32 @@ -3243,7 +3016,6 @@ model { name: "layer312" parents: "layer143 layer150 layer157 layer164 layer171 layer178 layer185 layer192 layer199 layer206 layer213 layer220 layer227 layer234 layer241 layer248 layer255 layer262 layer269 layer276 layer283 layer290 layer297 layer304 layer311" children: "layer313" - data_layout: "data_parallel" concatenation { } } @@ -3251,18 +3023,17 @@ model { name: "layer313" parents: "layer312" children: "layer314" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer314" parents: "layer313" children: "layer315" - data_layout: "data_parallel" relu { } } @@ -3270,7 +3041,6 @@ model { name: "layer315" parents: "layer314" children: "layer316" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 512 @@ -3282,7 +3052,6 @@ model { name: "layer316" parents: "layer315" children: "layer317 layer324 layer331 layer338 layer345 layer352 layer359 layer366 layer373 layer380 layer387 layer394 layer401 layer408 layer415 layer422 layer429" - data_layout: "data_parallel" pooling { num_dims: 2 pool_dims_i: 2 @@ -3294,7 +3063,6 @@ model { name: "layer317" parents: "layer316" children: "layer318" - data_layout: "data_parallel" concatenation { } } @@ -3302,18 +3070,17 @@ model { name: "layer318" parents: "layer317" children: "layer319" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer319" parents: "layer318" children: "layer320" - data_layout: "data_parallel" relu { } } @@ -3321,7 +3088,6 @@ model { name: "layer320" parents: "layer319" children: "layer321" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 128 @@ -3333,18 +3099,17 @@ model { name: "layer321" parents: "layer320" children: "layer322" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer322" parents: "layer321" children: "layer323" - data_layout: "data_parallel" relu { } } @@ -3352,7 +3117,6 @@ model { name: "layer323" parents: "layer322" children: "layer324 layer331 layer338 layer345 layer352 layer359 layer366 layer373 layer380 layer387 layer394 layer401 layer408 layer415 layer422 layer429" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 32 @@ -3365,7 +3129,6 @@ model { name: "layer324" parents: "layer316 layer323" children: "layer325" - data_layout: "data_parallel" concatenation { } } @@ -3373,18 +3136,17 @@ model { name: "layer325" parents: "layer324" children: "layer326" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer326" parents: "layer325" children: "layer327" - data_layout: "data_parallel" relu { } } @@ -3392,7 +3154,6 @@ model { name: "layer327" parents: "layer326" children: "layer328" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 128 @@ -3404,18 +3165,17 @@ model { name: "layer328" parents: "layer327" children: "layer329" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer329" parents: "layer328" children: "layer330" - data_layout: "data_parallel" relu { } } @@ -3423,7 +3183,6 @@ model { name: "layer330" parents: "layer329" children: "layer331 layer338 layer345 layer352 layer359 layer366 layer373 layer380 layer387 layer394 layer401 layer408 layer415 layer422 layer429" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 32 @@ -3436,7 +3195,6 @@ model { name: "layer331" parents: "layer316 layer323 layer330" children: "layer332" - data_layout: "data_parallel" concatenation { } } @@ -3444,18 +3202,17 @@ model { name: "layer332" parents: "layer331" children: "layer333" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer333" parents: "layer332" children: "layer334" - data_layout: "data_parallel" relu { } } @@ -3463,7 +3220,6 @@ model { name: "layer334" parents: "layer333" children: "layer335" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 128 @@ -3475,18 +3231,17 @@ model { name: "layer335" parents: "layer334" children: "layer336" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer336" parents: "layer335" children: "layer337" - data_layout: "data_parallel" relu { } } @@ -3494,7 +3249,6 @@ model { name: "layer337" parents: "layer336" children: "layer338 layer345 layer352 layer359 layer366 layer373 layer380 layer387 layer394 layer401 layer408 layer415 layer422 layer429" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 32 @@ -3507,7 +3261,6 @@ model { name: "layer338" parents: "layer316 layer323 layer330 layer337" children: "layer339" - data_layout: "data_parallel" concatenation { } } @@ -3515,18 +3268,17 @@ model { name: "layer339" parents: "layer338" children: "layer340" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer340" parents: "layer339" children: "layer341" - data_layout: "data_parallel" relu { } } @@ -3534,7 +3286,6 @@ model { name: "layer341" parents: "layer340" children: "layer342" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 128 @@ -3546,18 +3297,17 @@ model { name: "layer342" parents: "layer341" children: "layer343" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer343" parents: "layer342" children: "layer344" - data_layout: "data_parallel" relu { } } @@ -3565,7 +3315,6 @@ model { name: "layer344" parents: "layer343" children: "layer345 layer352 layer359 layer366 layer373 layer380 layer387 layer394 layer401 layer408 layer415 layer422 layer429" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 32 @@ -3578,7 +3327,6 @@ model { name: "layer345" parents: "layer316 layer323 layer330 layer337 layer344" children: "layer346" - data_layout: "data_parallel" concatenation { } } @@ -3586,18 +3334,17 @@ model { name: "layer346" parents: "layer345" children: "layer347" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer347" parents: "layer346" children: "layer348" - data_layout: "data_parallel" relu { } } @@ -3605,7 +3352,6 @@ model { name: "layer348" parents: "layer347" children: "layer349" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 128 @@ -3617,18 +3363,17 @@ model { name: "layer349" parents: "layer348" children: "layer350" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer350" parents: "layer349" children: "layer351" - data_layout: "data_parallel" relu { } } @@ -3636,7 +3381,6 @@ model { name: "layer351" parents: "layer350" children: "layer352 layer359 layer366 layer373 layer380 layer387 layer394 layer401 layer408 layer415 layer422 layer429" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 32 @@ -3649,7 +3393,6 @@ model { name: "layer352" parents: "layer316 layer323 layer330 layer337 layer344 layer351" children: "layer353" - data_layout: "data_parallel" concatenation { } } @@ -3657,18 +3400,17 @@ model { name: "layer353" parents: "layer352" children: "layer354" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer354" parents: "layer353" children: "layer355" - data_layout: "data_parallel" relu { } } @@ -3676,7 +3418,6 @@ model { name: "layer355" parents: "layer354" children: "layer356" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 128 @@ -3688,18 +3429,17 @@ model { name: "layer356" parents: "layer355" children: "layer357" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer357" parents: "layer356" children: "layer358" - data_layout: "data_parallel" relu { } } @@ -3707,7 +3447,6 @@ model { name: "layer358" parents: "layer357" children: "layer359 layer366 layer373 layer380 layer387 layer394 layer401 layer408 layer415 layer422 layer429" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 32 @@ -3720,7 +3459,6 @@ model { name: "layer359" parents: "layer316 layer323 layer330 layer337 layer344 layer351 layer358" children: "layer360" - data_layout: "data_parallel" concatenation { } } @@ -3728,18 +3466,17 @@ model { name: "layer360" parents: "layer359" children: "layer361" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer361" parents: "layer360" children: "layer362" - data_layout: "data_parallel" relu { } } @@ -3747,7 +3484,6 @@ model { name: "layer362" parents: "layer361" children: "layer363" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 128 @@ -3759,18 +3495,17 @@ model { name: "layer363" parents: "layer362" children: "layer364" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer364" parents: "layer363" children: "layer365" - data_layout: "data_parallel" relu { } } @@ -3778,7 +3513,6 @@ model { name: "layer365" parents: "layer364" children: "layer366 layer373 layer380 layer387 layer394 layer401 layer408 layer415 layer422 layer429" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 32 @@ -3791,7 +3525,6 @@ model { name: "layer366" parents: "layer316 layer323 layer330 layer337 layer344 layer351 layer358 layer365" children: "layer367" - data_layout: "data_parallel" concatenation { } } @@ -3799,18 +3532,17 @@ model { name: "layer367" parents: "layer366" children: "layer368" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer368" parents: "layer367" children: "layer369" - data_layout: "data_parallel" relu { } } @@ -3818,7 +3550,6 @@ model { name: "layer369" parents: "layer368" children: "layer370" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 128 @@ -3830,18 +3561,17 @@ model { name: "layer370" parents: "layer369" children: "layer371" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer371" parents: "layer370" children: "layer372" - data_layout: "data_parallel" relu { } } @@ -3849,7 +3579,6 @@ model { name: "layer372" parents: "layer371" children: "layer373 layer380 layer387 layer394 layer401 layer408 layer415 layer422 layer429" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 32 @@ -3862,7 +3591,6 @@ model { name: "layer373" parents: "layer316 layer323 layer330 layer337 layer344 layer351 layer358 layer365 layer372" children: "layer374" - data_layout: "data_parallel" concatenation { } } @@ -3870,18 +3598,17 @@ model { name: "layer374" parents: "layer373" children: "layer375" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer375" parents: "layer374" children: "layer376" - data_layout: "data_parallel" relu { } } @@ -3889,7 +3616,6 @@ model { name: "layer376" parents: "layer375" children: "layer377" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 128 @@ -3901,18 +3627,17 @@ model { name: "layer377" parents: "layer376" children: "layer378" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer378" parents: "layer377" children: "layer379" - data_layout: "data_parallel" relu { } } @@ -3920,7 +3645,6 @@ model { name: "layer379" parents: "layer378" children: "layer380 layer387 layer394 layer401 layer408 layer415 layer422 layer429" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 32 @@ -3933,7 +3657,6 @@ model { name: "layer380" parents: "layer316 layer323 layer330 layer337 layer344 layer351 layer358 layer365 layer372 layer379" children: "layer381" - data_layout: "data_parallel" concatenation { } } @@ -3941,18 +3664,17 @@ model { name: "layer381" parents: "layer380" children: "layer382" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer382" parents: "layer381" children: "layer383" - data_layout: "data_parallel" relu { } } @@ -3960,7 +3682,6 @@ model { name: "layer383" parents: "layer382" children: "layer384" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 128 @@ -3972,18 +3693,17 @@ model { name: "layer384" parents: "layer383" children: "layer385" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer385" parents: "layer384" children: "layer386" - data_layout: "data_parallel" relu { } } @@ -3991,7 +3711,6 @@ model { name: "layer386" parents: "layer385" children: "layer387 layer394 layer401 layer408 layer415 layer422 layer429" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 32 @@ -4004,7 +3723,6 @@ model { name: "layer387" parents: "layer316 layer323 layer330 layer337 layer344 layer351 layer358 layer365 layer372 layer379 layer386" children: "layer388" - data_layout: "data_parallel" concatenation { } } @@ -4012,18 +3730,17 @@ model { name: "layer388" parents: "layer387" children: "layer389" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer389" parents: "layer388" children: "layer390" - data_layout: "data_parallel" relu { } } @@ -4031,7 +3748,6 @@ model { name: "layer390" parents: "layer389" children: "layer391" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 128 @@ -4043,18 +3759,17 @@ model { name: "layer391" parents: "layer390" children: "layer392" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer392" parents: "layer391" children: "layer393" - data_layout: "data_parallel" relu { } } @@ -4062,7 +3777,6 @@ model { name: "layer393" parents: "layer392" children: "layer394 layer401 layer408 layer415 layer422 layer429" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 32 @@ -4075,7 +3789,6 @@ model { name: "layer394" parents: "layer316 layer323 layer330 layer337 layer344 layer351 layer358 layer365 layer372 layer379 layer386 layer393" children: "layer395" - data_layout: "data_parallel" concatenation { } } @@ -4083,18 +3796,17 @@ model { name: "layer395" parents: "layer394" children: "layer396" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer396" parents: "layer395" children: "layer397" - data_layout: "data_parallel" relu { } } @@ -4102,7 +3814,6 @@ model { name: "layer397" parents: "layer396" children: "layer398" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 128 @@ -4114,18 +3825,17 @@ model { name: "layer398" parents: "layer397" children: "layer399" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer399" parents: "layer398" children: "layer400" - data_layout: "data_parallel" relu { } } @@ -4133,7 +3843,6 @@ model { name: "layer400" parents: "layer399" children: "layer401 layer408 layer415 layer422 layer429" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 32 @@ -4146,7 +3855,6 @@ model { name: "layer401" parents: "layer316 layer323 layer330 layer337 layer344 layer351 layer358 layer365 layer372 layer379 layer386 layer393 layer400" children: "layer402" - data_layout: "data_parallel" concatenation { } } @@ -4154,18 +3862,17 @@ model { name: "layer402" parents: "layer401" children: "layer403" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer403" parents: "layer402" children: "layer404" - data_layout: "data_parallel" relu { } } @@ -4173,7 +3880,6 @@ model { name: "layer404" parents: "layer403" children: "layer405" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 128 @@ -4185,18 +3891,17 @@ model { name: "layer405" parents: "layer404" children: "layer406" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer406" parents: "layer405" children: "layer407" - data_layout: "data_parallel" relu { } } @@ -4204,7 +3909,6 @@ model { name: "layer407" parents: "layer406" children: "layer408 layer415 layer422 layer429" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 32 @@ -4217,7 +3921,6 @@ model { name: "layer408" parents: "layer316 layer323 layer330 layer337 layer344 layer351 layer358 layer365 layer372 layer379 layer386 layer393 layer400 layer407" children: "layer409" - data_layout: "data_parallel" concatenation { } } @@ -4225,18 +3928,17 @@ model { name: "layer409" parents: "layer408" children: "layer410" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer410" parents: "layer409" children: "layer411" - data_layout: "data_parallel" relu { } } @@ -4244,7 +3946,6 @@ model { name: "layer411" parents: "layer410" children: "layer412" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 128 @@ -4256,18 +3957,17 @@ model { name: "layer412" parents: "layer411" children: "layer413" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer413" parents: "layer412" children: "layer414" - data_layout: "data_parallel" relu { } } @@ -4275,7 +3975,6 @@ model { name: "layer414" parents: "layer413" children: "layer415 layer422 layer429" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 32 @@ -4288,7 +3987,6 @@ model { name: "layer415" parents: "layer316 layer323 layer330 layer337 layer344 layer351 layer358 layer365 layer372 layer379 layer386 layer393 layer400 layer407 layer414" children: "layer416" - data_layout: "data_parallel" concatenation { } } @@ -4296,18 +3994,17 @@ model { name: "layer416" parents: "layer415" children: "layer417" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer417" parents: "layer416" children: "layer418" - data_layout: "data_parallel" relu { } } @@ -4315,7 +4012,6 @@ model { name: "layer418" parents: "layer417" children: "layer419" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 128 @@ -4327,18 +4023,17 @@ model { name: "layer419" parents: "layer418" children: "layer420" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer420" parents: "layer419" children: "layer421" - data_layout: "data_parallel" relu { } } @@ -4346,7 +4041,6 @@ model { name: "layer421" parents: "layer420" children: "layer422 layer429" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 32 @@ -4359,7 +4053,6 @@ model { name: "layer422" parents: "layer316 layer323 layer330 layer337 layer344 layer351 layer358 layer365 layer372 layer379 layer386 layer393 layer400 layer407 layer414 layer421" children: "layer423" - data_layout: "data_parallel" concatenation { } } @@ -4367,18 +4060,17 @@ model { name: "layer423" parents: "layer422" children: "layer424" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer424" parents: "layer423" children: "layer425" - data_layout: "data_parallel" relu { } } @@ -4386,7 +4078,6 @@ model { name: "layer425" parents: "layer424" children: "layer426" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 128 @@ -4398,18 +4089,17 @@ model { name: "layer426" parents: "layer425" children: "layer427" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer427" parents: "layer426" children: "layer428" - data_layout: "data_parallel" relu { } } @@ -4417,7 +4107,6 @@ model { name: "layer428" parents: "layer427" children: "layer429" - data_layout: "data_parallel" convolution { num_dims: 2 num_output_channels: 32 @@ -4430,7 +4119,6 @@ model { name: "layer429" parents: "layer316 layer323 layer330 layer337 layer344 layer351 layer358 layer365 layer372 layer379 layer386 layer393 layer400 layer407 layer414 layer421 layer428" children: "layer430" - data_layout: "data_parallel" concatenation { } } @@ -4438,18 +4126,24 @@ model { name: "layer430" parents: "layer429" children: "layer431" - data_layout: "data_parallel" batch_normalization { decay: 0.9 scale_init: 1.0 epsilon: 1e-05 + statistics_group_size: 2 } } layer { name: "layer431" parents: "layer430" children: "layer432" - data_layout: "data_parallel" + relu { + } + } + layer { + name: "layer432" + parents: "layer431" + children: "layer433" pooling { num_dims: 2 pool_dims_i: 7 @@ -4459,41 +4153,36 @@ model { } } layer { - name: "layer432" - parents: "layer431" - children: "layer433" - data_layout: "data_parallel" + name: "layer433" + parents: "layer432" + children: "layer434" fully_connected { num_neurons: 1000 } } layer { - name: "layer433" - parents: "layer432" - children: "layer434 layer435 layer436" - data_layout: "data_parallel" + name: "layer434" + parents: "layer433" + children: "layer435 layer436 layer437" softmax { } } layer { - name: "layer436" - parents: "layer433 layer3" - data_layout: "data_parallel" + name: "layer437" + parents: "layer434 layer3" top_k_categorical_accuracy { k: 5 } } layer { - name: "layer435" - parents: "layer433 layer3" - data_layout: "data_parallel" + name: "layer436" + parents: "layer434 layer3" categorical_accuracy { } } layer { - name: "layer434" - parents: "layer433 layer3" - data_layout: "data_parallel" + name: "layer435" + parents: "layer434 layer3" cross_entropy { } } diff --git a/model_zoo/vision/densenet.py b/model_zoo/vision/densenet.py index 08487045d63..52064c9baa8 100755 --- a/model_zoo/vision/densenet.py +++ b/model_zoo/vision/densenet.py @@ -38,10 +38,15 @@ def log(string): # To avoid needing to stay logged into ssh, create a script # densenet_batch_job.cmd such as: # #!/bin/bash -# #SBATCH --nodes 16 +# #SBATCH --nodes 8 # #SBATCH --partition pbatch -# #SBATCH --time 240 -# ./densenet.py --nodes 16 --procs-per-node 2 --mini-batch-size 256 --num-epochs 10 > /usr/workspace/wsb//lbann/model_zoo/vision/output.txt +# #SBATCH --time 840 +# +# module load gcc/7.1.0 +# ../../scripts/build_lbann_lc.sh --compiler gnu --reconfigure +# +# module load python/3.6.4 +# ./densenet.py --nodes 8 --procs-per-node 2 --mini-batch-size 256 --num-epochs 10 &> /usr/workspace/wsb//lbann/model_zoo/vision/output.txt # and from lbann/model_zoo/vision run: # sbatch densenet_batch_job.cmd @@ -52,7 +57,7 @@ def log(string): # Copy the output file, experiment directory, and visualization # from LC to your computer by running the following commands from your computer: # scp @pascal.llnl.gov:/usr/workspace/wsb//lbann/model_zoo/vision/output.txt . -# scp -r @pascal.llnl.gov:/usr/workspace/wsb//lbann/experiments/_lbann_densenet/ . +# scp -r @pascal.llnl.gov:/usr/workspace/wsb//lbann/model_zoo/vision/_lbann_densenet/ . # scp @pascal.llnl.gov:/usr/workspace/wsb//lbann/graph.pdf . @@ -61,7 +66,8 @@ def log(string): # See PyTorch DenseNet: # https://github.com/pytorch/vision/blob/master/torchvision/models/densenet.py # See "Densely Connected Convolutional Networks" by Huang et. al p.4 -def densenet(version, +def densenet(statistics_group_size, + version, cumulative_layer_num, images_node ): @@ -78,12 +84,14 @@ def densenet(version, batch_norm_size = 4 parent_node, cumulative_layer_num = initial_layer( + statistics_group_size, cumulative_layer_num, images_node, num_initial_features) num_features = num_initial_features # Start counting dense blocks at 1. for current_block_num, num_layers in enumerate(layers_per_block, 1): parent_nodes, cumulative_layer_num = dense_block( + statistics_group_size, cumulative_layer_num, parent_node, batch_norm_size=batch_norm_size, @@ -101,6 +109,7 @@ def densenet(version, b=current_block_num, n=cumulative_layer_num)) if current_block_num != len(layers_per_block): parent_node, cumulative_layer_num = transition_layer( + statistics_group_size, current_block_num, cumulative_layer_num, parent_node, @@ -109,19 +118,26 @@ def densenet(version, ) num_features //= 2 - batch_normalization_node = standard_batchnorm(parent_node) + batch_normalization_node = standard_batchnorm(statistics_group_size, + parent_node) cumulative_layer_num += 1 log('densenet BatchNormalization. cumulative_layer_num={n}'.format( b=current_block_num, n=cumulative_layer_num)) + relu_node = lbann.Relu(batch_normalization_node) + cumulative_layer_num += 1 + log('densenet Relu. cumulative_layer_num={n}'.format( + b=current_block_num, n=cumulative_layer_num)) + probs = classification_layer( cumulative_layer_num, - batch_normalization_node + relu_node ) return probs -def initial_layer(cumulative_layer_num, +def initial_layer(statistics_group_size, + cumulative_layer_num, images_node, num_initial_channels ): @@ -139,7 +155,8 @@ def initial_layer(cumulative_layer_num, log('initial_layer Convolution. cumulative_layer_num={n}'.format( n=cumulative_layer_num)) - batch_normalization_node = standard_batchnorm(convolution_node) + batch_normalization_node = standard_batchnorm(statistics_group_size, + convolution_node) cumulative_layer_num += 1 log('initial_layer BatchNormalization. cumulative_layer_num={n}'.format( n=cumulative_layer_num)) @@ -165,17 +182,19 @@ def initial_layer(cumulative_layer_num, return pooling_node, cumulative_layer_num -def standard_batchnorm(parent_node): +def standard_batchnorm(statistics_group_size, parent_node): return lbann.BatchNormalization( parent_node, bias_init=0.0, decay=0.9, epsilon=1e-5, - scale_init=1.0 + scale_init=1.0, + statistics_group_size=statistics_group_size ) -def dense_block(cumulative_layer_num, +def dense_block(statistics_group_size, + cumulative_layer_num, parent_node, batch_norm_size, current_block_num, @@ -190,6 +209,7 @@ def dense_block(cumulative_layer_num, num_input_channels = num_initial_channels + (current_layer_num - 1) * growth_rate print('num_input_channels={c}'.format(c=num_input_channels)) parent_node, cumulative_layer_num = dense_layer( + statistics_group_size, current_block_num, current_layer_num, cumulative_layer_num, @@ -201,7 +221,8 @@ def dense_block(cumulative_layer_num, return parent_nodes, cumulative_layer_num -def dense_layer(current_block_num, +def dense_layer(statistics_group_size, + current_block_num, current_layer_num, cumulative_layer_num, parent_nodes, @@ -213,6 +234,7 @@ def dense_layer(current_block_num, log('dense_block={b} dense_layer={l} Concatenation. cumulative_layer_num={n}'.format( b=current_block_num, l=current_layer_num, n=cumulative_layer_num)) conv_block_1_node, cumulative_layer_num = conv_block( + statistics_group_size, current_block_num, current_layer_num, cumulative_layer_num, @@ -222,6 +244,7 @@ def dense_layer(current_block_num, num_output_channels=batch_norm_size * growth_rate ) conv_block_2_node, cumulative_layer_num = conv_block( + statistics_group_size, current_block_num, current_layer_num, cumulative_layer_num, @@ -233,7 +256,8 @@ def dense_layer(current_block_num, return conv_block_2_node, cumulative_layer_num -def conv_block(current_block_num, +def conv_block(statistics_group_size, + current_block_num, current_layer_num, cumulative_layer_num, parent_node, @@ -241,7 +265,8 @@ def conv_block(current_block_num, conv_pads_i, num_output_channels ): - batch_normalization_node = standard_batchnorm(parent_node) + batch_normalization_node = standard_batchnorm(statistics_group_size, + parent_node) cumulative_layer_num += 1 log('dense_block={b} dense_layer={l} BatchNormalization. cumulative_layer_num={n}'.format( b=current_block_num, l=current_layer_num, n=cumulative_layer_num)) @@ -268,12 +293,14 @@ def conv_block(current_block_num, return convolution_node, cumulative_layer_num -def transition_layer(current_block_num, +def transition_layer(statistics_group_size, + current_block_num, cumulative_layer_num, parent_node, num_output_channels ): - batch_normalization_node = standard_batchnorm(parent_node) + batch_normalization_node = standard_batchnorm(statistics_group_size, + parent_node) cumulative_layer_num += 1 log('dense_block={b} > transition_layer BatchNormalization. cumulative_layer_num={n}'.format( b=current_block_num, n=cumulative_layer_num)) @@ -394,6 +421,7 @@ def get_args(): def construct_layer_graph( + statistics_group_size, version, cumulative_layer_num, input_node): @@ -408,7 +436,8 @@ def construct_layer_graph( log('Identity. cumulative_layer_num={n}'.format(n=cumulative_layer_num)) # Use images_node, not image_labels_node. - probabilities = densenet(version, cumulative_layer_num, images_node) + probabilities = densenet(statistics_group_size, version, + cumulative_layer_num, images_node) return probabilities, image_labels_node @@ -420,11 +449,12 @@ def set_up_experiment(args, # Set up objective function cross_entropy = lbann.CrossEntropy([probs, labels]) layers = list(lbann.traverse_layer_graph(input_)) - weights = set() + l2_reg_weights = set() for l in layers: - weights.update(l.weights) + if type(l) == lbann.Convolution or type(l) == lbann.FullyConnected: + l2_reg_weights.update(l.weights) # scale = weight decay - l2_reg = lbann.L2WeightRegularization(weights=weights, scale=1e-4) + l2_reg = lbann.L2WeightRegularization(weights=l2_reg_weights, scale=1e-4) objective_function = lbann.ObjectiveFunction([cross_entropy, l2_reg]) # Set up model @@ -439,7 +469,6 @@ def set_up_experiment(args, model = lbann.Model(args.mini_batch_size, args.num_epochs, layers=layers, - weights=weights, objective_function=objective_function, metrics=metrics, callbacks=callbacks) @@ -515,6 +544,12 @@ def main(): # ---------------------------------- args = get_args() + # Match this with number of GPUs per node + # On Lassen, this will be 4. + # On Pascal, this will be 2. + # If there are no GPUs, then match the number of processes per node. + statistics_group_size = 2 + # ---------------------------------- # Construct layer graph # ---------------------------------- @@ -523,6 +558,7 @@ def main(): cumulative_layer_num = 1 log('Input. cumulative_layer_num={n}'.format(n=cumulative_layer_num)) (probs, labels) = construct_layer_graph( + statistics_group_size, 121, cumulative_layer_num, input_node) # ---------------------------------- From cb04061101ebfa680f25925d4d54b1d9c959dde5 Mon Sep 17 00:00:00 2001 From: Tim Moon Date: Thu, 5 Sep 2019 10:00:12 -0700 Subject: [PATCH 276/634] Fix bug in Python frontend Layer class --- python/lbann/layer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/lbann/layer.py b/python/lbann/layer.py index a289e7ca3e5..3248873ca1a 100644 --- a/python/lbann/layer.py +++ b/python/lbann/layer.py @@ -46,7 +46,7 @@ def __init__(self, for l in make_iterable(parents): self.add_parent(l) for l in make_iterable(children): - self.add_child(child) + self.add_child(l) for w in make_iterable(weights): self.add_weights(w) From 3c51a1ad9025cc63818b7f1cd6259ebbebe05ad4 Mon Sep 17 00:00:00 2001 From: "Thomas R. Benson" Date: Fri, 6 Sep 2019 11:51:40 -0700 Subject: [PATCH 277/634] add catch2 support to the superbuild --- superbuild/CMakeLists.txt | 7 ++++ superbuild/catch2/CMakeLists.txt | 72 ++++++++++++++++++++++++++++++++ 2 files changed, 79 insertions(+) create mode 100644 superbuild/catch2/CMakeLists.txt diff --git a/superbuild/CMakeLists.txt b/superbuild/CMakeLists.txt index 2481f6175e1..91fc56f3eb5 100644 --- a/superbuild/CMakeLists.txt +++ b/superbuild/CMakeLists.txt @@ -35,6 +35,8 @@ option(LBANN_SB_CLONE_VIA_SSH option(LBANN_SB_BUILD_ALUMINUM "Pull and build Aluminum from Github" OFF) +option(LBANN_SB_BUILD_CATCH2 "Pull and install CATCH2 library from Github" OFF) + option(LBANN_SB_BUILD_CEREAL "Pull and install CEREAL library from Github" OFF) option(LBANN_SB_BUILD_CNPY "Pull and build CNPY from Github" OFF) @@ -67,6 +69,11 @@ if (LBANN_SB_BUILD_ALUMINUM) list(APPEND _BUILD_PKGS ALUMINUM) endif () +if (LBANN_SB_BUILD_CATCH2) + add_subdirectory(catch2) + list(APPEND _BUILD_PKGS CATCH2) +endif () + if (LBANN_SB_BUILD_CEREAL) add_subdirectory(cereal) list(APPEND _BUILD_PKGS CEREAL) diff --git a/superbuild/catch2/CMakeLists.txt b/superbuild/catch2/CMakeLists.txt new file mode 100644 index 00000000000..d006684ba80 --- /dev/null +++ b/superbuild/catch2/CMakeLists.txt @@ -0,0 +1,72 @@ +# Use CATCH2_URL to specify the location of the git repo. Use +# CATCH2_TAG to specify the commit. + +enable_language(CXX) + +# Handle the clone mechanism. First URL +option(CATCH2_CLONE_VIA_SSH + "Clone CATCH2 using SSH instead of HTTPS" ${LBANN_SB_CLONE_VIA_SSH}) + +if (CATCH2_CLONE_VIA_SSH) + set(CATCH2_URL git@github.com:catchorg/catch2.git + CACHE STRING "The URL from which to clone CATCH2") +else () + set(CATCH2_URL "https://github.com/catchorg/catch2.git" + CACHE STRING "The URL from which to clone CATCH2") +endif () + +# ... then the tag. +set(CATCH2_TAG "master" + CACHE STRING "The git tag or hash to checkout for CATCH2") + +# Where to install CATCH2 +set(CATCH2_CMAKE_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}" + CACHE PATH "The installation location of CATCH2.") + +# The build type for CATCH2 +set(CATCH2_CMAKE_BUILD_TYPE "${CMAKE_BUILD_TYPE}" + CACHE STRING "The build type for CATCH2.") + +if (CATCH2_CUSTOM_SOURCE_DIR) + set(CATCH2_SOURCE_DIR "${CATCH2_CUSTOM_SOURCE_DIR}") + set(CATCH2_URL "") + set(CATCH2_TAG "") + set(_GIT_REPOSITORY_TAG) + set(_GIT_TAG_TAG) + message(STATUS "Using CATCH2 source in: ${CATCH2_SOURCE_DIR}") +else () + set(CATCH2_SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/src") + set(_GIT_REPOSITORY_TAG "GIT_REPOSITORY") + set(_GIT_TAG_TAG "GIT_TAG") +endif () + +# Now add the external project +include(ExternalProject) +ExternalProject_Add(CATCH2 + PREFIX ${CMAKE_CURRENT_BINARY_DIR} + TMP_DIR ${CMAKE_CURRENT_BINARY_DIR}/tmp + STAMP_DIR ${CMAKE_CURRENT_BINARY_DIR}/stamp + ${_GIT_REPOSITORY_TAG} ${CATCH2_URL} + ${_GIT_TAG_TAG} ${CATCH2_TAG} + SOURCE_DIR ${CATCH2_SOURCE_DIR} + BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/build + INSTALL_DIR ${CATCH2_CMAKE_INSTALL_PREFIX} + USES_TERMINAL_BUILD 1 + LOG_DOWNLOAD 1 + LOG_UPDATE 1 + LOG_CONFIGURE 1 + LOG_BUILD 1 + LOG_INSTALL 1 + LOG_TEST 1 + CMAKE_ARGS + -G${CMAKE_GENERATOR} + -DCMAKE_INSTALL_PREFIX=${CATCH2_CMAKE_INSTALL_PREFIX} + -DCMAKE_BUILD_TYPE=${CATCH2_CMAKE_BUILD_TYPE} + -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} + -DCATCH_BUILD_TESTING=OFF + -DCATCH_BUILDE_EXAMPLES=OFF + -DCATCH_ENABLE_WERROR=OFF + ) + +set(CATCH2_DIR ${CATCH2_CMAKE_INSTALL_PREFIX} + CACHE INTERNAL "The install prefix of CATCH2.") From 55c09fb23d820e8312b95424532799f30f033a52 Mon Sep 17 00:00:00 2001 From: "Thomas R. Benson" Date: Fri, 6 Sep 2019 11:57:07 -0700 Subject: [PATCH 278/634] trigger unit testing in hydrogen and lbann if catch2 built in superbuild --- superbuild/hydrogen/CMakeLists.txt | 9 +++++++++ superbuild/lbann/CMakeLists.txt | 9 +++++++++ 2 files changed, 18 insertions(+) diff --git a/superbuild/hydrogen/CMakeLists.txt b/superbuild/hydrogen/CMakeLists.txt index d68e68965a7..3c6b59670c7 100644 --- a/superbuild/hydrogen/CMakeLists.txt +++ b/superbuild/hydrogen/CMakeLists.txt @@ -55,6 +55,15 @@ if (TARGET ALUMINUM) endif (Hydrogen_ENABLE_ALUMINUM) endif (TARGET ALUMINUM) +if (TARGET CATCH2) + option(Hydrogen_ENABLE_UNIT_TESTS "Build catch2 unit tests in hydrogen" ON) + if (Hydrogen_ENABLE_UNIT_TESTS) + set(LBANN_SB_FWD_HYDROGEN_CATCH2_DIR ${CATCH2_DIR}) + set(_hydrogen_depends_tag DEPENDS) + list(APPEND _HYDROGEN_DEPENDS CATCH2) + endif () +endif (TARGET CATCH2) + if (Hydrogen_ENABLE_CUDA) enable_language(CUDA) endif () diff --git a/superbuild/lbann/CMakeLists.txt b/superbuild/lbann/CMakeLists.txt index fd1310769bd..0c65f388580 100644 --- a/superbuild/lbann/CMakeLists.txt +++ b/superbuild/lbann/CMakeLists.txt @@ -63,6 +63,15 @@ if (TARGET ALUMINUM) list(APPEND _LBANN_DEPENDS ALUMINUM) endif (LBANN_WITH_ALUMINUM) endif (TARGET ALUMINUM) + +if (TARGET CATCH2) + option(LBANN_WITH_UNIT_TESTING "Build catch2 unit tests in LBANN" ON) + if (LBANN_WITH_UNIT_TESTING) + set(LBANN_SB_FWD_LBANN_CATCH2_DIR ${CATCH2_DIR}) + list(APPEND _LBANN_DEPENDS CATCH2) + endif () +endif (TARGET CATCH2) + if (TARGET CEREAL) list(APPEND _LBANN_DEPENDS CEREAL) set(LBANN_SB_FWD_LBANN_CEREAL_DIR "${CEREAL_DIR}") From a5d6745611f5613b077ea119bab53fce8ac1e759 Mon Sep 17 00:00:00 2001 From: "Thomas R. Benson" Date: Fri, 6 Sep 2019 12:08:37 -0700 Subject: [PATCH 279/634] pin Catch2 to a specific version --- superbuild/catch2/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/superbuild/catch2/CMakeLists.txt b/superbuild/catch2/CMakeLists.txt index d006684ba80..4190f933169 100644 --- a/superbuild/catch2/CMakeLists.txt +++ b/superbuild/catch2/CMakeLists.txt @@ -16,7 +16,7 @@ else () endif () # ... then the tag. -set(CATCH2_TAG "master" +set(CATCH2_TAG "v2.9.2" CACHE STRING "The git tag or hash to checkout for CATCH2") # Where to install CATCH2 From 44334ea7fa9bd5096df6f7a2c7d257c9f612d5f0 Mon Sep 17 00:00:00 2001 From: "Thomas R. Benson" Date: Fri, 6 Sep 2019 12:20:45 -0700 Subject: [PATCH 280/634] make CUB a proper dependency of hydrogen --- superbuild/hydrogen/CMakeLists.txt | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/superbuild/hydrogen/CMakeLists.txt b/superbuild/hydrogen/CMakeLists.txt index d68e68965a7..b1b8b5270a2 100644 --- a/superbuild/hydrogen/CMakeLists.txt +++ b/superbuild/hydrogen/CMakeLists.txt @@ -55,6 +55,20 @@ if (TARGET ALUMINUM) endif (Hydrogen_ENABLE_ALUMINUM) endif (TARGET ALUMINUM) +if (TARGET CUB) + option(Hydrogen_ENABLE_CUB "Whether to use CUB in Hydrogen" ON) + if (Hydrogen_ENABLE_CUB) + message(STATUS "Building Hydrogen with CUB support") + set(LBANN_SB_FWD_HYDROGEN_CUB_DIR "${CUB_DIR}" + CACHE STRING "The path to CUB for Hydrogen.") + + set(_hydrogen_depends_tag DEPENDS) + list(APPEND _HYDROGEN_DEPENDS CUB) + + set(Hydrogen_ENABLE_CUDA ON) + endif () +endif (TARGET CUB) + if (Hydrogen_ENABLE_CUDA) enable_language(CUDA) endif () From 52b26d0fbb45b7d70fff5627e40ebffd655847c7 Mon Sep 17 00:00:00 2001 From: Brian Van Essen Date: Sat, 7 Sep 2019 20:27:51 -0700 Subject: [PATCH 281/634] Factoring out the trainer and training state from the model (#916) Refactored LBANN into trainers, execution contexts, training algorithms, and model classes. There are additional improvements that crept into the PR that include fixes to the checkpoint and restart, as well as introducing the use of the Cereal library into the code to replace behavior that was in the persist class. The details of these new classes is listed below: Trainers (i.e. execution environment) ****************************************** A trainer is a collection of compute resources and defines a explicit communication domain. It provides the execution for both the training and inference of a trained model. Once constructed a trainer owns an LBANN comm object that defines both intra- and inter-trainer communication domains. Additionally, a trainer will contain an I/O thread pool that is used to fetch and pre-process data that will be provided to the trainer's models. A trainer owns: * comm object * I/O thread pool * One or more models * Execution context for each model * In the future, it will also contain the data readers. Execution Context ****************************************** When a model is attached to a trainer the execution context of the training algorithm is stored in an execution_context class (or sub-class) per execution mode. Thus there is one execution context per model and mode that contains all of the state with respect to the training algorithm being applied to the model. For example it tracks the current: * step * execution mode * epoch * and a pointer back to the trainer Termination Criteria (Pending) ****************************************** (Pending feature) When a model is going to be trained or evaluated, the termination criteria is specified in an object that is passed into the training algorithm. (Note that this feature is under development, currently the termination criteria is dictated by when the training algorithm executes a fixed number of epochs.) Training Algorithms ****************************************** The training algorithm defines the optimization that is to be applied to the model(s) being trained. Additionally, it can specify how to evaluate the model. Model ****************************************** A model is a collection of operations with dependencies encoded as a directed acyclic graph (DAG). In a typical formulation, these operations form a neural network that will be either trained or used for inference. Each operation in the model is an instance of the layer class. The model is then a collection of layers that perform transformations and mathematical operations on data that is passed between layers. The model's DAG is executed in topological order. Inside of some layer types are weight matrices that define a trained model. (Note that LBANN should be able to support non-DNN models, but this is a subject for future work.) Each layer in the graph contains a set of tensors that holds the inputs, computed outputs, gradients with respect to the outputs, and gradients with respect to the inputs. Furthermore, for each layer in the graph with learnable parameters, there is an associated weight tensor that form the learned weights of the model. The model also owns the objective function, since that is integrally tied into the model's computational graph. Additionally, the model owns both the default optimizer that is used to provide a standard optimizer for the model's weight tensors. Once each weight tensor is instantiated, it will owns an instance of an optimizer. The model also owns the max_mini_batch_size that is supported by the model. This is due to the fact that it changes the size and shape of input, output, and gradient tensors. Additionally, the model owns a field that controls if background I/O is allowed for this model and associated data reader. ****************************************** * Cleaned up and properly segmented the checkpoint and restart logic across the trainer, execution context, and model. Additionally, pushed the execution context through more of the code. Moved setup of the data readers from the input layer into the lbann_library build_model_from_prototext function. This alleviates the need for the model to have direct access to the trainer. Fixed several bugs during setup where the model does not have a valid execution context. Moved the reporting of most of the LBANN configuration and run state so that it is reported by the trainer, not the model. Cleaned up some C++ style and improved use of unique pointers. * Updated the model prototext files to include the trainer message. Added a trainer prototext message to the python interface. * Changed the model factory to return a unique_ptr. Clarified the documentation of the step field in the execution_context. * Cleaning up what gets done in the model's checkpoint and restart versus the execution context. Note that the model has to be able to be restored without an execution context since it can be loaded from a trained model or a checkpoint. * - Updated some of the classes to use the Cereal library to save their checkpoint data. For these classes, data is now stored in XML format. Supported classes are: - Data reader - Execution context and subclasses - Metrics - Generic input layer - Removed explicit packing and unpacking routines that have been replaced with Cereal - Added templated functions for writing out Cereal archives to XML file or to a binary buffer that can be exchanged between ranks - Added support for how the new execution modes interact with the execution contexts - Added a callback phase that the checkpoint callback uses to identify the state at which a helper function was called - Improved the naming convention for how checkpoints are differentiated based on trainer, model, execution mode, and epoch plus step - Changed the checkpointing callback so that all of the state is saved at each checkpoint time. Added support for explicitly saving the model weights, the trainer state only (execution contexts), or a full checkpoint - Added support to the trainer to create and cache new execution contexts * Updated mini-batch size, step, and epoch counters to use size_t instead of int * Changed the error reporting mechanism for the model to save and restart, such that if a layer fails it will throw an exception. Also renamed the RNG save and load functions to indicate that it is for all ranks. * Simplified how the checkpoint directory is communicated for shared checkpoints to address racy bug in saving the RNG. * Added error handling for the RNG checkpoint and restart * Added a future clarification for how to restart from a checkpoint. * Added a flag to allow the checkpoint directory to be set from the command line. Updated the unit checkpoint tests to use explicit directories to avoid moving checkpoints after the fact. * Fixed a bug in proto_common that failed to detect when a directory doesn't exist. Added support to override the directory used by the dump weights callbsck and save model. Fixed a bug in the front end codes where the protobuf for the models was using a copy instead of the pointer. --- bamboo/common_python/tools.py | 2 +- bamboo/unit_tests/test_unit_checkpoint.py | 126 ++--- bamboo/unit_tests/test_unit_lbann2_reload.py | 25 +- docs/lbann.rst | 85 +++ include/lbann/CMakeLists.txt | 3 + include/lbann/base.hpp | 7 +- include/lbann/callbacks/callback.hpp | 2 + include/lbann/callbacks/checkpoint.hpp | 67 ++- include/lbann/callbacks/dump_outputs.hpp | 3 +- include/lbann/callbacks/dump_weights.hpp | 2 + include/lbann/callbacks/learning_rate.hpp | 42 +- include/lbann/callbacks/save_model.hpp | 2 + .../lbann/callbacks/variable_minibatch.hpp | 34 +- include/lbann/data_readers/data_reader.hpp | 108 +--- .../lbann/data_readers/data_reader_image.hpp | 4 +- .../data_readers/data_reader_jag_conduit.hpp | 2 +- .../lbann/data_readers/data_reader_python.hpp | 2 +- .../lbann/data_readers/sample_list_impl.hpp | 9 +- .../lbann/execution_contexts/CMakeLists.txt | 8 + .../execution_contexts/execution_context.hpp | 155 ++++++ .../sgd_execution_context.hpp | 119 +++++ include/lbann/io/persist.hpp | 186 ++++++- .../layers/io/input/generic_input_layer.hpp | 255 ++++----- .../layers/learning/base_convolution.hpp | 8 +- include/lbann/layers/regularizers/dropout.hpp | 8 +- .../layers/regularizers/selu_dropout.hpp | 4 +- include/lbann/layers/transform/bernoulli.hpp | 2 +- .../layers/transform/categorical_random.hpp | 2 +- .../layers/transform/discrete_random.hpp | 2 +- include/lbann/layers/transform/gaussian.hpp | 2 +- include/lbann/layers/transform/uniform.hpp | 2 +- include/lbann/layers/transform/weights.hpp | 4 +- include/lbann/lbann.hpp | 6 + include/lbann/metrics/metric.hpp | 27 +- .../lbann/models/directed_acyclic_graph.hpp | 2 +- include/lbann/models/model.hpp | 174 ++---- include/lbann/proto/factories.hpp | 7 + include/lbann/proto/proto_common.hpp | 1 + include/lbann/trainers/CMakeLists.txt | 7 + include/lbann/trainers/trainer.hpp | 162 ++++++ .../lbann/training_algorithms/CMakeLists.txt | 8 + .../sgd_training_algorithm.hpp | 106 ++++ .../training_algorithm.hpp | 66 +++ include/lbann/utils/CMakeLists.txt | 1 + include/lbann/utils/dataset.hpp | 6 + include/lbann/utils/enum_iterator.hpp | 57 ++ include/lbann/utils/graph.hpp | 2 +- include/lbann/utils/lbann_library.hpp | 16 +- include/lbann/utils/random.hpp | 4 +- model_zoo/lbann.cpp | 25 +- model_zoo/lbann2.cpp | 31 +- model_zoo/lbann_aecycgan.cpp | 37 +- model_zoo/lbann_cycgan.cpp | 45 +- model_zoo/lbann_gan.cpp | 23 +- model_zoo/lbann_inf.cpp | 22 +- .../models/alexnet/model_alexnet.prototext | 8 +- .../model_autoencoder_chem_ecfp.prototext | 8 +- ...er_chem_ecfp_200x150x100x100x100.prototext | 8 +- ...utoencoder_chem_ecfp_500x250x100.prototext | 8 +- .../model_autoencoder_chem_sigmoid.prototext | 8 +- .../model_dnn_chem_ecfp.prototext | 8 +- .../model_autoencoder_cifar10.prototext | 8 +- .../model_conv_autoencoder_cifar10.prototext | 8 +- .../model_conv_autoencoder_imagenet.prototext | 8 +- .../model_autoencoder_mnist.prototext | 8 +- .../model_conv_autoencoder_mnist.prototext | 8 +- .../autoencoder_mnist/vae_mnist.prototext | 4 +- .../candle/pilot1/ae_nodeselect_gdc.prototext | 8 +- .../models/candle/pilot1/combo.prototext | 8 +- .../cosmoflow/model_cosmoflow.prototext | 8 +- .../gan/jags/cycle_gan/cycgan_m1.prototext | 4 +- .../cycle_gan/cycgan_m1_template.prototext | 8 +- .../gan/jags/cycle_gan/cycgan_m2.prototext | 4 +- .../cycle_gan/cycgan_m2_template.prototext | 8 +- .../gan/jags/cycle_gan/cycgan_m3.prototext | 4 +- .../cycle_gan/cycgan_m3_template.prototext | 8 +- .../gan/mnist/adversarial_model.prototext | 8 +- .../gan/mnist/discriminator_model.prototext | 8 +- .../jag/gan/cyclic/cyclic_gan_model.prototext | 6 +- .../jag/gan/cyclic/model_template.prototext | 8 +- .../models/jag/gan/vanilla/gan.prototext | 6 +- .../jag/gan/vanilla/gan_template.prototext | 8 +- model_zoo/models/jag/vae_fcn.prototext | 8 +- model_zoo/models/jag/wae.prototext | 6 +- .../jag/wae_cycle_gan/cycle_gan.prototext | 6 +- .../wae_cycle_gan/cycle_gan_only.prototext | 6 +- .../data_reader_jag_conduit_lustre.prototext | 62 +-- .../models/jag/wae_cycle_gan/wae.prototext | 8 +- .../jag/wae_cycle_gan/wae_fw_inv.prototext | 24 +- .../jag/wae_cycle_gan/wae_nobn.prototext | 8 +- .../lenet_mnist/model_lenet_mnist.prototext | 8 +- ...onv_molecular_autoencoder_pilot2.prototext | 8 +- ...olecular_bead_autoencoder_pilot2.prototext | 8 +- ...del_molecular_autoencoder_pilot2.prototext | 8 +- .../models/resnet50/model_resnet50.prototext | 8 +- .../siamese/finetune-cub/model_cub.prototext | 8 +- .../model_cub_batchnorm.prototext | 8 +- ...batchnorm_transferred_and_frozen.prototext | 8 +- ..._alexnet_batchnorm_dag_frozen_bn.prototext | 8 +- .../model_mnist_simple_1.prototext | 8 +- .../model_mnist_simple_2.prototext | 8 +- .../model_channelwise_mean.prototext | 8 +- .../tests/layer_tests/model_clamp.prototext | 8 +- .../layer_tests/model_covariance.prototext | 8 +- .../tests/layer_tests/model_elu.prototext | 8 +- .../layer_tests/model_identity.prototext | 8 +- .../tests/layer_tests/model_l1_norm.prototext | 8 +- .../layer_tests/model_l2_norm2.prototext | 8 +- .../layer_tests/model_leaky_relu.prototext | 8 +- .../layer_tests/model_log_sigmoid.prototext | 8 +- .../layer_tests/model_log_softmax.prototext | 8 +- .../model_mean_absolute_error.prototext | 8 +- .../tests/layer_tests/model_relu.prototext | 8 +- .../tests/layer_tests/model_selu.prototext | 8 +- .../tests/layer_tests/model_sigmoid.prototext | 8 +- .../tests/layer_tests/model_softmax.prototext | 8 +- .../layer_tests/model_softplus.prototext | 8 +- .../layer_tests/model_softsign.prototext | 8 +- .../model_squared_difference.prototext | 4 +- .../layer_tests/model_tessellate.prototext | 4 +- .../layer_tests/model_variance.prototext | 8 +- .../tests/model_jag_single_layer_ae.prototext | 20 +- .../tests/model_lenet_mnist_ckpt.prototext | 8 +- .../model_lenet_mnist_dist_ckpt.prototext | 8 +- .../model_lenet_mnist_lbann2ckpt.prototext | 8 +- .../tests/model_mnist_conv_graph.prototext | 4 +- .../model_mnist_ridge_regression.prototext | 8 +- .../model_mnist_softmax_classifier.prototext | 8 +- model_zoo/vision/alexnet.py | 6 +- model_zoo/vision/lenet.py | 5 +- model_zoo/vision/resnet.py | 6 +- python/lbann/__init__.py | 3 +- python/lbann/contrib/lc/launcher.py | 4 +- python/lbann/launcher/__init__.py | 4 +- python/lbann/model.py | 10 - python/lbann/trainer.py | 25 + src/CMakeLists.txt | 3 + src/base.cpp | 4 +- src/callbacks/check_dataset.cpp | 10 +- src/callbacks/check_gradients.cpp | 8 +- src/callbacks/check_init.cpp | 3 +- src/callbacks/check_metric.cpp | 3 +- src/callbacks/check_nan.cpp | 9 +- src/callbacks/check_small.cpp | 9 +- src/callbacks/checkpoint.cpp | 302 ++++++++--- src/callbacks/confusion_matrix.cpp | 13 +- src/callbacks/debug.cpp | 25 +- src/callbacks/debug_io.cpp | 19 +- src/callbacks/dump_error_signals.cpp | 5 +- src/callbacks/dump_gradients.cpp | 5 +- .../dump_minibatch_sample_indices.cpp | 13 +- src/callbacks/dump_outputs.cpp | 7 +- src/callbacks/dump_weights.cpp | 6 +- src/callbacks/early_stopping.cpp | 5 +- src/callbacks/imcomm.cpp | 10 +- src/callbacks/learning_rate.cpp | 52 +- src/callbacks/ltfb.cpp | 37 +- src/callbacks/mixup.cpp | 3 +- src/callbacks/monitor_io.cpp | 6 +- src/callbacks/perturb_adam.cpp | 3 +- src/callbacks/print_statistics.cpp | 8 +- src/callbacks/profiler.cpp | 32 +- src/callbacks/replace_weights.cpp | 3 +- src/callbacks/save_images.cpp | 3 +- src/callbacks/save_model.cpp | 20 +- src/callbacks/save_topk_models.cpp | 3 +- src/callbacks/summary.cpp | 36 +- src/callbacks/timer.cpp | 24 +- src/callbacks/variable_minibatch.cpp | 28 +- src/data_readers/data_reader.cpp | 57 +- src/data_readers/data_reader_image.cpp | 6 +- src/data_readers/data_reader_jag_conduit.cpp | 8 +- .../data_reader_numpy_npz_conduit.cpp | 3 +- src/data_readers/data_reader_python.cpp | 2 +- src/execution_contexts/CMakeLists.txt | 8 + src/execution_contexts/execution_context.cpp | 100 ++++ .../sgd_execution_context.cpp | 68 +++ src/io/persist.cpp | 258 ++++----- src/layers/layer.cpp | 7 +- .../learning/channelwise_scale_bias.cpp | 4 +- src/layers/learning/channelwise_scale_bias.cu | 4 +- src/layers/learning/embedding.cpp | 4 +- src/layers/learning/entrywise_scale_bias.cpp | 7 +- src/layers/learning/entrywise_scale_bias.cu | 7 +- src/layers/learning/fully_connected.cpp | 13 +- .../regularizers/batch_normalization.cpp | 8 +- .../regularizers/batch_normalization.cu | 8 +- .../entrywise_batch_normalization.cpp | 13 +- .../entrywise_batch_normalization.cu | 13 +- src/metrics/metric.cpp | 58 +- src/models/model.cpp | 501 +++--------------- src/optimizers/hypergradient_adam.cpp | 12 +- src/proto/CMakeLists.txt | 2 + src/proto/factories/CMakeLists.txt | 1 + src/proto/factories/layer_graph_factory.cpp | 3 +- src/proto/factories/model_factory.cpp | 2 + src/proto/factories/trainer_factory.cpp | 46 ++ src/proto/lbann.proto | 4 + src/proto/model.proto | 4 - src/proto/proto_common.cpp | 48 +- src/proto/trainer.proto | 38 ++ src/proto/training_algorithm.proto | 34 ++ src/trainers/CMakeLists.txt | 7 + src/trainers/trainer.cpp | 216 ++++++++ src/training_algorithms/CMakeLists.txt | 7 + .../sgd_training_algorithm.cpp | 278 ++++++++++ src/utils/graph.cpp | 2 +- src/utils/lbann_library.cpp | 206 ++++--- src/utils/random.cpp | 28 +- src/weights/weights.cpp | 2 +- 210 files changed, 3667 insertions(+), 1955 deletions(-) create mode 100644 docs/lbann.rst create mode 100644 include/lbann/execution_contexts/CMakeLists.txt create mode 100644 include/lbann/execution_contexts/execution_context.hpp create mode 100644 include/lbann/execution_contexts/sgd_execution_context.hpp create mode 100644 include/lbann/trainers/CMakeLists.txt create mode 100644 include/lbann/trainers/trainer.hpp create mode 100644 include/lbann/training_algorithms/CMakeLists.txt create mode 100644 include/lbann/training_algorithms/sgd_training_algorithm.hpp create mode 100644 include/lbann/training_algorithms/training_algorithm.hpp create mode 100644 include/lbann/utils/enum_iterator.hpp create mode 100644 python/lbann/trainer.py create mode 100644 src/execution_contexts/CMakeLists.txt create mode 100644 src/execution_contexts/execution_context.cpp create mode 100644 src/execution_contexts/sgd_execution_context.cpp create mode 100644 src/proto/factories/trainer_factory.cpp create mode 100644 src/proto/trainer.proto create mode 100644 src/proto/training_algorithm.proto create mode 100644 src/trainers/CMakeLists.txt create mode 100644 src/trainers/trainer.cpp create mode 100644 src/training_algorithms/CMakeLists.txt create mode 100644 src/training_algorithms/sgd_training_algorithm.cpp diff --git a/bamboo/common_python/tools.py b/bamboo/common_python/tools.py index 4cc5839ae8e..153e447968f 100644 --- a/bamboo/common_python/tools.py +++ b/bamboo/common_python/tools.py @@ -423,6 +423,7 @@ def get_command(cluster, 'super_node', 'write_sample_list', 'ltfb_verbose', + 'ckpt_dir', # DataReaders: # 'data_filedir', @@ -442,7 +443,6 @@ def get_command(cluster, 'no_im_comm', # Not listed by `lbann --help`: - # 'ckpt_dir', # 'exit_after_setup', # 'procs_per_model' ] diff --git a/bamboo/unit_tests/test_unit_checkpoint.py b/bamboo/unit_tests/test_unit_checkpoint.py index bae6b789ef3..d898b62e209 100644 --- a/bamboo/unit_tests/test_unit_checkpoint.py +++ b/bamboo/unit_tests/test_unit_checkpoint.py @@ -16,27 +16,29 @@ def skeleton_checkpoint_lenet_shared(cluster, executables, dir_name, # No checkpointing, printing weights to files. output_file_name = '%s/bamboo/unit_tests/output/checkpoint_lenet_shared_no_checkpoint_%s_output.txt' % (dir_name, compiler_name) error_file_name = '%s/bamboo/unit_tests/error/checkpoint_lenet_shared_no_checkpoint_%s_error.txt' % (dir_name, compiler_name) + os.system('mkdir ckpt_lenet_shared') + no_ckpt_dir = 'ckpt_lenet_shared/no_ckpt_{c}'.format(c=compiler_name) command = tools.get_command( cluster=cluster, executable=exe, num_nodes=1, num_processes=2, dir_name=dir_name, data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST', - data_reader_name='mnist', model_folder='tests', + data_reader_name='mnist', data_reader_percent=1.0, + ckpt_dir=no_ckpt_dir, model_folder='tests', model_name='lenet_mnist_ckpt', num_epochs=2, optimizer_name='sgd', output_file_name=output_file_name, error_file_name=error_file_name) return_code_nockpt = os.system(command) tools.assert_success(return_code_nockpt, error_file_name) - os.system('mkdir ckpt_lenet_shared') - no_ckpt_dir = 'ckpt_lenet_shared/no_ckpt_{c}'.format(c=compiler_name) - os.system('mv ckpt {c}'.format(c=no_ckpt_dir)) # Run to checkpoint, printing weights to files. output_file_name = '%s/bamboo/unit_tests/output/checkpoint_lenet_shared_checkpoint_%s_output.txt' % (dir_name, compiler_name) error_file_name = '%s/bamboo/unit_tests/error/checkpoint_lenet_shared_checkpoint_%s_error.txt' % (dir_name, compiler_name) + ckpt_dir = 'ckpt_lenet_shared/ckpt_{c}'.format(c=compiler_name) command = tools.get_command( cluster=cluster, executable=exe, num_nodes=1, num_processes=2, dir_name=dir_name, data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST', - data_reader_name='mnist', model_folder='tests', + data_reader_name='mnist', data_reader_percent=1.0, + ckpt_dir=ckpt_dir, model_folder='tests', model_name='lenet_mnist_ckpt', num_epochs=1, optimizer_name='sgd', output_file_name=output_file_name, error_file_name=error_file_name) return_code_ckpt_1 = os.system(command) @@ -49,15 +51,14 @@ def skeleton_checkpoint_lenet_shared(cluster, executables, dir_name, cluster=cluster, executable=exe, num_nodes=1, num_processes=2, dir_name=dir_name, data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST', - data_reader_name='mnist', model_folder='tests', + data_reader_name='mnist', data_reader_percent=1.0, + ckpt_dir=ckpt_dir, model_folder='tests', model_name='lenet_mnist_ckpt', num_epochs=2, optimizer_name='sgd', output_file_name=output_file_name, error_file_name=error_file_name) return_code_ckpt_2 = os.system(command) tools.assert_success(return_code_ckpt_2, error_file_name) - diff_test = os.system('diff -rq ckpt {c}'.format(c=no_ckpt_dir)) - ckpt_dir = 'ckpt_lenet_shared/ckpt_{c}'.format(c=compiler_name) - os.system('mv ckpt {c}'.format(c=ckpt_dir)) + diff_test = os.system('diff -rq {ckpt} {no_ckpt}'.format(ckpt=ckpt_dir, no_ckpt=no_ckpt_dir)) path_prefix = '{d}/bamboo/unit_tests/'.format(d=dir_name) if diff_test !=0: raise AssertionError('diff_test={dt}\nCompare {ncd} and {cd} in {p}'.format( @@ -66,62 +67,63 @@ def skeleton_checkpoint_lenet_shared(cluster, executables, dir_name, def skeleton_checkpoint_lenet_distributed(cluster, executables, dir_name, compiler_name): - if compiler_name not in executables: - e = 'skeleton_checkpoint_lenet_distributed: default_exes[%s] does not exist' % compiler_name - print('Skip - ' + e) - pytest.skip(e) - exe = executables[compiler_name] - - # No checkpointing, printing weights to files. - output_file_name = '%s/bamboo/unit_tests/output/checkpoint_lenet_distributed_no_checkpoint_%s_output.txt' % (dir_name, compiler_name) - error_file_name = '%s/bamboo/unit_tests/error/checkpoint_lenet_distributed_no_checkpoint_%s_error.txt' % (dir_name, compiler_name) - command = tools.get_command( - cluster=cluster, executable=exe, num_nodes=1, num_processes=2, - dir_name=dir_name, - data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST', - data_reader_name='mnist', model_folder='tests', - model_name='lenet_mnist_dist_ckpt', num_epochs=2, optimizer_name='sgd', + if compiler_name not in executables: + e = 'skeleton_checkpoint_lenet_distributed: default_exes[%s] does not exist' % compiler_name + print('Skip - ' + e) + pytest.skip(e) + exe = executables[compiler_name] + + # No checkpointing, printing weights to files. + output_file_name = '%s/bamboo/unit_tests/output/checkpoint_lenet_distributed_no_checkpoint_%s_output.txt' % (dir_name, compiler_name) + error_file_name = '%s/bamboo/unit_tests/error/checkpoint_lenet_distributed_no_checkpoint_%s_error.txt' % (dir_name, compiler_name) + os.system('mkdir ckpt_lenet_distributed') + no_ckpt_dir = 'ckpt_lenet_distributed/no_ckpt_{c}'.format(c=compiler_name) + command = tools.get_command( + cluster=cluster, executable=exe, num_nodes=1, num_processes=2, + dir_name=dir_name, + data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST', + data_reader_name='mnist', data_reader_percent=1.0, + ckpt_dir=no_ckpt_dir, model_folder='tests', + model_name='lenet_mnist_dist_ckpt', num_epochs=2, optimizer_name='sgd', output_file_name=output_file_name, error_file_name=error_file_name) - return_code_nockpt = os.system(command) - tools.assert_success(return_code_nockpt, error_file_name) - os.system('mkdir ckpt_lenet_distributed') - no_ckpt_dir = 'ckpt_lenet_distributed/no_ckpt_{c}'.format(c=compiler_name) - os.system('mv ckpt {c}'.format(c=no_ckpt_dir)) - - # Run to checkpoint, printing weights to files. - output_file_name = '%s/bamboo/unit_tests/output/checkpoint_lenet_distributed_checkpoint_%s_output.txt' % (dir_name, compiler_name) - error_file_name = '%s/bamboo/unit_tests/error/checkpoint_lenet_distributed_checkpoint_%s_error.txt' % (dir_name, compiler_name) - command = tools.get_command( - cluster=cluster, executable=exe, num_nodes=1, num_processes=2, - dir_name=dir_name, - data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST', - data_reader_name='mnist', model_folder='tests', - model_name='lenet_mnist_dist_ckpt', num_epochs=1, optimizer_name='sgd', + return_code_nockpt = os.system(command) + tools.assert_success(return_code_nockpt, error_file_name) + + # Run to checkpoint, printing weights to files. + output_file_name = '%s/bamboo/unit_tests/output/checkpoint_lenet_distributed_checkpoint_%s_output.txt' % (dir_name, compiler_name) + error_file_name = '%s/bamboo/unit_tests/error/checkpoint_lenet_distributed_checkpoint_%s_error.txt' % (dir_name, compiler_name) + ckpt_dir = 'ckpt_lenet_distributed/ckpt_{c}'.format(c=compiler_name) + command = tools.get_command( + cluster=cluster, executable=exe, num_nodes=1, num_processes=2, + dir_name=dir_name, + data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST', + data_reader_name='mnist', data_reader_percent=1.0, + ckpt_dir=ckpt_dir, model_folder='tests', + model_name='lenet_mnist_dist_ckpt', num_epochs=1, optimizer_name='sgd', output_file_name=output_file_name, error_file_name=error_file_name) - return_code_ckpt_1 = os.system(command) - tools.assert_success(return_code_ckpt_1, error_file_name) - - # Pick up from checkpoint, printing weights to files. - output_file_name = '%s/bamboo/unit_tests/output/checkpoint_lenet_distributed_restart_%s_output.txt' % (dir_name, compiler_name) - error_file_name = '%s/bamboo/unit_tests/error/checkpoint_lenet_distributed_restart_%s_error.txt' % (dir_name, compiler_name) - command = tools.get_command( - cluster=cluster, executable=exe, num_nodes=1, num_processes=2, - dir_name=dir_name, - data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST', - data_reader_name='mnist', model_folder='tests', - model_name='lenet_mnist_dist_ckpt', num_epochs=2, optimizer_name='sgd', + return_code_ckpt_1 = os.system(command) + tools.assert_success(return_code_ckpt_1, error_file_name) + + # Pick up from checkpoint, printing weights to files. + output_file_name = '%s/bamboo/unit_tests/output/checkpoint_lenet_distributed_restart_%s_output.txt' % (dir_name, compiler_name) + error_file_name = '%s/bamboo/unit_tests/error/checkpoint_lenet_distributed_restart_%s_error.txt' % (dir_name, compiler_name) + command = tools.get_command( + cluster=cluster, executable=exe, num_nodes=1, num_processes=2, + dir_name=dir_name, + data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST', + data_reader_name='mnist', data_reader_percent=1.0, + ckpt_dir=ckpt_dir, model_folder='tests', + model_name='lenet_mnist_dist_ckpt', num_epochs=2, optimizer_name='sgd', output_file_name=output_file_name, error_file_name=error_file_name) - return_code_ckpt_2 = os.system(command) - tools.assert_success(return_code_ckpt_2, error_file_name) - - diff_test = os.system('diff -rq ckpt {c}'.format(c=no_ckpt_dir)) - ckpt_dir = 'ckpt_lenet_distributed/ckpt_{c}'.format(c=compiler_name) - os.system('mv ckpt {c}'.format(c=ckpt_dir)) - path_prefix = '{d}/bamboo/unit_tests'.format(d=dir_name) - if diff_test != 0: - raise AssertionError( - 'diff_test={dt}\nCompare {ncd} and {cd} in {p}'.format( - dt=diff_test, ncd=no_ckpt_dir, cd=ckpt_dir, p=path_prefix)) + return_code_ckpt_2 = os.system(command) + tools.assert_success(return_code_ckpt_2, error_file_name) + + diff_test = os.system('diff -rq {ckpt} {no_ckpt}'.format(ckpt=ckpt_dir, no_ckpt=no_ckpt_dir)) + path_prefix = '{d}/bamboo/unit_tests'.format(d=dir_name) + if diff_test != 0: + raise AssertionError( + 'diff_test={dt}\nCompare {ncd} and {cd} in {p}'.format( + dt=diff_test, ncd=no_ckpt_dir, cd=ckpt_dir, p=path_prefix)) def test_unit_checkpoint_lenet_shared_clang6(cluster, exes, dirname): diff --git a/bamboo/unit_tests/test_unit_lbann2_reload.py b/bamboo/unit_tests/test_unit_lbann2_reload.py index 6bd3aced4e1..554c1e8dce0 100644 --- a/bamboo/unit_tests/test_unit_lbann2_reload.py +++ b/bamboo/unit_tests/test_unit_lbann2_reload.py @@ -16,33 +16,33 @@ def skeleton_lbann2_reload(cluster, executables, dir_name, compiler_name): model_path = '{../../model_zoo/models/lenet_mnist/model_lenet_mnist.prototext,../../model_zoo/tests/model_lenet_mnist_lbann2ckpt.prototext}' output_file_name = '%s/bamboo/unit_tests/output/lbann2_no_checkpoint_%s_output.txt' % (dir_name, compiler_name) error_file_name = '%s/bamboo/unit_tests/error/lbann2_no_checkpoint_%s_error.txt' % (dir_name, compiler_name) + no_ckpt_dir = 'ckpt_lbann2_reload/lbann2_no_ckpt_{c}'.format(c=compiler_name) command = tools.get_command( cluster=cluster, executable=lbann2, num_nodes=1, num_processes=2, data_reader_name='mnist', data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST', dir_name=dir_name, + data_reader_percent=1.0, + ckpt_dir=no_ckpt_dir, model_path=model_path, optimizer_name='sgd', num_epochs=2, output_file_name=output_file_name, error_file_name=error_file_name) - os.mkdir('lbann2_ckpt') return_code_no_ckpt = os.system(command) tools.assert_success(return_code_no_ckpt, error_file_name) - os.system('mkdir ckpt_lbann2_reload') - no_ckpt_dir = 'ckpt_lbann2_reload/lbann2_no_ckpt_{c}'.format(c=compiler_name) - os.system('mv lbann2_ckpt {c}'.format(c=no_ckpt_dir)) - # Run to checkpoint, printing weights to files. output_file_name = '%s/bamboo/unit_tests/output/lbann2_checkpoint_%s_output.txt' % (dir_name, compiler_name) error_file_name = '%s/bamboo/unit_tests/error/lbann2_checkpoint_%s_error.txt' % (dir_name, compiler_name) + ckpt_dir = 'ckpt_lbann2_reload/lbann2_ckpt_{c}'.format(c=compiler_name) command = tools.get_command( cluster=cluster, executable=lbann2, num_nodes=1, num_processes=2, dir_name=dir_name, data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST', - data_reader_name='mnist', model_folder='tests', + data_reader_name='mnist', data_reader_percent=1.0, + ckpt_dir=ckpt_dir, model_folder='tests', model_name='lenet_mnist_ckpt', num_epochs=2, optimizer_name='sgd', output_file_name=output_file_name, error_file_name=error_file_name) @@ -52,22 +52,23 @@ def skeleton_lbann2_reload(cluster, executables, dir_name, compiler_name): # Pick up from checkpoint, printing weights to files. output_file_name = '%s/bamboo/unit_tests/output/lbann2_restart_%s_output.txt' % (dir_name, compiler_name) error_file_name = '%s/bamboo/unit_tests/error/lbann2_restart_%s_error.txt' % (dir_name, compiler_name) - os.mkdir('lbann2_ckpt') command = tools.get_command( cluster=cluster, executable=lbann2, num_nodes=1, num_processes=2, dir_name=dir_name, data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST', data_reader_name='mnist', + data_reader_percent=1.0, + ckpt_dir=ckpt_dir, model_path='../../model_zoo/tests/model_lenet_mnist_lbann2ckpt.prototext', - num_epochs=2, optimizer_name='sgd', ckpt_dir='ckpt/', + num_epochs=2, optimizer_name='sgd', output_file_name=output_file_name, error_file_name=error_file_name) return_code_ckpt_2 = os.system(command) tools.assert_success(return_code_ckpt_2, error_file_name) - os.system('rm lbann2_ckpt/model0-epoch*') - os.system('rm lbann2_nockpt/model0-epoch*') +# os.system('rm lbann2_ckpt/model0-epoch*') +# os.system('rm lbann2_nockpt/model0-epoch*') - diff_result = os.system('diff -rq lbann2_ckpt/ {c}'.format(c=no_ckpt_dir)) + diff_result = os.system('diff -rq {ckpt} {no_ckpt}'.format(ckpt=ckpt_dir, no_ckpt=no_ckpt_dir)) allow_epsilon_diff = False if allow_epsilon_diff and (diff_result != 0): equal_within_epsilon = True @@ -101,8 +102,6 @@ def skeleton_lbann2_reload(cluster, executables, dir_name, compiler_name): print(error_string) if equal_within_epsilon: diff_result = 0 - ckpt_dir = 'ckpt_lbann2_reload/lbann2_ckpt_{c}'.format(c=compiler_name) - os.system('mv lbann2_ckpt {c}'.format(c=ckpt_dir)) path_prefix = '{d}/bamboo/unit_tests'.format(d=dir_name) if diff_result != 0: raise AssertionError( diff --git a/docs/lbann.rst b/docs/lbann.rst new file mode 100644 index 00000000000..81b5eb73b88 --- /dev/null +++ b/docs/lbann.rst @@ -0,0 +1,85 @@ +******************** +LBANN Software Architecture and Class Overview +******************** + +Trainers (i.e. execution environment) +****************************************** + +A trainer is a collection of compute resources and defines a explicit +communication domain. It provides the execution for both the training +and inference of a trained model. Once constructed a trainer owns an +LBANN comm object that defines both intra- and inter-trainer +communication domains. Additionally, a trainer will contain an I/O +thread pool that is used to fetch and pre-process data that will be +provided to the trainer's models. + +A trainer owns: + +* comm object +* I/O thread pool +* One or more models +* Execution context for each model +* In the future, it will also contain the data readers. + +Execution Context +****************************************** + +When a model is attached to a trainer the execution context of the +training algorithm is stored in an execution_context class (or +sub-class) per execution mode. Thus there is one execution context +per model and mode that contains all of the state with respect to the +training algorithm being applied to the model. + +For example it tracks the current: + +* step +* execution mode +* epoch +* and a pointer back to the trainer + +Termination Criteria (Pending) +****************************************** + +(Pending feature) When a model is going to be trained or evaluated, +the termination criteria is specified in an object that is passed into +the training algorithm. (Note that this feature is under development, +currently the termination criteria is dictated by when the training +algorithm executes a fixed number of epochs.) + +Training Algorithms +****************************************** + +The training algorithm defines the optimization that is to be +applied to the model(s) being trained. Additionally, it can +specify how to evaluate the model. + +Model +****************************************** + +A model is a collection of operations with dependencies encoded as a +directed acyclic graph (DAG). In a typical formulation, these +operations form a neural network that will be either trained or used +for inference. Each operation in the model is an instance of the +layer class. The model is then a collection of layers that perform +transformations and mathematical operations on data that is passed +between layers. The model's DAG is executed in topological order. +Inside of some layer types are weight matrices that define a trained +model. (Note that LBANN should be able to support non-DNN models, but +this is a subject for future work.) + +Each layer in the graph contains a set of tensors that holds the +inputs, computed outputs, gradients with respect to the outputs, and +gradients with respect to the inputs. Furthermore, for each layer in +the graph with learnable parameters, there is an associated weight +tensor that form the learned weights of the model. The model also +owns the objective function, since that is integrally tied into the +model's computational graph. Additionally, the model owns both the +default optimizer that is used to provide a standard optimizer for the +model's weight tensors. Once each weight tensor is instantiated, it +will owns an instance of an optimizer. + +The model also owns the max_mini_batch_size that is supported by the +model. This is due to the fact that it changes the size and shape of +input, output, and gradient tensors. Additionally, the model owns a +field that controls if background I/O is allowed for this model and +associated data reader. diff --git a/include/lbann/CMakeLists.txt b/include/lbann/CMakeLists.txt index 3bff6abb1b3..4c12e546287 100644 --- a/include/lbann/CMakeLists.txt +++ b/include/lbann/CMakeLists.txt @@ -10,6 +10,7 @@ set_full_path(THIS_DIR_HEADERS add_subdirectory(callbacks) add_subdirectory(data_readers) add_subdirectory(data_store) +add_subdirectory(execution_contexts) add_subdirectory(io) add_subdirectory(layers) add_subdirectory(metrics) @@ -17,6 +18,8 @@ add_subdirectory(models) add_subdirectory(objective_functions) add_subdirectory(optimizers) add_subdirectory(proto) +add_subdirectory(trainers) +add_subdirectory(training_algorithms) add_subdirectory(transforms) add_subdirectory(utils) add_subdirectory(weights) diff --git a/include/lbann/base.hpp b/include/lbann/base.hpp index 97f33e93583..f2448ddf7d6 100644 --- a/include/lbann/base.hpp +++ b/include/lbann/base.hpp @@ -31,6 +31,7 @@ #include "lbann/Elemental_extensions.hpp" #include "lbann/utils/cyg_profile.hpp" #include "lbann/utils/file_utils.hpp" +#include "lbann/utils/enum_iterator.hpp" // Defines, among other things, DataType. #include "lbann_config.hpp" @@ -48,6 +49,9 @@ namespace lbann { // Forward-declaration. class lbann_comm; +/// Creating an observer_ptr to complement the unique_ptr and shared_ptr +template using observer_ptr = typename std::add_pointer::type; + // Note that this should only be used to wrap the thing coming out of // initialize()! This will be removed when we have proper RAII around // these things. @@ -126,9 +130,10 @@ matrix_format data_layout_to_matrix_format(data_layout layout); /// Neural network execution mode enum class execution_mode {training, validation, testing, prediction, invalid}; std::string to_string(execution_mode m); +using execution_mode_iterator = enum_iterator; /** @brief Convert a string to an execution_mode. */ -execution_mode exe_mode_from_string(std::string const& str); +execution_mode exec_mode_from_string(std::string const& str); /** @brief Extract an execution_mode from a stream. */ std::istream& operator>>(std::istream& os, execution_mode& e); diff --git a/include/lbann/callbacks/callback.hpp b/include/lbann/callbacks/callback.hpp index fcdc0295aad..def733089cf 100644 --- a/include/lbann/callbacks/callback.hpp +++ b/include/lbann/callbacks/callback.hpp @@ -29,11 +29,13 @@ #ifndef LBANN_CALLBACKS_CALLBACK_HPP_INCLUDED #define LBANN_CALLBACKS_CALLBACK_HPP_INCLUDED +#include "lbann/trainers/trainer.hpp" #include "lbann/layers/layer.hpp" #include "lbann/models/model.hpp" #include "lbann/utils/description.hpp" #include "lbann/utils/memory.hpp" #include "lbann/utils/summary.hpp" +#include "lbann/execution_contexts/sgd_execution_context.hpp" #include diff --git a/include/lbann/callbacks/checkpoint.hpp b/include/lbann/callbacks/checkpoint.hpp index 5107966dd79..65ed972e168 100644 --- a/include/lbann/callbacks/checkpoint.hpp +++ b/include/lbann/callbacks/checkpoint.hpp @@ -34,6 +34,14 @@ namespace lbann { namespace callback { +enum class callback_phase { + batch, + epoch, + validation, + inference, + invalid +}; + /** @brief Checkpoint at given interval in given directory */ class checkpoint : public callback_base { public: @@ -71,14 +79,19 @@ class checkpoint : public callback_base { checkpoint& operator=(const checkpoint&) = default; checkpoint* copy() const override { return new checkpoint(*this); } void setup(model *m) override; + void on_train_begin(model *m) override; void on_epoch_end(model *m) override; void on_batch_end(model *m) override; void on_validation_end(model *m) override; - inline void set_checkpoint_dir(std::string dir){ + inline void set_checkpoint_dir(const std::string& dir){ m_checkpoint_dir= dir; } + inline const std::string& get_checkpoint_dir(){ + return m_checkpoint_dir; + } + inline void set_checkpoint_epochs(int epochs){ m_checkpoint_epochs= epochs; } @@ -103,7 +116,14 @@ class checkpoint : public callback_base { m_ckpt_dist_steps = ckpt_dist_steps; } - bool need_checkpoint(model *m); + bool need_checkpoint(model *m, callback_phase phase); + std::string find_latest_checkpoint(model *m, std::string& latest_file, size_t& epoch, size_t& step, int& shared); + std::string find_latest_checkpoint(model *m, std::string& latest_file, execution_mode& mode, size_t &epoch, size_t& step, int& shared); + bool open_latest_checkpoint(model *m, + const std::string& task_label, + std::function reload_shared_ckpt, + std::function reload_distributed_ckpt); + bool reload_model(model *m); bool restart(model *m); std::string name() const override { return "checkpoint"; } protected: @@ -123,6 +143,7 @@ class checkpoint : public callback_base { template struct header_t { + execution_mode mode; int epoch; int step; int shared; @@ -130,52 +151,54 @@ class checkpoint : public callback_base { }; }; -static inline std::string get_last_shared_checkpoint_filename(model *m, std::string dir) { +inline std::string get_last_shared_checkpoint_filename(model *m, const std::string& dir) { lbann_comm *comm = m->get_comm(); - std::stringstream ss; + std::ostringstream ss; ss << dir << "/"; - ss << m->get_name().c_str() << "."; + ss << m->get_name().c_str() << ".trainer."; ss << comm->get_trainer_rank() << ".last.shared.checkpoint"; return ss.str(); } -static inline std::string get_shared_checkpoint_dirname(model *m, std::string dir, int epoch, int step) { +inline std::string get_shared_checkpoint_dirname(model *m, const std::string& dir, execution_mode mode, size_t epoch, size_t step) { lbann_comm *comm = m->get_comm(); - std::stringstream ss; + std::ostringstream ss; ss << dir << "/" << m->get_name().c_str(); - ss << "." << comm->get_trainer_rank(); - ss << ".shared.epoch." << epoch; + ss << ".trainer." << comm->get_trainer_rank(); + ss << ".shared." << to_string(mode); + ss << ".epoch." << epoch; ss << ".step."<< step << "/"; return ss.str(); } -static inline std::string get_last_distributed_checkpoint_filename(model *m, std::string dir) { +inline std::string get_last_distributed_checkpoint_filename(model *m, const std::string& dir) { lbann_comm *comm = m->get_comm(); - std::stringstream ss; + std::ostringstream ss; ss << dir << "/"; - ss << m->get_name().c_str() << "."; + ss << m->get_name().c_str() << ".trainer."; ss << comm->get_trainer_rank() << ".last.distributed.checkpoint"; return ss.str(); } -static inline std::string get_distributed_checkpoint_dirname(model *m, std::string dir, int epoch, int step) { +inline std::string get_distributed_checkpoint_dirname(model *m, const std::string& dir, execution_mode mode, size_t epoch, size_t step) { lbann_comm *comm = m->get_comm(); - std::stringstream ss; + std::ostringstream ss; ss << dir << "/" << m->get_name().c_str(); - ss << "." << comm->get_trainer_rank(); + ss << ".trainer." << comm->get_trainer_rank(); ss << ".rank." << comm->get_rank_in_trainer(); + ss << ".distributed." << to_string(mode); ss << ".epoch." << epoch; ss << ".step."<< step << "/"; return ss.str(); } // Print last checkpoint to file, used to determine which checkpoint to load from. -static inline bool write_latest(std::string filename, int epoch, int train) { +inline bool write_latest(std::string filename, execution_mode mode, size_t epoch, size_t train) { // open the file for writing int fd = openwrite(filename.c_str()); if (fd != -1) { char field[256]; - sprintf(field, "epoch=%d step=%d\n", epoch, train); + sprintf(field, "mode=%s epoch=%ld step=%ld\n", to_string(mode).c_str(), epoch, train); write_string(fd, filename.c_str(), field, strlen(field)); // close our file closewrite(fd, filename.c_str()); @@ -186,22 +209,26 @@ static inline bool write_latest(std::string filename, int epoch, int train) { /** \brief Reads the "latest" file and returns the epoch number and * sample offset for most recent checkpoint */ -static inline bool read_latest(std::string filename, int *epochLast, int *trainLast) { +inline bool read_latest(std::string filename, execution_mode *mode, size_t *epochLast, size_t *trainLast) { // assume we don't have a file, we'll return -1 in that case *epochLast = -1; *trainLast = -1; + *mode = execution_mode::invalid; // open the file for reading int fd = openread(filename.c_str()); if (fd != -1) { // read epoch from file char field[256]; read_string(fd, filename.c_str(), field, sizeof(field)); - int ret = sscanf(field, "epoch=%d step=%d\n", epochLast, trainLast); + char modeStr[64]; + int ret = sscanf(field, "mode=%s epoch=%ld step=%ld\n", modeStr, epochLast, trainLast); + *mode = exec_mode_from_string(modeStr); // close our file closeread(fd, filename.c_str()); if(ret != 2) { return false; } + return true; } - return true; + return false; } // Builder function diff --git a/include/lbann/callbacks/dump_outputs.hpp b/include/lbann/callbacks/dump_outputs.hpp index 939de823c10..34610896f3b 100644 --- a/include/lbann/callbacks/dump_outputs.hpp +++ b/include/lbann/callbacks/dump_outputs.hpp @@ -83,7 +83,8 @@ class dump_outputs : public callback_base { do_dump_outputs(*m, *l); } void on_evaluate_forward_prop_end(model* m, Layer* l) override { - if(m->get_step() % m_batch_interval == 0) { + const auto& c = static_cast(m->get_execution_context()); + if(c.get_step() % m_batch_interval == 0) { do_dump_outputs(*m, *l); } } diff --git a/include/lbann/callbacks/dump_weights.hpp b/include/lbann/callbacks/dump_weights.hpp index 85bf7d1b2af..603c07ca7b2 100644 --- a/include/lbann/callbacks/dump_weights.hpp +++ b/include/lbann/callbacks/dump_weights.hpp @@ -59,6 +59,8 @@ class dump_weights : public callback_base { void on_train_begin(model *m) override; void on_epoch_end(model *m) override; std::string name() const override { return "dump weights"; } + void set_target_dir(const std::string& basename) { m_basename = basename; } + const std::string& get_target_dir() { return m_basename; } private: /** Basename for writing files. */ std::string m_basename; diff --git a/include/lbann/callbacks/learning_rate.hpp b/include/lbann/callbacks/learning_rate.hpp index 20b0de08c17..c7a4d763959 100644 --- a/include/lbann/callbacks/learning_rate.hpp +++ b/include/lbann/callbacks/learning_rate.hpp @@ -113,9 +113,9 @@ class learning_rate : public callback_base { class step_learning_rate : public learning_rate { public: /** Decrease the learning rate by amt every step epochs. */ - step_learning_rate(int step, float amt); - step_learning_rate(int step, float amt, - std::vector weights_names); + step_learning_rate(size_t step, float amt); + step_learning_rate(size_t step, float amt, + std::vector weights_names); step_learning_rate( const step_learning_rate&) = default; step_learning_rate& operator=( @@ -128,7 +128,7 @@ class step_learning_rate : public learning_rate { float global_schedule(model *m) override; private: /** Number of epochs between each learning rate decrease. */ - int m_step; + size_t m_step; /** Amount to decrease the learning rate by. */ float m_amt; }; @@ -148,8 +148,8 @@ class adaptive_learning_rate : public learning_rate { * Decrease the learning rate by amt if accuracy does not improve for patience * epochs. */ - adaptive_learning_rate(int64_t patience, float amt); - adaptive_learning_rate(int64_t patience, float amt, + adaptive_learning_rate(size_t patience, float amt); + adaptive_learning_rate(size_t patience, float amt, std::vector weights_names); adaptive_learning_rate( const adaptive_learning_rate&) = default; @@ -163,15 +163,15 @@ class adaptive_learning_rate : public learning_rate { float global_schedule(model *m) override; private: /** Number of epochs to wait for improvements. */ - int64_t m_patience; + size_t m_patience; /** Amount to decrease the learning rate by. */ float m_amt; /** Current epoch. */ - int m_cur_epoch = -1; + size_t m_cur_epoch = std::numeric_limits::max(); /** Last recorded score. */ EvalType m_last_score = std::numeric_limits::max(); /** Current number of epochs without improvement. */ - int64_t m_wait = 0; + size_t m_wait = 0; /** Whether to adjust learning rate for current epoch. */ bool m_adjust_learning_rate = false; }; @@ -192,9 +192,9 @@ class drop_fixed_learning_rate : * reached. */ drop_fixed_learning_rate( - std::vector drop_epochs, float amt); + std::vector drop_epochs, float amt); drop_fixed_learning_rate( - std::vector drop_epochs, float amt, + std::vector drop_epochs, float amt, std::vector weights_names); drop_fixed_learning_rate( const drop_fixed_learning_rate&) = default; @@ -213,7 +213,7 @@ class drop_fixed_learning_rate : * Epochs to drop learning rate at. This is stored in reverse sorted order, * so that the end can be examined and then popped in constant time. */ - std::vector m_drop_epochs; + std::vector m_drop_epochs; }; // Builder function @@ -235,11 +235,11 @@ class linear_growth_learning_rate : * Linearly increase the learning rate to reach target after num_epochs. */ linear_growth_learning_rate( - float target, int64_t num_epochs); + float target, size_t num_epochs); linear_growth_learning_rate( - float target, int64_t num_epochs, int64_t delay); + float target, size_t num_epochs, size_t delay); linear_growth_learning_rate( - float target, int64_t num_epochs, int64_t delay, + float target, size_t num_epochs, size_t delay, std::vector weights_names); linear_growth_learning_rate( const linear_growth_learning_rate&) = default; @@ -259,9 +259,9 @@ class linear_growth_learning_rate : /// Amount to increase each epoch. float m_inc; /// Number of epochs over which to scale the learning rate. - int64_t m_num_epochs; + size_t m_num_epochs; /// Number of epochs to delay before starting growth. - int64_t m_delay; + size_t m_delay; }; // Builder function @@ -277,8 +277,8 @@ build_linear_growth_learning_rate_callback_from_pbuf( */ class poly_learning_rate : public learning_rate { public: - poly_learning_rate(double p, uint64_t n_epochs, uint64_t max_iter); - poly_learning_rate(double p, uint64_t n_epochs, uint64_t max_iter, double endl_r, + poly_learning_rate(double p, size_t n_epochs, size_t max_iter); + poly_learning_rate(double p, size_t n_epochs, size_t max_iter, double endl_r, std::vector weights_names); poly_learning_rate( const poly_learning_rate&) = default; @@ -296,9 +296,9 @@ class poly_learning_rate : public learning_rate { /// The exponent to compute new learning rate in poly policy double m_p; /// The number of epochs for training - uint64_t m_num_epochs; + size_t m_num_epochs; /// The maximum number of iterations until which the learning rate changes - uint64_t m_max_iter; + size_t m_max_iter; /// The minimum learning rate float m_end_lr; /// The current rate to scale the base learning rate diff --git a/include/lbann/callbacks/save_model.hpp b/include/lbann/callbacks/save_model.hpp index 67a8bb9603f..5c9e1cc696e 100644 --- a/include/lbann/callbacks/save_model.hpp +++ b/include/lbann/callbacks/save_model.hpp @@ -76,6 +76,8 @@ class save_model : public callback_base { bool ckptdir_is_fullpath=false); std::string name() const override { return "save model"; } + void set_target_dir(const std::string& dir) { m_dir = dir; } + const std::string& get_target_dir() { return m_dir; } protected: friend class lbann::model; diff --git a/include/lbann/callbacks/variable_minibatch.hpp b/include/lbann/callbacks/variable_minibatch.hpp index ce187cfb039..5bc5c37318b 100644 --- a/include/lbann/callbacks/variable_minibatch.hpp +++ b/include/lbann/callbacks/variable_minibatch.hpp @@ -41,7 +41,7 @@ namespace callback { */ class variable_minibatch : public callback_base { public: - variable_minibatch(int starting_mbsize); + variable_minibatch(size_t starting_mbsize); variable_minibatch( const variable_minibatch&) = default; variable_minibatch& operator=( @@ -64,24 +64,24 @@ class variable_minibatch : public callback_base { * behavior; also be aware of interactions with other learning rate * schedules. */ - virtual bool schedule(model *m, int& new_mbsize, float& new_lr, - int& ramp_time) = 0; + virtual bool schedule(model *m, size_t& new_mbsize, float& new_lr, + size_t& ramp_time) = 0; /// Change the learning rate of every layer in m to new_lr. void change_learning_rate(model *m, float new_lr) const; /// Get the current learning rate (assumes every layer has the same one). float get_current_learning_rate(model *m) const; /// Initial mini-batch size. - int m_starting_mbsize; + size_t m_starting_mbsize; /** * The current mini-batch size for this epoch. * This is kept separately from the model's get_current_mini_batch_size() * method, as calling that in on_epoch_end returns the size of the last mini- * batch, not the "base" mini-batch. */ - int m_current_mini_batch_size; + size_t m_current_mini_batch_size; /// Current number of epochs left to ramp the learning rate. - int m_ramp_count = 0; + size_t m_ramp_count = 0; /// Amount to increment the learning rate by when ramping. float m_lr_incr = 0.0f; }; @@ -92,8 +92,8 @@ class variable_minibatch : public callback_base { */ class step_minibatch : public variable_minibatch { public: - step_minibatch(int starting_mbsize, int step, - int ramp_time = 0); + step_minibatch(size_t starting_mbsize, size_t step, + size_t ramp_time = 0); step_minibatch(const step_minibatch&) = default; step_minibatch& operator=( const step_minibatch&) = delete; @@ -102,13 +102,13 @@ class step_minibatch : public variable_minibatch { } std::string name() const override { return "step minibatch"; } protected: - bool schedule(model *m, int& new_mbsize, float& new_lr, int& ramp_time) override; + bool schedule(model *m, size_t& new_mbsize, float& new_lr, size_t& ramp_time) override; private: /// Number of epochs between mini-batch size increases. - int m_step; + size_t m_step; /// Number of steps to ramp the learning rate over. - int m_ramp_time; + size_t m_ramp_time; }; // Builder function @@ -121,19 +121,19 @@ class minibatch_schedule : public variable_minibatch { /// Represents a step in a schedule of mini-batch sizes. struct minibatch_step { /// Epoch for this schedule to start. - int epoch; + size_t epoch; /// Mini-batch size to use. - int mbsize; + size_t mbsize; /// Learning rate to use. float lr; /// Number of epochs to ramp the learning rate over. - int ramp_time; - minibatch_step(int _epoch, int _mbsize, float _lr, int _ramp_time) : + size_t ramp_time; + minibatch_step(size_t _epoch, size_t _mbsize, float _lr, size_t _ramp_time) : epoch(_epoch), mbsize(_mbsize), lr(_lr), ramp_time(_ramp_time) {} }; minibatch_schedule( - int starting_mbsize, std::vector steps); + size_t starting_mbsize, std::vector steps); minibatch_schedule( const minibatch_schedule&) = default; minibatch_schedule& operator=( @@ -143,7 +143,7 @@ class minibatch_schedule : public variable_minibatch { } std::string name() const override { return "minibatch schedule"; } protected: - bool schedule(model *m, int& new_mbsize, float& new_lr, int& ramp_time) override; + bool schedule(model *m, size_t& new_mbsize, float& new_lr, size_t& ramp_time) override; private: /// Steps in the mini-batch schedule, stored in reverse sorted order. std::vector m_steps; diff --git a/include/lbann/data_readers/data_reader.hpp b/include/lbann/data_readers/data_reader.hpp index c4470b47081..cfae9cee33c 100644 --- a/include/lbann/data_readers/data_reader.hpp +++ b/include/lbann/data_readers/data_reader.hpp @@ -44,7 +44,9 @@ #include #include #include - +#include +#include +#include #define NOT_IMPLEMENTED(n) { \ std::stringstream s; \ @@ -108,6 +110,13 @@ class generic_data_reader { virtual ~generic_data_reader() {} virtual generic_data_reader* copy() const = 0; + /** Archive for checkpoint and restart */ + template void serialize( Archive & ar ) { + ar(CEREAL_NVP(m_current_mini_batch_idx), + CEREAL_NVP(m_current_pos), + CEREAL_NVP(m_shuffled_indices)); + } + /// set the comm object void set_comm(lbann_comm *comm) { m_comm = comm; @@ -273,7 +282,7 @@ class generic_data_reader { * If the base offset is not specified set it to 0 * If the stride is not specified set it to batch size */ - virtual void setup(int num_io_threads, std::shared_ptr io_thread_pool); + virtual void setup(int num_io_threads, observer_ptr io_thread_pool); /** Return this data_reader's type */ virtual std::string get_type() const = 0; @@ -584,96 +593,15 @@ class generic_data_reader { /** \brief Given directory to store checkpoint files, write state to file and add to number of bytes written */ - bool save_to_checkpoint_shared(persist& p, const char *name); + bool save_to_checkpoint_shared(persist& p, execution_mode mode); /** \brief Given directory to store checkpoint files, read state from file and add to number of bytes read */ - bool load_from_checkpoint_shared(persist& p, const char *name); + bool load_from_checkpoint_shared(persist& p, execution_mode mode); - bool save_to_checkpoint_distributed(persist& p, const char *name); + bool save_to_checkpoint_distributed(persist& p, execution_mode mode); /** \brief Given directory to store checkpoint files, read state from file and add to number of bytes read */ - bool load_from_checkpoint_distributed(persist& p, const char *name); - - struct packing_header { - uint64_t current_pos; - uint64_t current_mini_batch_idx; - uint64_t data_size; - }; - bool pack_scalars(persist& p, const char *name) { - char fieldname[1024]; - lbann::persist_type persist_value; - std::string s_name(name); - if(s_name.compare("data_reader_validation") == 0){ - persist_value = persist_type::validate; - } else { - persist_value= persist_type::train; - } - - - snprintf(fieldname, sizeof(fieldname), "%s_current_mini_batch_idx", name); - p.write_uint64(persist_value, fieldname, (uint64_t) m_current_mini_batch_idx); - - int size = m_shuffled_indices.size(); - snprintf(fieldname, sizeof(fieldname), "%s_data_size", name); - p.write_uint64(persist_value, fieldname, (uint64_t) size); - - snprintf(fieldname, sizeof(fieldname), "%s_data_position", name); - p.write_uint64(persist_value, fieldname, (uint64_t) m_current_pos); - - snprintf(fieldname, sizeof(fieldname), "%s_data_indices", name); - p.write_int32_contig(persist_value, fieldname, &m_shuffled_indices[0], (uint64_t) size); - - return true; - } - - bool unpack_scalars(persist& p, struct packing_header *header, const char *name){ - char fieldname[1024]; - lbann::persist_type persist_value; - std::string s_name(name); - if(s_name.compare("data_reader_validation") == 0){ - persist_value = persist_type::validate; - } else { - persist_value= persist_type::train; - } - // Closest to non checkpoint run only loads m_current_pos - - // record minibatch index - uint64_t val; - - snprintf(fieldname, sizeof(fieldname), "%s_current_mini_batch_idx", name); - p.read_uint64(persist_value, fieldname, &val); - m_current_mini_batch_idx = (int) val; - - snprintf(fieldname, sizeof(fieldname), "%s_data_size", name); - p.read_uint64(persist_value, fieldname, &val); - auto size = (int) val; - - // get current position within data - snprintf(fieldname, sizeof(fieldname), "%s_data_position", name); - p.read_uint64(persist_value, fieldname, &val); - m_current_pos = (int) val; - //resize shuffled index array to hold values - m_shuffled_indices.resize(size); - - //read list of indices - snprintf(fieldname, sizeof(fieldname), "%s_data_indices", name); - p.read_int32_contig(persist_value, fieldname, &m_shuffled_indices[0], (uint64_t) size); - - if(header != nullptr){ - //shuffled data indices array size, used for resize after broadcast. Not unpacked. - header->data_size = size; - // all else, unpacked and set in unpack header. - header->current_pos = m_current_pos; - header->current_mini_batch_idx = m_current_mini_batch_idx; - } - - return true; - } - - void unpack_header(struct packing_header& header){ - m_current_pos = (int) header.current_pos; - m_current_mini_batch_idx = (int) header.current_mini_batch_idx; - } + bool load_from_checkpoint_distributed(persist& p, execution_mode mode); /// returns a const ref to the data store virtual const data_store_conduit& get_data_store() const { @@ -749,7 +677,7 @@ class generic_data_reader { double get_use_percent() const; /** - * Returns the percent of the shuffled indices that are to be + * Returns the percent of the shuffled indices that are to be * used. Code in this method was formerly in select_subset_of_data() */ double get_percent_to_use() const; @@ -799,7 +727,7 @@ class generic_data_reader { } /// returns the percent of shuffled indices that are used; - /// the returned value depends on the values returned by + /// the returned value depends on the values returned by /// get_absolute_sample_count() and get_use_percent(). double get_percent_to_use(); @@ -905,7 +833,7 @@ class generic_data_reader { std::vector> m_thread_buffer; - std::shared_ptr m_io_thread_pool; + observer_ptr m_io_thread_pool; /// special handling for 1B jag; each reader /// owns a unique subset of the data diff --git a/include/lbann/data_readers/data_reader_image.hpp b/include/lbann/data_readers/data_reader_image.hpp index 3c9095af07c..0ac7b7740e5 100644 --- a/include/lbann/data_readers/data_reader_image.hpp +++ b/include/lbann/data_readers/data_reader_image.hpp @@ -55,7 +55,7 @@ class image_data_reader : public generic_data_reader { // dataset specific functions void load() override; - void setup(int num_io_threads, std::shared_ptr io_thread_pool) override; + void setup(int num_io_threads, observer_ptr io_thread_pool) override; int get_num_labels() const override { return m_num_labels; @@ -96,7 +96,7 @@ class image_data_reader : public generic_data_reader { return m_image_list.at(idx); } - void preload_data_store() override; + void preload_data_store() override; protected: void copy_members(const image_data_reader &rhs, const std::vector& ds_sample_move_list = std::vector()); diff --git a/include/lbann/data_readers/data_reader_jag_conduit.hpp b/include/lbann/data_readers/data_reader_jag_conduit.hpp index f178f5953f6..3caf527a4ad 100644 --- a/include/lbann/data_readers/data_reader_jag_conduit.hpp +++ b/include/lbann/data_readers/data_reader_jag_conduit.hpp @@ -94,7 +94,7 @@ class data_reader_jag_conduit : public generic_data_reader { ~data_reader_jag_conduit() override; data_reader_jag_conduit* copy() const override { return new data_reader_jag_conduit(*this); } - void setup(int num_io_threads, std::shared_ptr io_thread_pool) override; + void setup(int num_io_threads, observer_ptr io_thread_pool) override; std::string get_type() const override { return "data_reader_jag_conduit"; diff --git a/include/lbann/data_readers/data_reader_python.hpp b/include/lbann/data_readers/data_reader_python.hpp index 9d31503c648..09b0e05e928 100644 --- a/include/lbann/data_readers/data_reader_python.hpp +++ b/include/lbann/data_readers/data_reader_python.hpp @@ -54,7 +54,7 @@ class python_reader : public generic_data_reader { int get_linearized_data_size() const override; int get_linearized_label_size() const override; - void setup(int num_io_threads, std::shared_ptr io_thread_pool) override; + void setup(int num_io_threads, observer_ptr io_thread_pool) override; void load() override; protected: diff --git a/include/lbann/data_readers/sample_list_impl.hpp b/include/lbann/data_readers/sample_list_impl.hpp index c131bf814ac..364de74bb27 100644 --- a/include/lbann/data_readers/sample_list_impl.hpp +++ b/include/lbann/data_readers/sample_list_impl.hpp @@ -14,7 +14,6 @@ #include #include -#include #include namespace lbann { @@ -386,9 +385,11 @@ ::all_gather_field(T data, std::vector& gathered_data, lbann_comm& comm) { std::string archive; - std::stringstream ss; - cereal::BinaryOutputArchive oarchive(ss); - oarchive(data); + std::ostringstream ss; + { + cereal::BinaryOutputArchive oarchive(ss); + oarchive(data); + } // archive goes out of scope, ensuring all contents are flushed archive = ss.str(); std::vector gathered_archive(comm.get_procs_per_trainer()); diff --git a/include/lbann/execution_contexts/CMakeLists.txt b/include/lbann/execution_contexts/CMakeLists.txt new file mode 100644 index 00000000000..79bd7243399 --- /dev/null +++ b/include/lbann/execution_contexts/CMakeLists.txt @@ -0,0 +1,8 @@ +# Add the headers for this directory +set_full_path(THIS_DIR_HEADERS + execution_context.hpp + sgd_execution_context.hpp + ) + +# Propagate the files up the tree +set(HEADERS "${HEADERS}" "${THIS_DIR_HEADERS}" PARENT_SCOPE) diff --git a/include/lbann/execution_contexts/execution_context.hpp b/include/lbann/execution_contexts/execution_context.hpp new file mode 100644 index 00000000000..456cf52c795 --- /dev/null +++ b/include/lbann/execution_contexts/execution_context.hpp @@ -0,0 +1,155 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_EXECUTION_CONTEXT_HPP +#define LBANN_EXECUTION_CONTEXT_HPP + +#include "lbann/base.hpp" +#include "lbann/comm.hpp" +#include "lbann/io/persist.hpp" +#include "lbann/utils/threads/thread_pool.hpp" +#include + +namespace lbann { + +// Forward-declare this. +class trainer; + +struct termination_criteria { + size_t num_steps; +}; + +class execution_context { +public: + /** Constructor. */ + execution_context(observer_ptr trainer, lbann_comm *comm, execution_mode mode); + /** Destructor. */ + virtual ~execution_context() = default; + + /** Copy execution_context. */ + virtual std::unique_ptr copy_execution_context() const { + // Use explicit construction of unique pointer since copy + // constructor is protected and cannot be accessed in make_unique + return std::unique_ptr{new execution_context(*this)}; + } + + /** Archive for checkpoint and restart */ + template void serialize( Archive & ar ) { + ar(CEREAL_NVP(m_execution_mode), + CEREAL_NVP(m_terminate_training), + CEREAL_NVP(m_step)); + } + + /** @brief Current step in the training algorithm + * @detailed Step counts the number of iterations in the training + * algorithm's internal state + */ + size_t get_step() const noexcept { return m_step; } + + /** @brief Increment the current step in the training algorithm + * @detailed Increment the step count in the training + * algorithm's internal state + */ + void inc_step() noexcept { ++m_step; } + + /** Get the mode that the trainer is currenting executing. */ + inline void set_execution_mode(execution_mode mode) noexcept { + m_execution_mode = mode; + } + + /** Get the mode that the trainer is currenting executing. */ + inline execution_mode get_execution_mode() const noexcept { + return m_execution_mode; + } + + /** Return true if the flag to stop training is set. */ + bool get_terminate_training() const { + return m_terminate_training; + } + /** Set the terminate training flag (on or off). */ + void set_terminate_training(bool f) { + m_terminate_training = f; + } + + /** Get the execution environment */ + trainer& get_trainer() { + if (!m_trainer) { LBANN_ERROR("m_trainer is null"); } + return *m_trainer; + } + + thread_pool& get_io_thread_pool() const; + + lbann_comm& get_comm() const { + if (!m_comm) { LBANN_ERROR("m_comm is null"); } + return *m_comm; + }; + + /** Are background I/O activities enabled by the input layers */ + bool background_io_activity_allowed(); + + /** Checkpoint training_algorithm to given file descriptor */ + virtual void save_to_checkpoint_shared(persist& p); + /** Restore training_algorithm by reading checkpoint from given file descriptor */ + virtual void load_from_checkpoint_shared(persist& p); + virtual void save_to_checkpoint_distributed(persist& p); + virtual void load_from_checkpoint_distributed(persist& p); + +protected: + /** Copy constructor. */ + execution_context(const execution_context& other) = default; + /** Copy assignment operator. */ + execution_context& operator=(const execution_context& other) = default; + /** Move constructor. */ + execution_context(execution_context&& other) = default; + /** Move assignment operator. */ + execution_context& operator=(execution_context&& other) = default; + +private: + /** Pointer to the training context (execution environment) for the training algorithm */ + observer_ptr m_trainer; + + /** LBANN communicator. */ + observer_ptr m_comm; + + /** The trainer's current execution mode. */ + execution_mode m_execution_mode = execution_mode::training; + + /** @brief Current step in the training algorithm + * @detailed Step counts the number of iterations in the training + * algorithm's internal state + */ + size_t m_step = 0; + + /** @brief Whether to terminate training. + * @detailed If true, training will terminate immediately before + * the next epoch. + */ + bool m_terminate_training = false; +}; + +} // namespace lbann + +#endif // LBANN_EXECUTION_CONTEXT_HPP diff --git a/include/lbann/execution_contexts/sgd_execution_context.hpp b/include/lbann/execution_contexts/sgd_execution_context.hpp new file mode 100644 index 00000000000..d3a5409dddc --- /dev/null +++ b/include/lbann/execution_contexts/sgd_execution_context.hpp @@ -0,0 +1,119 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_SGD_EXECUTION_CONTEXT_HPP +#define LBANN_SGD_EXECUTION_CONTEXT_HPP + +#include "lbann/execution_contexts/execution_context.hpp" +#include +namespace lbann { + +struct sgd_termination_criteria : public termination_criteria { + size_t num_epochs; +}; + + +/** @brief SGD Uses the step to track the Current mini-batch step for + * execution mode. + * @detailed Step counts are not reset after each epoch. + */ +class sgd_execution_context final : public execution_context { +public: + /** Constructor. */ + sgd_execution_context(observer_ptr trainer, lbann_comm *comm, execution_mode mode, size_t mini_batch_size); + /** Destructor. */ + virtual ~sgd_execution_context() = default; + + /** Copy constructor. */ + sgd_execution_context(const sgd_execution_context& other) = default; + /** Copy assignment operator. */ + sgd_execution_context& operator=(const sgd_execution_context& other) = default; + /** Move constructor. */ + sgd_execution_context(sgd_execution_context&& other) = default; + /** Move assignment operator. */ + sgd_execution_context& operator=(sgd_execution_context&& other) = default; + /** Copy sgd_execution_context. */ + virtual std::unique_ptr copy_execution_context() const { return make_unique(*this); } + + /** Archive for checkpoint and restart */ + template void serialize( Archive & ar ) { + ar(cereal::base_class( this ), + CEREAL_NVP(m_epoch), + CEREAL_NVP(m_current_mini_batch_size), + CEREAL_NVP(m_effective_mini_batch_size)); + } + + /** Number of times the training set has been traversed. */ + inline size_t get_epoch() const noexcept { return m_epoch; } + + /** @brief Increment the current epoch in the execution context + * @detailed Increment the counter tracking the number of times + * that the data set has been traversed. + */ + void inc_epoch() noexcept { ++m_epoch; } + + /** Set the trainer's current mini-batch size. */ + inline void set_current_mini_batch_size(size_t mini_batch_size) { + m_current_mini_batch_size = mini_batch_size; + } + /** Get the trainer's current mini-batch size. */ + inline size_t get_current_mini_batch_size() const { + return m_current_mini_batch_size; + } + /** Get the trainer's effective mini-batch size. */ + inline size_t get_effective_mini_batch_size() const { + return m_effective_mini_batch_size; + } + /** Set the trainer's effective mini-batch size. */ + inline void set_effective_mini_batch_size(size_t mini_batch_size) { + m_effective_mini_batch_size = mini_batch_size; + } + + /** Checkpoint training_algorithm to given file descriptor */ + virtual void save_to_checkpoint_shared(persist& p); + /** Restore training_algorithm by reading checkpoint from given file descriptor */ + virtual void load_from_checkpoint_shared(persist& p); + virtual void save_to_checkpoint_distributed(persist& p); + virtual void load_from_checkpoint_distributed(persist& p); + +private: + /** Number of times the training data set has been traversed. */ + size_t m_epoch = 0; + + /** Size of the current mini-batch in the model. */ + size_t m_current_mini_batch_size; + + /** The "effective" size of a minibatch. + * + * This is the size of the minibatch across all models and used for + * e.g. correctly averaging gradients from multiple models. + */ + size_t m_effective_mini_batch_size; +}; + +} // namespace lbann + +#endif // LBANN_SGD_EXECUTION_CONTEXT_HPP diff --git a/include/lbann/io/persist.hpp b/include/lbann/io/persist.hpp index 409dc5ddf89..3a2201b72bf 100644 --- a/include/lbann/io/persist.hpp +++ b/include/lbann/io/persist.hpp @@ -30,36 +30,90 @@ #define LBANN_PERSIST_H #include "lbann/base.hpp" +#include "lbann/utils/exception.hpp" +#include "lbann/utils/enum_iterator.hpp" #include "El.hpp" +#include +#include +#include +#include +#include namespace lbann { enum class persist_type { train, // data should be saved in file with train data model, // data should be saved in file with model data - validate + metrics, + validate, + testing, + prediction_context, + training_context, + testing_context, + validation_context, }; +using persist_type_iterator = enum_iterator; + +inline persist_type execution_mode_to_persist_type(execution_mode m) { + switch(m) { + case execution_mode::training: + return persist_type::training_context; + case execution_mode::validation: + return persist_type::validation_context; + case execution_mode::testing: + return persist_type::testing_context; + case execution_mode::prediction: + return persist_type::prediction_context; + // case execution_mode::tournament: + // return persist_type::tournament; + case execution_mode::invalid: + default: + LBANN_ERROR("Invalid execution mode specified"); + } +} + +inline std::string to_string(persist_type pt) { + switch(pt) { + case persist_type::model: + return "model"; + case persist_type::metrics: + return "metrics"; + case persist_type::train: + return "train"; + case persist_type::validate: + return "validate"; + case persist_type::testing: + return "test"; + case persist_type::prediction_context: + return "prediction"; + case persist_type::training_context: + return "training"; + case persist_type::validation_context: + return "validation"; + case persist_type::testing_context: + return "testing"; + default: + LBANN_ERROR("Invalid persist type specified"); + } +} + +/// @todo Fix the callback types to properly track execution phases enum class callback_type { - batch, - epoch, - validation, - inference, + model_only, + execution_context_only, + full_checkpoint, invalid }; class persist { - protected: - uint64_t m_bytes; - int m_model_fd; - int m_train_fd; - int m_validate_fd; - char m_model_filename[1024]; - char m_train_filename[1024]; - char m_validate_filename[1024]; + private: + std::map m_bytes; + std::map m_FDs; + std::map m_filenames; callback_type ckpt_type; public: - char m_checkpoint_dir[1024]; + std::string m_checkpoint_dir; public: persist(); @@ -73,18 +127,24 @@ class persist { ckpt_type = type; } - void open_checkpoint(const char *dir); + void open_checkpoint(const std::string& dir); void close_checkpoint(); - void open_restart(const char *dir); + void open_restart(const std::string& dir); void close_restart(); uint64_t get_bytes() const { - return m_bytes; + uint64_t bytes = 0; + for(auto& pt : m_bytes) { + bytes += pt.second; + } + return bytes; } void reset_bytes() { - m_bytes = 0; + for(auto& pt : m_bytes) { + pt.second = 0; + } } bool write_rank_distmat(persist_type type, const char *name, const AbsDistMat& M); @@ -117,6 +177,7 @@ class persist { bool write_datatype(persist_type type, const char *name, DataType val); bool read_datatype (persist_type type, const char *name, DataType *val); + std::string get_filename(persist_type type) const; private: int get_fd(persist_type type) const; }; @@ -145,6 +206,95 @@ bool read_double (int fd, const char *name, double *val); bool write_string(int fd, const char *name, const char *buf, size_t size); bool read_string(int fd, const char *name, char *buf, size_t size); +class NonexistentArchiveFile : public std::runtime_error { +public: + NonexistentArchiveFile(std::string const& filename) : std::runtime_error(std::string("Archive file not found: ") + filename) {} +}; + +template +void write_cereal_archive(C& obj, persist& p, persist_type pt, const std::string& suffix) { + std::ofstream os(p.get_filename(pt) + suffix); + if(!os.is_open()) { + throw NonexistentArchiveFile(p.get_filename(pt) + suffix); + } + cereal::XMLOutputArchive archive(os); + archive(obj); +} + +template +void write_cereal_archive(C& obj, persist& p, execution_mode mode, const std::string& suffix) { + const persist_type pt = execution_mode_to_persist_type(mode); + write_cereal_archive(obj, p, pt, suffix); +} + +template +void read_cereal_archive(C& obj, persist& p, persist_type pt, const std::string& suffix) { + std::ifstream is(p.get_filename(pt) + suffix); + if(!is.is_open()) { + throw NonexistentArchiveFile(p.get_filename(pt) + suffix); + } + cereal::XMLInputArchive archive(is); + archive(obj); +} + +template +void read_cereal_archive(C& obj, persist& p, execution_mode mode, const std::string& suffix) { + const persist_type pt = execution_mode_to_persist_type(mode); + read_cereal_archive(obj, p, pt, suffix); +} + +template +std::string create_cereal_archive_binary_string(C& obj) { + std::ostringstream ss; + { + cereal::BinaryOutputArchive archive(ss); + archive(obj); + } // archive goes out of scope, ensuring all contents are flushed + return ss.str(); +} + +template +void unpack_cereal_archive_binary_string(C& obj, const std::string& buf) { + std::istringstream ss(buf); + { + cereal::BinaryInputArchive archive(ss); + archive(obj); + } // archive goes out of scope, ensuring all contents are flushed +} + +template +void load_from_shared_cereal_archive(C& obj, persist& p, persist_type pt, + lbann_comm& comm, + const std::string& suffix) { + std::string buf; + if (comm.am_trainer_master()) { + read_cereal_archive(obj, p, pt, suffix); + buf = create_cereal_archive_binary_string(obj); + }else { + // If you are not the trainer master, still check to see if the file exists + std::ifstream is(p.get_filename(pt) + suffix); + if(!is.is_open()) { + throw NonexistentArchiveFile(p.get_filename(pt) + suffix); + } + } + + // TODO: this assumes homogeneous processors + // broadcast state from rank 0 + comm.trainer_broadcast(0, buf); + + if (!comm.am_trainer_master()) { + unpack_cereal_archive_binary_string(obj, buf); + } +} + +template +void load_from_shared_cereal_archive(C& obj, persist& p, execution_mode mode, + lbann_comm& comm, + const std::string& suffix) { + const persist_type pt = execution_mode_to_persist_type(mode); + load_from_shared_cereal_archive(obj, p, pt, comm, suffix); +} + } // namespace lbann #endif // LBANN_PERSIST_H diff --git a/include/lbann/layers/io/input/generic_input_layer.hpp b/include/lbann/layers/io/input/generic_input_layer.hpp index 3bf4677c7b3..74beeb6482f 100644 --- a/include/lbann/layers/io/input/generic_input_layer.hpp +++ b/include/lbann/layers/io/input/generic_input_layer.hpp @@ -29,11 +29,17 @@ #include "lbann/layers/io/io_layer.hpp" //#include "lbann/utils/dataset.hpp" +#include "lbann/io/persist.hpp" #include "lbann/io/data_buffers/generic_io_buffer.hpp" #include "lbann/io/data_buffers/partitioned_io_buffer.hpp" #include "lbann/models/model.hpp" #include "lbann/callbacks/imcomm.hpp" #include "lbann/utils/omp_diagnostics.hpp" +#include +#include +#include +#include +#include #include @@ -123,6 +129,16 @@ class generic_input_layer : public io_layer { return *this; } + /** Archive for checkpoint and restart */ + template void serialize( Archive & ar ) { + ar(/*CEREAL_NVP(m_io_buffer),*/ + CEREAL_NVP(m_training_dataset), + CEREAL_NVP(m_testing_dataset), + CEREAL_NVP(m_validation_dataset)/*, + CEREAL_NVP(m_data_readers), + CEREAL_NVP(m_data_set_processed)*/); + } + template inline void initialize_io_buffer(lbann_comm *comm, int num_parallel_readers, std::map data_readers); @@ -131,7 +147,6 @@ class generic_input_layer : public io_layer { description get_description() const override { auto desc = io_layer::get_description(); desc.add("Buffer", m_io_buffers[0]->get_type()); - desc.add("Background I/O", this->m_model->background_io_activity_allowed()); return desc; } @@ -152,22 +167,6 @@ class generic_input_layer : public io_layer { output.Resize(output.Height(), max_mb_size); } - auto num_io_threads = this->m_model->get_io_thread_pool()->get_num_threads(); - /// BVE FIXME foreach data reader - // in case that target_layer gets initialized beforehand - if(m_data_readers[execution_mode::training] != nullptr) { - m_data_readers[execution_mode::training]->setup(num_io_threads, this->m_model->get_io_thread_pool()); - m_data_readers[execution_mode::training]->set_rank(Layer::m_comm->get_rank_in_trainer()); - } - if(m_data_readers[execution_mode::validation] != nullptr) { - m_data_readers[execution_mode::validation]->setup(num_io_threads, this->m_model->get_io_thread_pool()); - m_data_readers[execution_mode::validation]->set_rank(Layer::m_comm->get_rank_in_trainer()); - } - if(m_data_readers[execution_mode::testing] != nullptr) { - m_data_readers[execution_mode::testing]->setup(num_io_threads, this->m_model->get_io_thread_pool()); - m_data_readers[execution_mode::testing]->set_rank(Layer::m_comm->get_rank_in_trainer()); - } - if(io_layer::m_data_set_spans_models) { calculate_num_iterations_per_epoch_training_spans_models(max_mb_size); } else { @@ -200,23 +199,28 @@ class generic_input_layer : public io_layer { * Sets up the effective (global) mini-batch size. */ void fp_setup_outputs(El::Int mini_batch_size) override { - - // Determine model mini-batch size and effective mini-batch size - // Note: If inter-model communication is activated, the effective - // mini-batch is equal to the global mini-batch size. - /// @todo This functionality should probably be moved elsewhere - mini_batch_size = get_current_mini_batch_size(); - int effective_mini_batch_size = mini_batch_size; - for (auto&& cb : this->m_model->get_callbacks()) { - if (dynamic_cast(cb) != nullptr) { - effective_mini_batch_size = get_current_global_mini_batch_size(); - break; + /// During model setup there is no valid execution context, but + /// during execution there is a context + if(this->m_model->has_valid_execution_context()) { + // Determine model mini-batch size and effective mini-batch size + // Note: If inter-model communication is activated, the effective + // mini-batch is equal to the global mini-batch size. + /// @todo This functionality should probably be moved elsewhere + mini_batch_size = get_current_mini_batch_size(); + + auto effective_mini_batch_size = mini_batch_size; + for (auto&& cb : this->m_model->get_callbacks()) { + if (dynamic_cast(cb) != nullptr) { + effective_mini_batch_size = get_current_global_mini_batch_size(); + break; + } } - } - // Set mini-batch size in model - this->m_model->set_current_mini_batch_size(mini_batch_size); - this->m_model->set_effective_mini_batch_size(effective_mini_batch_size); + auto& c = static_cast(this->m_model->get_execution_context()); + // Set mini-batch size in model + c.set_current_mini_batch_size(mini_batch_size); + c.set_effective_mini_batch_size(effective_mini_batch_size); + } // Initialize matrices io_layer::fp_setup_outputs(mini_batch_size); @@ -248,7 +252,7 @@ class generic_input_layer : public io_layer { } void fp_compute() override { - execution_mode mode = this->m_model->get_execution_mode(); + execution_mode mode = this->m_model->get_execution_context().get_execution_mode(); increment_active_buffer_idx(mode); @@ -257,7 +261,7 @@ class generic_input_layer : public io_layer { // If there is no valid data and there is not already a background // thread to fetch the data, queue up the background thread if(io_buffer->num_samples_ready(mode) == 0 && !io_buffer->is_data_fetched_in_background(mode)) { - std::future background_fetch_done = this->m_model->get_io_thread_pool()->submit_job( + std::future background_fetch_done = this->m_model->get_execution_context().get_io_thread_pool().submit_job( std::bind(&generic_input_layer::fetch_data_in_background, this, get_active_buffer_idx(mode), mode)); io_buffer->set_data_fetch_future(std::move(background_fetch_done), mode); io_buffer->set_fetch_data_in_background(true, mode); @@ -297,9 +301,9 @@ class generic_input_layer : public io_layer { m_data_set_processed = io_buffer->update_data_set(get_data_reader(mode), mode); - if(!m_data_set_processed && this->m_model->background_io_activity_allowed()) { + if(!m_data_set_processed && this->m_model->get_execution_context().background_io_activity_allowed()) { int next_active_buffer = get_active_buffer_idx(mode) + 1; - std::future background_fetch_done = this->m_model->get_io_thread_pool()->submit_job( + std::future background_fetch_done = this->m_model->get_execution_context().get_io_thread_pool().submit_job( std::bind(&generic_input_layer::fetch_data_in_background, this, next_active_buffer, mode)); generic_io_buffer* next_io_buffer = m_io_buffers[next_active_buffer % m_io_buffers.size()]; next_io_buffer->set_data_fetch_future(std::move(background_fetch_done), mode); @@ -345,7 +349,7 @@ class generic_input_layer : public io_layer { } generic_data_reader *get_data_reader() const { - return get_data_reader(this->m_model->get_execution_mode()); + return get_data_reader(this->m_model->get_execution_context().get_execution_mode()); } virtual int get_num_parallel_readers(execution_mode mode) const { @@ -354,7 +358,7 @@ class generic_input_layer : public io_layer { } virtual int get_num_parallel_readers() const { - return get_num_parallel_readers(this->m_model->get_execution_mode()); + return get_num_parallel_readers(this->m_model->get_execution_context().get_execution_mode()); } virtual int get_num_iterations_per_epoch(execution_mode mode) const { @@ -363,7 +367,7 @@ class generic_input_layer : public io_layer { } virtual int get_num_iterations_per_epoch() const { - return get_num_iterations_per_epoch(this->m_model->get_execution_mode()); + return get_num_iterations_per_epoch(this->m_model->get_execution_context().get_execution_mode()); } virtual int get_current_step_in_epoch(execution_mode mode) const { @@ -372,7 +376,7 @@ class generic_input_layer : public io_layer { } virtual int get_current_step_in_epoch() const { - return get_current_step_in_epoch(this->m_model->get_execution_mode()); + return get_current_step_in_epoch(this->m_model->get_execution_context().get_execution_mode()); } virtual int get_mini_batch_size(execution_mode mode) const { @@ -386,7 +390,7 @@ class generic_input_layer : public io_layer { } virtual int get_last_mini_batch_size() const { - return get_last_mini_batch_size(this->m_model->get_execution_mode()); + return get_last_mini_batch_size(this->m_model->get_execution_context().get_execution_mode()); } virtual int get_current_mini_batch_size(execution_mode mode) const { @@ -395,7 +399,7 @@ class generic_input_layer : public io_layer { } virtual int get_current_mini_batch_size() const { - return get_current_mini_batch_size(this->m_model->get_execution_mode()); + return get_current_mini_batch_size(this->m_model->get_execution_context().get_execution_mode()); } virtual int get_global_mini_batch_size(execution_mode mode) const { @@ -414,7 +418,7 @@ class generic_input_layer : public io_layer { } virtual int get_current_global_mini_batch_size() const { - return get_current_global_mini_batch_size(this->m_model->get_execution_mode()); + return get_current_global_mini_batch_size(this->m_model->get_execution_context().get_execution_mode()); } virtual int get_world_master_mini_batch_adjustment(execution_mode mode) const { @@ -423,7 +427,7 @@ class generic_input_layer : public io_layer { } virtual int get_world_master_mini_batch_adjustment() const { - return get_world_master_mini_batch_adjustment(this->m_model->get_execution_mode()); + return get_world_master_mini_batch_adjustment(this->m_model->get_execution_context().get_execution_mode()); } virtual int get_current_world_master_mini_batch_adjustment(execution_mode mode, int model_rank) const { @@ -432,7 +436,7 @@ class generic_input_layer : public io_layer { } virtual int get_current_world_master_mini_batch_adjustment(int model_rank) const { - return get_current_world_master_mini_batch_adjustment(this->m_model->get_execution_mode(), model_rank); + return get_current_world_master_mini_batch_adjustment(this->m_model->get_execution_context().get_execution_mode(), model_rank); } /** Calculate how many iterations are required for training, testing, @@ -519,8 +523,8 @@ class generic_input_layer : public io_layer { /** * Return the dataset associated with the current execution mode. */ - dataset& select_dataset() override { return get_dataset(m_model->get_execution_mode()); } - const dataset& select_dataset() const override { return get_dataset(m_model->get_execution_mode()); } + dataset& select_dataset() override { return get_dataset(m_model->get_execution_context().get_execution_mode()); } + const dataset& select_dataset() const override { return get_dataset(m_model->get_execution_context().get_execution_mode()); } /** * Return the first dataset with a valid (non-null) datareader. @@ -558,16 +562,22 @@ class generic_input_layer : public io_layer { * Return the sample indices fetched in the current mini-batch. */ El::Matrix* get_sample_indices_per_mb() override { - execution_mode mode = this->m_model->get_execution_mode(); + execution_mode mode = this->m_model->get_execution_context().get_execution_mode(); generic_io_buffer* io_buffer = m_io_buffers[get_active_buffer_idx(mode) % m_io_buffers.size()]; - return io_buffer->get_sample_indices_fetched_per_mb(this->m_model->get_execution_mode()); + return io_buffer->get_sample_indices_fetched_per_mb(this->m_model->get_execution_context().get_execution_mode()); } /** * Get the dimensions of the underlying data. */ const std::vector get_data_dims(int child_index = 0) const override { - const generic_data_reader *dr = get_data_reader(); + // Check the training and testing execution modes for data dimensions + const generic_data_reader *dr = get_data_reader(execution_mode::training); + // If there isn't a training data reader, use the testing data reader + if(dr == nullptr) { + dr = get_data_reader(execution_mode::testing); + } + if(dr == nullptr) { LBANN_ERROR("unable to call get_data_dims -- no valid execution mode"); } // dataset* ds = select_first_valid_dataset(); if (dr) { if(child_index == 0) { @@ -715,52 +725,30 @@ class generic_input_layer : public io_layer { bool save_to_checkpoint_shared(persist& p) const override { // save state of data readers from input layer data_reader_map_t::const_iterator it; - if(p.get_cb_type() != callback_type::validation){ + if(p.get_cb_type() == callback_type::execution_context_only + || p.get_cb_type() == callback_type::full_checkpoint){ + it = this->m_data_readers.find(execution_mode::training); if ((it != this->m_data_readers.end()) && it->second) { - (it->second)->save_to_checkpoint_shared(p, "data_reader_training"); + (it->second)->save_to_checkpoint_shared(p, execution_mode::training); } it = this->m_data_readers.find(execution_mode::testing); if ((it != this->m_data_readers.end()) && it->second) { - (it->second)->save_to_checkpoint_shared(p, "data_reader_testing"); - } - if (m_comm->am_trainer_master()) { - p.write_uint64(persist_type::train, "reader_train_processed", - (uint64_t) m_training_dataset.get_num_samples_processed()); - p.write_uint64(persist_type::train, "reader_train_total", - (uint64_t) m_training_dataset.get_total_samples()); - - p.write_uint64(persist_type::train, "reader_test_processed", - (uint64_t) m_testing_dataset.get_num_samples_processed()); - p.write_uint64(persist_type::train, "reader_test_total", - (uint64_t) m_testing_dataset.get_total_samples()); - - } - } - if(p.get_cb_type() == callback_type::validation || p.get_cb_type() == callback_type::batch){ - if (m_comm->am_trainer_master()) { - p.write_uint64(persist_type::validate, "reader_validate_processed", - (uint64_t) m_validation_dataset.get_num_samples_processed()); - p.write_uint64(persist_type::validate, "reader_validate_total", - (uint64_t) m_validation_dataset.get_total_samples()); + (it->second)->save_to_checkpoint_shared(p, execution_mode::testing); } it = this->m_data_readers.find(execution_mode::validation); if ((it != this->m_data_readers.end()) && it->second) { - (it->second)->save_to_checkpoint_shared(p, "data_reader_validation"); + (it->second)->save_to_checkpoint_shared(p, execution_mode::validation); } + + if (get_comm()->am_trainer_master()) { + write_cereal_archive(*this, p, execution_mode::training, "_io.xml"); + } + } return true; } - struct dataset_header { - uint64_t train_proc; - uint64_t train_total; - uint64_t test_proc; - uint64_t test_total; - uint64_t validate_proc; - uint64_t validate_total; - }; - // reload state of IO from a checkpoint bool load_from_checkpoint_shared(persist& p) override { // save state of data readers from input layer @@ -768,80 +756,52 @@ class generic_input_layer : public io_layer { it = this->m_data_readers.find(execution_mode::training); if ((it != this->m_data_readers.end()) && it->second) { - (it->second)->load_from_checkpoint_shared(p, "data_reader_training"); + (it->second)->load_from_checkpoint_shared(p, execution_mode::training); } it = this->m_data_readers.find(execution_mode::testing); if ((it != this->m_data_readers.end()) && it->second) { - (it->second)->load_from_checkpoint_shared(p, "data_reader_testing"); - } - - // save our own state - // rank 0 reads the file - dataset_header header; - // Assume we are loading from a epoch end checkpoint - if (m_comm->am_trainer_master()) { - p.read_uint64(persist_type::train, "reader_train_processed", &header.train_proc); - p.read_uint64(persist_type::train, "reader_train_total", &header.train_total); - p.read_uint64(persist_type::train, "reader_test_processed", &header.test_proc); - p.read_uint64(persist_type::train, "reader_test_total", &header.test_total); - if(m_data_readers[execution_mode::validation] != nullptr){ - p.read_uint64(persist_type::validate, "reader_validate_processed", &header.validate_proc); - p.read_uint64(persist_type::validate, "reader_validate_total", &header.validate_total); - } + (it->second)->load_from_checkpoint_shared(p, execution_mode::testing); } - it = this->m_data_readers.find(execution_mode::validation); if ((it != this->m_data_readers.end()) && it->second) { - (it->second)->load_from_checkpoint_shared(p, "data_reader_validation"); - } - // TODO: assumes homogeneous hardware - // broadcast data from rank 0 - MPI_Bcast(&header, sizeof(header), MPI_BYTE, 0, MPI_COMM_WORLD); - // set our fields - m_training_dataset.num_samples_processed() = (long) header.train_proc; - m_training_dataset.total_samples() = (long) header.train_total; - m_testing_dataset.num_samples_processed() = (long) header.test_proc; - m_testing_dataset.total_samples() = (long) header.test_total; - if(m_data_readers[execution_mode::validation] != nullptr){ - m_validation_dataset.num_samples_processed() = (long) header.validate_proc; - m_validation_dataset.total_samples() = (long) header.validate_total; + (it->second)->load_from_checkpoint_shared(p, execution_mode::validation); + } + + std::string buf; + if (get_comm()->am_trainer_master()) { + read_cereal_archive(*this, p, execution_mode::training, "_io.xml"); + buf = create_cereal_archive_binary_string(*this); + } + + // TODO: this assumes homogeneous processors + // broadcast state from rank 0 + get_comm()->trainer_broadcast(0, buf); + + if (!get_comm()->am_trainer_master()) { + unpack_cereal_archive_binary_string(*this, buf); } + return true; } bool save_to_checkpoint_distributed(persist& p) const override { // save state of data readers from input layer data_reader_map_t::const_iterator it; - if(p.get_cb_type() != callback_type::validation){ + if(p.get_cb_type() == callback_type::execution_context_only || p.get_cb_type() == callback_type::full_checkpoint) { it = this->m_data_readers.find(execution_mode::training); if ((it != this->m_data_readers.end()) && it->second) { - (it->second)->save_to_checkpoint_distributed(p, "data_reader_training"); + (it->second)->save_to_checkpoint_distributed(p, execution_mode::training); } it = this->m_data_readers.find(execution_mode::testing); if ((it != this->m_data_readers.end()) && it->second) { - (it->second)->save_to_checkpoint_distributed(p, "data_reader_testing"); + (it->second)->save_to_checkpoint_distributed(p, execution_mode::testing); } - p.write_uint64(persist_type::train, "reader_train_processed", - (uint64_t) m_training_dataset.get_num_samples_processed()); - p.write_uint64(persist_type::train, "reader_train_total", - (uint64_t) m_training_dataset.get_total_samples()); - - p.write_uint64(persist_type::train, "reader_test_processed", - (uint64_t) m_testing_dataset.get_num_samples_processed()); - p.write_uint64(persist_type::train, "reader_test_total", - (uint64_t) m_testing_dataset.get_total_samples()); - - } - if(p.get_cb_type() == callback_type::validation || p.get_cb_type() == callback_type::batch){ - p.write_uint64(persist_type::validate, "reader_validate_processed", - (uint64_t) m_validation_dataset.get_num_samples_processed()); - p.write_uint64(persist_type::validate, "reader_validate_total", - (uint64_t) m_validation_dataset.get_total_samples()); it = this->m_data_readers.find(execution_mode::validation); if ((it != this->m_data_readers.end()) && it->second) { - (it->second)->save_to_checkpoint_distributed(p, "data_reader_validation"); + (it->second)->save_to_checkpoint_distributed(p, execution_mode::validation); } + write_cereal_archive(*this, p, execution_mode::training, "_io.xml"); } return true; } @@ -851,37 +811,18 @@ class generic_input_layer : public io_layer { data_reader_map_t::const_iterator it; it = this->m_data_readers.find(execution_mode::training); if ((it != this->m_data_readers.end()) && it->second) { - (it->second)->load_from_checkpoint_distributed(p, "data_reader_training"); + (it->second)->load_from_checkpoint_distributed(p, execution_mode::training); } it = this->m_data_readers.find(execution_mode::testing); if ((it != this->m_data_readers.end()) && it->second) { - (it->second)->load_from_checkpoint_distributed(p, "data_reader_testing"); - } - // save our own state - // rank 0 reads the file - dataset_header header; - p.read_uint64(persist_type::train, "reader_train_processed", &header.train_proc); - p.read_uint64(persist_type::train, "reader_train_total", &header.train_total); - p.read_uint64(persist_type::train, "reader_test_processed", &header.test_proc); - p.read_uint64(persist_type::train, "reader_test_total", &header.test_total); - if(m_data_readers[execution_mode::validation] != nullptr){ - p.read_uint64(persist_type::validate, "reader_validate_processed", &header.validate_proc); - p.read_uint64(persist_type::validate, "reader_validate_total", &header.validate_total); + (it->second)->load_from_checkpoint_distributed(p, execution_mode::testing); } it = this->m_data_readers.find(execution_mode::validation); if ((it != this->m_data_readers.end()) && it->second) { - (it->second)->load_from_checkpoint_distributed(p, "data_reader_validation"); + (it->second)->load_from_checkpoint_distributed(p, execution_mode::validation); } - // set our fields - m_training_dataset.num_samples_processed() = (long) header.train_proc; - m_training_dataset.total_samples() = (long) header.train_total; - m_testing_dataset.num_samples_processed() = (long) header.test_proc; - m_testing_dataset.total_samples() = (long) header.test_total; - if(m_data_readers[execution_mode::validation] != nullptr){ - m_validation_dataset.num_samples_processed() = (long) header.validate_proc; - m_validation_dataset.total_samples() = (long) header.validate_total; - } + read_cereal_archive(*this, p, execution_mode::training, "_io.xml"); return true; } diff --git a/include/lbann/layers/learning/base_convolution.hpp b/include/lbann/layers/learning/base_convolution.hpp index a431c98cd37..6f210b7eccc 100644 --- a/include/lbann/layers/learning/base_convolution.hpp +++ b/include/lbann/layers/learning/base_convolution.hpp @@ -37,6 +37,7 @@ #include "lbann/utils/random.hpp" #include "lbann/utils/timer.hpp" #include "lbann/utils/im2col.hpp" +#include "lbann/execution_contexts/sgd_execution_context.hpp" namespace lbann { @@ -643,8 +644,8 @@ class base_convolution_layer : public Layer { const auto& local_input = get_local_prev_activations(); const auto& local_gradient_wrt_output = get_local_prev_error_signals(); - // Useful constants - const int effective_mini_batch_size = this->m_model->get_effective_mini_batch_size(); + const auto& c = static_cast(this->m_model->get_execution_context()); + const auto effective_mini_batch_size = c.get_effective_mini_batch_size(); const bool has_local_data = (local_input.Height() > 0 && local_input.Width() > 0 && local_gradient_wrt_output.Height() > 0 @@ -917,7 +918,8 @@ class base_convolution_layer : public Layer { const int num_input_channels = input_dims[0]; const int num_output_channels = output_dims[0]; const int num_per_output_channel = get_output_size() / num_output_channels; - const int effective_mini_batch_size = this->m_model->get_effective_mini_batch_size(); + const auto& c = static_cast(this->m_model->get_execution_context()); + const auto effective_mini_batch_size = c.get_effective_mini_batch_size(); const auto& kernel_dims = get_kernel_dims(); const auto& kernel_size = std::accumulate(kernel_dims.begin(), kernel_dims.end(), diff --git a/include/lbann/layers/regularizers/dropout.hpp b/include/lbann/layers/regularizers/dropout.hpp index 2dfde7b21a0..9b93423cc0b 100644 --- a/include/lbann/layers/regularizers/dropout.hpp +++ b/include/lbann/layers/regularizers/dropout.hpp @@ -180,7 +180,7 @@ class dropout : public regularizer_layer { auto& output = get_activations(); // Do nothing if dropout is disabled - const auto& mode = this->m_model->get_execution_mode(); + const auto& mode = this->m_model->get_execution_context().get_execution_mode(); if (mode != execution_mode::training || m_keep_prob < EvalType(0)) { El::Copy(input, output); return; @@ -213,7 +213,7 @@ class dropout : public regularizer_layer { void bp_compute_cpu() { const auto& gradient_wrt_output = get_prev_error_signals(); auto& gradient_wrt_input = get_error_signals(); - const auto& mode = this->m_model->get_execution_mode(); + const auto& mode = this->m_model->get_execution_context().get_execution_mode(); if (mode != execution_mode::training || m_keep_prob < EvalType(0)) { El::Copy(gradient_wrt_output, gradient_wrt_input); } else { @@ -233,7 +233,7 @@ class dropout : public regularizer_layer { auto& local_output = output.Matrix(); // Do nothing if dropout is disabled or there is no local data - const auto& mode = this->m_model->get_execution_mode(); + const auto& mode = this->m_model->get_execution_context().get_execution_mode(); if (mode != execution_mode::training || m_keep_prob < EvalType(0)) { El::Copy(input, output); return; @@ -272,7 +272,7 @@ class dropout : public regularizer_layer { auto& local_gradient_wrt_input = gradient_wrt_input.Matrix(); // Copy error signal if dropout is disabled - const auto& mode = this->m_model->get_execution_mode(); + const auto& mode = this->m_model->get_execution_context().get_execution_mode(); if (mode != execution_mode::training || m_keep_prob < EvalType(0)) { El::Copy(gradient_wrt_output, gradient_wrt_input); } else { diff --git a/include/lbann/layers/regularizers/selu_dropout.hpp b/include/lbann/layers/regularizers/selu_dropout.hpp index a2b3d6475a3..8c7b837729f 100644 --- a/include/lbann/layers/regularizers/selu_dropout.hpp +++ b/include/lbann/layers/regularizers/selu_dropout.hpp @@ -109,7 +109,7 @@ class selu_dropout : public regularizer_layer { protected: /** Drop out units in forward propagation. */ void fp_compute() override { - if (this->m_model->get_execution_mode() != execution_mode::training || + if (this->m_model->get_execution_context().get_execution_mode() != execution_mode::training || m_keep_prob < 0.0f) { // Do nothing if dropout is disabled El::Copy(get_prev_activations(), get_activations()); @@ -141,7 +141,7 @@ class selu_dropout : public regularizer_layer { /** Adjust gradients for dropout in backprop. */ void bp_compute() override { - if (this->m_model->get_execution_mode() != execution_mode::training + if (this->m_model->get_execution_context().get_execution_mode() != execution_mode::training || m_keep_prob < 0.0f) { El::Copy(get_prev_error_signals(), get_error_signals()); } else { diff --git a/include/lbann/layers/transform/bernoulli.hpp b/include/lbann/layers/transform/bernoulli.hpp index 2f6fc0d4077..f7216d75421 100644 --- a/include/lbann/layers/transform/bernoulli.hpp +++ b/include/lbann/layers/transform/bernoulli.hpp @@ -65,7 +65,7 @@ class bernoulli_layer : public transform_layer { void fp_compute() override { auto& output = get_activations(); - if (this->m_model->get_execution_mode() == execution_mode::training) { + if (this->m_model->get_execution_context().get_execution_mode() == execution_mode::training) { bernoulli_fill(output, output.Height(), output.Width(), m_prob); } else { El::Zero(output); diff --git a/include/lbann/layers/transform/categorical_random.hpp b/include/lbann/layers/transform/categorical_random.hpp index ac756dbeb5f..555b44ff67e 100644 --- a/include/lbann/layers/transform/categorical_random.hpp +++ b/include/lbann/layers/transform/categorical_random.hpp @@ -69,7 +69,7 @@ class categorical_random_layer : public transform_layer { const auto& local_width = local_input.Width(); // Initialize output and random numbers - const auto& mode = this->m_model->get_execution_mode(); + const auto& mode = this->m_model->get_execution_context().get_execution_mode(); El::Zero(local_output); StarVCMat rand_mat(input.Grid(), input.Root()); if (mode == execution_mode::training) { diff --git a/include/lbann/layers/transform/discrete_random.hpp b/include/lbann/layers/transform/discrete_random.hpp index c668971726f..1da49cd16d7 100644 --- a/include/lbann/layers/transform/discrete_random.hpp +++ b/include/lbann/layers/transform/discrete_random.hpp @@ -86,7 +86,7 @@ class discrete_random_layer : public transform_layer { const auto& local_width = input.LocalWidth(); // Initialize random numbers - const auto& mode = this->m_model->get_execution_mode(); + const auto& mode = this->m_model->get_execution_context().get_execution_mode(); if (mode == execution_mode::training) { uniform_fill(output, 1, width, DataType(0.5), DataType(0.5)); } diff --git a/include/lbann/layers/transform/gaussian.hpp b/include/lbann/layers/transform/gaussian.hpp index 2de8e7e9af3..8d679fd4a91 100644 --- a/include/lbann/layers/transform/gaussian.hpp +++ b/include/lbann/layers/transform/gaussian.hpp @@ -70,7 +70,7 @@ class gaussian_layer : public transform_layer { void fp_compute() override { auto& output = get_activations(); - if (this->m_model->get_execution_mode() == execution_mode::training) { + if (this->m_model->get_execution_context().get_execution_mode() == execution_mode::training) { gaussian_fill(output, output.Height(), output.Width(), m_mean, m_stdev); } else { El::Fill(output, m_mean); diff --git a/include/lbann/layers/transform/uniform.hpp b/include/lbann/layers/transform/uniform.hpp index acaf26952a5..b7423cb0295 100644 --- a/include/lbann/layers/transform/uniform.hpp +++ b/include/lbann/layers/transform/uniform.hpp @@ -74,7 +74,7 @@ class uniform_layer : public transform_layer { const auto& mean = (m_max + m_min) / 2; const auto& radius = (m_max - m_min) / 2; auto& output = get_activations(); - if (this->m_model->get_execution_mode() == execution_mode::training) { + if (this->m_model->get_execution_context().get_execution_mode() == execution_mode::training) { uniform_fill(output, output.Height(), output.Width(), mean, radius); } else { El::Fill(output, mean); diff --git a/include/lbann/layers/transform/weights.hpp b/include/lbann/layers/transform/weights.hpp index bdf0de3a5e4..0b4aee13672 100644 --- a/include/lbann/layers/transform/weights.hpp +++ b/include/lbann/layers/transform/weights.hpp @@ -28,6 +28,7 @@ #define LBANN_LAYER_WEIGHTS_HPP_INCLUDED #include "lbann/layers/transform/transform.hpp" +#include "lbann/execution_contexts/sgd_execution_context.hpp" namespace lbann { @@ -185,8 +186,9 @@ class weights_layer : public transform_layer { m_workspace->Resize(local_gradient_wrt_output.Width(), 1); El::Fill(*m_workspace, one); + const auto& c = static_cast(this->m_model->get_execution_context()); // Compute gradient contribution and accumulate - const auto& scale = one / this->m_model->get_effective_mini_batch_size(); + const auto& scale = one / c.get_effective_mini_batch_size(); El::Gemv(El::NORMAL, scale, local_gradient_wrt_output, *m_workspace, zero, m_gradient->Matrix()); diff --git a/include/lbann/lbann.hpp b/include/lbann/lbann.hpp index a3cd67f45f7..094eab8e2b8 100644 --- a/include/lbann/lbann.hpp +++ b/include/lbann/lbann.hpp @@ -27,6 +27,12 @@ #ifndef LBANN_LBANN_HPP_INCLUDED #define LBANN_LBANN_HPP_INCLUDED +/// Trainers +#include "lbann/trainers/trainer.hpp" + +/// Training Algorithms +#include "lbann/training_algorithms/training_algorithm.hpp" + /// Models #include "lbann/models/directed_acyclic_graph.hpp" diff --git a/include/lbann/metrics/metric.hpp b/include/lbann/metrics/metric.hpp index d270c361bb5..c7786dce602 100644 --- a/include/lbann/metrics/metric.hpp +++ b/include/lbann/metrics/metric.hpp @@ -31,6 +31,8 @@ #include "lbann/comm.hpp" #include "lbann/utils/exception.hpp" #include "lbann/io/persist.hpp" +#include +#include namespace lbann { @@ -56,6 +58,13 @@ struct metric_statistics { metric_statistics& operator=(const metric_statistics& other) = default; /** Destructor. */ ~metric_statistics() = default; + + /** Archive for checkpoint and restart */ + template void serialize( Archive & ar ) { + ar(CEREAL_NVP(m_sum), + CEREAL_NVP(m_num_samples)); + } + /** Add metric value to statistics. */ void add_value(EvalType value, int num_samples = 1); /** Get mean metric value. @@ -67,19 +76,6 @@ struct metric_statistics { int get_num_samples() const { return m_num_samples; } /** Reset statistics. */ void reset(); - - //************************************************************************ - // Checkpointing - //************************************************************************ - /** struct used to serialize mode fields in file and MPI transfer */ - struct packing_header { - double sum; - uint64_t num_samples; - }; - bool pack_scalars(persist& p); - bool unpack_scalars(persist& p, struct packing_header *header); - void unpack_header(struct packing_header& header); - }; /** Abstract base class for metric functions. @@ -102,6 +98,11 @@ class metric { /** Copy function. */ virtual metric* copy() const = 0; + /** Archive for checkpoint and restart */ + template void serialize( Archive & ar ) { + ar(CEREAL_NVP(m_statistics)); + } + /** Return a string name for this metric. */ virtual std::string name() const = 0; /** Return a display unit for this metric. diff --git a/include/lbann/models/directed_acyclic_graph.hpp b/include/lbann/models/directed_acyclic_graph.hpp index a47c6a8f123..1485b80d40f 100644 --- a/include/lbann/models/directed_acyclic_graph.hpp +++ b/include/lbann/models/directed_acyclic_graph.hpp @@ -43,7 +43,7 @@ class directed_acyclic_graph_model : public model { directed_acyclic_graph_model(const directed_acyclic_graph_model& other) = default; directed_acyclic_graph_model& operator=(const directed_acyclic_graph_model& other) = default; ~directed_acyclic_graph_model() override = default; - directed_acyclic_graph_model* copy() const override { return new directed_acyclic_graph_model(*this); } + std::unique_ptr copy_model() const override { return make_unique(*this); } std::string get_type() const override { return "directed acyclic graph"; } protected: diff --git a/include/lbann/models/model.hpp b/include/lbann/models/model.hpp index 8448f37786e..b0440a49ea9 100644 --- a/include/lbann/models/model.hpp +++ b/include/lbann/models/model.hpp @@ -30,6 +30,7 @@ #include "lbann/base.hpp" #include "lbann/comm.hpp" #include "lbann/layers/layer.hpp" +#include "lbann/execution_contexts/execution_context.hpp" #include "lbann/utils/summary.hpp" #include "lbann/utils/graph.hpp" #include "lbann/io/file_io.hpp" @@ -52,6 +53,8 @@ class Model; namespace lbann { // Forward declarations +class lbann_callback; +class training_algorithm; class callback_base; /** @brief Abstract base class for neural network models. */ @@ -63,13 +66,13 @@ class model { // =========================================== model(lbann_comm* comm, - El::Int mini_batch_size, + size_t mini_batch_size, objective_function* obj_fn, optimizer* default_optimizer = nullptr); model(const model& other); model& operator=(const model& other); virtual ~model(); - virtual model* copy() const = 0; + virtual std::unique_ptr copy_model() const = 0; // =========================================== // Access functions @@ -129,59 +132,27 @@ class model { return m_callbacks; } - /** @brief Return the I/O thread pool */ - std::shared_ptr get_io_thread_pool() { return m_io_thread_pool; } - /** @brief Get the model's comm. */ - inline lbann_comm *get_comm() const { + lbann_comm *get_comm() const { return m_comm; } - void set_execution_mode(execution_mode mode); - execution_mode get_execution_mode() const noexcept; - - /** @brief Number of times the training set has been traversed. */ - inline El::Int get_epoch() const noexcept { return m_epoch; } - - /** @brief Current mini-batch step for current execution mode. - * @details Step counts are not reset after each epoch. - */ - El::Int get_step() const noexcept; - - /** @brief Current mini-batch step for given execution mode. - * @details Step counts are not reset after each epoch. - */ - El::Int get_step(execution_mode mode) const noexcept; - - /** @brief Set the model's current mini-batch size. */ - inline void set_current_mini_batch_size(int mini_batch_size) { - m_current_mini_batch_size = mini_batch_size; - } - /** @brief Get the model's current mini-batch size. */ - inline int get_current_mini_batch_size() const { - return m_current_mini_batch_size; + /** Check to see if there is a valid training context for the model */ + bool has_valid_execution_context() const { + return (m_execution_context != nullptr); } - /** @brief Get the model's maximum mini-batch size. */ - inline int get_max_mini_batch_size() const { - return m_max_mini_batch_size; - } - /** @brief Get the model's effective mini-batch size. */ - inline int get_effective_mini_batch_size() const { - return m_effective_mini_batch_size; - } - /** @brief Set the model's effective mini-batch size. */ - inline void set_effective_mini_batch_size(int mini_batch_size) { - m_effective_mini_batch_size = mini_batch_size; - } - int get_num_iterations_per_epoch(execution_mode mode) const; - /** @brief Return true if the flag to stop training is set. */ - bool get_terminate_training() const { - return m_terminate_training; + /** Grab the training context of the model */ + const execution_context& get_execution_context() const { + if(m_execution_context == nullptr) { + LBANN_ERROR("execution context is not set"); + } + return *m_execution_context; } - /** @brief Set the terminate training flag (on or off). */ - void set_terminate_training(bool f) { - m_terminate_training = f; + + /** Grab the training context of the model */ + execution_context& get_execution_context() { + return const_cast(static_cast(*this).get_execution_context()); } // =========================================== @@ -216,6 +187,11 @@ class model { */ optimizer* create_optimizer() const; + /** Get the trainer's maximum mini-batch size. */ + inline size_t get_max_mini_batch_size() const { + return m_max_mini_batch_size; + } + /** @brief Set a flag that can be used to enable / disable the * background I/O activities */ @@ -224,27 +200,15 @@ class model { /** @brief Are background I/O activities enabled by the input layers */ bool background_io_activity_allowed() { return m_background_io_allowed; } + size_t get_num_iterations_per_epoch(execution_mode mode) const; + // =========================================== // Setup // =========================================== /** @details Must be called after model specification and before * execution. */ - virtual void setup(std::shared_ptr io_thread_pool); - - // =========================================== - // Execution - // =========================================== - - /** @brief Evaluate model. */ - virtual void evaluate(execution_mode mode, int num_batches=0); - - /** @brief Train model. */ - virtual void train(int num_epochs, int num_batches=0); - - /** @brief Complete any background I/O data fetch for the execution - mode requested */ - virtual void collect_background_data_fetch(execution_mode mode); + virtual void setup(); virtual void make_data_store_preloaded(execution_mode mode); @@ -292,9 +256,6 @@ class model { protected: - /** @brief Check if the model execution mode is valid. */ - virtual bool is_execution_mode_valid(execution_mode mode) const; - /** @brief Reorder layer list with a gather. * * The new layer list is the same length as @c gather_indices and @@ -353,19 +314,31 @@ class model { */ virtual void setup_weights(); +public: + // =========================================== + // Execution + // =========================================== + /** @brief Reset model pointer and execution mode. */ - virtual void reset_mode_and_model(execution_mode mode); + virtual void reset_mode(execution_context& context, execution_mode mode); /** @brief Reset model statistics for an epoch. */ virtual void reset_epoch_statistics(execution_mode mode); - /** @brief Evaluate model on a mini-batch */ - virtual bool evaluate_mini_batch(execution_mode mode); - /** @brief Train model on a mini-batch. */ - virtual bool train_mini_batch(); + + /** @brief Check if the trainer execution mode is valid for this model. + @todo this should be moved to the trainer when the data readers move. */ + virtual bool is_execution_mode_valid(execution_mode mode) const; + + /** @brief Complete any background I/O data fetch for the execution + mode requested */ + virtual void collect_background_data_fetch(execution_mode mode); /** @brief Forward propagation step. */ virtual void forward_prop(execution_mode mode); /** @brief Backward propagation step. */ virtual void backward_prop(); + /** Evaluate any metrics in the model */ + virtual void evaluate_metrics(execution_mode mode, + size_t current_mini_batch_size); /** @brief Clear each optimizer's gradient. * * This must be called before training forward prop since layers @@ -389,22 +362,6 @@ class model { /** @brief Execute callbacks at end of setup. */ virtual void do_setup_end_cbs(); - /** @brief Execute callbacks at start of training. */ - virtual void do_train_begin_cbs(); - /** @brief Execute callbacks at end of training. */ - virtual void do_train_end_cbs(); - /** @brief Execute callbacks at start of evaluation. */ - virtual void do_evaluate_begin_cbs(execution_mode mode); - /** @brief Execute callbacks at end of evaluation. */ - virtual void do_evaluate_end_cbs(execution_mode mode); - /** @brief Execute callbacks at start of epoch. */ - virtual void do_epoch_begin_cbs(); - /** @brief Execute callbacks at end of epoch. */ - virtual void do_epoch_end_cbs(); - /** @brief Execute callbacks at start of mini-batch. */ - virtual void do_batch_begin_cbs(execution_mode mode); - /** @brief Execute callbacks at end of mini-batch. */ - virtual void do_batch_end_cbs(execution_mode mode); /** @brief Execute callbacks at start of model forward propagation. */ virtual void do_model_forward_prop_begin_cbs(execution_mode mode); /** @brief Execute callbacks at end of model forward propagation. */ @@ -432,6 +389,9 @@ class model { private: + /** Pointer to the execution context object used for training or evaluating this model */ + observer_ptr m_execution_context; + /** @brief LBANN communicator. */ lbann_comm* m_comm; @@ -441,37 +401,6 @@ class model { */ std::string m_name; - /** @brief Current execution mode. */ - execution_mode m_execution_mode = execution_mode::training; - - /** @brief Number of times the training data set has been traversed. */ - El::Int m_epoch = 0; - - /** @brief Number of mini-batch steps performed. - * @details Step counts are not reset after each epoch. - */ - std::map m_step; - - /** @brief Whether to terminate training. - * @details If true, training will terminate immediately before - * the next epoch. - */ - bool m_terminate_training = false; - - /** @brief Size of the current mini-batch in the model. */ - int m_current_mini_batch_size; - /** @details Maximum possible minibatch size supported by layers in - * this model. Note that this is local to the particular model, - * not across multiple models. - */ - int m_max_mini_batch_size; - /** @brief The "effective" size of a minibatch. - * - * This is the size of the minibatch across all models and used for - * e.g. correctly averaging gradients from multiple models. - */ - int m_effective_mini_batch_size; - /** @brief Tensor operations. * @details The list is in execution order for forward propagation. */ @@ -480,6 +409,12 @@ class model { /** @brief Trainable parameters. */ std::vector m_weights; + /** @details Maximum possible minibatch size supported by layers in + * this model. Note that this is local to the particular model, + * not across multiple models. + */ + size_t m_max_mini_batch_size; + /** @details If a layer needs to construct an optimizer during * setup, it will make a copy of the default optimizer. This object * is just used to create copies and is not actually used for @@ -498,9 +433,6 @@ class model { /** @brief Current callbacks to process. */ std::vector m_callbacks; - /** @brief Threads available for I/O */ - std::shared_ptr m_io_thread_pool; - /** @brief Flag that allows input layers to fetch data in the background */ bool m_background_io_allowed = true; diff --git a/include/lbann/proto/factories.hpp b/include/lbann/proto/factories.hpp index 2ec5fa0bf82..94e7bb6309a 100644 --- a/include/lbann/proto/factories.hpp +++ b/include/lbann/proto/factories.hpp @@ -33,6 +33,7 @@ #include "lbann/proto/proto_common.hpp" #include "lbann/transforms/transform.hpp" #include "lbann/transforms/transform_pipeline.hpp" +#include "lbann/trainers/trainer.hpp" #include #include @@ -49,17 +50,23 @@ class Weights; namespace lbann { namespace proto { +/** Construct a trainer specified with a prototext. */ +std::unique_ptr construct_trainer(lbann_comm* comm, + const lbann_data::Trainer& proto_trainer); + /** Construct a model specified with a prototext. */ std::unique_ptr construct_model( lbann_comm* comm, const std::map& data_readers, const lbann_data::Optimizer& proto_opt, + const lbann_data::Trainer& proto_trainer, const lbann_data::Model& proto_model); /** Construct a layer graph specified with a prototext. */ std::vector> construct_layer_graph( lbann_comm* comm, const std::map& data_readers, + const lbann_data::Trainer& proto_trainer, const lbann_data::Model& proto_model); /** Construct a layer specified with prototext. */ diff --git a/include/lbann/proto/proto_common.hpp b/include/lbann/proto/proto_common.hpp index 72fc2397651..158bd284961 100644 --- a/include/lbann/proto/proto_common.hpp +++ b/include/lbann/proto/proto_common.hpp @@ -32,6 +32,7 @@ // Forward declaration of protobuf classes namespace lbann_data { class LbannPB; +class Trainer; } namespace lbann { diff --git a/include/lbann/trainers/CMakeLists.txt b/include/lbann/trainers/CMakeLists.txt new file mode 100644 index 00000000000..827647c3c7a --- /dev/null +++ b/include/lbann/trainers/CMakeLists.txt @@ -0,0 +1,7 @@ +# Add the headers for this directory +set_full_path(THIS_DIR_HEADERS + trainer.hpp + ) + +# Propagate the files up the tree +set(HEADERS "${HEADERS}" "${THIS_DIR_HEADERS}" PARENT_SCOPE) diff --git a/include/lbann/trainers/trainer.hpp b/include/lbann/trainers/trainer.hpp new file mode 100644 index 00000000000..3b90b9cd648 --- /dev/null +++ b/include/lbann/trainers/trainer.hpp @@ -0,0 +1,162 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_TRAINER_HPP +#define LBANN_TRAINER_HPP + +#include "lbann/base.hpp" +#include "lbann/comm.hpp" +#include "lbann/models/model.hpp" +#include "lbann/execution_contexts/execution_context.hpp" +#include "lbann/io/persist.hpp" +#include "lbann/utils/threads/thread_pool.hpp" +#include +#include +#include +#include + +namespace lbann { + +// Forward-declare this. +class lbann_callback; +class training_algorithm; +class termination_criteria; + +/** Create a hash function for hashing a std::pair type */ +struct pair_hash +{ + template + std::size_t operator() (const std::pair &pair) const + { + using underlying_t = typename std::underlying_type::type; + return std::hash()(pair.first) ^ std::hash()(static_cast(pair.second)); + } +}; + +/** Represents an LBANN trainer and its context. */ +class trainer { +public: + + /** Constructor. */ + trainer(lbann_comm *comm); + + /** Copy constructor. */ + trainer(const trainer& other); + /** Copy assignment operator. */ + trainer& operator=(const trainer& other); + /** Destructor. */ + ~trainer(); + + /** Set the trainer's name; this is an arbitrary string + * that may be useful in multi-trainer scenarios, e.g, + * LTFB, jag + */ + void set_name(std::string const& name); + + /** Return the trainer's name; this is an arbitrary string + * that may be useful in multi-trainer scenarios, e.g, + * LTFB, jag + */ + std::string get_name() const { + return m_name; + } + + /** Human-readable description. */ + description get_description() const; + + /** Set up the trainer. */ + void setup(std::unique_ptr io_thread_pool); + + using execution_context_key_pair_t = typename std::pair, execution_mode>; + + execution_context_key_pair_t + check_and_build_execution_context(training_algorithm& alg, + observer_ptr model, + execution_mode mode); + + execution_context_key_pair_t + check_and_build_execution_context(const execution_context& c, + model& model, + execution_mode mode); + + execution_context& get_execution_context(observer_ptr model, + execution_mode mode); + + execution_context& get_execution_context(execution_context_key_pair_t key); + + void delete_execution_context(execution_context_key_pair_t key); + + void for_each_execution_context(std::function)>fn); + + void apply(training_algorithm& alg, + observer_ptr model, + execution_mode mode, + termination_criteria const& term_criteria); + + void train(observer_ptr model, El::Int num_epochs, El::Int num_batches=0); + + void evaluate(observer_ptr model, execution_mode mode, El::Int num_batches=0); + + /** Return the I/O thread pool */ + thread_pool& get_io_thread_pool() const { + if (!m_io_thread_pool) { LBANN_ERROR("m_io_thread_pool is null"); } + return *(m_io_thread_pool.get()); + } + + /** Get the trainer's comm. */ + inline lbann_comm *get_comm() const { + return m_comm; + } + + /** Set a flag that can be used to enable / disable the background I/O activities */ + void allow_background_io_activity(bool enable) { m_background_io_allowed = enable; } + + /** Are background I/O activities enabled by the input layers */ + bool background_io_activity_allowed() { return m_background_io_allowed; } + +private: + + /** Give trainer a name. */ + std::string m_name; + + /** Communicator for the trainer. */ + lbann_comm *m_comm; + + /** Threads available for I/O */ + std::unique_ptr m_io_thread_pool; + + /** Flag that allows input layers to fetch data in the background */ + bool m_background_io_allowed; + + /** @brief Map from model and execution mode to its execution context */ + std::unordered_map, execution_mode>, + std::unique_ptr, + pair_hash> m_model_execution_context; +}; + +} // namespace lbann + +#endif // LBANN_TRAINER_HPP diff --git a/include/lbann/training_algorithms/CMakeLists.txt b/include/lbann/training_algorithms/CMakeLists.txt new file mode 100644 index 00000000000..2240711572d --- /dev/null +++ b/include/lbann/training_algorithms/CMakeLists.txt @@ -0,0 +1,8 @@ +# Add the headers for this directory +set_full_path(THIS_DIR_HEADERS + training_algorithm.hpp + sgd_training_algorithm.hpp + ) + +# Propagate the files up the tree +set(HEADERS "${HEADERS}" "${THIS_DIR_HEADERS}" PARENT_SCOPE) diff --git a/include/lbann/training_algorithms/sgd_training_algorithm.hpp b/include/lbann/training_algorithms/sgd_training_algorithm.hpp new file mode 100644 index 00000000000..a974f9e06a5 --- /dev/null +++ b/include/lbann/training_algorithms/sgd_training_algorithm.hpp @@ -0,0 +1,106 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_SGD_TRAINING_ALGORITHM_HPP +#define LBANN_SGD_TRAINING_ALGORITHM_HPP + +#include "lbann/training_algorithms/training_algorithm.hpp" +#include "lbann/execution_contexts/sgd_execution_context.hpp" + +namespace lbann { + +/** @brief Base class for LBANN SGD-family training algorithms. */ +class sgd_training_algorithm : public training_algorithm { +public: + + /** Constructor. */ + sgd_training_algorithm() {}; + /** Copy constructor. */ + sgd_training_algorithm(const sgd_training_algorithm& other) = default; + /** Copy assignment operator. */ + sgd_training_algorithm& operator=(const sgd_training_algorithm& other) = default; + /** Move constructor. */ + sgd_training_algorithm(sgd_training_algorithm&& other) = default; + /** Move assignment operator. */ + sgd_training_algorithm& operator=(sgd_training_algorithm&& other) = default; + /** Destructor. */ + virtual ~sgd_training_algorithm() = default; + /** Copy training_algorithm. */ + // virtual sgd_training_algorithm* copy() const = default; + + // =========================================== + // Execution + // =========================================== + + /** Apply the training algorithm to the model with the provided + context and execution mode */ + void apply(execution_context& c, + model& model, + execution_mode mode, + termination_criteria const& term_criteria) override; + + /** Train a model using an iterative SGD solver. */ + void train(sgd_execution_context& c, + model& model, + size_t num_epochs, size_t num_batches=0); + + /** Evaluate a model using the forward pass of an SGD solver. */ + void evaluate(sgd_execution_context& c, + model& model, + execution_mode mode, size_t num_batches=0); + +protected: + /** Train model on one step / mini-batch of an SGD forward pass */ + virtual bool train_mini_batch(sgd_execution_context& c, model& model); + + /** Evaluate model on one step / mini-batch of an SGD forward pass */ + virtual bool evaluate_mini_batch(sgd_execution_context& c, model& model, execution_mode mode); + + //////////////////////////////////////////////////////////// + // Callbacks + //////////////////////////////////////////////////////////// + + /** Execute callbacks at start of training. */ + virtual void do_train_begin_cbs(model& model); + /** Execute callbacks at end of training. */ + virtual void do_train_end_cbs(model& model); + /** Execute callbacks at start of evaluation. */ + virtual void do_evaluate_begin_cbs(model& model, execution_mode mode); + /** Execute callbacks at end of evaluation. */ + virtual void do_evaluate_end_cbs(model& model, execution_mode mode); + /** Execute callbacks at start of epoch. */ + virtual void do_epoch_begin_cbs(model& model); + /** Execute callbacks at end of epoch. */ + virtual void do_epoch_end_cbs(model& model); + /** Execute callbacks at start of mini-batch. */ + virtual void do_batch_begin_cbs(model& model, execution_mode mode); + /** Execute callbacks at end of mini-batch. */ + virtual void do_batch_end_cbs(model& model, execution_mode mode); +}; + +} // namespace lbann + +#endif // LBANN_SGD_TRAINING_ALGORITHM_HPP diff --git a/include/lbann/training_algorithms/training_algorithm.hpp b/include/lbann/training_algorithms/training_algorithm.hpp new file mode 100644 index 00000000000..79ec661ef7a --- /dev/null +++ b/include/lbann/training_algorithms/training_algorithm.hpp @@ -0,0 +1,66 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_TRAINING_ALGORITHM_HPP +#define LBANN_TRAINING_ALGORITHM_HPP + +#include "lbann/base.hpp" +#include "lbann/execution_contexts/execution_context.hpp" + +namespace lbann { + +// Forward-declare this. +class execution_context; +class model; + +/** Base class for LBANN training_algorithms. */ +class training_algorithm { +public: + + /** Constructor. */ + training_algorithm() {}; + /** Copy constructor. */ + training_algorithm(const training_algorithm& other) = default; + /** Copy assignment operator. */ + training_algorithm& operator=(const training_algorithm& other) = default; + /** Move constructor. */ + training_algorithm(training_algorithm&& other) = default; + /** Move assignment operator. */ + training_algorithm& operator=(training_algorithm&& other) = default; + /** Destructor. */ + virtual ~training_algorithm() = default; + /** Copy training_algorithm. */ + // virtual training_algorithm* copy() const = default; + + virtual void apply(execution_context& context, + model& model, + execution_mode mode, + termination_criteria const& term_criteria) = 0; +}; + +} // namespace lbann + +#endif // LBANN_TRAINING_ALGORITHM_HPP diff --git a/include/lbann/utils/CMakeLists.txt b/include/lbann/utils/CMakeLists.txt index c510021d3ff..230ff88b506 100644 --- a/include/lbann/utils/CMakeLists.txt +++ b/include/lbann/utils/CMakeLists.txt @@ -8,6 +8,7 @@ set_full_path(THIS_DIR_HEADERS dataset.hpp description.hpp entrywise_operator.hpp + enum_iterator.hpp exception.hpp factory.hpp factory_error_policies.hpp diff --git a/include/lbann/utils/dataset.hpp b/include/lbann/utils/dataset.hpp index 2c1373f3807..154a47c8d44 100644 --- a/include/lbann/utils/dataset.hpp +++ b/include/lbann/utils/dataset.hpp @@ -28,6 +28,7 @@ #define LBANN_DATASET_HPP_INCLUDED #include "lbann/data_readers/data_reader.hpp" +#include namespace lbann { @@ -38,6 +39,11 @@ class dataset { // the data reader. dataset(const dataset& other) = default; dataset& operator=(const dataset& other) = default; + template void serialize( Archive & ar ) { + ar(CEREAL_NVP(m_num_samples_processed), + CEREAL_NVP(m_total_samples)); + } + long get_num_samples_processed() const { return m_num_samples_processed; } long& num_samples_processed() { return m_num_samples_processed; } long get_total_samples() const { return m_total_samples; } diff --git a/include/lbann/utils/enum_iterator.hpp b/include/lbann/utils/enum_iterator.hpp new file mode 100644 index 00000000000..247420f0e90 --- /dev/null +++ b/include/lbann/utils/enum_iterator.hpp @@ -0,0 +1,57 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_ENUM_ITERATOR_H +#define LBANN_ENUM_ITERATOR_H + +#include + +namespace lbann { + +/** @brief Create an iterator that goes over a contiguous (unit-step) + enum class */ +template < typename C, C beginVal, C endVal> +class enum_iterator { + typedef typename std::underlying_type::type val_t; + int val; +public: + enum_iterator(const C & f) : val(static_cast(f)) {} + enum_iterator() : val(static_cast(beginVal)) {} + enum_iterator operator++() { + ++val; + return *this; + } + C operator*() { return static_cast(val); } + enum_iterator begin() { return *this; } //default ctor is good + enum_iterator end() { + static const enum_iterator endIter=++enum_iterator(endVal); // cache it + return endIter; + } + bool operator!=(const enum_iterator& i) { return val != i.val; } +}; + +} +#endif // LBANN_ENUM_ITERATOR_H diff --git a/include/lbann/utils/graph.hpp b/include/lbann/utils/graph.hpp index f9d4522762a..d81e66ab3a7 100644 --- a/include/lbann/utils/graph.hpp +++ b/include/lbann/utils/graph.hpp @@ -36,7 +36,7 @@ namespace graph { /** Print the nodes and edges of a graph to an output stream. */ void print(const std::set& nodes, const std::map>& edges, - std::ostream& os = std::cout); + std::ostream& os); /** Get nodes adjacent to a given node. */ std::set get_neighbors(El::Int node, diff --git a/include/lbann/utils/lbann_library.hpp b/include/lbann/utils/lbann_library.hpp index 215a63ad542..5456cfac2f3 100644 --- a/include/lbann/utils/lbann_library.hpp +++ b/include/lbann/utils/lbann_library.hpp @@ -34,18 +34,24 @@ namespace lbann { const int lbann_default_random_seed = 42; -std::unique_ptr construct_io_thread_pool(lbann_comm *comm); +std::unique_ptr construct_trainer(lbann_comm *comm, + lbann_data::Trainer* pb_trainer, + options *opts); + +std::unique_ptr construct_io_thread_pool(lbann_comm *comm, options *opts); std::unique_ptr build_model_from_prototext( int argc, char **argv, + const lbann_data::Trainer* pb_trainer, lbann_data::LbannPB &pb, lbann_comm *comm, - std::shared_ptr io_thread_pool, + options *opts, + thread_pool& io_thread_pool, bool first_model); -void print_lbann_configuration( - lbann_data::Model *pb_model, lbann_comm *comm, - int io_threads_per_process, int io_threads_offset); +void print_lbann_configuration(lbann_comm *comm, + int io_threads_per_process, + int io_threads_offset); } // namespace lbann diff --git a/include/lbann/utils/random.hpp b/include/lbann/utils/random.hpp index 25cd5c1a0aa..92074e30847 100644 --- a/include/lbann/utils/random.hpp +++ b/include/lbann/utils/random.hpp @@ -247,8 +247,8 @@ void bernoulli_fill_procdet(AbsDistMat& mat, El::Int m, El::Int n, double p = 0. void uniform_fill_procdet(AbsDistMat& mat, El::Int m, El::Int n, DataType center = 0.0f, DataType radius = 1.0f); -bool save_rng_to_checkpoint_shared(persist& p, const lbann_comm* comm); -bool load_rng_from_checkpoint_shared(persist& p, const lbann_comm* comm); +bool save_rng_to_checkpoint(persist& p, const lbann_comm* comm); +bool load_rng_from_checkpoint(persist& p, const lbann_comm* comm); template class rng { diff --git a/model_zoo/lbann.cpp b/model_zoo/lbann.cpp index 7b0580a73b2..07f23433c8a 100644 --- a/model_zoo/lbann.cpp +++ b/model_zoo/lbann.cpp @@ -71,16 +71,25 @@ int main(int argc, char *argv[]) { //to activate, must specify --st_on on cmd line stack_profiler::get()->activate(comm->get_rank_in_world()); - // Initalize a global I/O thread pool - std::shared_ptr io_thread_pool = construct_io_thread_pool(comm.get()); - + // Load the prototexts specificed on the command line auto pbs = protobuf_utils::load_prototext(master, argc, argv); - lbann_data::LbannPB pb = *(pbs[0]); + // Optionally over-ride some values in the prototext for each model + for(size_t i = 0; i < pbs.size(); i++) { + get_cmdline_overrides(*comm, *(pbs[i])); + } + + lbann_data::LbannPB& pb = *(pbs[0]); + lbann_data::Trainer *pb_trainer = pb.mutable_trainer(); + + // Construct the trainer + std::unique_ptr trainer = construct_trainer(comm.get(), pb_trainer, opts); + + thread_pool& io_thread_pool = trainer->get_io_thread_pool(); lbann_data::Model *pb_model = pb.mutable_model(); - auto model = build_model_from_prototext(argc, argv, pb, - comm.get(), io_thread_pool, true); + auto model = build_model_from_prototext(argc, argv, pb_trainer, pb, + comm.get(), opts, io_thread_pool, true); if (opts->has_string("create_tarball")) { return EXIT_SUCCESS; @@ -89,10 +98,10 @@ int main(int argc, char *argv[]) { if (! opts->get_bool("exit_after_setup")) { // Train model - model->train(pb_model->num_epochs()); + trainer->train(model.get(), pb_model->num_epochs()); // Evaluate model on test set - model->evaluate(execution_mode::testing); + trainer->evaluate(model.get(), execution_mode::testing); //has no affect unless option: --st_on was given stack_profiler::get()->print(); diff --git a/model_zoo/lbann2.cpp b/model_zoo/lbann2.cpp index 6dcd3272cd3..b9147096c44 100644 --- a/model_zoo/lbann2.cpp +++ b/model_zoo/lbann2.cpp @@ -54,17 +54,26 @@ int main(int argc, char *argv[]) { std::ostringstream err; - // Initalize a global I/O thread pool - std::shared_ptr io_thread_pool = construct_io_thread_pool(comm.get()); - auto pbs = protobuf_utils::load_prototext(master, argc, argv); + // Optionally over-ride some values in the prototext for each model + for(size_t i = 0; i < pbs.size(); i++) { + get_cmdline_overrides(*comm, *(pbs[i])); + } + + lbann_data::LbannPB& pb = *(pbs[0]); + lbann_data::Trainer *pb_trainer = pb.mutable_trainer(); + + // Construct the trainer + std::unique_ptr trainer = construct_trainer(comm.get(), pb_trainer, opts); + + thread_pool& io_thread_pool = trainer->get_io_thread_pool(); - auto model_1 = build_model_from_prototext(argc, argv, *(pbs[0]), - comm.get(), io_thread_pool, true); + auto model_1 = build_model_from_prototext(argc, argv, pb_trainer, *(pbs[0]), + comm.get(), opts, io_thread_pool, true); std::unique_ptr model_2; if (pbs.size() > 1) { - model_2 = build_model_from_prototext(argc, argv, *(pbs[1]), - comm.get(), io_thread_pool, false); + model_2 = build_model_from_prototext(argc, argv, pb_trainer, *(pbs[1]), + comm.get(), opts, io_thread_pool, false); } // Load layer weights from checkpoint if checkpoint directory given if(opts->has_string("ckpt_dir")){ @@ -80,11 +89,11 @@ int main(int argc, char *argv[]) { // When using checkpoint states, skip training as those could be the result // of checkpointing by steps. if (!opts->has_string("no_model1_train")){ - model_1->train( pb_model.num_epochs() ); + trainer->train(model_1.get(), pb_model.num_epochs() ); } // Evaluate model 1 unless it is set to skip if (!opts->has_string("no_model1_eval")){ - model_1->evaluate(execution_mode::testing); + trainer->evaluate(model_1.get(), execution_mode::testing); } if (model_2 != nullptr) { @@ -108,8 +117,8 @@ int main(int argc, char *argv[]) { std::cerr << "\n STARTING train - model 2\n\n"; } const lbann_data::Model pb_model_2 = pbs[1]->model(); - model_2->train( pb_model_2.num_epochs() ); - model_2->evaluate(execution_mode::testing); + trainer->train(model_2.get(), pb_model_2.num_epochs() ); + trainer->evaluate(model_2.get(), execution_mode::testing); } } catch (std::exception& e) { diff --git a/model_zoo/lbann_aecycgan.cpp b/model_zoo/lbann_aecycgan.cpp index 6b2a7eb6d2d..ed150eb9e4d 100644 --- a/model_zoo/lbann_aecycgan.cpp +++ b/model_zoo/lbann_aecycgan.cpp @@ -53,26 +53,35 @@ int main(int argc, char *argv[]) { std::ostringstream err; - // Initalize a global I/O thread pool - std::shared_ptr io_thread_pool = construct_io_thread_pool(comm.get()); - auto pbs = protobuf_utils::load_prototext(master, argc, argv); + // Optionally over-ride some values in the prototext for each model + for(size_t i = 0; i < pbs.size(); i++) { + get_cmdline_overrides(*comm, *(pbs[i])); + } + + lbann_data::LbannPB& pb = *(pbs[0]); + lbann_data::Trainer *pb_trainer = pb.mutable_trainer(); + + // Construct the trainer + std::unique_ptr trainer = construct_trainer(comm.get(), pb_trainer, opts); + + thread_pool& io_thread_pool = trainer->get_io_thread_pool(); - auto model_1 = build_model_from_prototext(argc, argv, *(pbs[0]), - comm.get(), io_thread_pool, true); //ae + auto model_1 = build_model_from_prototext(argc, argv, pb_trainer, *(pbs[0]), + comm.get(), opts, io_thread_pool, true); //ae std::unique_ptr model_2, //cycgan model_3; //ae+cycgan if (pbs.size() > 1) { - model_2 = build_model_from_prototext(argc, argv, *(pbs[1]), - comm.get(), io_thread_pool, false); + model_2 = build_model_from_prototext(argc, argv, pb_trainer, *(pbs[1]), + comm.get(), opts, io_thread_pool, false); } if (pbs.size() > 2) { - model_3 = build_model_from_prototext(argc, argv, *(pbs[2]), - comm.get(), io_thread_pool, false); + model_3 = build_model_from_prototext(argc, argv, pb_trainer, *(pbs[2]), + comm.get(), opts, io_thread_pool, false); } @@ -81,16 +90,16 @@ int main(int argc, char *argv[]) { const lbann_data::Model pb_model_3 = pbs[2]->model(); if(master) std::cout << " Pre-train autoencoder " << std::endl; - model_1->train(pb_model_1.num_epochs()); - model_1->evaluate(execution_mode::testing); + trainer->train(model_1.get(), pb_model_1.num_epochs()); + trainer->evaluate(model_1.get(), execution_mode::testing); auto ae_weights = model_1->get_weights(); model_2->copy_trained_weights_from(ae_weights); model_3->copy_trained_weights_from(ae_weights); //Train cycle GAN if (master) std::cerr << "\nSTARTING train - cycle GAN \n\n"; - model_2->train(pb_model_2.num_epochs()); - model_2->evaluate(execution_mode::testing); + trainer->train(model_2.get(), pb_model_2.num_epochs()); + trainer->evaluate(model_2.get(), execution_mode::testing); auto model2_weights = model_2->get_weights(); //Evaluate on pretrained autoencoder @@ -99,7 +108,7 @@ int main(int argc, char *argv[]) { if(master) std::cout << " Save AE + cycleGAN" << std::endl; model_3->save_model(); if(master) std::cout << " Evaluate cycleGAN model on pretrained autoencoder" << std::endl; - model_3->evaluate(execution_mode::testing); + trainer->evaluate(model_3.get(), execution_mode::testing); } catch (std::exception& e) { El::ReportException(e); diff --git a/model_zoo/lbann_cycgan.cpp b/model_zoo/lbann_cycgan.cpp index 9b2c061f957..c701ab11969 100644 --- a/model_zoo/lbann_cycgan.cpp +++ b/model_zoo/lbann_cycgan.cpp @@ -72,13 +72,22 @@ int main(int argc, char *argv[]) { std::ostringstream err; - // Initalize a global I/O thread pool - std::shared_ptr io_thread_pool = construct_io_thread_pool(comm.get()); - auto pbs = protobuf_utils::load_prototext(master, argc, argv); + // Optionally over-ride some values in the prototext for each model + for(size_t i = 0; i < pbs.size(); i++) { + get_cmdline_overrides(*comm, *(pbs[i])); + } + + lbann_data::LbannPB& pb = *(pbs[0]); + lbann_data::Trainer *pb_trainer = pb.mutable_trainer(); + + // Construct the trainer + std::unique_ptr trainer = construct_trainer(comm.get(), pb_trainer, opts); + + thread_pool& io_thread_pool = trainer->get_io_thread_pool(); - auto model_1 = build_model_from_prototext(argc, argv, *(pbs[0]), - comm.get(), io_thread_pool, true); //D1 solver + auto model_1 = build_model_from_prototext(argc, argv, pb_trainer, *(pbs[0]), + comm.get(), opts, io_thread_pool, true); //D1 solver //hack, overide model name to make reporting easy, what can break?" std::unique_ptr model_2, //G1 solver model_3, //G2 solver @@ -88,23 +97,23 @@ int main(int argc, char *argv[]) { ae_cycgan_model; //contain layer(s) from (cyc)GAN if (pbs.size() > 1) { - model_2 = build_model_from_prototext(argc, argv, *(pbs[1]), - comm.get(), io_thread_pool, false); + model_2 = build_model_from_prototext(argc, argv, pb_trainer, *(pbs[1]), + comm.get(), opts, io_thread_pool, false); } if (pbs.size() > 2) { - model_3 = build_model_from_prototext(argc, argv, *(pbs[2]), - comm.get(), io_thread_pool, false); + model_3 = build_model_from_prototext(argc, argv, pb_trainer, *(pbs[2]), + comm.get(), opts, io_thread_pool, false); } if (pbs.size() > 3) { - ae_model = build_model_from_prototext(argc, argv, *(pbs[3]), - comm.get(), io_thread_pool, false); + ae_model = build_model_from_prototext(argc, argv, pb_trainer, *(pbs[3]), + comm.get(), opts, io_thread_pool, false); } if (pbs.size() > 4) { - ae_cycgan_model = build_model_from_prototext(argc, argv, *(pbs[4]), - comm.get(), io_thread_pool, false); + ae_cycgan_model = build_model_from_prototext(argc, argv, pb_trainer, *(pbs[4]), + comm.get(), opts, io_thread_pool, false); } const lbann_data::Model pb_model = pbs[0]->model(); @@ -116,7 +125,7 @@ int main(int argc, char *argv[]) { if(ae_model != nullptr) { if(master) std::cout << " Pre-train autoencoder " << std::endl; const lbann_data::Model pb_model_4 = pbs[3]->model(); - ae_model->train(pb_model_4.num_epochs()); + trainer->train(ae_model.get(), pb_model_4.num_epochs()); auto ae_weights = ae_model->get_weights(); model_1->copy_trained_weights_from(ae_weights); model_2->copy_trained_weights_from(ae_weights); @@ -129,20 +138,20 @@ int main(int argc, char *argv[]) { int max_super_step = pb_model.super_steps(); while (super_step <= max_super_step) { if (master) std::cerr << "\nSTARTING train - discriminator (D1 & D2) models at step " << super_step <<"\n\n"; - model_1->train( super_step*pb_model.num_epochs(),pb_model.num_batches()); + trainer->train(model_1.get(), super_step*pb_model.num_epochs(),pb_model.num_batches()); if(master) std::cout << " Copy all trained weights from discriminator to G1 and train/freeze as appropriate " << std::endl; auto model1_weights = model_1->get_weights(); model_2->copy_trained_weights_from(model1_weights); if (master) std::cerr << "\n STARTING train - G1 solver model at step " << super_step << " \n\n"; - model_2->train( super_step*pb_model_2.num_epochs(),pb_model_2.num_batches()); + trainer->train(model_2.get(), super_step*pb_model_2.num_epochs(),pb_model_2.num_batches()); // Evaluate model on test set // model_2->evaluate(execution_mode::testing,pb_model_2.num_batches()); if(master) std::cout << " Copy all trained weights from discriminator to G2 and train/freeze as appropriate " << std::endl; model_3->copy_trained_weights_from(model1_weights); if (master) std::cerr << "\n STARTING train - G2 solver model at step " << super_step << " \n\n"; - model_3->train( super_step*pb_model_3.num_epochs(),pb_model_3.num_batches()); + trainer->train(model_3.get(), super_step*pb_model_3.num_epochs(),pb_model_3.num_batches()); // Evaluate model on test set // model_3->evaluate(execution_mode::testing,pb_model_3.num_batches()); @@ -171,7 +180,7 @@ int main(int argc, char *argv[]) { model_3->save_model(); ae_cycgan_model->save_model(); if(master) std::cout << " Evaluate pretrained autoencoder" << std::endl; - ae_cycgan_model->evaluate(execution_mode::testing); + trainer->evaluate(ae_cycgan_model.get(), execution_mode::testing); //has no affect unless option: --st_on was given stack_profiler::get()->print(); diff --git a/model_zoo/lbann_gan.cpp b/model_zoo/lbann_gan.cpp index 8ee5dcbbfa3..f8dec401657 100644 --- a/model_zoo/lbann_gan.cpp +++ b/model_zoo/lbann_gan.cpp @@ -53,16 +53,25 @@ int main(int argc, char *argv[]) { std::ostringstream err; - // Initalize a global I/O thread pool - std::shared_ptr io_thread_pool = construct_io_thread_pool(comm.get()); - auto pbs = protobuf_utils::load_prototext(master, argc, argv); + // Optionally over-ride some values in the prototext for each model + for(size_t i = 0; i < pbs.size(); i++) { + get_cmdline_overrides(*comm, *(pbs[i])); + } + + lbann_data::LbannPB& pb = *(pbs[0]); + lbann_data::Trainer *pb_trainer = pb.mutable_trainer(); + + // Construct the trainer + std::unique_ptr trainer = construct_trainer(comm.get(), pb_trainer, opts); + + thread_pool& io_thread_pool = trainer->get_io_thread_pool(); - auto model_1 = build_model_from_prototext(argc, argv, *(pbs[0]), comm.get(), io_thread_pool, true); //discriminator + auto model_1 = build_model_from_prototext(argc, argv, pb_trainer, *(pbs[0]), comm.get(), opts, io_thread_pool, true); //discriminator //model std::unique_ptr model_2 = nullptr; //adversarial model if (pbs.size() > 1) { - model_2 = build_model_from_prototext(argc, argv, *(pbs[1]), comm.get(), io_thread_pool, false); + model_2 = build_model_from_prototext(argc, argv, pb_trainer, *(pbs[1]), comm.get(), opts, io_thread_pool, false); } const lbann_data::Model pb_model = pbs[0]->model(); @@ -75,7 +84,7 @@ int main(int argc, char *argv[]) { while (super_step <= max_super_step) { if (master) std::cerr << "\nSTARTING train - discriminator model at step " << super_step <<"\n\n"; //@todo freeze generator layers in this step - model_1->train( super_step*pb_model.num_epochs() ); + trainer->train(model_1.get(), super_step*pb_model.num_epochs() ); //Replace/copy "proxy" layer in adversarial model (model2) with its "equivalent" layer in discriminator model (model1) //@todo freeze layers after replacement @@ -96,7 +105,7 @@ int main(int argc, char *argv[]) { } if (master) std::cerr << "\n STARTING train - adversarial model at step " << super_step << " \n\n"; - model_2->train( super_step*pb_model_2.num_epochs() ); + trainer->train(model_2.get(), super_step*pb_model_2.num_epochs() ); super_step++; } diff --git a/model_zoo/lbann_inf.cpp b/model_zoo/lbann_inf.cpp index 50639024e1b..14cd1eace90 100644 --- a/model_zoo/lbann_inf.cpp +++ b/model_zoo/lbann_inf.cpp @@ -55,16 +55,24 @@ int main(int argc, char *argv[]) { std::ostringstream err; - // Initalize a global I/O thread pool - std::shared_ptr io_thread_pool - = construct_io_thread_pool(comm.get()); - auto pbs = protobuf_utils::load_prototext(master, argc, argv); + // Optionally over-ride some values in the prototext for each model + for(size_t i = 0; i < pbs.size(); i++) { + get_cmdline_overrides(*comm, *(pbs[i])); + } + + lbann_data::LbannPB& pb = *(pbs[0]); + lbann_data::Trainer *pb_trainer = pb.mutable_trainer(); + + // Construct the trainer + std::unique_ptr trainer = construct_trainer(comm.get(), pb_trainer, opts); + + thread_pool& io_thread_pool = trainer->get_io_thread_pool(); std::vector> models; for(auto&& pb_model : pbs) { models.emplace_back( - build_model_from_prototext(argc, argv, *pb_model, - comm.get(), io_thread_pool, models.size() == 0)); + build_model_from_prototext(argc, argv, pb_trainer, *pb_model, + comm.get(), opts, io_thread_pool, models.size() == 0)); } // Load layer weights from checkpoint if checkpoint directory given @@ -85,7 +93,7 @@ int main(int argc, char *argv[]) { El::Int num_samples = models[0]->get_num_iterations_per_epoch(execution_mode::testing); for(El::Int s = 0; s < num_samples; s++) { for(auto&& m : models) { - m->evaluate(execution_mode::testing, 1); + trainer->evaluate(m.get(), execution_mode::testing, 1); } } diff --git a/model_zoo/models/alexnet/model_alexnet.prototext b/model_zoo/models/alexnet/model_alexnet.prototext index 58e8edfb22d..a620616b698 100644 --- a/model_zoo/models/alexnet/model_alexnet.prototext +++ b/model_zoo/models/alexnet/model_alexnet.prototext @@ -1,10 +1,12 @@ +trainer { + block_size: 256 + procs_per_trainer: 0 + num_parallel_readers: 0 +} model { data_layout: "data_parallel" mini_batch_size: 256 - block_size: 256 num_epochs: 72 - num_parallel_readers: 0 - procs_per_trainer: 0 ################################################### # Objective function diff --git a/model_zoo/models/autoencoder_candle_pilot1/model_autoencoder_chem_ecfp.prototext b/model_zoo/models/autoencoder_candle_pilot1/model_autoencoder_chem_ecfp.prototext index 6e40b9f3328..31be0dc7969 100644 --- a/model_zoo/models/autoencoder_candle_pilot1/model_autoencoder_chem_ecfp.prototext +++ b/model_zoo/models/autoencoder_candle_pilot1/model_autoencoder_chem_ecfp.prototext @@ -1,13 +1,15 @@ +trainer { + block_size: 256 + procs_per_trainer: 0 + num_parallel_readers: 1 +} model { ### Model description and network architecture taken from: ### https://lc.llnl.gov/bitbucket/projects/BIOM/repos/molresp/browse/tf_model.py?at=TensorFlow_chemClass ### This network description is anologous to AutoEncoder_Chem_ECFP data_layout: "model_parallel" mini_batch_size: 128 - block_size: 256 num_epochs: 4 - num_parallel_readers: 1 - procs_per_trainer: 0 ################################################### # Objective function diff --git a/model_zoo/models/autoencoder_candle_pilot1/model_autoencoder_chem_ecfp_200x150x100x100x100.prototext b/model_zoo/models/autoencoder_candle_pilot1/model_autoencoder_chem_ecfp_200x150x100x100x100.prototext index c25231de172..e322dccfa01 100644 --- a/model_zoo/models/autoencoder_candle_pilot1/model_autoencoder_chem_ecfp_200x150x100x100x100.prototext +++ b/model_zoo/models/autoencoder_candle_pilot1/model_autoencoder_chem_ecfp_200x150x100x100x100.prototext @@ -1,13 +1,15 @@ +trainer { + block_size: 256 + procs_per_trainer: 0 + num_parallel_readers: 1 +} model { ### Model description and network architecture taken from: ### https://lc.llnl.gov/bitbucket/projects/BIOM/repos/molresp/browse/tf_model.py?at=TensorFlow_chemClass ### This network description is anologous to AutoEncoder_Chem_ECFP data_layout: "model_parallel" mini_batch_size: 1024 - block_size: 256 num_epochs: 4 - num_parallel_readers: 1 - procs_per_trainer: 0 ################################################### # Objective function diff --git a/model_zoo/models/autoencoder_candle_pilot1/model_autoencoder_chem_ecfp_500x250x100.prototext b/model_zoo/models/autoencoder_candle_pilot1/model_autoencoder_chem_ecfp_500x250x100.prototext index 576d5a3c402..715ac13b195 100644 --- a/model_zoo/models/autoencoder_candle_pilot1/model_autoencoder_chem_ecfp_500x250x100.prototext +++ b/model_zoo/models/autoencoder_candle_pilot1/model_autoencoder_chem_ecfp_500x250x100.prototext @@ -1,13 +1,15 @@ +trainer { + block_size: 256 + procs_per_trainer: 0 + num_parallel_readers: 1 +} model { ### Model description and network architecture taken from: ### https://lc.llnl.gov/bitbucket/projects/BIOM/repos/molresp/browse/tf_model.py?at=TensorFlow_chemClass ### This network description is anologous to AutoEncoder_Chem_ECFP data_layout: "model_parallel" mini_batch_size: 1024 - block_size: 256 num_epochs:20 - num_parallel_readers: 1 - procs_per_trainer: 0 ################################################### # Objective function diff --git a/model_zoo/models/autoencoder_candle_pilot1/model_autoencoder_chem_sigmoid.prototext b/model_zoo/models/autoencoder_candle_pilot1/model_autoencoder_chem_sigmoid.prototext index 67fbf7dda3e..62bb76d84d1 100644 --- a/model_zoo/models/autoencoder_candle_pilot1/model_autoencoder_chem_sigmoid.prototext +++ b/model_zoo/models/autoencoder_candle_pilot1/model_autoencoder_chem_sigmoid.prototext @@ -1,13 +1,15 @@ +trainer { + block_size: 256 + procs_per_trainer: 0 + num_parallel_readers: 1 +} model { ### Model description and network architecture taken from: ### https://lc.llnl.gov/bitbucket/projects/BIOM/repos/molresp/browse/tf_model.py?at=TensorFlow_chemClass ### This network description is anologous to AutoEncoder_Chem_Sigmoid data_layout: "model_parallel" mini_batch_size: 128 - block_size: 256 num_epochs: 4 - num_parallel_readers: 1 - procs_per_trainer: 0 ################################################### # Objective function diff --git a/model_zoo/models/autoencoder_candle_pilot1/model_dnn_chem_ecfp.prototext b/model_zoo/models/autoencoder_candle_pilot1/model_dnn_chem_ecfp.prototext index 47e375a2e25..9c8dc907072 100644 --- a/model_zoo/models/autoencoder_candle_pilot1/model_dnn_chem_ecfp.prototext +++ b/model_zoo/models/autoencoder_candle_pilot1/model_dnn_chem_ecfp.prototext @@ -1,13 +1,15 @@ +trainer { + block_size: 256 + procs_per_trainer: 0 + num_parallel_readers: 1 +} model { ### Model description and network architecture taken from: ### https://lc.llnl.gov/bitbucket/projects/BIOM/repos/molresp/browse/tf_model.py?at=TensorFlow_chemClass ### This network description is anologous to AutoEncoder_Chem_ECFP data_layout: "model_parallel" mini_batch_size: 128 - block_size: 256 num_epochs: 4 - num_parallel_readers: 1 - procs_per_trainer: 0 ################################################### # Objective function diff --git a/model_zoo/models/autoencoder_cifar10/model_autoencoder_cifar10.prototext b/model_zoo/models/autoencoder_cifar10/model_autoencoder_cifar10.prototext index c8f1e32b091..29dbbbcc6ba 100644 --- a/model_zoo/models/autoencoder_cifar10/model_autoencoder_cifar10.prototext +++ b/model_zoo/models/autoencoder_cifar10/model_autoencoder_cifar10.prototext @@ -1,10 +1,12 @@ +trainer { + block_size: 256 + procs_per_trainer: 0 + num_parallel_readers: 0 +} model { data_layout: "model_parallel" mini_batch_size: 32 - block_size: 256 num_epochs: 100 - num_parallel_readers: 0 - procs_per_trainer: 0 ################################################### # Objective function diff --git a/model_zoo/models/autoencoder_cifar10/model_conv_autoencoder_cifar10.prototext b/model_zoo/models/autoencoder_cifar10/model_conv_autoencoder_cifar10.prototext index 1107d1f2bfd..b829a4af5f3 100644 --- a/model_zoo/models/autoencoder_cifar10/model_conv_autoencoder_cifar10.prototext +++ b/model_zoo/models/autoencoder_cifar10/model_conv_autoencoder_cifar10.prototext @@ -1,10 +1,12 @@ +trainer { + block_size: 256 + #procs_per_trainer: 12 + num_parallel_readers: 1 +} model { data_layout: "data_parallel" mini_batch_size: 128 - block_size: 256 num_epochs: 10 - num_parallel_readers: 1 - #procs_per_trainer: 12 procs_per_trainer: 0 disable_cuda: true diff --git a/model_zoo/models/autoencoder_imagenet/model_conv_autoencoder_imagenet.prototext b/model_zoo/models/autoencoder_imagenet/model_conv_autoencoder_imagenet.prototext index 97d0ee18f3f..95acabd67f7 100644 --- a/model_zoo/models/autoencoder_imagenet/model_conv_autoencoder_imagenet.prototext +++ b/model_zoo/models/autoencoder_imagenet/model_conv_autoencoder_imagenet.prototext @@ -1,10 +1,12 @@ +trainer { + block_size: 256 + procs_per_trainer: 0 + num_parallel_readers: 1 +} model { data_layout: "data_parallel" mini_batch_size: 128 - block_size: 256 num_epochs: 4 - num_parallel_readers: 1 - procs_per_trainer: 0 ################################################### # Objective function diff --git a/model_zoo/models/autoencoder_mnist/model_autoencoder_mnist.prototext b/model_zoo/models/autoencoder_mnist/model_autoencoder_mnist.prototext index 81d11fbce37..7a7a9ba3fe0 100644 --- a/model_zoo/models/autoencoder_mnist/model_autoencoder_mnist.prototext +++ b/model_zoo/models/autoencoder_mnist/model_autoencoder_mnist.prototext @@ -1,10 +1,12 @@ +trainer { + block_size: 256 + procs_per_trainer: 0 + num_parallel_readers: 0 +} model { data_layout: "model_parallel" mini_batch_size: 10 - block_size: 256 num_epochs: 10 - num_parallel_readers: 0 - procs_per_trainer: 0 ################################################### # Objective function diff --git a/model_zoo/models/autoencoder_mnist/model_conv_autoencoder_mnist.prototext b/model_zoo/models/autoencoder_mnist/model_conv_autoencoder_mnist.prototext index 0bd522e79a7..a7f0b9ba513 100644 --- a/model_zoo/models/autoencoder_mnist/model_conv_autoencoder_mnist.prototext +++ b/model_zoo/models/autoencoder_mnist/model_conv_autoencoder_mnist.prototext @@ -1,10 +1,12 @@ +trainer { + block_size: 256 + procs_per_trainer: 0 + num_parallel_readers: 1 +} model { data_layout: "data_parallel" mini_batch_size: 128 - block_size: 256 num_epochs: 4 - num_parallel_readers: 1 - procs_per_trainer: 0 ################################################### # Objective function diff --git a/model_zoo/models/autoencoder_mnist/vae_mnist.prototext b/model_zoo/models/autoencoder_mnist/vae_mnist.prototext index bcba455a50b..9f22ed365a8 100644 --- a/model_zoo/models/autoencoder_mnist/vae_mnist.prototext +++ b/model_zoo/models/autoencoder_mnist/vae_mnist.prototext @@ -1,9 +1,11 @@ # LBANN implementation of MNIST VAE in Doersch's autoencoder tutorial # See https://github.com/cdoersch/vae_tutorial/blob/master/mnist_vae.prototxt +trainer { + block_size: 256 +} model { data_layout: "data_parallel" mini_batch_size: 100 - block_size: 256 num_epochs: 50 ############################################## diff --git a/model_zoo/models/candle/pilot1/ae_nodeselect_gdc.prototext b/model_zoo/models/candle/pilot1/ae_nodeselect_gdc.prototext index dfd926c0548..b1c68492b9e 100644 --- a/model_zoo/models/candle/pilot1/ae_nodeselect_gdc.prototext +++ b/model_zoo/models/candle/pilot1/ae_nodeselect_gdc.prototext @@ -1,10 +1,12 @@ +trainer { + block_size: 256 + procs_per_trainer: 0 + num_parallel_readers: 0 +} model { data_layout: "model_parallel" mini_batch_size: 50 - block_size: 256 num_epochs: 20 - num_parallel_readers: 0 - procs_per_trainer: 0 ################################################### # Objective function diff --git a/model_zoo/models/candle/pilot1/combo.prototext b/model_zoo/models/candle/pilot1/combo.prototext index 1d6aeb6a9d1..0ceac3cfec4 100644 --- a/model_zoo/models/candle/pilot1/combo.prototext +++ b/model_zoo/models/candle/pilot1/combo.prototext @@ -1,12 +1,14 @@ #Example taken from:https://github.com/ECP-CANDLE/Benchmarks/tree/frameworks/Pilot1/Combo #Timestamp 03/07/2018 8:30PM +trainer{ + block_size: 256 + procs_per_trainer: 0 + num_parallel_readers: 0 +} model { data_layout: "model_parallel" mini_batch_size: 256 - block_size: 256 num_epochs: 10 - num_parallel_readers: 0 - procs_per_trainer: 0 ################################################### # Objective function diff --git a/model_zoo/models/cosmoflow/model_cosmoflow.prototext b/model_zoo/models/cosmoflow/model_cosmoflow.prototext index f4b6829a637..782682f97bf 100644 --- a/model_zoo/models/cosmoflow/model_cosmoflow.prototext +++ b/model_zoo/models/cosmoflow/model_cosmoflow.prototext @@ -1,11 +1,13 @@ +trainer { + block_size: 256 + procs_per_trainer: 0 + num_parallel_readers: 0 +} model { type: "directed_acyclic_graph_model" data_layout: "data_parallel" mini_batch_size: 64 - block_size: 256 num_epochs: 18 - num_parallel_readers: 0 - procs_per_trainer: 0 objective_function { layer_term { layer: "mean_absolute_error" } diff --git a/model_zoo/models/gan/jags/cycle_gan/cycgan_m1.prototext b/model_zoo/models/gan/jags/cycle_gan/cycgan_m1.prototext index bcc80c98e3f..f2a919a628c 100644 --- a/model_zoo/models/gan/jags/cycle_gan/cycgan_m1.prototext +++ b/model_zoo/models/gan/jags/cycle_gan/cycgan_m1.prototext @@ -1,3 +1,6 @@ +trainer { + block_size: 256 +} model { objective_function { l2_weight_regularization { @@ -569,7 +572,6 @@ model { # execution_modes: "test" # } #} - block_size: 256 super_steps: 10000 num_batches: 1 } diff --git a/model_zoo/models/gan/jags/cycle_gan/cycgan_m1_template.prototext b/model_zoo/models/gan/jags/cycle_gan/cycgan_m1_template.prototext index 98a6745c3da..b77befee088 100644 --- a/model_zoo/models/gan/jags/cycle_gan/cycgan_m1_template.prototext +++ b/model_zoo/models/gan/jags/cycle_gan/cycgan_m1_template.prototext @@ -1,12 +1,14 @@ +trainer { + block_size: 256 + procs_per_trainer: 0 + num_parallel_readers: 0 +} model { data_layout: "data_parallel" mini_batch_size: 64 - block_size: 256 super_steps: 10000 num_batches: 1 num_epochs: 1 - num_parallel_readers: 0 - procs_per_trainer: 0 ################################################### # Objective function diff --git a/model_zoo/models/gan/jags/cycle_gan/cycgan_m2.prototext b/model_zoo/models/gan/jags/cycle_gan/cycgan_m2.prototext index 45c673ea736..053dae3b448 100644 --- a/model_zoo/models/gan/jags/cycle_gan/cycgan_m2.prototext +++ b/model_zoo/models/gan/jags/cycle_gan/cycgan_m2.prototext @@ -1,3 +1,6 @@ +trainer { + block_size: 256 +} model { objective_function { l2_weight_regularization { @@ -544,7 +547,6 @@ model { execution_modes: "test" } } - block_size: 256 super_steps: 10000 num_batches: 1 } diff --git a/model_zoo/models/gan/jags/cycle_gan/cycgan_m2_template.prototext b/model_zoo/models/gan/jags/cycle_gan/cycgan_m2_template.prototext index b834fb30db2..758d7d353e1 100644 --- a/model_zoo/models/gan/jags/cycle_gan/cycgan_m2_template.prototext +++ b/model_zoo/models/gan/jags/cycle_gan/cycgan_m2_template.prototext @@ -1,12 +1,14 @@ +trainer { + block_size: 256 + procs_per_trainer: 0 + num_parallel_readers: 0 +} model { data_layout: "data_parallel" mini_batch_size: 64 - block_size: 256 super_steps: 10000 num_batches: 1 num_epochs: 1 - num_parallel_readers: 0 - procs_per_trainer: 0 ################################################### # Objective function diff --git a/model_zoo/models/gan/jags/cycle_gan/cycgan_m3.prototext b/model_zoo/models/gan/jags/cycle_gan/cycgan_m3.prototext index 7277121a832..81560f62300 100644 --- a/model_zoo/models/gan/jags/cycle_gan/cycgan_m3.prototext +++ b/model_zoo/models/gan/jags/cycle_gan/cycgan_m3.prototext @@ -1,3 +1,6 @@ +trainer { + block_size: 256 +} model { objective_function { l2_weight_regularization { @@ -605,7 +608,6 @@ model { execution_modes: "test" } } - block_size: 256 super_steps: 10000 num_batches: 1 } diff --git a/model_zoo/models/gan/jags/cycle_gan/cycgan_m3_template.prototext b/model_zoo/models/gan/jags/cycle_gan/cycgan_m3_template.prototext index 71fc11174c2..d6d8ed35499 100644 --- a/model_zoo/models/gan/jags/cycle_gan/cycgan_m3_template.prototext +++ b/model_zoo/models/gan/jags/cycle_gan/cycgan_m3_template.prototext @@ -1,12 +1,14 @@ +trainer { + block_size: 256 + procs_per_trainer: 0 + num_parallel_readers: 0 +} model { data_layout: "data_parallel" mini_batch_size: 64 - block_size: 256 super_steps: 10000 num_batches: 1 num_epochs: 1 - num_parallel_readers: 0 - procs_per_trainer: 0 ################################################### # Objective function diff --git a/model_zoo/models/gan/mnist/adversarial_model.prototext b/model_zoo/models/gan/mnist/adversarial_model.prototext index 8d35b438717..348e618a66e 100644 --- a/model_zoo/models/gan/mnist/adversarial_model.prototext +++ b/model_zoo/models/gan/mnist/adversarial_model.prototext @@ -1,12 +1,14 @@ #Adversarial Model +trainer { + block_size: 256 + procs_per_trainer: 0 + num_parallel_readers: 0 +} model { data_layout: "model_parallel" mini_batch_size: 32 - block_size: 256 super_steps: 100000 num_epochs: 1 - num_parallel_readers: 0 - procs_per_trainer: 0 ################################################### # Objective function diff --git a/model_zoo/models/gan/mnist/discriminator_model.prototext b/model_zoo/models/gan/mnist/discriminator_model.prototext index 75cc0425b9e..fde26792721 100644 --- a/model_zoo/models/gan/mnist/discriminator_model.prototext +++ b/model_zoo/models/gan/mnist/discriminator_model.prototext @@ -1,12 +1,14 @@ #Discriminator Model +trainer { + block_size: 256 + procs_per_trainer: 0 + num_parallel_readers: 0 +} model { data_layout: "model_parallel" mini_batch_size: 32 - block_size: 256 super_steps: 100000 num_epochs: 1 - num_parallel_readers: 0 - procs_per_trainer: 0 ################################################### # Objective function diff --git a/model_zoo/models/jag/gan/cyclic/cyclic_gan_model.prototext b/model_zoo/models/jag/gan/cyclic/cyclic_gan_model.prototext index a0d04525790..59f04954428 100644 --- a/model_zoo/models/jag/gan/cyclic/cyclic_gan_model.prototext +++ b/model_zoo/models/jag/gan/cyclic/cyclic_gan_model.prototext @@ -1,5 +1,8 @@ -model { +trainer { + block_size: 256 procs_per_trainer:0 +} +model { objective_function { l2_weight_regularization { scale_factor: 0.0001 @@ -742,5 +745,4 @@ model { batch_interval: 1 } } - block_size: 256 } diff --git a/model_zoo/models/jag/gan/cyclic/model_template.prototext b/model_zoo/models/jag/gan/cyclic/model_template.prototext index 3c130a16aa8..d41b06eecab 100644 --- a/model_zoo/models/jag/gan/cyclic/model_template.prototext +++ b/model_zoo/models/jag/gan/cyclic/model_template.prototext @@ -1,10 +1,12 @@ +trainer { + block_size: 256 + procs_per_trainer: 0 + num_parallel_readers: 0 +} model { data_layout: "data_parallel" mini_batch_size: 64 - block_size: 256 num_epochs: 10 - num_parallel_readers: 0 - procs_per_trainer: 0 ################################################### # Objective function diff --git a/model_zoo/models/jag/gan/vanilla/gan.prototext b/model_zoo/models/jag/gan/vanilla/gan.prototext index 976151dd318..8a4a408fabf 100644 --- a/model_zoo/models/jag/gan/vanilla/gan.prototext +++ b/model_zoo/models/jag/gan/vanilla/gan.prototext @@ -1,3 +1,7 @@ +trainer { + block_size: 256 + procs_per_trainer:0 +} model { random_init_models_differently: true objective_function { @@ -482,6 +486,4 @@ model { # } # } - block_size: 256 - procs_per_trainer:0 } diff --git a/model_zoo/models/jag/gan/vanilla/gan_template.prototext b/model_zoo/models/jag/gan/vanilla/gan_template.prototext index 89c1f949691..f3cfbe4cb55 100644 --- a/model_zoo/models/jag/gan/vanilla/gan_template.prototext +++ b/model_zoo/models/jag/gan/vanilla/gan_template.prototext @@ -1,10 +1,12 @@ +trainer { + block_size: 256 + procs_per_trainer: 0 + num_parallel_readers: 0 +} model { data_layout: "data_parallel" mini_batch_size: 64 - block_size: 256 num_epochs: 1 - num_parallel_readers: 0 - procs_per_trainer: 0 ################################################### # Objective function diff --git a/model_zoo/models/jag/vae_fcn.prototext b/model_zoo/models/jag/vae_fcn.prototext index 8f2528984df..ed560d473f9 100644 --- a/model_zoo/models/jag/vae_fcn.prototext +++ b/model_zoo/models/jag/vae_fcn.prototext @@ -1,14 +1,16 @@ #Example taken from: https://lc.llnl.gov/bitbucket/users/jjayaram/repos/deep-latent-spaces/browse/codes/dev/VAE-FCN/vae_fcn.py and #https://lc.llnl.gov/bitbucket/users/jjayaram/repos/deep-latent-spaces/browse/codes/dev/VAE-FCN/run_vae.py #Timestamp 02/26/2018 8:45AM +trainer { + block_size: 256 + procs_per_trainer: 0 + num_parallel_readers: 0 +} model { data_layout: "model_parallel" #mini_batch_size: 128 mini_batch_size: 100 #more last minibatch images to save - block_size: 256 num_epochs: 40 - num_parallel_readers: 0 - procs_per_trainer: 0 ################################################### # Objective function diff --git a/model_zoo/models/jag/wae.prototext b/model_zoo/models/jag/wae.prototext index 8d204e7a62a..9a87ecfffba 100644 --- a/model_zoo/models/jag/wae.prototext +++ b/model_zoo/models/jag/wae.prototext @@ -1,3 +1,7 @@ +trainer { + block_size: 256 + procs_per_trainer:0 +} model { random_init_models_differently: true serialize_io: true @@ -570,6 +574,4 @@ model { # } # } - block_size: 256 - procs_per_trainer:0 } diff --git a/model_zoo/models/jag/wae_cycle_gan/cycle_gan.prototext b/model_zoo/models/jag/wae_cycle_gan/cycle_gan.prototext index 92db21e4ca1..a5ce5742c6b 100644 --- a/model_zoo/models/jag/wae_cycle_gan/cycle_gan.prototext +++ b/model_zoo/models/jag/wae_cycle_gan/cycle_gan.prototext @@ -1,8 +1,11 @@ +trainer { + block_size: 256 + procs_per_trainer:0 +} model { name: "cycgan_model" shareable_training_data_reader:false serialize_io: true - procs_per_trainer:0 objective_function { l2_weight_regularization { scale_factor: 0.0001 @@ -1026,7 +1029,6 @@ model { # } # } - block_size: 256 ####For metric, loss per individual sample layer { name: "fw_latent_loss" diff --git a/model_zoo/models/jag/wae_cycle_gan/cycle_gan_only.prototext b/model_zoo/models/jag/wae_cycle_gan/cycle_gan_only.prototext index adaeeb9456c..858cf6909ce 100644 --- a/model_zoo/models/jag/wae_cycle_gan/cycle_gan_only.prototext +++ b/model_zoo/models/jag/wae_cycle_gan/cycle_gan_only.prototext @@ -1,8 +1,11 @@ +trainer { + block_size: 256 + procs_per_trainer:0 +} model { name: "cycgan_model" shareable_training_data_reader:false serialize_io: true - procs_per_trainer:0 objective_function { l2_weight_regularization { scale_factor: 0.0001 @@ -895,7 +898,6 @@ model { # } # } - block_size: 256 ####For metric, loss per individual sample layer { name: "fw_latent_loss" diff --git a/model_zoo/models/jag/wae_cycle_gan/data_reader_jag_conduit_lustre.prototext b/model_zoo/models/jag/wae_cycle_gan/data_reader_jag_conduit_lustre.prototext index 82ac04f28e5..4ae82349f50 100644 --- a/model_zoo/models/jag/wae_cycle_gan/data_reader_jag_conduit_lustre.prototext +++ b/model_zoo/models/jag/wae_cycle_gan/data_reader_jag_conduit_lustre.prototext @@ -30,36 +30,6 @@ data_reader { num_labels: 5 - image_preprocessor { - # assume fixed size of input images if cropper is not used - raw_width: 64 - raw_height: 64 - raw_num_channels: 4 - - normalizer { - disable: true - scale: false - subtract_mean: false - unit_variance: false - z_score: true - } - - subtractor { - disable: true - } - - cropper { - disable: true - } - - colorizer { - disable: true - } - - augmenter { - disable: true - } - } } reader { @@ -75,42 +45,12 @@ data_reader { index_list_per_model: false validation_percent: 0 - absolute_sample_count: 0 + absolute_sample_count: 0 percent_of_data_to_use: 0.0005 disable_responses: true disable_labels: true num_labels: 5 - image_preprocessor { - # assume fixed size of input images if cropper is not used - raw_width: 64 - raw_height: 64 - raw_num_channels: 4 - - normalizer { - disable: true - scale: false - subtract_mean: false - unit_variance: false - z_score: true - } - - subtractor { - disable: true - } - - cropper { - disable: true - } - - colorizer { - disable: true - } - - augmenter { - disable: true - } - } } } diff --git a/model_zoo/models/jag/wae_cycle_gan/wae.prototext b/model_zoo/models/jag/wae_cycle_gan/wae.prototext index aa937b9ab7c..5234bbb6625 100644 --- a/model_zoo/models/jag/wae_cycle_gan/wae.prototext +++ b/model_zoo/models/jag/wae_cycle_gan/wae.prototext @@ -1,3 +1,7 @@ +trainer { + block_size: 256 + procs_per_trainer:0 +} model { random_init_models_differently: true serialize_io: true @@ -675,7 +679,7 @@ model { name: "decode3bias" } - + #Discriminator (shared) weights { name: "wae_d1fc1linearity" @@ -755,6 +759,4 @@ model { # } callback { save_model { dir: "model" } } - block_size: 256 - procs_per_trainer:0 } diff --git a/model_zoo/models/jag/wae_cycle_gan/wae_fw_inv.prototext b/model_zoo/models/jag/wae_cycle_gan/wae_fw_inv.prototext index 5864a694d85..aa0e545a486 100644 --- a/model_zoo/models/jag/wae_cycle_gan/wae_fw_inv.prototext +++ b/model_zoo/models/jag/wae_cycle_gan/wae_fw_inv.prototext @@ -1,14 +1,16 @@ #Augumented version of ae_cyc.prototext so we can we ae_loss, fw_latent_loss and fw_out_loss all in the same file instead of 3 files, a request from MLSI ML team. This augmentation involves replicating blocks for fw_model from cycle gan and encode from autoencoder. +trainer { + block_size: 256 + procs_per_trainer: 0 + num_parallel_readers: 0 +} model { name: "wae_fw_inv_model" - shareable_training_data_reader:false + shareable_training_data_reader:false serialize_io: true data_layout: "data_parallel" - mini_batch_size: 16384 - block_size: 256 + mini_batch_size: 16384 num_epochs: 1 - num_parallel_readers: 0 - procs_per_trainer: 0 ################################################### # Objective function @@ -208,7 +210,7 @@ model { weights: "gen1fc4linearity gen1fc4bias" parents: "gen1leaky_relu3_1" } - + weights { name: "gen1fc1linearity" initializer { @@ -357,7 +359,7 @@ model { data_layout: "data_parallel" identity {} } - ####output of encoder goes to decoder and cycGAN duplicates + ####output of encoder goes to decoder and cycGAN duplicates ###################### # Decoder for foward output loss @@ -540,7 +542,7 @@ model { data_layout: "data_parallel" weights: "decode3linearity decode3bias" fully_connected { - num_neurons: 32 + num_neurons: 32 has_bias: true } } @@ -727,7 +729,7 @@ model { weights: "gen1fc4linearity gen1fc4bias" parents: "latent_gen1leaky_relu3_1" } - + #layer { # name: "gsample_minus_latentsample" # data_layout: "data_parallel" @@ -745,7 +747,7 @@ model { #parents: "gsample_minus_latentsample" parents: "latent_gen1fc4 image_data_dummy" } - + #####Inverse loss from cycle GAN #### latent space (image_data_dummy) -> pred X'(gen2fc4) layer { @@ -811,7 +813,7 @@ model { weights: "gen2fc4linearity gen2fc4bias" parents: "gen2leaky_relu3" } - + #layer { # name: "gsample2_minus_x" diff --git a/model_zoo/models/jag/wae_cycle_gan/wae_nobn.prototext b/model_zoo/models/jag/wae_cycle_gan/wae_nobn.prototext index b0a68861ed1..3bfe7e17686 100644 --- a/model_zoo/models/jag/wae_cycle_gan/wae_nobn.prototext +++ b/model_zoo/models/jag/wae_cycle_gan/wae_nobn.prototext @@ -1,3 +1,6 @@ +trainer { + block_size: 256 +} model { random_init_models_differently: true name: "wae_model" @@ -655,7 +658,7 @@ model { name: "decode3bias" } - + #Discriminator (shared) weights { name: "wae_d1fc1linearity" @@ -734,6 +737,5 @@ model { # } #callback { save_model { dir: "model" } } - block_size: 256 - procs_per_model:0 + procs_per_model:0 } diff --git a/model_zoo/models/lenet_mnist/model_lenet_mnist.prototext b/model_zoo/models/lenet_mnist/model_lenet_mnist.prototext index d41b5c90339..9d2c4bcf368 100644 --- a/model_zoo/models/lenet_mnist/model_lenet_mnist.prototext +++ b/model_zoo/models/lenet_mnist/model_lenet_mnist.prototext @@ -1,10 +1,12 @@ +trainer{ + block_size: 256 + procs_per_trainer: 0 + num_parallel_readers: 0 +} model { data_layout: "data_parallel" mini_batch_size: 64 - block_size: 256 num_epochs: 20 - num_parallel_readers: 0 - procs_per_trainer: 0 ################################################### # Objective function diff --git a/model_zoo/models/molecular_autoencoder_candle_pilot2/model_conv_molecular_autoencoder_pilot2.prototext b/model_zoo/models/molecular_autoencoder_candle_pilot2/model_conv_molecular_autoencoder_pilot2.prototext index 3791b5c4e0c..a6e7b0f6442 100644 --- a/model_zoo/models/molecular_autoencoder_candle_pilot2/model_conv_molecular_autoencoder_pilot2.prototext +++ b/model_zoo/models/molecular_autoencoder_candle_pilot2/model_conv_molecular_autoencoder_pilot2.prototext @@ -1,9 +1,11 @@ +trainer { + block_size: 256 + procs_per_trainer: 0 + num_parallel_readers: 1 +} model { mini_batch_size: 1024 - block_size: 256 num_epochs: 4 - num_parallel_readers: 1 - procs_per_trainer: 0 ############################################## # Objective function diff --git a/model_zoo/models/molecular_autoencoder_candle_pilot2/model_conv_molecular_bead_autoencoder_pilot2.prototext b/model_zoo/models/molecular_autoencoder_candle_pilot2/model_conv_molecular_bead_autoencoder_pilot2.prototext index d3827897409..8603f2bd397 100644 --- a/model_zoo/models/molecular_autoencoder_candle_pilot2/model_conv_molecular_bead_autoencoder_pilot2.prototext +++ b/model_zoo/models/molecular_autoencoder_candle_pilot2/model_conv_molecular_bead_autoencoder_pilot2.prototext @@ -1,9 +1,11 @@ +trainer { + block_size: 256 + procs_per_trainer: 0 + num_parallel_readers: 1 +} model { mini_batch_size: 32 - block_size: 256 num_epochs: 4 - num_parallel_readers: 1 - procs_per_trainer: 0 ############################################## # Objective function diff --git a/model_zoo/models/molecular_autoencoder_candle_pilot2/model_molecular_autoencoder_pilot2.prototext b/model_zoo/models/molecular_autoencoder_candle_pilot2/model_molecular_autoencoder_pilot2.prototext index e519fe44b87..99a392e63d8 100644 --- a/model_zoo/models/molecular_autoencoder_candle_pilot2/model_molecular_autoencoder_pilot2.prototext +++ b/model_zoo/models/molecular_autoencoder_candle_pilot2/model_molecular_autoencoder_pilot2.prototext @@ -1,9 +1,11 @@ +trainer { + block_size: 256 + procs_per_trainer: 0 + num_parallel_readers: 1 +} model { mini_batch_size: 512 - block_size: 256 num_epochs: 4 - num_parallel_readers: 1 - procs_per_trainer: 0 ############################################## # Objective function diff --git a/model_zoo/models/resnet50/model_resnet50.prototext b/model_zoo/models/resnet50/model_resnet50.prototext index 6ad463158cf..e617beef9b5 100644 --- a/model_zoo/models/resnet50/model_resnet50.prototext +++ b/model_zoo/models/resnet50/model_resnet50.prototext @@ -1,10 +1,12 @@ +trainer { + block_size: 256 + procs_per_trainer: 0 + num_parallel_readers: 0 +} model { data_layout: "data_parallel" mini_batch_size: 256 - block_size: 256 num_epochs: 10 - num_parallel_readers: 0 - procs_per_trainer: 0 ################################################### # Objective function diff --git a/model_zoo/models/siamese/finetune-cub/model_cub.prototext b/model_zoo/models/siamese/finetune-cub/model_cub.prototext index 0f139ba4ec2..df6b9335943 100644 --- a/model_zoo/models/siamese/finetune-cub/model_cub.prototext +++ b/model_zoo/models/siamese/finetune-cub/model_cub.prototext @@ -1,10 +1,12 @@ +trainer { + block_size: 256 + procs_per_trainer: 0 + num_parallel_readers: 0 +} model { data_layout: "data_parallel" mini_batch_size: 64 - block_size: 256 num_epochs: 50 - num_parallel_readers: 0 - procs_per_trainer: 0 ################################################### # Objective function diff --git a/model_zoo/models/siamese/finetune-cub/model_cub_batchnorm.prototext b/model_zoo/models/siamese/finetune-cub/model_cub_batchnorm.prototext index c7ab48dd13e..034d1cd41c6 100644 --- a/model_zoo/models/siamese/finetune-cub/model_cub_batchnorm.prototext +++ b/model_zoo/models/siamese/finetune-cub/model_cub_batchnorm.prototext @@ -1,10 +1,12 @@ +trainer { + block_size: 256 + procs_per_trainer: 0 + num_parallel_readers: 0 +} model { data_layout: "data_parallel" mini_batch_size: 64 - block_size: 256 num_epochs: 50 - num_parallel_readers: 0 - procs_per_trainer: 0 ################################################### # Objective function diff --git a/model_zoo/models/siamese/finetune-cub/model_cub_batchnorm_transferred_and_frozen.prototext b/model_zoo/models/siamese/finetune-cub/model_cub_batchnorm_transferred_and_frozen.prototext index ec2f478b8b1..3164f38097f 100644 --- a/model_zoo/models/siamese/finetune-cub/model_cub_batchnorm_transferred_and_frozen.prototext +++ b/model_zoo/models/siamese/finetune-cub/model_cub_batchnorm_transferred_and_frozen.prototext @@ -1,10 +1,12 @@ +trainer { + block_size: 256 + procs_per_trainer: 0 + num_parallel_readers: 0 +} model { data_layout: "data_parallel" mini_batch_size: 64 - block_size: 256 num_epochs: 50 - num_parallel_readers: 0 - procs_per_trainer: 0 ################################################### # Objective function diff --git a/model_zoo/models/siamese/triplet/model_triplet_alexnet_batchnorm_dag_frozen_bn.prototext b/model_zoo/models/siamese/triplet/model_triplet_alexnet_batchnorm_dag_frozen_bn.prototext index e6a87ad92f5..3c9d16d8d25 100644 --- a/model_zoo/models/siamese/triplet/model_triplet_alexnet_batchnorm_dag_frozen_bn.prototext +++ b/model_zoo/models/siamese/triplet/model_triplet_alexnet_batchnorm_dag_frozen_bn.prototext @@ -1,10 +1,12 @@ +trainer { + block_size: 256 + procs_per_trainer: 0 + num_parallel_readers: 0 +} model { data_layout: "data_parallel" mini_batch_size: 128 - block_size: 256 num_epochs: 1 - num_parallel_readers: 0 - procs_per_trainer: 0 ################################################### # Objective function diff --git a/model_zoo/models/simple_mnist/model_mnist_simple_1.prototext b/model_zoo/models/simple_mnist/model_mnist_simple_1.prototext index 638368f5fb7..1727cb235d8 100644 --- a/model_zoo/models/simple_mnist/model_mnist_simple_1.prototext +++ b/model_zoo/models/simple_mnist/model_mnist_simple_1.prototext @@ -1,10 +1,12 @@ +trainer { + block_size: 256 + procs_per_trainer: 0 + num_parallel_readers: 0 +} model { data_layout: "data_parallel" mini_batch_size: 64 - block_size: 256 num_epochs: 3 - num_parallel_readers: 0 - procs_per_trainer: 0 ################################################### # Objective function diff --git a/model_zoo/models/simple_mnist/model_mnist_simple_2.prototext b/model_zoo/models/simple_mnist/model_mnist_simple_2.prototext index ea600bd2646..4a7b090d43a 100644 --- a/model_zoo/models/simple_mnist/model_mnist_simple_2.prototext +++ b/model_zoo/models/simple_mnist/model_mnist_simple_2.prototext @@ -1,10 +1,12 @@ +trainer { + block_size: 256 + procs_per_trainer: 0 + num_parallel_readers: 0 +} model { data_layout: "data_parallel" mini_batch_size: 64 - block_size: 256 num_epochs: 3 - num_parallel_readers: 0 - procs_per_trainer: 0 ################################################### # Objective function diff --git a/model_zoo/tests/layer_tests/model_channelwise_mean.prototext b/model_zoo/tests/layer_tests/model_channelwise_mean.prototext index 5d73378bfbb..01e489066ce 100644 --- a/model_zoo/tests/layer_tests/model_channelwise_mean.prototext +++ b/model_zoo/tests/layer_tests/model_channelwise_mean.prototext @@ -1,10 +1,12 @@ +trainer { + block_size: 256 + procs_per_trainer: 0 + num_parallel_readers: 0 +} model { data_layout: "data_parallel" mini_batch_size: 11 - block_size: 256 num_epochs: 0 - num_parallel_readers: 0 - procs_per_trainer: 0 ################################################### # Objective function and metrics diff --git a/model_zoo/tests/layer_tests/model_clamp.prototext b/model_zoo/tests/layer_tests/model_clamp.prototext index 96b68c24a19..f0120282b07 100644 --- a/model_zoo/tests/layer_tests/model_clamp.prototext +++ b/model_zoo/tests/layer_tests/model_clamp.prototext @@ -1,10 +1,12 @@ +trainer { + block_size: 256 + procs_per_trainer: 0 + num_parallel_readers: 0 +} model { data_layout: "data_parallel" mini_batch_size: 11 - block_size: 256 num_epochs: 0 - num_parallel_readers: 0 - procs_per_trainer: 0 ################################################### # Objective function and metrics diff --git a/model_zoo/tests/layer_tests/model_covariance.prototext b/model_zoo/tests/layer_tests/model_covariance.prototext index c081c8f9261..ad224af6739 100644 --- a/model_zoo/tests/layer_tests/model_covariance.prototext +++ b/model_zoo/tests/layer_tests/model_covariance.prototext @@ -1,10 +1,12 @@ +trainer { + block_size: 256 + procs_per_trainer: 0 + num_parallel_readers: 0 +} model { data_layout: "data_parallel" mini_batch_size: 11 - block_size: 256 num_epochs: 0 - num_parallel_readers: 0 - procs_per_trainer: 0 ################################################### # Objective function and metrics diff --git a/model_zoo/tests/layer_tests/model_elu.prototext b/model_zoo/tests/layer_tests/model_elu.prototext index ca38a049fea..e045dea1d11 100644 --- a/model_zoo/tests/layer_tests/model_elu.prototext +++ b/model_zoo/tests/layer_tests/model_elu.prototext @@ -1,10 +1,12 @@ +trainer { + block_size: 256 + procs_per_trainer: 0 + num_parallel_readers: 0 +} model { data_layout: "data_parallel" mini_batch_size: 11 - block_size: 256 num_epochs: 0 - num_parallel_readers: 0 - procs_per_trainer: 0 ################################################### # Objective function and metrics diff --git a/model_zoo/tests/layer_tests/model_identity.prototext b/model_zoo/tests/layer_tests/model_identity.prototext index 1ee188c4e92..606803d8721 100644 --- a/model_zoo/tests/layer_tests/model_identity.prototext +++ b/model_zoo/tests/layer_tests/model_identity.prototext @@ -1,10 +1,12 @@ +trainer { + block_size: 256 + procs_per_trainer: 0 + num_parallel_readers: 0 +} model { data_layout: "data_parallel" mini_batch_size: 11 - block_size: 256 num_epochs: 0 - num_parallel_readers: 0 - procs_per_trainer: 0 ################################################### # Objective function and metrics diff --git a/model_zoo/tests/layer_tests/model_l1_norm.prototext b/model_zoo/tests/layer_tests/model_l1_norm.prototext index 510e3510b99..c2175bcb852 100644 --- a/model_zoo/tests/layer_tests/model_l1_norm.prototext +++ b/model_zoo/tests/layer_tests/model_l1_norm.prototext @@ -1,10 +1,12 @@ +trainer { + block_size: 256 + procs_per_trainer: 0 + num_parallel_readers: 0 +} model { data_layout: "data_parallel" mini_batch_size: 11 - block_size: 256 num_epochs: 0 - num_parallel_readers: 0 - procs_per_trainer: 0 ################################################### # Objective function and metrics diff --git a/model_zoo/tests/layer_tests/model_l2_norm2.prototext b/model_zoo/tests/layer_tests/model_l2_norm2.prototext index 623f374f02f..694d374536e 100644 --- a/model_zoo/tests/layer_tests/model_l2_norm2.prototext +++ b/model_zoo/tests/layer_tests/model_l2_norm2.prototext @@ -1,10 +1,12 @@ +trainer { + block_size: 256 + procs_per_trainer: 0 + num_parallel_readers: 0 +} model { data_layout: "data_parallel" mini_batch_size: 11 - block_size: 256 num_epochs: 0 - num_parallel_readers: 0 - procs_per_trainer: 0 ################################################### # Objective function and metrics diff --git a/model_zoo/tests/layer_tests/model_leaky_relu.prototext b/model_zoo/tests/layer_tests/model_leaky_relu.prototext index e1641781ca9..126c2962870 100644 --- a/model_zoo/tests/layer_tests/model_leaky_relu.prototext +++ b/model_zoo/tests/layer_tests/model_leaky_relu.prototext @@ -1,10 +1,12 @@ +trainer { + block_size: 256 + procs_per_trainer: 0 + num_parallel_readers: 0 +} model { data_layout: "data_parallel" mini_batch_size: 11 - block_size: 256 num_epochs: 0 - num_parallel_readers: 0 - procs_per_trainer: 0 ################################################### # Objective function and metrics diff --git a/model_zoo/tests/layer_tests/model_log_sigmoid.prototext b/model_zoo/tests/layer_tests/model_log_sigmoid.prototext index b53d30b0029..8b559f9766d 100644 --- a/model_zoo/tests/layer_tests/model_log_sigmoid.prototext +++ b/model_zoo/tests/layer_tests/model_log_sigmoid.prototext @@ -1,10 +1,12 @@ +trainer { + block_size: 256 + procs_per_trainer: 0 + num_parallel_readers: 0 +} model { data_layout: "data_parallel" mini_batch_size: 11 - block_size: 256 num_epochs: 0 - num_parallel_readers: 0 - procs_per_trainer: 0 ################################################### # Objective function and metrics diff --git a/model_zoo/tests/layer_tests/model_log_softmax.prototext b/model_zoo/tests/layer_tests/model_log_softmax.prototext index d9ae9fbc863..1de02b0342d 100644 --- a/model_zoo/tests/layer_tests/model_log_softmax.prototext +++ b/model_zoo/tests/layer_tests/model_log_softmax.prototext @@ -1,10 +1,12 @@ +trainer { + block_size: 256 + procs_per_trainer: 0 + num_parallel_readers: 0 +} model { data_layout: "data_parallel" mini_batch_size: 11 - block_size: 256 num_epochs: 0 - num_parallel_readers: 0 - procs_per_trainer: 0 ################################################### # Objective function and metrics diff --git a/model_zoo/tests/layer_tests/model_mean_absolute_error.prototext b/model_zoo/tests/layer_tests/model_mean_absolute_error.prototext index 78f637afb34..8728d2bcb80 100644 --- a/model_zoo/tests/layer_tests/model_mean_absolute_error.prototext +++ b/model_zoo/tests/layer_tests/model_mean_absolute_error.prototext @@ -1,10 +1,12 @@ +trainer { + block_size: 256 + procs_per_trainer: 0 + num_parallel_readers: 0 +} model { data_layout: "data_parallel" mini_batch_size: 11 - block_size: 256 num_epochs: 0 - num_parallel_readers: 0 - procs_per_trainer: 0 ################################################### # Objective function and metrics diff --git a/model_zoo/tests/layer_tests/model_relu.prototext b/model_zoo/tests/layer_tests/model_relu.prototext index 8e048caa515..4acdbe6aaf1 100644 --- a/model_zoo/tests/layer_tests/model_relu.prototext +++ b/model_zoo/tests/layer_tests/model_relu.prototext @@ -1,10 +1,12 @@ +trainer { + block_size: 256 + procs_per_trainer: 0 + num_parallel_readers: 0 +} model { data_layout: "data_parallel" mini_batch_size: 11 - block_size: 256 num_epochs: 0 - num_parallel_readers: 0 - procs_per_trainer: 0 ################################################### # Objective function and metrics diff --git a/model_zoo/tests/layer_tests/model_selu.prototext b/model_zoo/tests/layer_tests/model_selu.prototext index 488aa6cb0d5..007859a7bac 100644 --- a/model_zoo/tests/layer_tests/model_selu.prototext +++ b/model_zoo/tests/layer_tests/model_selu.prototext @@ -1,10 +1,12 @@ +trainer { + block_size: 256 + procs_per_trainer: 0 + num_parallel_readers: 0 +} model { data_layout: "data_parallel" mini_batch_size: 11 - block_size: 256 num_epochs: 0 - num_parallel_readers: 0 - procs_per_trainer: 0 ################################################### # Objective function and metrics diff --git a/model_zoo/tests/layer_tests/model_sigmoid.prototext b/model_zoo/tests/layer_tests/model_sigmoid.prototext index 5eda4e5e5e1..13323653add 100644 --- a/model_zoo/tests/layer_tests/model_sigmoid.prototext +++ b/model_zoo/tests/layer_tests/model_sigmoid.prototext @@ -1,10 +1,12 @@ +trainer { + block_size: 256 + procs_per_trainer: 0 + num_parallel_readers: 0 +} model { data_layout: "data_parallel" mini_batch_size: 11 - block_size: 256 num_epochs: 0 - num_parallel_readers: 0 - procs_per_trainer: 0 ################################################### # Objective function and metrics diff --git a/model_zoo/tests/layer_tests/model_softmax.prototext b/model_zoo/tests/layer_tests/model_softmax.prototext index 4171f1c93bd..71ed61145fe 100644 --- a/model_zoo/tests/layer_tests/model_softmax.prototext +++ b/model_zoo/tests/layer_tests/model_softmax.prototext @@ -1,10 +1,12 @@ +trainer { + block_size: 256 + procs_per_trainer: 0 + num_parallel_readers: 0 +} model { data_layout: "data_parallel" mini_batch_size: 11 - block_size: 256 num_epochs: 0 - num_parallel_readers: 0 - procs_per_trainer: 0 ################################################### # Objective function and metrics diff --git a/model_zoo/tests/layer_tests/model_softplus.prototext b/model_zoo/tests/layer_tests/model_softplus.prototext index 09622663b13..c2543bcc1a4 100644 --- a/model_zoo/tests/layer_tests/model_softplus.prototext +++ b/model_zoo/tests/layer_tests/model_softplus.prototext @@ -1,10 +1,12 @@ +trainer { + block_size: 256 + procs_per_trainer: 0 + num_parallel_readers: 0 +} model { data_layout: "data_parallel" mini_batch_size: 11 - block_size: 256 num_epochs: 0 - num_parallel_readers: 0 - procs_per_trainer: 0 ################################################### # Objective function and metrics diff --git a/model_zoo/tests/layer_tests/model_softsign.prototext b/model_zoo/tests/layer_tests/model_softsign.prototext index cdc3adc9ade..3c83855f991 100644 --- a/model_zoo/tests/layer_tests/model_softsign.prototext +++ b/model_zoo/tests/layer_tests/model_softsign.prototext @@ -1,10 +1,12 @@ +trainer { + block_size: 256 + procs_per_trainer: 0 + num_parallel_readers: 0 +} model { data_layout: "data_parallel" mini_batch_size: 11 - block_size: 256 num_epochs: 0 - num_parallel_readers: 0 - procs_per_trainer: 0 ################################################### # Objective function and metrics diff --git a/model_zoo/tests/layer_tests/model_squared_difference.prototext b/model_zoo/tests/layer_tests/model_squared_difference.prototext index ea81d630ee2..2142f7e5144 100644 --- a/model_zoo/tests/layer_tests/model_squared_difference.prototext +++ b/model_zoo/tests/layer_tests/model_squared_difference.prototext @@ -1,6 +1,8 @@ +trainer { + block_size: 256 +} model { mini_batch_size: 11 - block_size: 256 num_epochs: 0 ################################################### diff --git a/model_zoo/tests/layer_tests/model_tessellate.prototext b/model_zoo/tests/layer_tests/model_tessellate.prototext index 0ca5493bb6b..5da0b5fa989 100644 --- a/model_zoo/tests/layer_tests/model_tessellate.prototext +++ b/model_zoo/tests/layer_tests/model_tessellate.prototext @@ -1,6 +1,8 @@ +trainer { + block_size: 256 +} model { mini_batch_size: 11 - block_size: 256 num_epochs: 0 ################################################### diff --git a/model_zoo/tests/layer_tests/model_variance.prototext b/model_zoo/tests/layer_tests/model_variance.prototext index d1d6c8b8329..d01a9e9ce68 100644 --- a/model_zoo/tests/layer_tests/model_variance.prototext +++ b/model_zoo/tests/layer_tests/model_variance.prototext @@ -1,10 +1,12 @@ +trainer { + block_size: 256 + procs_per_trainer: 0 + num_parallel_readers: 0 +} model { data_layout: "data_parallel" mini_batch_size: 11 - block_size: 256 num_epochs: 0 - num_parallel_readers: 0 - procs_per_trainer: 0 ################################################### # Objective function and metrics diff --git a/model_zoo/tests/model_jag_single_layer_ae.prototext b/model_zoo/tests/model_jag_single_layer_ae.prototext index 572017c8366..6ed218e7216 100644 --- a/model_zoo/tests/model_jag_single_layer_ae.prototext +++ b/model_zoo/tests/model_jag_single_layer_ae.prototext @@ -2,16 +2,18 @@ # Run time for this example is about 2s per epoch on 16 nodes (32 tasks) # Example on how to run: # srun --nodes=16 --ntasks=32 build/gnu.Release.catalyst.llnl.gov/lbann/build/model_zoo/lbann --model=model_zoo/tests/model_jag_single_layer_ae.prototext --optimizer=model_zoo/optimizers/opt_adam.prototext --reader=model_zoo/data_readers/data_reader_jag.prototext --metadata=model_zoo/models/jag/wae_cycle_gan/jag_100M_metadata.prototext +trainer { + block_size: 256 + procs_per_trainer:0 + num_parallel_readers: 0 +} model { name: "ae_model" - shareable_training_data_reader:false + shareable_training_data_reader:false serialize_io: true data_layout: "data_parallel" - mini_batch_size: 128 - block_size: 256 + mini_batch_size: 128 num_epochs: 4 - num_parallel_readers: 0 - procs_per_trainer: 0 ################################################### # Objective function @@ -55,6 +57,12 @@ model { } } + callback { checkpoint { + checkpoint_dir: "ckpt" + checkpoint_epochs: 1 +# checkpoint_steps: 1 + } } + ################################################### # start of layers ################################################### @@ -110,7 +118,7 @@ model { data_layout: "data_parallel" elu {} } - #Y'(reconstructed images and scalar) + #Y'(reconstructed images and scalar) layer { parents: "encodeelu" name: "decode" diff --git a/model_zoo/tests/model_lenet_mnist_ckpt.prototext b/model_zoo/tests/model_lenet_mnist_ckpt.prototext index 060b2c223d3..a0aed7f7c83 100644 --- a/model_zoo/tests/model_lenet_mnist_ckpt.prototext +++ b/model_zoo/tests/model_lenet_mnist_ckpt.prototext @@ -1,10 +1,12 @@ +trainer { + block_size: 256 + procs_per_trainer: 0 + num_parallel_readers: 0 +} model { data_layout: "data_parallel" mini_batch_size: 64 - block_size: 256 num_epochs: 20 - num_parallel_readers: 0 - procs_per_trainer: 0 disable_cuda: true ################################################### # Objective function diff --git a/model_zoo/tests/model_lenet_mnist_dist_ckpt.prototext b/model_zoo/tests/model_lenet_mnist_dist_ckpt.prototext index 922e0f483a6..0c325acd036 100644 --- a/model_zoo/tests/model_lenet_mnist_dist_ckpt.prototext +++ b/model_zoo/tests/model_lenet_mnist_dist_ckpt.prototext @@ -1,10 +1,12 @@ +trainer { + block_size: 256 + procs_per_trainer: 0 + num_parallel_readers: 0 +} model { data_layout: "data_parallel" mini_batch_size: 64 - block_size: 256 num_epochs: 20 - num_parallel_readers: 0 - procs_per_trainer: 0 disable_cuda: true ################################################### # Objective function diff --git a/model_zoo/tests/model_lenet_mnist_lbann2ckpt.prototext b/model_zoo/tests/model_lenet_mnist_lbann2ckpt.prototext index 7d17ecd18c4..b6c8b635ebb 100644 --- a/model_zoo/tests/model_lenet_mnist_lbann2ckpt.prototext +++ b/model_zoo/tests/model_lenet_mnist_lbann2ckpt.prototext @@ -1,10 +1,12 @@ +trainer { + block_size: 256 + procs_per_trainer: 0 + num_parallel_readers: 0 +} model { data_layout: "data_parallel" mini_batch_size: 64 - block_size: 256 num_epochs: 20 - num_parallel_readers: 0 - procs_per_trainer: 0 disable_cuda: true ################################################### # Objective function diff --git a/model_zoo/tests/model_mnist_conv_graph.prototext b/model_zoo/tests/model_mnist_conv_graph.prototext index 28fa7809551..61ecd70e0dd 100644 --- a/model_zoo/tests/model_mnist_conv_graph.prototext +++ b/model_zoo/tests/model_mnist_conv_graph.prototext @@ -1,7 +1,9 @@ +trainer { + block_size: 257 +} model { data_layout: "data_parallel" mini_batch_size: 31 - block_size: 257 num_epochs: 4 ################################################### diff --git a/model_zoo/tests/model_mnist_ridge_regression.prototext b/model_zoo/tests/model_mnist_ridge_regression.prototext index 1b358e90faf..0edf0a32028 100644 --- a/model_zoo/tests/model_mnist_ridge_regression.prototext +++ b/model_zoo/tests/model_mnist_ridge_regression.prototext @@ -1,10 +1,12 @@ +trainer { + block_size: 257 + procs_per_trainer: 0 + num_parallel_readers: 0 +} model { data_layout: "data_parallel" mini_batch_size: 131 - block_size: 257 num_epochs: 4 - num_parallel_readers: 0 - procs_per_trainer: 0 ################################################### # Objective function diff --git a/model_zoo/tests/model_mnist_softmax_classifier.prototext b/model_zoo/tests/model_mnist_softmax_classifier.prototext index ae75c319f7c..f776f5e89fb 100644 --- a/model_zoo/tests/model_mnist_softmax_classifier.prototext +++ b/model_zoo/tests/model_mnist_softmax_classifier.prototext @@ -1,10 +1,12 @@ +trainer { + block_size: 199 + procs_per_trainer: 0 + num_parallel_readers: 0 +} model { data_layout: "data_parallel" mini_batch_size: 103 - block_size: 199 num_epochs: 4 - num_parallel_readers: 0 - procs_per_trainer: 0 ################################################### # Objective function diff --git a/model_zoo/vision/alexnet.py b/model_zoo/vision/alexnet.py index a9319ba9c29..eca35721714 100755 --- a/model_zoo/vision/alexnet.py +++ b/model_zoo/vision/alexnet.py @@ -84,9 +84,13 @@ txtf.Merge(f.read(), data_reader_proto) data_reader_proto = data_reader_proto.data_reader +# Setup trainer +trainer = lbann.Trainer() + # Save prototext if args.prototext: lbann.proto.save_prototext(args.prototext, + trainer=trainer, model=model, optimizer=opt, data_reader=data_reader_proto) @@ -103,6 +107,6 @@ imagenet_labels(data_set='train', num_classes=classes), imagenet_dir(data_set='val', num_classes=classes), imagenet_labels(data_set='val', num_classes=classes))) - lbann.contrib.lc.launcher.run(model, data_reader_proto, opt, + lbann.contrib.lc.launcher.run(trainer, model, data_reader_proto, opt, job_name = 'lbann_alexnet', **kwargs) diff --git a/model_zoo/vision/lenet.py b/model_zoo/vision/lenet.py index 97cdb15ea14..ecab1daf54b 100755 --- a/model_zoo/vision/lenet.py +++ b/model_zoo/vision/lenet.py @@ -97,6 +97,9 @@ txtf.Merge(f.read(), data_reader_proto) data_reader_proto = data_reader_proto.data_reader +# Setup trainer +trainer = lbann.Trainer() + # ---------------------------------- # Run experiment # ---------------------------------- @@ -106,6 +109,6 @@ kwargs = {} if args.partition: kwargs['partition'] = args.partition if args.account: kwargs['account'] = args.account -lbann.run(model, data_reader_proto, opt, +lbann.run(trainer, model, data_reader_proto, opt, job_name='lbann_lenet', **kwargs) diff --git a/model_zoo/vision/resnet.py b/model_zoo/vision/resnet.py index 964ad3bb60c..3d4d0029a64 100755 --- a/model_zoo/vision/resnet.py +++ b/model_zoo/vision/resnet.py @@ -177,9 +177,13 @@ txtf.Merge(f.read(), data_reader_proto) data_reader_proto = data_reader_proto.data_reader +# Setup trainer +trainer = lbann.Trainer() + # Save prototext if args.prototext: lbann.proto.save_prototext(args.prototext, + trainer=trainer, model=model, optimizer=opt, data_reader=data_reader_proto) @@ -196,6 +200,6 @@ imagenet_labels(data_set='train', num_classes=classes), imagenet_dir(data_set='val', num_classes=classes), imagenet_labels(data_set='val', num_classes=classes))) - lbann.contrib.lc.launcher.run(model, data_reader_proto, opt, + lbann.contrib.lc.launcher.run(trainer, model, data_reader_proto, opt, job_name='lbann_resnet', **kwargs) diff --git a/python/lbann/__init__.py b/python/lbann/__init__.py index 1eeb8651e34..36a19e07b15 100644 --- a/python/lbann/__init__.py +++ b/python/lbann/__init__.py @@ -19,7 +19,7 @@ _lbann_exe = _config['Paths']['lbann_exe'] except: pass -import lbann_pb2, callbacks_pb2, layers_pb2, metrics_pb2, model_pb2, objective_functions_pb2, optimizers_pb2, reader_pb2, weights_pb2 +import lbann_pb2, callbacks_pb2, layers_pb2, metrics_pb2, model_pb2, objective_functions_pb2, optimizers_pb2, reader_pb2, weights_pb2, trainer_pb2, training_algorithm_pb2 def lbann_exe(): """LBANN executable.""" return _lbann_exe if _lbann_exe else 'lbann' @@ -28,6 +28,7 @@ def lbann_exe(): from lbann.callback import * from lbann.layer import * from lbann.metric import * +from lbann.trainer import * from lbann.model import * from lbann.objective_function import * from lbann.optimizer import * diff --git a/python/lbann/contrib/lc/launcher.py b/python/lbann/contrib/lc/launcher.py index d7c30645d41..3dd9718a176 100644 --- a/python/lbann/contrib/lc/launcher.py +++ b/python/lbann/contrib/lc/launcher.py @@ -2,7 +2,7 @@ from lbann.contrib.lc.systems import * import lbann.launcher -def run(model, data_reader, optimizer, +def run(trainer, model, data_reader, optimizer, lbann_exe=lbann_exe(), lbann_args='', experiment_dir=None, @@ -66,7 +66,7 @@ def run(model, data_reader, optimizer, environment['AL_PROGRESS_RANKS_PER_NUMA_NODE'] = 2 # Run LBANN - return lbann.launcher.run(model, data_reader, optimizer, + return lbann.launcher.run(trainer, model, data_reader, optimizer, lbann_exe=lbann_exe, lbann_args=lbann_args, experiment_dir=experiment_dir, diff --git a/python/lbann/launcher/__init__.py b/python/lbann/launcher/__init__.py index b42f7311c99..f8acbf4e720 100644 --- a/python/lbann/launcher/__init__.py +++ b/python/lbann/launcher/__init__.py @@ -10,7 +10,7 @@ # Run experiments # ============================================== -def run(model, data_reader, optimizer, +def run(trainer, model, data_reader, optimizer, lbann_exe=lbann.lbann_exe(), lbann_args='', experiment_dir=None, @@ -39,6 +39,7 @@ def run(model, data_reader, optimizer, can be set with the environment variable `LBANN_EXPERIMENT_DIR`. Args: + trainer (lbann.Trainer): LBANN Trainer (resource manager). model (lbann.model.Model or lbann_pb2.Model): Neural network model. data_reader (lbann_pb2.DataReader): Data reader. @@ -94,6 +95,7 @@ def run(model, data_reader, optimizer, # Create experiment prototext file prototext_file = os.path.join(experiment_dir, 'experiment.prototext') lbann.proto.save_prototext(prototext_file, + trainer=trainer, model=model, data_reader=data_reader, optimizer=optimizer) diff --git a/python/lbann/model.py b/python/lbann/model.py index 4a86dee82fc..a6e4572e87b 100644 --- a/python/lbann/model.py +++ b/python/lbann/model.py @@ -16,9 +16,6 @@ def __init__(self, mini_batch_size, epochs, # Scalar fields self.mini_batch_size = mini_batch_size self.epochs = epochs - self.block_size = 256 # TODO: Make configurable - self.num_parallel_readers = 0 # TODO: Make configurable - self.procs_per_trainer = 0 # TODO: Make configurable self.random_seed = random_seed self.summary_dir = summary_dir # Get connected layers @@ -48,9 +45,6 @@ def export_proto(self): model = model_pb2.Model() model.mini_batch_size = self.mini_batch_size model.num_epochs = self.epochs - model.block_size = self.block_size - model.num_parallel_readers = self.num_parallel_readers - model.procs_per_trainer = self.procs_per_trainer if self.random_seed is not None: model.random_seed = self.random_seed if self.summary_dir is not None: @@ -64,7 +58,3 @@ def export_proto(self): model.callback.extend([c.export_proto() for c in self.callbacks]) return model - - def save_proto(self, filename): - """Export model to prototext file.""" - save_prototext(filename, model=self.export_proto()) diff --git a/python/lbann/trainer.py b/python/lbann/trainer.py new file mode 100644 index 00000000000..75b0ff994e1 --- /dev/null +++ b/python/lbann/trainer.py @@ -0,0 +1,25 @@ +"""LBANN Trainer.""" +import abc +from lbann import trainer_pb2 +from lbann.util import make_iterable + +class Trainer: + """LBANN Trainer.""" + + def __init__(self): + # Scalar fields + self.block_size = 256 # TODO: Make configurable + self.procs_per_trainer = 0 # TODO: Make configurable + self.num_parallel_readers = 0 # TODO: Make configurable + self.num_gpus = 1 # TODO: Make configurable + + def export_proto(self): + """Construct and return a protobuf message.""" + # Initialize protobuf message + trainer = trainer_pb2.Trainer() + trainer.block_size = self.block_size + trainer.procs_per_trainer = self.procs_per_trainer + trainer.num_parallel_readers = self.num_parallel_readers + trainer.num_gpus = self.num_gpus + + return trainer diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 133a095b0ca..3f839bb810f 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -9,6 +9,7 @@ set_full_path(THIS_DIR_SOURCES add_subdirectory(callbacks) add_subdirectory(data_readers) add_subdirectory(data_store) +add_subdirectory(execution_contexts) add_subdirectory(io) add_subdirectory(layers) add_subdirectory(metrics) @@ -16,6 +17,8 @@ add_subdirectory(models) add_subdirectory(objective_functions) add_subdirectory(optimizers) add_subdirectory(proto) +add_subdirectory(trainers) +add_subdirectory(training_algorithms) add_subdirectory(transforms) add_subdirectory(utils) add_subdirectory(weights) diff --git a/src/base.cpp b/src/base.cpp index 89fe8abae0c..ce069824e2f 100644 --- a/src/base.cpp +++ b/src/base.cpp @@ -151,7 +151,7 @@ std::string to_string(execution_mode m) { } } -execution_mode exe_mode_from_string(std::string const& str) { +execution_mode exec_mode_from_string(std::string const& str) { if (str == "training" || str == "train") return execution_mode::training; else if (str == "validation" || str == "validate") @@ -169,7 +169,7 @@ execution_mode exe_mode_from_string(std::string const& str) { std::istream& operator>>(std::istream& is, execution_mode& m) { std::string tmp; is >> tmp; - m = exe_mode_from_string(tmp); + m = exec_mode_from_string(tmp); return is; } diff --git a/src/callbacks/check_dataset.cpp b/src/callbacks/check_dataset.cpp index 69a41b5df50..7f2a65562e2 100644 --- a/src/callbacks/check_dataset.cpp +++ b/src/callbacks/check_dataset.cpp @@ -59,16 +59,18 @@ void check_dataset::add_to_set(model *m, Layer *l, int64_t step, std::set& } void check_dataset::on_forward_prop_end(model *m, Layer *l) { - add_to_set(m, l, m->get_step(), training_set); + const auto& c = m->get_execution_context(); + add_to_set(m, l, c.get_step(), training_set); } void check_dataset::on_evaluate_forward_prop_end(model *m, Layer *l) { - switch(m->get_execution_mode()) { + const auto& c = m->get_execution_context(); + switch(c.get_execution_mode()) { case execution_mode::validation: - add_to_set(m, l, m->get_step(), validation_set); + add_to_set(m, l, c.get_step(), validation_set); break; case execution_mode::testing: - add_to_set(m, l, m->get_step(), testing_set); + add_to_set(m, l, c.get_step(), testing_set); break; default: LBANN_ERROR("check_dataset: invalid execution phase"); diff --git a/src/callbacks/check_gradients.cpp b/src/callbacks/check_gradients.cpp index c3de1bc48e1..783742925d4 100644 --- a/src/callbacks/check_gradients.cpp +++ b/src/callbacks/check_gradients.cpp @@ -46,6 +46,7 @@ namespace { * layers. It is assumed that input layers have already loaded data. */ DataType compute_objective_function(model& m) { + const auto& c = static_cast(m.get_execution_context()); // Forward prop, skipping input layers for (auto&& l : m.get_layers()) { @@ -56,8 +57,8 @@ DataType compute_objective_function(model& m) { // Get objective function value auto&& obj = m.get_objective_function(); - const auto mode = m.get_execution_mode(); - const auto mini_batch_size = m.get_current_mini_batch_size(); + const auto mode = c.get_execution_mode(); + const auto mini_batch_size = c.get_current_mini_batch_size(); obj->start_evaluation(mode, mini_batch_size); return obj->finish_evaluation(mode, mini_batch_size); @@ -77,8 +78,9 @@ check_gradients::check_gradients(std::set modes, void check_gradients::do_check_gradients(model& m) const { // Get objects from model + const auto& c = static_cast(m.get_execution_context()); auto& comm = *m.get_comm(); - const auto mode = m.get_execution_mode(); + const auto mode = c.get_execution_mode(); const auto& layers = m.get_layers(); // Return immediately if gradient check isn't currently needed diff --git a/src/callbacks/check_init.cpp b/src/callbacks/check_init.cpp index c38bc95c6f2..0a1a16d6d48 100644 --- a/src/callbacks/check_init.cpp +++ b/src/callbacks/check_init.cpp @@ -33,8 +33,9 @@ namespace lbann { namespace callback { void check_init::on_train_begin(model *m) { + const auto& c = static_cast(m->get_execution_context()); // Skip after the first epoch. - if (m->get_epoch() != 0) { + if (c.get_epoch() != 0) { return; } lbann_comm *comm = m->get_comm(); diff --git a/src/callbacks/check_metric.cpp b/src/callbacks/check_metric.cpp index f885a0ac19f..7aef22ea766 100644 --- a/src/callbacks/check_metric.cpp +++ b/src/callbacks/check_metric.cpp @@ -62,10 +62,11 @@ check_metric::check_metric(std::string metric_name, void check_metric::do_check_metric(const model& m) const { + const auto& c = m.get_execution_context(); std::stringstream err; // Return immediately if execution mode is invalid - const auto& mode = m.get_execution_mode(); + const auto& mode = c.get_execution_mode(); if (!m_modes.empty() && m_modes.count(mode) == 0) { return; } // Get metric diff --git a/src/callbacks/check_nan.cpp b/src/callbacks/check_nan.cpp index e970c23e790..5bbbfc4f695 100644 --- a/src/callbacks/check_nan.cpp +++ b/src/callbacks/check_nan.cpp @@ -77,12 +77,13 @@ bool has_inf(const AbsDistMat& mat, El::Int& row, El::Int& col) { * necessarily have bad data, and the check is purely local. */ void dump_network(model *m) { + const auto& c = static_cast(m->get_execution_context()); for (const auto* l : m->get_layers()) { std::stringstream ss; ss << "model" << m->get_comm()->get_trainer_rank() << "-rank" << m->get_comm()->get_rank_in_trainer() - << "-epoch" << m->get_epoch() - << "-step" << m->get_step(execution_mode::training) + << "-epoch" << c.get_epoch() + << "-step" << c.get_step() << "-" << l->get_name() << "-"; const std::string prefix = ss.str(); for (int i = 0; i < l->get_num_children(); ++i) { @@ -100,8 +101,8 @@ void dump_network(model *m) { std::stringstream ss; ss << "model" << m->get_comm()->get_trainer_rank() << "-rank" << m->get_comm()->get_rank_in_trainer() - << "-epoch" << m->get_epoch() - << "-step" << m->get_step(execution_mode::training) + << "-epoch" << c.get_epoch() + << "-step" << c.get_step() << "-" << w->get_name() << "-"; const std::string prefix = ss.str(); El::Write(w->get_values().LockedMatrix(), diff --git a/src/callbacks/check_small.cpp b/src/callbacks/check_small.cpp index a91227bf143..0612c8666b6 100644 --- a/src/callbacks/check_small.cpp +++ b/src/callbacks/check_small.cpp @@ -31,18 +31,20 @@ namespace lbann { namespace callback { void check_small::on_forward_prop_end(model *m, Layer *l) { + const auto& c = m->get_execution_context(); const AbsDistMat& acts = l->get_activations(); if (!is_good(acts)) { std::stringstream ss; ss << name() << ": " << "[" << std::to_string(m->get_comm()->get_rank_in_world()) << "]: " << "error in activations of " << l->get_name() << " " - << "(step=" << std::to_string(m->get_step(execution_mode::training)) << ")"; + << "(step=" << std::to_string(c.get_step()) << ")"; throw lbann_exception(ss.str()); } } void check_small::on_backward_prop_end(model *m) { + const auto& c = m->get_execution_context(); for (weights *w : m->get_weights()) { optimizer *opt = w->get_optimizer(); if (opt != nullptr && !is_good(opt->get_gradient())) { @@ -50,20 +52,21 @@ void check_small::on_backward_prop_end(model *m) { ss << name() << ": " << "[" << std::to_string(m->get_comm()->get_rank_in_world()) << "]: " << "error in weights gradient of " << w->get_name() << " " - << "(step=" << std::to_string(m->get_step(execution_mode::training)) << ")"; + << "(step=" << std::to_string(c.get_step()) << ")"; throw lbann_exception(ss.str()); } } } void check_small::on_batch_end(model *m) { + const auto& c = m->get_execution_context(); for (weights *w : m->get_weights()) { if (!is_good(w->get_values())) { std::stringstream ss; ss << name() << ": " << "[" << std::to_string(m->get_comm()->get_rank_in_world()) << "]: " << "error in weights of " << w->get_name() << " " - << "(step=" << std::to_string(m->get_step(execution_mode::training)-1) << ")"; + << "(step=" << std::to_string(c.get_step()-1) << ")"; throw lbann_exception(ss.str()); } } diff --git a/src/callbacks/checkpoint.cpp b/src/callbacks/checkpoint.cpp index a9fbe87fb5c..34816bbfd6b 100644 --- a/src/callbacks/checkpoint.cpp +++ b/src/callbacks/checkpoint.cpp @@ -41,35 +41,44 @@ namespace callback { // Load from checkpoint occurs during setup callbacks void checkpoint::setup(model *m) { p.set_cb_type(callback_type::invalid); + reload_model(m); +} + +// Restoring the execution context from checkpoint occurs during just +// before execution phase +void checkpoint::on_train_begin(model *m) { + p.set_cb_type(callback_type::full_checkpoint); restart(m); } + // Interval defined with checkpoint_epochs or ckpt_dist_epochs void checkpoint::on_epoch_end(model *m) { - p.set_cb_type(callback_type::epoch); - if(need_checkpoint(m)){ + p.set_cb_type(callback_type::full_checkpoint); + if(need_checkpoint(m, callback_phase::epoch)){ do_checkpoint(m); } p.set_cb_type(callback_type::invalid); } // Interval defined with checkpoint_epochs or ckpt_dist_epochs void checkpoint::on_validation_end(model *m) { - p.set_cb_type(callback_type::validation); - if(need_checkpoint(m)){ + p.set_cb_type(callback_type::full_checkpoint); + if(need_checkpoint(m, callback_phase::validation)){ do_checkpoint(m); } p.set_cb_type(callback_type::invalid); } // Interval defined with checkpoint_steps or ckpt_dist_steps void checkpoint::on_batch_end(model *m) { - p.set_cb_type(callback_type::batch); - if(need_checkpoint(m)){ + p.set_cb_type(callback_type::full_checkpoint); + if(need_checkpoint(m, callback_phase::batch)){ do_checkpoint(m); } p.set_cb_type(callback_type::invalid); } // Decide if we need to trigger a checkpoint for either mode, based on prototext defined intervals -bool checkpoint::need_checkpoint(model *m) { +bool checkpoint::need_checkpoint(model *m, callback_phase phase) { + const auto& c = static_cast(m->get_execution_context()); /* TODO: since we're using clocks, this requires a bcast for each call, * we could use number of samples processed to make a local decision */ // if none of our checkpoint conditions are set, assume we're not checkpointing @@ -84,23 +93,23 @@ bool checkpoint::need_checkpoint(model *m) { m_checkpoint_shared = false; m_checkpoint_dist = false; lbann_comm *comm = m->get_comm(); - int cur_epoch = m->get_epoch(); + int cur_epoch = c.get_epoch(); // If we are at the end of a training epoch and the training epoch lands on defined interval, ckpt - if (!m_checkpoint_shared && m_checkpoint_epochs > 0 && (p.get_cb_type() == callback_type::epoch || p.get_cb_type() == callback_type::validation)){ + if (!m_checkpoint_shared && m_checkpoint_epochs > 0 && (phase == callback_phase::epoch || phase == callback_phase::validation)){ m_checkpoint_shared = (cur_epoch > 0) && (cur_epoch % m_checkpoint_epochs == 0); } - if(!m_checkpoint_dist && m_ckpt_dist_epochs > 0 && (p.get_cb_type() == callback_type::epoch || p.get_cb_type() == callback_type::validation)){ + if(!m_checkpoint_dist && m_ckpt_dist_epochs > 0 && (phase == callback_phase::epoch || phase == callback_phase::validation)){ m_checkpoint_dist = (cur_epoch > 0) && (cur_epoch % m_ckpt_dist_epochs == 0); } // If we are at the end of a training mb step and the training mb step lands on defined interval, trigger checkpoint if (!m_checkpoint_shared && m_checkpoint_steps > 0) { - m_checkpoint_shared = (m->get_step(execution_mode::training) > 0) && (m->get_step(execution_mode::training) % m_checkpoint_steps == 0); + m_checkpoint_shared = (c.get_step() > 0) && (c.get_step() % m_checkpoint_steps == 0); } if(!m_checkpoint_dist && m_ckpt_dist_steps > 0){ - m_checkpoint_dist = (m->get_step(execution_mode::training) > 0) && (m->get_step(execution_mode::training) % m_ckpt_dist_steps == 0); + m_checkpoint_dist = (c.get_step() > 0) && (c.get_step() % m_ckpt_dist_steps == 0); } // check the clock if time-based checkpoint is enabled @@ -123,6 +132,7 @@ bool checkpoint::need_checkpoint(model *m) { // Checkpoint Shared/Distributed bool checkpoint::do_checkpoint(model *m) { + auto& c = static_cast(m->get_execution_context()); // if the checkpoint directory is not defined, bail if (m_checkpoint_dir.length() == 0 && m_per_rank_dir.length() == 0) { return false; @@ -133,8 +143,8 @@ bool checkpoint::do_checkpoint(model *m) { char dir[1024]; std::string epochdir; std::string latest_file; - int epoch = -1; - int step = -1 ; + size_t epoch = std::numeric_limits::max(); + size_t step = std::numeric_limits::max(); lbann_comm *comm = m->get_comm(); // TODO: we would want to prepend dir with the model name and model rank: // m->get_name() + '.' + std::to_string(comm->get_trainer_rank()) + '.' @@ -143,10 +153,14 @@ bool checkpoint::do_checkpoint(model *m) { comm->trainer_barrier(); // let user know we're saving a checkpoint if (comm->am_trainer_master()) { - epoch = m->get_epoch(); - step = m->get_step(execution_mode::training); + epoch = c.get_epoch(); + step = c.get_step(); timer.Start(); - printf("Checkpoint: epoch %d step %d ...\n", epoch, step); + std::cout << "[" << m->get_name() + << "." << comm->get_trainer_rank() + << "] Checkpoint [" << to_string(c.get_execution_mode()) + << "] to " << m_checkpoint_dir + << " : epoch " << epoch << " step " << step << " ..." << std::endl; fflush(stdout); } comm->trainer_broadcast(0, epoch); @@ -163,33 +177,54 @@ bool checkpoint::do_checkpoint(model *m) { } makedir(dir); // create directories per ranks - epochdir = get_distributed_checkpoint_dirname(m, dir, epoch, step); + epochdir = get_distributed_checkpoint_dirname(m, dir, c.get_execution_mode(), epoch, step); + /** @todo BVE FIXME this should be refactored to only open the + checkpoints files that we care about */ p.open_checkpoint(epochdir.c_str()); // Call top level save to checkpoint function in model, in turn calls save to checkpoint functions for other model classes (weights, layers) - m->save_to_checkpoint_distributed(p); + if(p.get_cb_type() == callback_type::model_only || p.get_cb_type() == callback_type::full_checkpoint) { + m->save_to_checkpoint_distributed(p); + } + if(p.get_cb_type() == callback_type::execution_context_only + || p.get_cb_type() == callback_type::full_checkpoint) { + auto save_checkpoint = [this](observer_ptr ctx) + ->void { ctx->save_to_checkpoint_distributed(this->p); }; + c.get_trainer().for_each_execution_context(save_checkpoint); + } p.close_checkpoint(); // Print latest checkpoint to file if (comm->am_trainer_master()) { latest_file = get_last_distributed_checkpoint_filename(m, dir); - write_latest(latest_file, epoch, step); + write_latest(latest_file, c.get_execution_mode(), epoch, step); } } - // Shared checkpoint, logic identical to Distributed.i + // Shared checkpoint, logic identical to Distributed. if(m_checkpoint_shared){ strcpy(dir, m_checkpoint_dir.c_str()); makedir(dir); - epochdir = get_shared_checkpoint_dirname(m, dir, epoch, step); + epochdir = get_shared_checkpoint_dirname(m, dir, c.get_execution_mode(), epoch, step); if (comm->am_trainer_master()) { p.open_checkpoint(epochdir.c_str()); + }else { + // Need to give other ranks knowledge of checkpoint dir for writing of rank specific rng state + p.m_checkpoint_dir = epochdir; + } + // Make sure that the master has had a chance to create the directories + comm->trainer_barrier(); + if(p.get_cb_type() == callback_type::model_only || p.get_cb_type() == callback_type::full_checkpoint) { + m->save_to_checkpoint_shared(p); + } + if(p.get_cb_type() == callback_type::execution_context_only + || p.get_cb_type() == callback_type::full_checkpoint) { + auto save_checkpoint = [this](observer_ptr ctx) + ->void { ctx->save_to_checkpoint_shared(this->p); }; + c.get_trainer().for_each_execution_context(save_checkpoint); } - // Need to give other ranks knowledge of checkpoint dir for writing of rank specific rng state - comm->trainer_broadcast(0, &(p.m_checkpoint_dir[0]), sizeof(p.m_checkpoint_dir)); - m->save_to_checkpoint_shared(p); // close our checkpoint p.close_checkpoint(); if (comm->am_trainer_master()) { latest_file = get_last_shared_checkpoint_filename(m, dir); - write_latest(latest_file, epoch, step); + write_latest(latest_file, c.get_execution_mode(), epoch, step); } } @@ -201,8 +236,14 @@ bool checkpoint::do_checkpoint(model *m) { if (secs > 0.0) { bw = EvalType(bytes_count) / (secs * 1024.0 * 1024.0); } - printf("[%s.%d] Checkpoint complete: Epoch=%d Step=%d (%f secs, %llu bytes, %f MB/sec)\n", - m->get_name().c_str(), comm->get_trainer_rank(), epoch, step, secs, (unsigned long long) bytes_count, bw); + std::cout << "[" << m->get_name() + << "." << comm->get_trainer_rank() + << "] Checkpoint [" << to_string(c.get_execution_mode()) + << "] to " << m_checkpoint_dir + << " complete: Epoch=" << epoch + << " Step=" << step + << " (" << secs << " secs, " << bytes_count << " bytes, " + << bw << " MB/sec)" << std::endl; fflush(stdout); } // record last checkpoint time in case checkpoint_secs interval defined. @@ -211,33 +252,23 @@ bool checkpoint::do_checkpoint(model *m) { return true; } -// Restart Shared/Distributed -bool checkpoint::restart(model *m) { - // if the checkpoint directory is not defined, bail - if (m_checkpoint_dir.length() == 0 && m_per_rank_dir.length() == 0) { - return false; - } +std::string checkpoint::find_latest_checkpoint(model *m, std::string& latest_file, execution_mode& mode, size_t &epoch, size_t& step, int& shared) { constexpr unsigned int max_len_dirname = 1024; - // get top level directory char dir[max_len_dirname]; - std::string latest_file; - int epoch = -1; - int step = -1; - int epoch_dist = -1; - int step_dist = -1; + size_t epoch_dist = 0; + size_t step_dist = 0; lbann_comm *comm = m->get_comm(); - int shared = 1; // Grab latest checkpoint information, checks for latest in dist and shared, restarts from most recent between the two. if (comm->am_trainer_master()) { if(m_per_rank_dir.length()){ snprintf(dir, sizeof(dir), "%s/%s", m_per_rank_dir.c_str(), m_checkpoint_dir.c_str()); latest_file = get_last_distributed_checkpoint_filename(m, dir); - read_latest(latest_file, &epoch, &step); + read_latest(latest_file, &mode, &epoch_dist, &step_dist); } if(m_checkpoint_dir.length()){ strcpy(dir, m_checkpoint_dir.c_str()); latest_file = get_last_shared_checkpoint_filename(m, dir); - read_latest(latest_file, &epoch, &step); + read_latest(latest_file, &mode, &epoch, &step); } if(epoch > epoch_dist){ @@ -261,26 +292,57 @@ bool checkpoint::restart(model *m) { #if 1 header_t header; - header.epoch = epoch; - header.step = step; - header.shared = shared; - memcpy(header.dirname, dir, sizeof(dir)); + if (comm->am_trainer_master()) { + header.mode = mode; + header.epoch = epoch; + header.step = step; + header.shared = shared; + memcpy(header.dirname, dir, sizeof(dir)); + } comm->trainer_broadcast(0, header); - epoch = header.epoch; - step = header.step; - shared = header.shared; - memcpy(dir, header.dirname, sizeof(dir)); + if (!comm->am_trainer_master()) { + mode = header.mode; + epoch = header.epoch; + step = header.step; + shared = header.shared; + memcpy(dir, header.dirname, sizeof(dir)); + } #else comm->trainer_broadcast(0, epoch); comm->trainer_broadcast(0, step); comm->trainer_broadcast(0, shared); comm->trainer_broadcast(0, &(dir[0]), sizeof(dir)); #endif + return dir; +} + +// Open latest Shared/Distributed checkpoint +bool checkpoint::open_latest_checkpoint( + model *m, + const std::string& task_label, + std::function reload_shared_ckpt, + std::function reload_distributed_ckpt) { + // if the checkpoint directory is not defined, bail + if (m_checkpoint_dir.length() == 0 && m_per_rank_dir.length() == 0) { + return false; + } + + // constexpr unsigned int max_len_dirname = 1024; + // get top level directory + // char dir[max_len_dirname]; + std::string latest_file; + size_t epoch = std::numeric_limits::max(); + size_t step = std::numeric_limits::max(); + int shared = 1; + execution_mode mode; + lbann_comm *comm = m->get_comm(); + + std::string dir = find_latest_checkpoint(m, latest_file, mode, epoch, step, shared); // if we couldn't find the latest epoch, just return - if (epoch < 0) { + if (epoch == std::numeric_limits::max()) { return false; } // time how long this takes @@ -288,28 +350,31 @@ bool checkpoint::restart(model *m) { // let user know we're restarting from a checkpoint if (comm->am_trainer_master()) { timer.Start(); - printf("Restart: epoch %d ...\n", epoch); - fflush(stdout); + std::cout << task_label << "ing from " << m_checkpoint_dir << " : epoch " << epoch << " ..." << std::endl; } std::string epochdir; // Create dir to restart from based off last recorded checkpoint (or overriden values in last.shared[distributed].checkpoint if(!shared){ - epochdir = get_distributed_checkpoint_dirname(m, dir, epoch, step); + epochdir = get_distributed_checkpoint_dirname(m, dir, mode, epoch, step); p.open_restart(epochdir.c_str()); - m->load_from_checkpoint_distributed(p); + reload_distributed_ckpt(p); p.close_restart(); } else { - epochdir = get_shared_checkpoint_dirname(m, dir, epoch, step); - if (comm->am_trainer_master()) { - p.open_restart(epochdir.c_str()); - } - // Ensure all ranks have access to checkpoint dir, needed for loading rank specific rng state - comm->trainer_broadcast(0, &(p.m_checkpoint_dir[0]), sizeof(p.m_checkpoint_dir)); - m->load_from_checkpoint_shared(p); - if(comm->am_trainer_master()) - p.close_restart(); + epochdir = get_shared_checkpoint_dirname(m, dir, mode, epoch, step); + // if (comm->am_trainer_master()) { + /// @todo For the moment let all ranks open the checkpoint files + p.open_restart(epochdir.c_str()); + // } else { + // // Ensure all ranks have access to checkpoint dir, needed for loading rank specific rng state + // p.m_checkpoint_dir = epochdir; + // } + reload_shared_ckpt(p); + // if(comm->am_trainer_master()) { + /// @todo For the moment let all ranks open the checkpoint files + p.close_restart(); + // } } // close our checkpoint @@ -321,15 +386,116 @@ bool checkpoint::restart(model *m) { if (secs > 0.0) { bw = EvalType(bytes_count) / (secs * 1024.0 * 1024.0); } - printf("[%s.%d] Restart complete: Epoch=%d Step=%d (%f secs, %llu bytes, %f MB/sec)\n", - m->get_name().c_str(), comm->get_trainer_rank(), epoch, step, secs, (unsigned long long) bytes_count, bw - ); + std::cout << "[" << m->get_name() + << "." << comm->get_trainer_rank() + << "] " << task_label + << " from " << m_checkpoint_dir + << " complete: Epoch=" << epoch + << " Step=" << step + << " (" << secs << " secs, " << bytes_count << " bytes, " + << bw << " MB/sec)" << std::endl; fflush(stdout); } p.reset_bytes(); return true; } +// Reload a model from a Shared/Distributed checkpoint +bool checkpoint::reload_model(model *m) { + auto reload_shared_model = std::function + ([m](/*const */persist& p_ref) + ->void { + m->load_from_checkpoint_shared(p_ref); + return; + }); + + auto reload_distributed_model = std::function + ([m](/*const */persist& p_ref) + ->void { + m->load_from_checkpoint_distributed(p_ref); + return; + }); + + + open_latest_checkpoint(m, "Reload", reload_shared_model, reload_distributed_model); + + return true; +} + + +// Restart previously saved Shared/Distributed execution contexts +bool checkpoint::restart(model *m) { + // This function needs to read the checkpoint to see what execution + // contexts exists and create a valid execution context for each + // one. + // Then setup the model with the proper one + auto& c = static_cast(m->get_execution_context()); + + auto restart_shared_model = [&m, &c](/*const */persist& p_ref) + ->void { + execution_mode current_mode = c.get_execution_mode(); + + for(execution_mode mode : execution_mode_iterator()) { + /// Restart should optionally load any other valid contexts + if(mode == execution_mode::invalid) { continue; } + trainer::execution_context_key_pair_t key; + try { + if(current_mode == mode) { + /// Restart has to be able to load the currently running execution context + c.load_from_checkpoint_shared(p_ref); + }else { + key = c.get_trainer().check_and_build_execution_context(c, *m, mode); + auto& evaluation_context = static_cast(c.get_trainer().get_execution_context(key)); + evaluation_context.load_from_checkpoint_shared(p_ref); + } + }catch (NonexistentArchiveFile const&) { + // Ignore the exception if the file is not for the current execution mode + if(current_mode == mode) { + LBANN_ERROR("Failed to restart model, invalid execution mode: " + to_string(current_mode)); + }else { + c.get_trainer().delete_execution_context(key); + } + } + } + return; + }; + + auto restart_distributed_model = [&m, &c](/*const */persist& p_ref) + ->void { + execution_mode current_mode = c.get_execution_mode(); + + for(execution_mode mode : execution_mode_iterator()) { + /// Restart should optionally load any other valid contexts + if(mode == execution_mode::invalid) { continue; } + trainer::execution_context_key_pair_t key; + try { + if(current_mode == mode) { + /// Restart has to be able to load the currently running execution context + c.load_from_checkpoint_distributed(p_ref); + }else { + key = c.get_trainer().check_and_build_execution_context(c, *m, mode); + auto& evaluation_context = static_cast(c.get_trainer().get_execution_context(key)); + evaluation_context.load_from_checkpoint_distributed(p_ref); + } + }catch (NonexistentArchiveFile const&) { + // Ignore the exception if the file is not for the current execution mode + if(current_mode == mode) { + LBANN_ERROR("Failed to restart model, invalid execution mode: " + to_string(current_mode)); + }else { + c.get_trainer().delete_execution_context(key); + } + } + + } + return; + }; + + + open_latest_checkpoint(m, "Restart", restart_shared_model, restart_distributed_model); + + return true; +} + std::unique_ptr build_checkpoint_callback_from_pbuf( const google::protobuf::Message& proto_msg, const std::shared_ptr&) { diff --git a/src/callbacks/confusion_matrix.cpp b/src/callbacks/confusion_matrix.cpp index 3406860e4d4..c43e9ab7b0d 100644 --- a/src/callbacks/confusion_matrix.cpp +++ b/src/callbacks/confusion_matrix.cpp @@ -133,7 +133,8 @@ const AbsDistMat& confusion_matrix::get_labels(const model& m) const { // --------------------------------------------------------- void confusion_matrix::reset_counts(const model& m) { - auto& counts = m_counts[m.get_execution_mode()]; + const auto& c = m.get_execution_context(); + auto& counts = m_counts[c.get_execution_mode()]; const auto& num_classes = get_predictions(m).Height(); counts.assign(num_classes * num_classes, 0); } @@ -165,7 +166,8 @@ void confusion_matrix::update_counts(const model& m) { const auto& local_labels = m_labels_v->LockedMatrix(); // Update counts - auto& counts = m_counts[m.get_execution_mode()]; + const auto& c = m.get_execution_context(); + auto& counts = m_counts[c.get_execution_mode()]; const auto& local_height = local_predictions.Height(); const auto& local_width = local_predictions.Width(); for (El::Int local_col = 0; local_col < local_width; ++local_col) { @@ -187,9 +189,10 @@ void confusion_matrix::update_counts(const model& m) { } void confusion_matrix::save_confusion_matrix(const model& m) { + const auto& c = static_cast(m.get_execution_context()); // Get counts - const auto& mode = m.get_execution_mode(); + const auto& mode = c.get_execution_mode(); auto& counts = m_counts[mode]; // Accumulate counts in master process @@ -216,10 +219,10 @@ void confusion_matrix::save_confusion_matrix(const model& m) { std::string mode_string; switch (mode) { case execution_mode::training: - mode_string = "train-epoch" + std::to_string(m.get_epoch()); + mode_string = "train-epoch" + std::to_string(c.get_epoch()); break; case execution_mode::validation: - mode_string = "validation-epoch" + std::to_string(m.get_epoch()); + mode_string = "validation-epoch" + std::to_string(c.get_epoch()); break; case execution_mode::testing: mode_string = "test"; diff --git a/src/callbacks/debug.cpp b/src/callbacks/debug.cpp index 117a0e97a90..992efc8e943 100644 --- a/src/callbacks/debug.cpp +++ b/src/callbacks/debug.cpp @@ -65,10 +65,11 @@ std::string weights_string(const weights& w) { /** Get human-readable string describing current batch step. */ std::string batch_step_string(const model& m) { + const auto& c = static_cast(m.get_execution_context()); std::stringstream msg; - const auto& mode = m.get_execution_mode(); - msg << to_string(mode) << " batch " << m.get_step(); - msg << " (epoch " << m.get_epoch() << ")"; + const auto& mode = c.get_execution_mode(); + msg << to_string(mode) << " batch " << c.get_step(); + msg << " (epoch " << c.get_epoch() << ")"; return msg.str(); } @@ -76,7 +77,8 @@ std::string batch_step_string(const model& m) { // Status updates for batch beginnings/endings void debug::on_batch_begin(model *m) { - if(m_modes.empty() || m_modes.count(m->get_execution_mode()) > 0) { + const auto& c = m->get_execution_context(); + if(m_modes.empty() || m_modes.count(c.get_execution_mode()) > 0) { std::stringstream msg; msg << rank_string(*m->get_comm()) << ": " << "starting " << batch_step_string(*m) << std::endl; @@ -84,7 +86,8 @@ void debug::on_batch_begin(model *m) { } } void debug::on_batch_end(model *m) { - if(m_modes.empty() || m_modes.count(m->get_execution_mode()) > 0) { + const auto& c = m->get_execution_context(); + if(m_modes.empty() || m_modes.count(c.get_execution_mode()) > 0) { std::stringstream msg; msg << rank_string(*m->get_comm()) << ": " << "ending " << batch_step_string(*m) << std::endl; @@ -100,7 +103,8 @@ void debug::on_batch_evaluate_end(model *m) { // Status updates for beginning/ending of layer forward/backward prop void debug::on_forward_prop_begin(model *m, Layer *l) { - if(m_modes.empty() || m_modes.count(m->get_execution_mode()) > 0) { + const auto& c = m->get_execution_context(); + if(m_modes.empty() || m_modes.count(c.get_execution_mode()) > 0) { std::stringstream msg; msg << rank_string(*m->get_comm()) << ": " << layer_string(*l) << " is starting forward prop for " << batch_step_string(*m) @@ -109,7 +113,8 @@ void debug::on_forward_prop_begin(model *m, Layer *l) { } } void debug::on_forward_prop_end(model *m, Layer *l) { - if(m_modes.empty() || m_modes.count(m->get_execution_mode()) > 0) { + const auto& c = m->get_execution_context(); + if(m_modes.empty() || m_modes.count(c.get_execution_mode()) > 0) { std::stringstream msg; msg << rank_string(*m->get_comm()) << ": " << layer_string(*l) << " is ending forward prop for " << batch_step_string(*m) @@ -118,7 +123,8 @@ void debug::on_forward_prop_end(model *m, Layer *l) { } } void debug::on_backward_prop_begin(model *m, Layer *l) { - if(m_modes.empty() || m_modes.count(m->get_execution_mode()) > 0) { + const auto& c = m->get_execution_context(); + if(m_modes.empty() || m_modes.count(c.get_execution_mode()) > 0) { std::stringstream msg; msg << rank_string(*m->get_comm()) << ": " << layer_string(*l) << " is starting backward prop for " << batch_step_string(*m) @@ -127,7 +133,8 @@ void debug::on_backward_prop_begin(model *m, Layer *l) { } } void debug::on_backward_prop_end(model *m, Layer *l) { - if(m_modes.empty() || m_modes.count(m->get_execution_mode()) > 0) { + const auto& c = m->get_execution_context(); + if(m_modes.empty() || m_modes.count(c.get_execution_mode()) > 0) { std::stringstream msg; msg << rank_string(*m->get_comm()) << ": " << layer_string(*l) << " is ending backward prop for " << batch_step_string(*m) diff --git a/src/callbacks/debug_io.cpp b/src/callbacks/debug_io.cpp index e55f7a884d5..c5b5712f828 100644 --- a/src/callbacks/debug_io.cpp +++ b/src/callbacks/debug_io.cpp @@ -51,8 +51,9 @@ void debug_io::on_forward_prop_begin(model *m, Layer *l) { return; } + const auto& c = m->get_execution_context(); if(m->get_comm()->get_rank_in_trainer() < input->get_data_reader()->get_num_parallel_readers()) { - if(m_debug_phase == execution_mode::invalid || m_debug_phase == m->get_execution_mode()) { + if(m_debug_phase == execution_mode::invalid || m_debug_phase == c.get_execution_mode()) { print_fp_start(m, input); } } @@ -62,11 +63,12 @@ void debug_io::on_forward_prop_begin(model *m, Layer *l) { } void debug_io::print_fp_start(model *m, generic_input_layer *input) { - const auto& step = m->get_step(); + const auto& c = static_cast(m->get_execution_context()); + const auto& step = c.get_step(); std::cout << "[" << m->get_comm()->get_trainer_rank() << "." << m->get_comm()->get_rank_in_trainer() - << "] @" << m->get_epoch() << "." << step - << " Phase: " << to_string(m->get_execution_mode()) + << "] @" << c.get_epoch() << "." << step + << " Phase: " << to_string(c.get_execution_mode()) << " starting forward propagation for layer " << input->get_name() << " type: " << input->get_type() << " iteration: " << input->get_data_reader()->get_current_mini_batch_index() @@ -82,7 +84,7 @@ void debug_io::print_fp_start(model *m, generic_input_layer *input) { // 179i @ 300s (=5m*60s) + 1i @ 100s (=5m*45s):offset <- num models void debug_io::print_phase_start(model *m, execution_mode mode) { - + const auto& c = m->get_execution_context(); // Get data reader from first input layer in model generic_data_reader* data_reader = nullptr; for (auto&& l : m->get_layers()) { @@ -94,7 +96,7 @@ void debug_io::print_phase_start(model *m, execution_mode mode) { } if (data_reader == nullptr) { return; } - const auto& step = m->get_step(); + const auto& step = c.get_step(); if(data_reader->get_rank() < data_reader->get_num_parallel_readers()) { std::cout << "[" << m->get_comm()->get_trainer_rank() @@ -139,13 +141,14 @@ void debug_io::on_validation_begin(model *m) { } void debug_io::on_evaluate_forward_prop_begin(model *m, Layer *l) { + const auto& c = m->get_execution_context(); auto *input = dynamic_cast(l); if (input == nullptr || m_debug_lvl < 1) { return; } if(m->get_comm()->get_rank_in_trainer() < input->get_data_reader()->get_num_parallel_readers()) { - if(m_debug_phase == execution_mode::invalid || m_debug_phase == m->get_execution_mode()) { + if(m_debug_phase == execution_mode::invalid || m_debug_phase == c.get_execution_mode()) { print_fp_start(m, input); } } @@ -165,7 +168,7 @@ build_debug_io_callback_from_pbuf( const google::protobuf::Message& proto_msg, const std::shared_ptr&) { const auto& params = dynamic_cast(proto_msg); - const auto& phase = exe_mode_from_string(params.phase()); + const auto& phase = exec_mode_from_string(params.phase()); const auto& lvl = params.lvl(); switch (phase) { case execution_mode::training: diff --git a/src/callbacks/dump_error_signals.cpp b/src/callbacks/dump_error_signals.cpp index eee0be5bfcf..647b72e0653 100644 --- a/src/callbacks/dump_error_signals.cpp +++ b/src/callbacks/dump_error_signals.cpp @@ -32,6 +32,7 @@ namespace lbann { namespace callback { void dump_error_signals::on_backward_prop_end(model *m, Layer *l) { + const auto& c = static_cast(m->get_execution_context()); // Write each activation matrix to file for (int i = 0; i < l->get_num_parents(); ++i) { @@ -40,8 +41,8 @@ void dump_error_signals::on_backward_prop_end(model *m, Layer *l) { std::stringstream file; file << m_basename << "model" << m->get_comm()->get_trainer_rank() << "-" - << "epoch" << m->get_epoch() << "-" - << "step" << m->get_step() << "-" + << "epoch" << c.get_epoch() << "-" + << "step" << c.get_step() << "-" << l->get_name() << "-" << "ErrorSignals"; if (l->get_num_parents() > 1) { file << i; } diff --git a/src/callbacks/dump_gradients.cpp b/src/callbacks/dump_gradients.cpp index f8ed1d23679..4268fb7928e 100644 --- a/src/callbacks/dump_gradients.cpp +++ b/src/callbacks/dump_gradients.cpp @@ -36,14 +36,15 @@ namespace lbann { namespace callback { void dump_gradients::on_backward_prop_end(model *m) { + const auto& c = static_cast(m->get_execution_context()); for (weights *w : m->get_weights()) { optimizer *opt = w->get_optimizer(); if (opt != nullptr) { const std::string file = (m_basename + "model" + std::to_string(m->get_comm()->get_trainer_rank()) - + "-epoch" + std::to_string(m->get_epoch()) - + "-step" + std::to_string(m->get_step()) + + "-epoch" + std::to_string(c.get_epoch()) + + "-step" + std::to_string(c.get_step()) + "-" + w->get_name() + "-Gradient"); El::Write(opt->get_gradient(), file, El::ASCII); diff --git a/src/callbacks/dump_minibatch_sample_indices.cpp b/src/callbacks/dump_minibatch_sample_indices.cpp index d4b29d06b1c..2ae01725a36 100644 --- a/src/callbacks/dump_minibatch_sample_indices.cpp +++ b/src/callbacks/dump_minibatch_sample_indices.cpp @@ -40,6 +40,7 @@ namespace lbann { namespace callback { void dump_minibatch_sample_indices::dump_to_file(model *m, Layer *l, int64_t step) { + const auto& c = static_cast(m->get_execution_context()); // Print minibatch sample indices of input layers auto *input = dynamic_cast(l); if (input != nullptr) { @@ -59,11 +60,11 @@ void dump_minibatch_sample_indices::dump_to_file(model *m, Layer *l, int64_t ste const std::string file = (m_basename - + to_string(m->get_execution_mode()) + + to_string(c.get_execution_mode()) + "-model" + std::to_string(m->get_comm()->get_trainer_rank()) + "-rank" + std::to_string(m->get_comm()->get_rank_in_trainer()) - + "-epoch" + std::to_string(m->get_epoch()) - + "-step" + std::to_string(m->get_step(execution_mode::training)) + + "-epoch" + std::to_string(c.get_epoch()) + + "-step" + std::to_string(c.get_step()) + "-" + l->get_name() + "-MB_Sample_Indices"); El::Write(*indices, file, El::ASCII); @@ -71,11 +72,13 @@ void dump_minibatch_sample_indices::dump_to_file(model *m, Layer *l, int64_t ste } void dump_minibatch_sample_indices::on_forward_prop_end(model *m, Layer *l) { - dump_to_file(m, l, m->get_step()); + const auto& c = m->get_execution_context(); + dump_to_file(m, l, c.get_step()); } void dump_minibatch_sample_indices::on_evaluate_forward_prop_end(model *m, Layer *l) { - dump_to_file(m, l, m->get_step()); + const auto& c = m->get_execution_context(); + dump_to_file(m, l, c.get_step()); } std::unique_ptr diff --git a/src/callbacks/dump_outputs.cpp b/src/callbacks/dump_outputs.cpp index 38ffd3f7149..a7b233389d7 100644 --- a/src/callbacks/dump_outputs.cpp +++ b/src/callbacks/dump_outputs.cpp @@ -135,11 +135,12 @@ dump_outputs::dump_outputs(std::set layer_names, } void dump_outputs::do_dump_outputs(const model& m, const Layer& l) { + const auto& c = static_cast(m.get_execution_context()); // Get mini-batch step information - const auto& mode = m.get_execution_mode(); - const auto& epoch = m.get_epoch(); - const auto& step = m.get_step(); + const auto& mode = c.get_execution_mode(); + const auto& epoch = c.get_epoch(); + const auto& step = c.get_step(); // Quit if output dump isn't needed if (!m_modes.empty() && m_modes.count(mode) == 0) { return; } diff --git a/src/callbacks/dump_weights.cpp b/src/callbacks/dump_weights.cpp index 8ba2644fc3a..f1f20dd1468 100644 --- a/src/callbacks/dump_weights.cpp +++ b/src/callbacks/dump_weights.cpp @@ -45,14 +45,16 @@ void dump_weights::on_epoch_end(model *m) { } void dump_weights::do_dump_weights(model *m, std::string s) { + const auto& c = static_cast(m->get_execution_context()); + makedir(m_basename.c_str()); for (weights *w : m->get_weights()) { - std::string epoch = "-epoch" + std::to_string(m->get_epoch()-1); + std::string epoch = "-epoch" + std::to_string(c.get_epoch()-1); if(s != "") { epoch = "-" + s; } const std::string file = (m_basename - + "model" + std::to_string(m->get_comm()->get_trainer_rank()) + + "/model" + std::to_string(m->get_comm()->get_trainer_rank()) + epoch + "-" + w->get_name() + "-Weights"); diff --git a/src/callbacks/early_stopping.cpp b/src/callbacks/early_stopping.cpp index 54f7b2495fa..218845cc132 100644 --- a/src/callbacks/early_stopping.cpp +++ b/src/callbacks/early_stopping.cpp @@ -41,7 +41,8 @@ early_stopping::early_stopping(int64_t patience) : /// Monitor the objective function to see if the validation score /// continues to improve void early_stopping::on_validation_end(model *m) { - execution_mode mode = m->get_execution_mode(); + auto& c = m->get_execution_context(); + execution_mode mode = c.get_execution_mode(); EvalType score = m->get_objective_function()->get_mean_value(mode); if (score < m_last_score) { if (m->get_comm()->am_trainer_master()) { @@ -53,7 +54,7 @@ void early_stopping::on_validation_end(model *m) { m_wait = 0; } else { if (m_wait >= m_patience) { - m->set_terminate_training(true); + c.set_terminate_training(true); if (m->get_comm()->am_trainer_master()) { std::cout << "Model " << m->get_comm()->get_trainer_rank() << " terminating training due to early stopping: " << score << diff --git a/src/callbacks/imcomm.cpp b/src/callbacks/imcomm.cpp index 779fcc08842..a7726a24f8e 100644 --- a/src/callbacks/imcomm.cpp +++ b/src/callbacks/imcomm.cpp @@ -98,9 +98,10 @@ void imcomm::on_train_begin(model *m) { } void imcomm::on_backward_prop_end(model *m) { + const auto& c = m->get_execution_context(); lbann_comm *comm = m->get_comm(); if (comm->get_num_trainers() == 1 || - m->get_execution_mode() != execution_mode::training) { + c.get_execution_mode() != execution_mode::training) { return; // No point with only one model. } for (weights *w : m->get_weights()) { @@ -131,9 +132,10 @@ void imcomm::do_summary(model *m, weights *w, if (m_summarizer == nullptr) { return; } + const auto& c = m->get_execution_context(); std::string prefix = w->get_name() + "/imcomm_"; m_summarizer->reduce_scalar(prefix + "time", - im_time, m->get_step(execution_mode::training)); + im_time, c.get_step()); // Use the same approximation the comm layer does. const CPUMat& local_gradients = static_cast(w->get_optimizer()->get_gradient().LockedMatrix()); @@ -142,9 +144,9 @@ void imcomm::do_summary(model *m, weights *w, size_t bytes_received = sizeof(DataType) * local_gradients.Height() * local_gradients.Width(); m_summarizer->reduce_scalar(prefix + "bytes_sent", - bytes_sent, m->get_step(execution_mode::training)); + bytes_sent, c.get_step()); m_summarizer->reduce_scalar(prefix + "bytes_received", - bytes_received, m->get_step(execution_mode::training)); + bytes_received, c.get_step()); } static std::vector comm_type_names = { "none", "normal" }; diff --git a/src/callbacks/learning_rate.cpp b/src/callbacks/learning_rate.cpp index 9d3df1468ee..49e5c069a4d 100644 --- a/src/callbacks/learning_rate.cpp +++ b/src/callbacks/learning_rate.cpp @@ -78,6 +78,7 @@ void learning_rate::setup(model *m) { } void learning_rate::on_epoch_end(model *m) { + const auto& c = static_cast(m->get_execution_context()); const float new_lr = global_schedule(m); const float old_global_lr = m_cur_global_lr; m_cur_global_lr = new_lr; @@ -85,7 +86,7 @@ void learning_rate::on_epoch_end(model *m) { if (comm->am_trainer_master() && new_lr != old_global_lr) { std::cout << "Model " << comm->get_trainer_rank() << ": " << "changing global learning rate to " << new_lr - << " at epoch " << m->get_epoch() << std::endl; + << " at epoch " << c.get_epoch() << std::endl; } for (weights *w : this->get_weights()) { optimizer *opt = w->get_optimizer(); @@ -108,16 +109,17 @@ void learning_rate::on_backward_prop_end(model *m) { } step_learning_rate::step_learning_rate( - int step, float amt) : + size_t step, float amt) : learning_rate(), m_step(step), m_amt(amt) {} step_learning_rate::step_learning_rate( - int step, float amt, std::vector weights_names) : + size_t step, float amt, std::vector weights_names) : learning_rate(std::move(weights_names)), m_step(step), m_amt(amt) {} float step_learning_rate::global_schedule(model *m) { - if (m->get_epoch() % m_step == 0) { + const auto& c = static_cast(m->get_execution_context()); + if (c.get_epoch() % m_step == 0) { return get_current_global_learning_rate() * m_amt; } else { return get_current_global_learning_rate(); @@ -125,20 +127,21 @@ float step_learning_rate::global_schedule(model *m) { } adaptive_learning_rate::adaptive_learning_rate( - int64_t patience, float amt) : + size_t patience, float amt) : adaptive_learning_rate(patience, amt, std::vector()) {} adaptive_learning_rate::adaptive_learning_rate( - int64_t patience, float amt, std::vector weights_list) : + size_t patience, float amt, std::vector weights_list) : learning_rate(std::move(weights_list)), m_patience(patience), m_amt(amt) {} float adaptive_learning_rate::global_schedule(model *m) { + const auto& c = static_cast(m->get_execution_context()); // Determine behavior the first time this is called in an epoch - if (m_cur_epoch != m->get_epoch()) { - m_cur_epoch = m->get_epoch(); - const execution_mode mode = m->get_execution_mode(); + if (m_cur_epoch != c.get_epoch()) { + m_cur_epoch = c.get_epoch(); + const auto mode = c.get_execution_mode(); const EvalType score = m->get_objective_function()->get_mean_value(mode); if (score < m_last_score) { // Reset wait counter if score has decreased @@ -166,12 +169,12 @@ float adaptive_learning_rate::global_schedule(model *m) { } drop_fixed_learning_rate::drop_fixed_learning_rate( - std::vector drop_epochs, float amt) : + std::vector drop_epochs, float amt) : drop_fixed_learning_rate(std::move(drop_epochs), amt, std::vector()) {} drop_fixed_learning_rate::drop_fixed_learning_rate( - std::vector drop_epochs, float amt, std::vector weights_names) : + std::vector drop_epochs, float amt, std::vector weights_names) : learning_rate(std::move(weights_names)), m_amt(amt), m_drop_epochs(std::move(drop_epochs)) { // Sort in reverse order. @@ -179,14 +182,15 @@ drop_fixed_learning_rate::drop_fixed_learning_rate( } float drop_fixed_learning_rate::global_schedule(model* m) { + const auto& c = static_cast(m->get_execution_context()); // Delete last drop epoch if we have already passed it while (!m_drop_epochs.empty() - && m->get_epoch() > m_drop_epochs.back()) { + && c.get_epoch() > m_drop_epochs.back()) { m_drop_epochs.pop_back(); } // Adjust learning rate if at a drop epoch - if (!m_drop_epochs.empty() && m->get_epoch() == m_drop_epochs.back()) { + if (!m_drop_epochs.empty() && c.get_epoch() == m_drop_epochs.back()) { return get_current_global_learning_rate() * m_amt; } else { return get_current_global_learning_rate(); @@ -194,17 +198,17 @@ float drop_fixed_learning_rate::global_schedule(model* m) { } linear_growth_learning_rate::linear_growth_learning_rate( - float target, int64_t num_epochs) : + float target, size_t num_epochs) : linear_growth_learning_rate(target, num_epochs, 0, std::vector()) {} linear_growth_learning_rate::linear_growth_learning_rate( - float target, int64_t num_epochs, int64_t delay) : + float target, size_t num_epochs, size_t delay) : linear_growth_learning_rate(target, num_epochs, delay, std::vector()) {} linear_growth_learning_rate::linear_growth_learning_rate( - float target, int64_t num_epochs, int64_t delay, + float target, size_t num_epochs, size_t delay, std::vector weights_names) : learning_rate(std::move(weights_names)), m_target(target), m_inc(0), @@ -221,10 +225,11 @@ void linear_growth_learning_rate::setup(model *m) { } float linear_growth_learning_rate::global_schedule(model *m) { - if (m->get_epoch() < m_delay) { + const auto& c = static_cast(m->get_execution_context()); + if (c.get_epoch() < m_delay) { return get_current_global_learning_rate(); - } else if (m->get_epoch() <= m_num_epochs + m_delay) { - int num_left = m_num_epochs + m_delay - m->get_epoch(); + } else if (c.get_epoch() <= m_num_epochs + m_delay) { + int num_left = m_num_epochs + m_delay - c.get_epoch(); return m_base_lr + m_inc*(m_num_epochs - num_left); } else { return get_current_global_learning_rate(); @@ -238,14 +243,14 @@ float linear_growth_learning_rate::global_schedule(model *m) { * epochs (n_epochs). n_epochs is not used otherwise. */ poly_learning_rate::poly_learning_rate( - double p, uint64_t n_epochs, uint64_t max_iter) + double p, size_t n_epochs, size_t max_iter) : learning_rate(std::vector()), m_p(p), m_num_epochs(n_epochs), m_max_iter(max_iter), m_end_lr(0.0f), m_lr(1.0f), m_last_epoch_lr(1.0f) {} poly_learning_rate::poly_learning_rate( - double p, uint64_t n_epochs, uint64_t max_iter, double end_lr, std::vector weights_names) + double p, size_t n_epochs, size_t max_iter, double end_lr, std::vector weights_names) : learning_rate(std::move(weights_names)), m_p(p), m_num_epochs(n_epochs), m_max_iter(max_iter), m_end_lr(end_lr), @@ -275,7 +280,8 @@ float poly_learning_rate::global_schedule(model *m) { * Compute the learning rate for the next iteration. */ float poly_learning_rate::optimizer_schedule(model *m, optimizer &opt) { - const uint64_t cur_iter = static_cast(m->get_step(execution_mode::training)); + const auto& c = static_cast(m->get_execution_context()); + const size_t cur_iter = c.get_step(); if (m_max_iter > cur_iter) { m_lr = static_cast(std::pow(static_cast(m_max_iter - cur_iter)/m_max_iter, m_p)); } @@ -334,7 +340,7 @@ build_drop_fixed_learning_rate_callback_from_pbuf( const google::protobuf::Message& proto_msg, const std::shared_ptr&) { const auto& params = dynamic_cast(proto_msg); - std::vector drop_epochs; + std::vector drop_epochs; for (int i = 0; i < params.drop_epoch_size(); ++i) { drop_epochs.push_back(params.drop_epoch(i)); } diff --git a/src/callbacks/ltfb.cpp b/src/callbacks/ltfb.cpp index 45a793700dd..96a3386de48 100644 --- a/src/callbacks/ltfb.cpp +++ b/src/callbacks/ltfb.cpp @@ -204,8 +204,9 @@ void exchange_models__checkpoint_file(lbann_comm& comm, const std::vector& local_weights) { // Checkpoint directories + const auto& c = m.get_execution_context(); const auto local_trainer = comm.get_trainer_rank(); - const auto step = m.get_step(); + const auto step = c.get_step(); const std::string send_dir = (m.get_name() + "_trainer" + std::to_string(local_trainer) + "_step" + std::to_string(step)); @@ -215,11 +216,11 @@ void exchange_models__checkpoint_file(lbann_comm& comm, // Save model checkpoint persist p; - p.set_cb_type(callback_type::batch); + p.set_cb_type(callback_type::model_only); if (comm.am_trainer_master()) { - p.open_checkpoint(send_dir.c_str()); + p.open_checkpoint(send_dir); } else { - std::strcpy(p.m_checkpoint_dir, send_dir.c_str()); + p.m_checkpoint_dir = send_dir; } m.save_to_checkpoint_shared(p); p.close_checkpoint(); @@ -234,11 +235,11 @@ void exchange_models__checkpoint_file(lbann_comm& comm, } // Load model checkpoint from partner trainer - p.set_cb_type(callback_type::batch); + p.set_cb_type(callback_type::model_only); if (comm.am_trainer_master()) { - p.open_restart(recv_dir.c_str()); + p.open_restart(recv_dir); } else { - std::strcpy(p.m_checkpoint_dir, recv_dir.c_str()); + p.m_checkpoint_dir = recv_dir; } m.load_from_checkpoint_shared(p); if (comm.am_trainer_master()) { @@ -263,19 +264,20 @@ void exchange_models__checkpoint_file(lbann_comm& comm, void restore_local_model__checkpoint_file(lbann_comm& comm, model& m) { // Checkpoint directories + const auto& c = m.get_execution_context(); const auto local_trainer = comm.get_trainer_rank(); - const auto step = m.get_step(); + const auto step = c.get_step(); const std::string checkpoint_dir = (m.get_name() + "_trainer" + std::to_string(local_trainer) + "_step" + std::to_string(step)); // Load local model checkpoint persist p; - p.set_cb_type(callback_type::batch); + p.set_cb_type(callback_type::model_only); if (comm.am_trainer_master()) { - p.open_restart(checkpoint_dir.c_str()); + p.open_restart(checkpoint_dir); } else { - std::strcpy(p.m_checkpoint_dir, checkpoint_dir.c_str()); + p.m_checkpoint_dir = checkpoint_dir; } m.load_from_checkpoint_shared(p); if (comm.am_trainer_master()) { @@ -286,9 +288,9 @@ void restore_local_model__checkpoint_file(lbann_comm& comm, model& m) { /** Get mean metric value with validation set. */ EvalType evaluate(model& m, const std::string& metric_name) { - + auto& c = m.get_execution_context(); // Make sure data readers finish asynchronous work - const auto original_mode = m.get_execution_mode(); + const auto original_mode = c.get_execution_mode(); m.collect_background_data_fetch(original_mode); // Mark the data store as loading - Note that this is a temporary fix @@ -296,7 +298,7 @@ EvalType evaluate(model& m, const std::string& metric_name) { m.mark_data_store_explicitly_loading(execution_mode::validation); // Evaluate model on validation set - m.evaluate(execution_mode::validation); + c.get_trainer().evaluate(&m, execution_mode::validation); // Get metric value bool found_metric = false; @@ -320,7 +322,7 @@ EvalType evaluate(model& m, const std::string& metric_name) { m.make_data_store_preloaded(execution_mode::validation); // Clean up and return metric value - m.set_execution_mode(original_mode); + c.set_execution_mode(original_mode); return metric_value; } @@ -412,11 +414,12 @@ void ltfb::on_train_begin(model *m) { } void ltfb::on_batch_begin(model *m) { + const auto& c = m->get_execution_context(); auto&& comm = *m->get_comm(); // Check whether to start LTFB round - const auto mode = m->get_execution_mode(); - const auto step = m->get_step(); + const auto mode = c.get_execution_mode(); + const auto step = c.get_step(); if (mode != execution_mode::training || step == 0) { return; } // Print message diff --git a/src/callbacks/mixup.cpp b/src/callbacks/mixup.cpp index 1e83cd7c709..2e759202121 100644 --- a/src/callbacks/mixup.cpp +++ b/src/callbacks/mixup.cpp @@ -42,7 +42,8 @@ void mixup::on_forward_prop_end(model *m, Layer *l) { if (!m_layers.count(l->get_name())) { return; } - if (m->get_execution_mode() != execution_mode::training) { + const auto& c = static_cast(m->get_execution_context()); + if (c.get_execution_mode() != execution_mode::training) { return; // No mixup outside of training. } diff --git a/src/callbacks/monitor_io.cpp b/src/callbacks/monitor_io.cpp index d67f5aaff34..8adc43b07ba 100644 --- a/src/callbacks/monitor_io.cpp +++ b/src/callbacks/monitor_io.cpp @@ -38,6 +38,7 @@ namespace lbann { namespace callback { void monitor_io::on_epoch_end(model *m) { + const auto& c = static_cast(m->get_execution_context()); lbann_comm *comm = m->get_comm(); for (Layer *layer : m->get_layers()) { if(m_layers.size() == 0 @@ -48,13 +49,14 @@ void monitor_io::on_epoch_end(model *m) { << comm->get_rank_in_trainer() << " processed " << input->get_num_samples_trained() << " training samples of " << input->get_total_num_training_samples() << " (" - << input->get_num_samples_trained() / m->get_epoch() << " per epoch)" << std::endl; + << input->get_num_samples_trained() / c.get_epoch() << " per epoch)" << std::endl; } } } } void monitor_io::on_test_end(model *m) { + const auto& c = static_cast(m->get_execution_context()); lbann_comm *comm = m->get_comm(); for (Layer *layer : m->get_layers()) { if(m_layers.size() == 0 @@ -65,7 +67,7 @@ void monitor_io::on_test_end(model *m) { << comm->get_rank_in_trainer() << " processed " << input->get_num_samples_tested() << " test samples of " << input->get_total_num_testing_samples() << " (" - << input->get_num_samples_tested() / m->get_epoch() + << input->get_num_samples_tested() / c.get_epoch() << " per epoch)" << std::endl; } } diff --git a/src/callbacks/perturb_adam.cpp b/src/callbacks/perturb_adam.cpp index 92f76888e23..09e88a7678b 100644 --- a/src/callbacks/perturb_adam.cpp +++ b/src/callbacks/perturb_adam.cpp @@ -61,7 +61,8 @@ void perturb_adam::setup(model* m) { } void perturb_adam::on_batch_begin(model* m) { - if (m_perturb_during_training && m->get_step() > 0) { + const auto& c = m->get_execution_context(); + if (m_perturb_during_training && c.get_step() > 0) { perturb(*m); } } diff --git a/src/callbacks/print_statistics.cpp b/src/callbacks/print_statistics.cpp index 5eab1fb6d20..0622f80a014 100644 --- a/src/callbacks/print_statistics.cpp +++ b/src/callbacks/print_statistics.cpp @@ -53,6 +53,7 @@ void print_statistics::setup(model *m) { } void print_statistics::on_epoch_begin(model *m) { + const auto& c = static_cast(m->get_execution_context()); lbann_comm *comm = m->get_comm(); if (comm->am_world_master()) { @@ -67,7 +68,7 @@ void print_statistics::on_epoch_begin(model *m) { // Print message std::cout << "--------------------------------------------------------------------------------" << std::endl; - std::cout << "[" << m->get_epoch() << "] Epoch : stats formated [tr/v/te]" + std::cout << "[" << c.get_epoch() << "] Epoch : stats formated [tr/v/te]" << " iter/epoch =" << " [" << input->get_num_iterations_per_epoch(execution_mode::training) @@ -137,14 +138,15 @@ void print_statistics::on_test_end(model *m) { } void print_statistics::report_results(model *m) { + const auto& c = static_cast(m->get_execution_context()); lbann_comm *comm = m->get_comm(); // Get string for execution mode - const execution_mode mode = m->get_execution_mode(); + const execution_mode mode = c.get_execution_mode(); std::string mode_string; switch (mode) { case execution_mode::training: - mode_string = "training epoch " + std::to_string(m->get_epoch()-1); + mode_string = "training epoch " + std::to_string(c.get_epoch()-1); break; case execution_mode::validation: mode_string = "validation"; diff --git a/src/callbacks/profiler.cpp b/src/callbacks/profiler.cpp index aef9cab3205..d95cb7e05ba 100644 --- a/src/callbacks/profiler.cpp +++ b/src/callbacks/profiler.cpp @@ -55,56 +55,66 @@ profiler::profiler(bool sync, bool skip_init) : } void profiler::on_epoch_begin(model *m) { + const auto& c = static_cast(m->get_execution_context()); // Skip the first epoch - if (m_skip_init && m->get_epoch() == 1) { + if (m_skip_init && c.get_epoch() == 1) { prof_start(); } - prof_region_begin(("epoch " + std::to_string(m->get_epoch())).c_str(), + prof_region_begin(("epoch " + std::to_string(c.get_epoch())).c_str(), prof_colors[0], m_sync); } void profiler::on_epoch_end(model *m) { - prof_region_end(("epoch " + std::to_string(m->get_epoch())).c_str(), + const auto& c = static_cast(m->get_execution_context()); + prof_region_end(("epoch " + std::to_string(c.get_epoch())).c_str(), m_sync); } void profiler::on_validation_begin(model *m) { - prof_region_begin(("val " + std::to_string(m->get_epoch())).c_str(), + const auto& c = static_cast(m->get_execution_context()); + prof_region_begin(("val " + std::to_string(c.get_epoch())).c_str(), prof_colors[0], m_sync); } void profiler::on_validation_end(model *m) { - prof_region_end(("val " + std::to_string(m->get_epoch())).c_str(), + const auto& c = static_cast(m->get_execution_context()); + prof_region_end(("val " + std::to_string(c.get_epoch())).c_str(), m_sync); } void profiler::on_test_begin(model *m) { - prof_region_begin(("test " + std::to_string(m->get_epoch())).c_str(), + const auto& c = static_cast(m->get_execution_context()); + prof_region_begin(("test " + std::to_string(c.get_epoch())).c_str(), prof_colors[0], m_sync); } void profiler::on_test_end(model *m) { - prof_region_end(("test " + std::to_string(m->get_epoch())).c_str(), + const auto& c = static_cast(m->get_execution_context()); + prof_region_end(("test " + std::to_string(c.get_epoch())).c_str(), m_sync); } void profiler::on_batch_begin(model *m) { - prof_region_begin(("batch " + std::to_string(m->get_step(execution_mode::training))).c_str(), + const auto& c = m->get_execution_context(); + prof_region_begin(("batch " + std::to_string(c.get_step())).c_str(), prof_colors[1], m_sync); } void profiler::on_batch_end(model *m) { - prof_region_end(("batch " + std::to_string(m->get_step(execution_mode::training))).c_str(), + const auto& c = m->get_execution_context(); + prof_region_end(("batch " + std::to_string(c.get_step())).c_str(), m_sync); } void profiler::on_batch_evaluate_begin(model *m) { - prof_region_begin(("batch eval " + std::to_string(m->get_step(execution_mode::training))).c_str(), + const auto& c = m->get_execution_context(); + prof_region_begin(("batch eval " + std::to_string(c.get_step())).c_str(), prof_colors[1], m_sync); } void profiler::on_batch_evaluate_end(model *m) { - prof_region_end(("batch eval " + std::to_string(m->get_step(execution_mode::training))).c_str(), + const auto& c = m->get_execution_context(); + prof_region_end(("batch eval " + std::to_string(c.get_step())).c_str(), m_sync); } diff --git a/src/callbacks/replace_weights.cpp b/src/callbacks/replace_weights.cpp index 3137bc3d243..9d2caa51a11 100644 --- a/src/callbacks/replace_weights.cpp +++ b/src/callbacks/replace_weights.cpp @@ -48,7 +48,8 @@ void replace_weights::setup(model *m) { } void replace_weights::on_batch_end(model *m) { - const auto& step = m->get_step(execution_mode::training); + const auto& c = m->get_execution_context(); + const auto& step = c.get_step(); if(step % m_batch_interval == 0) { for(size_t i = 0; i < m_src_layers.size(); i++) { m_dst_layers[i]->replace_weights(m_src_layers[i]); diff --git a/src/callbacks/save_images.cpp b/src/callbacks/save_images.cpp index 62e72f32f52..c6b73033f3b 100644 --- a/src/callbacks/save_images.cpp +++ b/src/callbacks/save_images.cpp @@ -144,7 +144,8 @@ save_images::save_images(std::vector layer_names, } void save_images::on_epoch_end(model *m) { - save_image(m_image_prefix + "epoch" + std::to_string(m->get_epoch()), + const auto& c = static_cast(m->get_execution_context()); + save_image(m_image_prefix + "epoch" + std::to_string(c.get_epoch()), m_image_format, m->get_layers(), m_layer_names); diff --git a/src/callbacks/save_model.cpp b/src/callbacks/save_model.cpp index d313b0fc5d1..ea2b73bf7f7 100644 --- a/src/callbacks/save_model.cpp +++ b/src/callbacks/save_model.cpp @@ -72,7 +72,7 @@ void save_model::write_proto_text(const lbann_data::Model& proto, bool save_model::do_save_model(model *m) { lbann_data::Model model_param; - p.set_cb_type(callback_type::inference); + p.set_cb_type(callback_type::model_only); do_save_model_weights(m); p.set_cb_type(callback_type::invalid); @@ -90,6 +90,7 @@ bool save_model::do_save_model(model *m) { // Save model weights bool save_model::do_save_model_weights(model *m) { + const auto& c = static_cast(m->get_execution_context()); // if the checkpoint directory is not defined, bail if (m_dir.length() == 0) { return false; @@ -100,8 +101,8 @@ bool save_model::do_save_model_weights(model *m) { lbann_comm *comm = m->get_comm(); comm->trainer_barrier(); // let user know we're saving the weights - int epoch = m->get_epoch(); - int step = m->get_step(execution_mode::training); + int epoch = c.get_epoch(); + int step = c.get_step(); if (comm->am_trainer_master()) { timer.Start(); printf("[%s.%d] Saving model weights: epoch %d step %d ...\n", m->get_name().c_str(), comm->get_trainer_rank(), epoch, step); @@ -110,7 +111,7 @@ bool save_model::do_save_model_weights(model *m) { // Shared checkpoint, logic identical to Distributed.i makedir(m_dir.c_str()); - std::string epochdir = get_shared_checkpoint_dirname(m, m_dir.c_str(), epoch, step); + std::string epochdir = get_shared_checkpoint_dirname(m, m_dir.c_str(), c.get_execution_mode(), epoch, step); if (comm->am_trainer_master()) { p.open_checkpoint(epochdir.c_str()); } @@ -121,7 +122,7 @@ bool save_model::do_save_model_weights(model *m) { p.close_checkpoint(); if (comm->am_trainer_master()) { std::string latest_file = get_last_shared_checkpoint_filename(m, m_dir.c_str()); - write_latest(latest_file, epoch, step); + write_latest(latest_file, c.get_execution_mode(), epoch, step); } uint64_t bytes_count = p.get_bytes(); @@ -146,16 +147,17 @@ bool save_model::load_model_weights(std::string ckpt_dir, model * m, bool ckptdi if(ckptdir_is_fullpath) { active_ckpt_dir = ckpt_dir; }else { - int epochLast = -1; - int stepLast = -1; + size_t epochLast = std::numeric_limits::max();; + size_t stepLast = std::numeric_limits::max();; + execution_mode mode = execution_mode::invalid; active_ckpt_dir = get_last_shared_checkpoint_filename(m, ckpt_dir); // get last epoch and step saved. - int success = read_latest(active_ckpt_dir, &epochLast, &stepLast); + int success = read_latest(active_ckpt_dir, &mode, &epochLast, &stepLast); if(!success) { return false; } - active_ckpt_dir = get_shared_checkpoint_dirname(m, ckpt_dir, epochLast, stepLast); + active_ckpt_dir = get_shared_checkpoint_dirname(m, ckpt_dir, mode, epochLast, stepLast); } lbann_comm *comm = m->get_comm(); if(comm->am_trainer_master()) { diff --git a/src/callbacks/save_topk_models.cpp b/src/callbacks/save_topk_models.cpp index 66591080fc9..6523910cbdb 100644 --- a/src/callbacks/save_topk_models.cpp +++ b/src/callbacks/save_topk_models.cpp @@ -48,6 +48,7 @@ void save_topk_models::on_test_end(model *m) { } bool save_topk_models::am_in_topk(model *m) { + const auto& c = static_cast(m->get_execution_context()); lbann_comm *comm = m->get_comm(); const int num_trainers = comm->get_num_trainers(); std::string mode_string = "test"; @@ -56,7 +57,7 @@ bool save_topk_models::am_in_topk(model *m) { for (const auto& met : m->get_metrics()) { if (met->name() == m_metric_name) { found_metric = true; - score = met->get_mean_value(m->get_execution_mode()); + score = met->get_mean_value(c.get_execution_mode()); break; } } diff --git a/src/callbacks/summary.cpp b/src/callbacks/summary.cpp index f284a7f2f07..bf8e118cd55 100644 --- a/src/callbacks/summary.cpp +++ b/src/callbacks/summary.cpp @@ -51,14 +51,15 @@ void summary::on_train_begin(model *m) { } void summary::on_batch_end(model *m) { - if(!m_summarizer){ LBANN_ERROR("Summary callback failed: m_summarizer does not exist."); } + const auto& c = m->get_execution_context(); + prof_region_begin("summary-batch", prof_colors[0], false); m->summarize_stats(*m_summarizer); - if (m_mat_interval > 0 && m->get_step(execution_mode::training) % m_mat_interval == 0) { + if (m_mat_interval > 0 && c.get_step() % m_mat_interval == 0) { m->summarize_matrices(*m_summarizer); } lbann_comm *comm = m->get_comm(); @@ -68,15 +69,15 @@ void summary::on_batch_end(model *m) { size_t intertrainer_barriers = comm->get_num_intertrainer_barriers(); size_t global_barriers = comm->get_num_global_barriers(); comm->reset_stats_counters(); - m_summarizer->sum_reduce_scalar("bytes_sent", bytes_sent, m->get_step(execution_mode::training)); + m_summarizer->sum_reduce_scalar("bytes_sent", bytes_sent, c.get_step()); m_summarizer->sum_reduce_scalar("bytes_received", bytes_received, - m->get_step(execution_mode::training)); + c.get_step()); m_summarizer->reduce_scalar("trainer_barriers", trainer_barriers, - m->get_step(execution_mode::training)); + c.get_step()); m_summarizer->reduce_scalar("intertrainer_barriers", intertrainer_barriers, - m->get_step(execution_mode::training)); + c.get_step()); m_summarizer->reduce_scalar("global_barriers", global_barriers, - m->get_step(execution_mode::training)); + c.get_step()); prof_region_end("summary-batch", false); } @@ -85,15 +86,16 @@ void summary::on_epoch_end(model *m) { LBANN_ERROR("Summary callback failed: m_summarizer does not exist."); } + const auto& c = m->get_execution_context(); prof_region_begin("summary-epoch", prof_colors[0], false); for (const auto& met : m->get_metrics()) { - EvalType train_score = met->get_mean_value(m->get_execution_mode()); + EvalType train_score = met->get_mean_value(c.get_execution_mode()); // Replace spaces with _ for consistency. std::string metric_name = met->name(); std::transform(metric_name.begin(), metric_name.end(), metric_name.begin(), - [] (char c) { return c == ' ' ? '_' : c; }); + [] (char c_) { return c_ == ' ' ? '_' : c_; }); std::string phase = "train_" + metric_name; - m_summarizer->reduce_scalar(phase, train_score, m->get_step(execution_mode::training)); + m_summarizer->reduce_scalar(phase, train_score, c.get_step()); } save_histograms(m); m_summarizer->flush(); @@ -105,16 +107,17 @@ void summary::on_test_end(model *m) { if(!m_summarizer){ LBANN_ERROR("Summary callback failed: m_summarizer does not exist."); } + const auto& c = m->get_execution_context(); prof_region_begin("summary-test", prof_colors[0], false); lbann_comm *comm = m->get_comm(); for (auto&& met : m->get_metrics()) { - EvalType test_score = met->get_mean_value(m->get_execution_mode()); + EvalType test_score = met->get_mean_value(c.get_execution_mode()); // Replace spaces with _ for consistency. std::string metric_name = met->name(); std::transform(metric_name.begin(), metric_name.end(), metric_name.begin(), - [] (char c) { return c == ' ' ? '_' : c; }); + [] (char c_) { return c_ == ' ' ? '_' : c_; }); std::string phase = "test_" + metric_name; - m_summarizer->reduce_scalar(phase, test_score, m->get_step(execution_mode::training)); + m_summarizer->reduce_scalar(phase, test_score, c.get_step()); } // Reset counters incremented during test phase. comm->reset_stats_counters(); @@ -128,13 +131,14 @@ void summary::save_histograms(model *m) { if(!m_summarizer){ LBANN_ERROR("Summary callback failed: m_summarizer does not exist."); } + const auto& c = m->get_execution_context(); for (const auto& layer : m->get_layers()) { const std::string prefix = layer->get_name() + "/"; for (int i = 0; i < layer->get_num_children(); ++i) { AbsDistMatReadProxy acts(layer->get_activations(i)); m_summarizer->reduce_histogram(prefix + "activations" + std::to_string(i), acts.GetLocked(), - m->get_step(execution_mode::training)); + c.get_step()); } } for (const auto& w : m->get_weights()) { @@ -142,13 +146,13 @@ void summary::save_histograms(model *m) { AbsDistMatReadProxy weights(w->get_values()); m_summarizer->reduce_histogram(prefix + "weights", weights.GetLocked(), - m->get_step(execution_mode::training)); + c.get_step()); optimizer *opt = w->get_optimizer(); if (opt != nullptr) { AbsDistMatReadProxy gradients(opt->get_gradient()); m_summarizer->reduce_histogram(prefix + "weights_gradient", gradients.GetLocked(), - m->get_step(execution_mode::training)); + c.get_step()); } } } diff --git a/src/callbacks/timer.cpp b/src/callbacks/timer.cpp index e98b37aa6f0..755c980553d 100644 --- a/src/callbacks/timer.cpp +++ b/src/callbacks/timer.cpp @@ -32,31 +32,35 @@ namespace lbann { namespace callback { void timer::batch_timing_begin(const model& m) { - const auto& mode = m.get_execution_mode(); + const auto& c = m.get_execution_context(); + const auto& mode = c.get_execution_mode(); m_batch_start_times[mode] = get_time(); } void timer::batch_timing_end(const model& m) { - const auto& mode = m.get_execution_mode(); + const auto& c = m.get_execution_context(); + const auto& mode = c.get_execution_mode(); const auto& batch_time = get_time() - m_batch_start_times[mode]; m_batch_times[mode].push_back(batch_time); - if (m_summarizer) { - m_summarizer->reduce_scalar("minibatch_time", batch_time, m.get_step(execution_mode::training)-1); - m_summarizer->reduce_scalar_all("minibatch_time", batch_time, m.get_step(execution_mode::training)-1); + if (m_summarizer != nullptr) { + m_summarizer->reduce_scalar("minibatch_time", batch_time, c.get_step()-1); + m_summarizer->reduce_scalar_all("minibatch_time", batch_time, c.get_step()-1); } } void timer::timing_begin(const model& m) { - const auto& mode = m.get_execution_mode(); + const auto& c = m.get_execution_context(); + const auto& mode = c.get_execution_mode(); m_start_times[mode] = get_time(); m_batch_times[mode].clear(); } void timer::timing_end(model& m) { + const auto& c = static_cast(m.get_execution_context()); constexpr EvalType zero = 0; // Get run time - const auto& mode = m.get_execution_mode(); + const auto& mode = c.get_execution_mode(); const auto& run_time = get_time() - m_start_times[mode]; // Compute minibatch statistics @@ -77,8 +81,8 @@ void timer::timing_end(model& m) { } if (num_batches > 1) { batch_time_stdev = zero; - for (const auto& t : batch_times) { - const auto& diff = t - batch_time_mean; + for (const auto& bt : batch_times) { + const auto& diff = bt - batch_time_mean; batch_time_stdev += diff * diff; } batch_time_stdev /= num_batches - 1; @@ -89,7 +93,7 @@ void timer::timing_end(model& m) { std::string mode_string; switch(mode) { case execution_mode::training: - mode_string = "training epoch " + std::to_string(m.get_epoch()-1); + mode_string = "training epoch " + std::to_string(c.get_epoch()-1); break; case execution_mode::validation: mode_string = "validation"; diff --git a/src/callbacks/variable_minibatch.cpp b/src/callbacks/variable_minibatch.cpp index 96d84669ee1..57d486079d8 100644 --- a/src/callbacks/variable_minibatch.cpp +++ b/src/callbacks/variable_minibatch.cpp @@ -40,12 +40,13 @@ namespace lbann { namespace callback { variable_minibatch::variable_minibatch( - int starting_mbsize) : m_starting_mbsize(starting_mbsize), + size_t starting_mbsize) : m_starting_mbsize(starting_mbsize), m_current_mini_batch_size(starting_mbsize) {} void variable_minibatch::on_train_begin(model *m) { // Avoid issues with the train method being called multiple times. - if (m->get_epoch() != 0) { return; } + const auto& c = static_cast(m->get_execution_context()); + if (c.get_epoch() != 0) { return; } // Get first input layer in model generic_input_layer* input = nullptr; @@ -70,6 +71,7 @@ void variable_minibatch::on_train_begin(model *m) { } void variable_minibatch::on_epoch_end(model *m) { + const auto& c = static_cast(m->get_execution_context()); // Get first input layer in model generic_input_layer* input = nullptr; @@ -80,9 +82,9 @@ void variable_minibatch::on_epoch_end(model *m) { if (input == nullptr) { LBANN_ERROR("could not get input layer"); } lbann_comm *comm = m->get_comm(); - int new_mbsize = 0; + size_t new_mbsize = 0; float new_lr = 0.0f; - int ramp_time = 0; + size_t ramp_time = 0; if (schedule(m, new_mbsize, new_lr, ramp_time)) { if (new_mbsize > m->get_max_mini_batch_size()) { if (comm->am_trainer_master()) { @@ -109,12 +111,12 @@ void variable_minibatch::on_epoch_end(model *m) { std::cout << "Model " << comm->get_trainer_rank() << ": Changing mini-batch size to " << new_mbsize << " and learning rate to " << new_lr << " at epoch " << - m->get_epoch() << std::endl; + c.get_epoch() << std::endl; } } else if (comm->am_trainer_master()) { std::cout << "Model " << comm->get_trainer_rank() << ": Changing mini-batch size to " << new_mbsize << - " at epoch " << m->get_epoch() << std::endl; + " at epoch " << c.get_epoch() << std::endl; } } // Ramp the learning rate, if needed. @@ -152,13 +154,14 @@ float variable_minibatch::get_current_learning_rate( } step_minibatch::step_minibatch( - int starting_mbsize, int step, int ramp_time) : + size_t starting_mbsize, size_t step, size_t ramp_time) : variable_minibatch(starting_mbsize), m_step(step), m_ramp_time(ramp_time) {} bool step_minibatch::schedule( - model *m, int& new_mbsize, float& new_lr, int& ramp_time) { - if (m->get_epoch() % m_step == 0) { + model *m, size_t& new_mbsize, float& new_lr, size_t& ramp_time) { + const auto& c = static_cast(m->get_execution_context()); + if (c.get_epoch() % m_step == 0) { new_mbsize = m_current_mini_batch_size * 2; new_lr = get_current_learning_rate(m) * 2; ramp_time = m_ramp_time; @@ -169,7 +172,7 @@ bool step_minibatch::schedule( } minibatch_schedule::minibatch_schedule( - int starting_mbsize, std::vector steps) : + size_t starting_mbsize, std::vector steps) : variable_minibatch(starting_mbsize), m_steps(std::move(steps)) { std::sort(m_steps.rbegin(), m_steps.rend(), [] (const minibatch_step& a, const minibatch_step& b) { @@ -178,8 +181,9 @@ minibatch_schedule::minibatch_schedule( } bool minibatch_schedule::schedule( - model *m, int& new_mbsize, float& new_lr, int& ramp_time) { - if (!m_steps.empty() && m->get_epoch() == m_steps.back().epoch) { + model *m, size_t& new_mbsize, float& new_lr, size_t& ramp_time) { + const auto& c = static_cast(m->get_execution_context()); + if (!m_steps.empty() && c.get_epoch() == m_steps.back().epoch) { new_mbsize = m_steps.back().mbsize; new_lr = m_steps.back().lr; ramp_time = m_steps.back().ramp_time; diff --git a/src/data_readers/data_reader.cpp b/src/data_readers/data_reader.cpp index 66294f21417..f2492ec2124 100644 --- a/src/data_readers/data_reader.cpp +++ b/src/data_readers/data_reader.cpp @@ -32,6 +32,10 @@ #include "lbann/models/model.hpp" #include #include +#include "lbann/io/persist.hpp" +#include "lbann/execution_contexts/sgd_execution_context.hpp" +#include +#include namespace lbann { @@ -50,7 +54,7 @@ void generic_data_reader::shuffle_indices(rng_gen& gen) { } } -void generic_data_reader::setup(int num_io_threads, std::shared_ptr io_thread_pool) { +void generic_data_reader::setup(int num_io_threads, observer_ptr io_thread_pool) { m_base_offset = 0; m_sample_stride = 1; m_stride_to_next_mini_batch = 0; @@ -494,7 +498,7 @@ double generic_data_reader::get_percent_to_use() { size_t count = get_absolute_sample_count(); double use_percent = get_use_percent(); double r = 0.; - + if (count != 0) { r = count / get_num_data(); } @@ -525,7 +529,7 @@ void generic_data_reader::select_subset_of_data() { return ; } - long unused = get_validation_percent()*get_num_data(); + long unused = get_validation_percent()*get_num_data(); long use_me = get_num_data() - unused; if (unused > 0) { m_unused_indices=std::vector(m_shuffled_indices.begin() + use_me, m_shuffled_indices.end()); @@ -550,39 +554,28 @@ void generic_data_reader::use_unused_index_set() { } /** \brief Given directory to store checkpoint files, write state to file and add to number of bytes written */ -bool generic_data_reader::save_to_checkpoint_shared(persist& p, const char *name) { - // rank 0 writes the training state file - if (m_comm->am_trainer_master()) { - pack_scalars(p,name); +bool generic_data_reader::save_to_checkpoint_shared(persist& p, execution_mode mode) { + if (get_comm()->am_trainer_master()) { + write_cereal_archive(*this, p, mode, "_dr.xml"); } return true; } /** \brief Given directory to store checkpoint files, read state from file and add to number of bytes read */ -bool lbann::generic_data_reader::load_from_checkpoint_shared(persist& p, const char *name) { - // rank 0 reads the training state file - struct packing_header header; - if (m_comm->am_trainer_master()) { - unpack_scalars(p,&header,name); - } - m_comm->trainer_broadcast(0, header); - unpack_header(header); - - m_comm->trainer_broadcast(0, m_shuffled_indices); - +bool lbann::generic_data_reader::load_from_checkpoint_shared(persist& p, execution_mode mode) { + load_from_shared_cereal_archive(*this, p, mode, *get_comm(), "_dr.xml"); // Adjust current position to deal with fact that it was just loaded to all ranks from rank 0 (differs by rank #) m_current_pos += m_comm->get_rank_in_trainer(); return true; } -bool generic_data_reader::save_to_checkpoint_distributed(persist& p, const char *name) { - pack_scalars(p,name); +bool generic_data_reader::save_to_checkpoint_distributed(persist& p, execution_mode mode) { + write_cereal_archive(*this, p, mode, "_dr.xml"); return true; } -bool lbann::generic_data_reader::load_from_checkpoint_distributed(persist& p, const char *name) { - struct packing_header header; - unpack_scalars(p,&header,name); +bool lbann::generic_data_reader::load_from_checkpoint_distributed(persist& p, execution_mode mode) { + read_cereal_archive(*this, p, mode, "_dc.xml"); return true; } @@ -752,29 +745,31 @@ void generic_data_reader::setup_data_store(int mini_batch_size) { } bool generic_data_reader::data_store_active() const { + const auto& c = static_cast(m_model->get_execution_context()); if (m_data_store != nullptr && m_data_store->is_preloaded()) { return true; } /// Use the data store for all modes except testing /// i.e. training, validation, tournament return (m_data_store != nullptr - && (((m_model->get_execution_mode() == execution_mode::training) - && m_model->get_epoch() > 0) - || ((m_model->get_execution_mode() == execution_mode::validation) - && m_model->get_epoch() > 1))); + && (((c.get_execution_mode() == execution_mode::training) + && c.get_epoch() > 0) + || ((c.get_execution_mode() == execution_mode::validation) + && c.get_epoch() > 1))); } bool generic_data_reader::priming_data_store() const { + const auto& c = static_cast(m_model->get_execution_context()); if (m_data_store != nullptr && m_data_store->is_preloaded()) { return false; } /// Use the data store for all modes except testing /// i.e. training, validation, tournament return (m_data_store != nullptr - && (((m_model->get_execution_mode() == execution_mode::training) - && m_model->get_epoch() == 0) - || ((m_model->get_execution_mode() == execution_mode::validation) - && m_model->get_epoch() == 1) + && (((c.get_execution_mode() == execution_mode::training) + && c.get_epoch() == 0) + || ((c.get_execution_mode() == execution_mode::validation) + && c.get_epoch() == 1) || m_data_store->is_explicitly_loading())); } diff --git a/src/data_readers/data_reader_image.cpp b/src/data_readers/data_reader_image.cpp index 6f3d69af019..940cd46792d 100644 --- a/src/data_readers/data_reader_image.cpp +++ b/src/data_readers/data_reader_image.cpp @@ -159,6 +159,8 @@ void image_data_reader::load() { } fclose(fplist); + // TODO: this will probably need to change after sample_list class + // is modified // reset indices m_shuffled_indices.clear(); m_shuffled_indices.resize(m_image_list.size()); @@ -219,10 +221,10 @@ void image_data_reader::preload_data_store() { if (is_master()) { std::cout << "image_data_reader::preload_data_store time: " << (get_time() - tm1) << "\n"; - } + } } -void image_data_reader::setup(int num_io_threads, std::shared_ptr io_thread_pool) { +void image_data_reader::setup(int num_io_threads, observer_ptr io_thread_pool) { generic_data_reader::setup(num_io_threads, io_thread_pool); m_transform_pipeline.set_expected_out_dims( {static_cast(m_image_num_channels), diff --git a/src/data_readers/data_reader_jag_conduit.cpp b/src/data_readers/data_reader_jag_conduit.cpp index 687d8f27cf8..866ba62ed28 100644 --- a/src/data_readers/data_reader_jag_conduit.cpp +++ b/src/data_readers/data_reader_jag_conduit.cpp @@ -29,6 +29,7 @@ #include "lbann/io/data_buffers/partitioned_io_buffer.hpp" #include "lbann/data_store/data_store_conduit.hpp" #include "lbann/models/model.hpp" +#include "lbann/execution_contexts/sgd_execution_context.hpp" #include "lbann/utils/lbann_library.hpp" #include "lbann/utils/image.hpp" #include "lbann/utils/opencv.hpp" @@ -263,7 +264,7 @@ void data_reader_jag_conduit::set_defaults() { m_list_per_model = false; } -void data_reader_jag_conduit::setup(int num_io_threads, std::shared_ptr io_thread_pool) { +void data_reader_jag_conduit::setup(int num_io_threads, observer_ptr io_thread_pool) { generic_data_reader::setup(num_io_threads, io_thread_pool); } @@ -1465,20 +1466,21 @@ bool data_reader_jag_conduit::fetch_datum(CPUMat& X, int data_id, int mb_idx) { } bool data_reader_jag_conduit::fetch_response(CPUMat& X, int data_id, int mb_idx) { + const auto& c = static_cast(m_model->get_execution_context()); int tid = m_io_thread_pool->get_local_thread_id(); std::vector sizes = get_linearized_response_sizes(); std::vector X_v = create_datum_views(X, sizes, mb_idx); bool ok = true; // Create a node to hold all of the data conduit::Node node; - if (m_data_store != nullptr && m_model->get_epoch() > 0) { + if (m_data_store != nullptr && c.get_epoch() > 0) { const conduit::Node& ds_node = m_data_store->get_conduit_node(data_id); node.set_external(ds_node); } for(size_t i = 0u; ok && (i < X_v.size()); ++i) { ok = fetch(X_v[i], data_id, node, 0, tid, m_dependent[i], "response"); } - if (m_data_store != nullptr && m_model->get_epoch() == 0) { + if (m_data_store != nullptr && c.get_epoch() == 0) { // Once the node has been populated save it in the data store if (m_data_store != nullptr) { m_data_store->set_conduit_node(data_id, node); diff --git a/src/data_readers/data_reader_numpy_npz_conduit.cpp b/src/data_readers/data_reader_numpy_npz_conduit.cpp index 013467bfd8a..8eb72d4849b 100644 --- a/src/data_readers/data_reader_numpy_npz_conduit.cpp +++ b/src/data_readers/data_reader_numpy_npz_conduit.cpp @@ -214,7 +214,8 @@ bool numpy_npz_conduit_reader::fetch_datum(Mat& X, int data_id, int mb_idx) { numpy_conduit_converter::load_conduit_node(m_filenames[data_id], data_id, node); //note: if testing, and test set is touched more than once, the following // will through an exception TODO: relook later - if (priming_data_store() || m_model->get_execution_mode() == execution_mode::testing) { + const auto& c = static_cast(m_model->get_execution_context()); + if (priming_data_store() || c.get_execution_mode() == execution_mode::testing) { m_data_store->set_conduit_node(data_id, node); } } diff --git a/src/data_readers/data_reader_python.cpp b/src/data_readers/data_reader_python.cpp index 2086cbc5757..0ef83e41f57 100644 --- a/src/data_readers/data_reader_python.cpp +++ b/src/data_readers/data_reader_python.cpp @@ -159,7 +159,7 @@ bool python_reader::fetch_label(CPUMat& Y, int data_id, int col) { } void python_reader::setup(int num_io_threads, - std::shared_ptr io_thread_pool) { + observer_ptr io_thread_pool) { generic_data_reader::setup(num_io_threads, io_thread_pool); // Acquire Python GIL diff --git a/src/execution_contexts/CMakeLists.txt b/src/execution_contexts/CMakeLists.txt new file mode 100644 index 00000000000..14b452d3b93 --- /dev/null +++ b/src/execution_contexts/CMakeLists.txt @@ -0,0 +1,8 @@ +# Add the source files for this directory +set_full_path(THIS_DIR_SOURCES + execution_context.cpp + sgd_execution_context.cpp + ) + +# Propagate the files up the tree +set(SOURCES "${SOURCES}" "${THIS_DIR_SOURCES}" PARENT_SCOPE) diff --git a/src/execution_contexts/execution_context.cpp b/src/execution_contexts/execution_context.cpp new file mode 100644 index 00000000000..d081f13b973 --- /dev/null +++ b/src/execution_contexts/execution_context.cpp @@ -0,0 +1,100 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#include "lbann/training_algorithms/training_algorithm.hpp" +#include "lbann/trainers/trainer.hpp" +#include "lbann/callbacks/callback.hpp" +#include "lbann/io/persist.hpp" +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +//#include + +namespace lbann { + +//****************************************************************************** +// Execution context +//****************************************************************************** + +execution_context::execution_context(observer_ptr trainer, lbann_comm *comm, execution_mode mode) + : m_trainer(trainer), + m_comm(comm), + m_execution_mode(mode), + m_terminate_training(false) {} + +//////////////////////////////////////////////////////////// +// Training_Algorithm state +//////////////////////////////////////////////////////////// + +// observer_ptr training_algorithm::get_io_thread_pool() { +// return m_trainer->get_io_thread_pool(); +// } + +thread_pool& execution_context::get_io_thread_pool() const { + return m_trainer->get_io_thread_pool(); +} + +/** Are background I/O activities enabled by the input layers */ +bool execution_context::background_io_activity_allowed() { + return m_trainer->background_io_activity_allowed(); +} + +//////////////////////////////////////////////////////////// +// Checkpointing +//////////////////////////////////////////////////////////// + +void execution_context::save_to_checkpoint_shared(persist& p) { + if (get_comm().am_trainer_master()) { + write_cereal_archive(*this, p, get_execution_mode(), "_ctx.xml"); + } + return; +} + +void execution_context::load_from_checkpoint_shared(persist& p) { + load_from_shared_cereal_archive(*this, p, get_execution_mode(), get_comm(), "_ctx.xml"); + return; +} + +void execution_context::save_to_checkpoint_distributed(persist& p){ + write_cereal_archive(*this, p, get_execution_mode(), "_ctx.xml"); + return; +} + +void execution_context::load_from_checkpoint_distributed(persist& p){ + read_cereal_archive(*this, p, get_execution_mode(), "_ctx.xml"); + return; +} + +} // namespace lbann diff --git a/src/execution_contexts/sgd_execution_context.cpp b/src/execution_contexts/sgd_execution_context.cpp new file mode 100644 index 00000000000..049133d0908 --- /dev/null +++ b/src/execution_contexts/sgd_execution_context.cpp @@ -0,0 +1,68 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#include "lbann/execution_contexts/sgd_execution_context.hpp" +#include +#include +#include +#include + +namespace lbann { + +sgd_execution_context::sgd_execution_context(observer_ptr trainer, lbann_comm *comm, + execution_mode mode, + size_t mini_batch_size) + : execution_context(trainer, comm, mode), + m_current_mini_batch_size(mini_batch_size), + m_effective_mini_batch_size(mini_batch_size) {} + +//////////////////////////////////////////////////////////// +// Checkpointing +//////////////////////////////////////////////////////////// + +void sgd_execution_context::save_to_checkpoint_shared(persist& p) { + if (get_comm().am_trainer_master()) { + write_cereal_archive(*this, p, get_execution_mode(), "_ctx.xml"); + } + return; +} + +void sgd_execution_context::load_from_checkpoint_shared(persist& p) { + load_from_shared_cereal_archive(*this, p, get_execution_mode(), get_comm(), "_ctx.xml"); + return; +} + +void sgd_execution_context::save_to_checkpoint_distributed(persist& p) { + write_cereal_archive(*this, p, get_execution_mode(), "_ctx.xml"); + return; +} + +void sgd_execution_context::load_from_checkpoint_distributed(persist& p) { + read_cereal_archive(*this, p, get_execution_mode(), "_ctx.xml"); + return; +} + +} // namespace lbann diff --git a/src/io/persist.cpp b/src/io/persist.cpp index 87513600bcd..5e423027603 100644 --- a/src/io/persist.cpp +++ b/src/io/persist.cpp @@ -72,9 +72,7 @@ bool lbann::persist::write_rank_distmat(persist_type type, const char *name, con } else if (type == persist_type::model) { filename += std::string("/model_") + name; } else { - std::stringstream err; - err << "invalid persist_type (" << static_cast(type) << ")"; - LBANN_ERROR(err.str()); + LBANN_ERROR("invalid persist_type (", static_cast(type), ")"); } // skip all of this if matrix is not held on rank const El::Int localHeight = M.LocalHeight(); @@ -99,7 +97,7 @@ bool lbann::persist::write_rank_distmat(persist_type type, const char *name, con if (write_rc != sizeof(header)) { // error! } - m_bytes += write_rc; + m_bytes[type] += write_rc; // now write the data for our part of the distributed matrix const El::Int lDim = M.LDim(); @@ -112,7 +110,7 @@ bool lbann::persist::write_rank_distmat(persist_type type, const char *name, con if (write_rc != bufsize) { // error! } - m_bytes += write_rc; + m_bytes[type] += write_rc; } else { // TODO: if this padding is small, may not be a big deal to write it out anyway // we've got some padding along the first dimension @@ -124,7 +122,7 @@ bool lbann::persist::write_rank_distmat(persist_type type, const char *name, con if (write_rc != bufsize) { // error! } - m_bytes += write_rc; + m_bytes[type] += write_rc; } } return true; @@ -133,8 +131,6 @@ bool lbann::persist::write_rank_distmat(persist_type type, const char *name, con /** \brief Given an open file descriptor, file name, and a matrix, read the matrix * from the file descriptor, return the number of bytes read */ bool lbann::persist::read_rank_distmat(persist_type type, const char *name, AbsDistMat& M) { - std::stringstream err; - // read in the header std::string filename = m_checkpoint_dir; if (type == persist_type::train) { @@ -142,8 +138,7 @@ bool lbann::persist::read_rank_distmat(persist_type type, const char *name, AbsD } else if (type == persist_type::model) { filename += std::string("/model_") + name; } else { - err << "invalid persist_type (" << static_cast(type) << ")"; - LBANN_ERROR(err.str()); + LBANN_ERROR("invalid persist_type (", static_cast(type), ")"); } int fd = openread(filename.c_str()); // file does not exist. we will try to grab matrix from rank 0 @@ -152,13 +147,10 @@ bool lbann::persist::read_rank_distmat(persist_type type, const char *name, AbsD struct layer_header header; ssize_t read_rc = read(fd, &header, sizeof(header)); if (read_rc != sizeof(header)) { - err << "failed to read layer header from file " - << "(attempted to read " << sizeof(header) << " bytes " - << "from " << filename << ", " - << "but got " << read_rc << " bytes)"; - LBANN_ERROR(err.str()); + LBANN_ERROR("failed to read layer header from file (attempted to read ", + sizeof(header), " bytes from ", filename, ", but got ", read_rc, " bytes)"); } - m_bytes += read_rc; + m_bytes[type] += read_rc; // resize our global matrix El::Int height = header.height; @@ -173,26 +165,20 @@ bool lbann::persist::read_rank_distmat(persist_type type, const char *name, AbsD El::Int bufsize = localheight * localwidth * sizeof(DataType); read_rc = read(fd, buf, bufsize); if (read_rc != bufsize) { - err << "failed to read layer data from file " - << "(attempted to read " << bufsize << " bytes " - << "from " << filename << ", " - << "but got " << read_rc << " bytes)"; - LBANN_ERROR(err.str()); + LBANN_ERROR("failed to read layer data from file (attempted to read ", bufsize, + " bytes from ", filename, ", but got ", read_rc, " bytes)"); } - m_bytes += read_rc; + m_bytes[type] += read_rc; } else { for(El::Int j = 0; j < localwidth; ++j) { auto *buf = (void *) M.Buffer(0, j); El::Int bufsize = localheight * sizeof(DataType); read_rc = read(fd, buf, bufsize); if (read_rc != bufsize) { - err << "failed to read layer data from file " - << "(attempted to read " << bufsize << " bytes " - << "from " << filename << ", " - << "but got " << read_rc << " bytes)"; - LBANN_ERROR(err.str()); + LBANN_ERROR("failed to read layer data from file (attempted to read ", + bufsize, " bytes from ", filename, ", but got ", read_rc, " bytes)"); } - m_bytes += read_rc; + m_bytes[type] += read_rc; } } } else { @@ -202,26 +188,20 @@ bool lbann::persist::read_rank_distmat(persist_type type, const char *name, AbsD El::Int bufsize = localheight * localwidth * sizeof(DataType); read_rc = read(fd, buf, bufsize); if (read_rc != bufsize) { - err << "failed to read layer data from file " - << "(attempted to read " << bufsize << " bytes " - << "from " << filename << ", " - << "but got " << read_rc << " bytes)"; - LBANN_ERROR(err.str()); + LBANN_ERROR("failed to read layer data from file (attempted to read ", + bufsize, " bytes from ", filename, ", but got ", read_rc, " bytes)"); } - m_bytes += read_rc; + m_bytes[type] += read_rc; } else { for(El::Int jLoc = 0; jLoc < localwidth; ++jLoc) { auto *buf = (void *) M.Buffer(0, jLoc); El::Int bufsize = localheight * sizeof(DataType); read_rc = read(fd, buf, bufsize); if (read_rc != bufsize) { - err << "failed to read layer data from file " - << "(attempted to read " << bufsize << " bytes " - << "from " << filename << ", " - << "but got " << read_rc << " bytes)"; - LBANN_ERROR(err.str()); + LBANN_ERROR("failed to read layer data from file (attempted to read ", + bufsize, " bytes from ", filename, ", but got ", read_rc, " bytes)"); } - m_bytes += read_rc; + m_bytes[type] += read_rc; } } } @@ -233,110 +213,83 @@ bool lbann::persist::read_rank_distmat(persist_type type, const char *name, AbsD ****************************************************/ lbann::persist::persist() { - // initialize number of bytes written - m_bytes = 0; - - // initialize file descriptors - m_model_fd = -1; - m_train_fd = -1; - m_validate_fd = -1; + for(persist_type pt : persist_type_iterator()) { + // initialize number of bytes written + m_bytes[pt] = 0; + // initialize file descriptors + m_FDs[pt] = -1; + m_filenames[pt] = ""; + } } -void lbann::persist::open_checkpoint(const char *dir) { +/** @todo BVE FIXME this should be refactored to only open the + checkpoints files that we care about */ +void lbann::persist::open_checkpoint(const std::string& dir) { // create directory for checkpoint - lbann::makedir(dir); + lbann::makedir(dir.c_str()); // copy checkpoint directory - strcpy(m_checkpoint_dir, dir); - - // open the file for writing - sprintf(m_model_filename, "%s/model", dir); - - // define filename for train state - sprintf(m_train_filename, "%s/train", dir); - - if(ckpt_type != callback_type::validation && ckpt_type != callback_type::inference){ - m_model_fd = lbann::openwrite(m_model_filename); - if (m_model_fd < 0) { - LBANN_ERROR(std::string{} - + "failed to open file (" + m_model_filename + ")"); - } - - m_train_fd = lbann::openwrite(m_train_filename); - if (m_train_fd < 0) { - LBANN_ERROR(std::string{} - + "failed to open file (" + m_train_filename + ")"); - } - } - if (ckpt_type == callback_type::validation || ckpt_type == callback_type::batch){ - sprintf(m_validate_filename, "%s/validate", dir); - m_validate_fd = lbann::openwrite(m_validate_filename); - if (m_validate_fd < 0) { - LBANN_ERROR(std::string{} - + "failed to open file (" + m_validate_filename + ")"); + m_checkpoint_dir = dir; + + for(persist_type pt : persist_type_iterator()) { + // open the file for writing + m_filenames[pt] = dir + to_string(pt); + // Do not explicitly open several files -- this state is saved via Cereal + if(pt != persist_type::metrics && + pt != persist_type::testing && + pt != persist_type::validate && + pt != persist_type::testing_context && + pt != persist_type::training_context && + pt != persist_type::validation_context && + pt != persist_type::prediction_context) { + m_FDs[pt] = lbann::openwrite(m_filenames[pt].c_str()); + if (m_FDs[pt] < 0) { + LBANN_ERROR("failed to open file (", m_filenames[pt], ")"); + } } } } void lbann::persist::close_checkpoint() { - // close model file - if (m_model_fd >= 0) { - lbann::closewrite(m_model_fd, m_model_filename); - m_model_fd = -1; - } - - // close training file - if (m_train_fd >= 0) { - lbann::closewrite(m_train_fd, m_train_filename); - m_train_fd = -1; - } - if (m_validate_fd >= 0) { - lbann::closewrite(m_validate_fd, m_validate_filename); - m_validate_fd = -1; + for(persist_type pt : persist_type_iterator()) { + if (m_FDs[pt] >= 0) { + lbann::closewrite(m_FDs[pt], m_filenames[pt].c_str()); + m_FDs[pt] = -1; + m_filenames[pt] = ""; + } } } -void lbann::persist::open_restart(const char *dir) { +void lbann::persist::open_restart(const std::string& dir) { // copy checkpoint directory - strcpy(m_checkpoint_dir, dir); - // open the file for writing - sprintf(m_model_filename, "%s/model", dir); - - // define filename for train state - sprintf(m_train_filename, "%s/train", dir); - // define filename for validate phase state - sprintf(m_validate_filename, "%s/validate", dir); - - m_model_fd = lbann::openread(m_model_filename); - if (m_model_fd < 0) { - LBANN_ERROR(std::string{} - + "failed to read file (" + m_model_filename + ")"); - } - - m_train_fd = lbann::openread(m_train_filename); - if (m_train_fd < 0) { - LBANN_ERROR(std::string{} - + "failed to read file (" + m_train_filename + ")"); - } - m_validate_fd = lbann::openread(m_validate_filename); - if (m_validate_fd < 0) { - LBANN_WARNING(std::string{} - + "failed to read file (" + m_validate_filename + "), " - + "which is not an error if validation percent = 0"); + m_checkpoint_dir = dir; + + for(persist_type pt : persist_type_iterator()) { + // open the file for reading + m_filenames[pt] = dir + to_string(pt); + if(pt != persist_type::metrics && + pt != persist_type::testing && + pt != persist_type::validate && + pt != persist_type::testing_context && + pt != persist_type::training_context && + pt != persist_type::validation_context && + pt != persist_type::prediction_context) { + m_FDs[pt] = lbann::openread(m_filenames[pt].c_str()); + if (m_FDs[pt] < 0) { + LBANN_ERROR("failed to open file (", m_filenames[pt], ")"); + } + } } } void lbann::persist::close_restart() { - // close model file - lbann::closeread(m_model_fd, m_model_filename); - m_model_fd = -1; - // close training file - lbann::closeread(m_train_fd, m_train_filename); - m_train_fd = -1; - // close validate file - lbann::closeread(m_validate_fd, m_validate_filename); - m_validate_fd = -1; - + for(persist_type pt : persist_type_iterator()) { + if (m_FDs[pt] >= 0) { + lbann::closeread(m_FDs[pt], m_filenames[pt].c_str()); + m_FDs[pt] = -1; + m_filenames[pt] = ""; + } + } } bool lbann::persist::write_distmat(persist_type type, const char *name, AbsDistMat *M) { @@ -347,16 +300,14 @@ bool lbann::persist::write_distmat(persist_type type, const char *name, AbsDistM } else if (type == persist_type::model) { filename += std::string("/model_") + name; } else { - std::stringstream err; - err << "invalid persist_type (" << static_cast(type) << ")"; - LBANN_ERROR(err.str()); + LBANN_ERROR("invalid persist_type (", static_cast(type), ")"); } El::Write(*M, filename, El::BINARY, ""); //Write_MPI(M, filename, BINARY, ""); uint64_t bytes = 2 * sizeof(El::Int) + M->Height() * M->Width() * sizeof(DataType); - m_bytes += bytes; + m_bytes[type] += bytes; return true; } @@ -369,48 +320,51 @@ bool lbann::persist::read_distmat(persist_type type, const char *name, AbsDistMa } else if (type == persist_type::model) { filename += std::string("/model_") + name; } else { - std::stringstream err; - err << "invalid persist_type (" << static_cast(type) << ")"; - LBANN_ERROR(err.str()); + LBANN_ERROR("invalid persist_type (", static_cast(type), ")"); } // check whether file exists int exists = lbann::exists(filename.c_str()); if (! exists) { - LBANN_ERROR("failed to read distributed matrix from file (" + filename + ")"); + LBANN_ERROR("failed to read distributed matrix from file (", filename, ")"); return false; } El::Read(*M, filename, El::BINARY, true); //Read_MPI(M, filename, BINARY, 1); uint64_t bytes = 2 * sizeof(El::Int) + M->Height() * M->Width() * sizeof(DataType); - m_bytes += bytes; + m_bytes[type] += bytes; return true; } bool lbann::persist::write_bytes(persist_type type, const char *name, const void *buf, size_t size) { int fd = get_fd(type); + std::string filename = get_filename(type); if (fd >= 0) { ssize_t rc = write(fd, buf, size); if (rc != (ssize_t) size) { - LBANN_ERROR(std::string{} + "failed to write file (" + name + ")"); + LBANN_ERROR("failed to write to fd ", fd, + " for file ", filename, " and field (", name, ")"); return false; } - m_bytes += size; + m_bytes[type] += size; } return true; } bool lbann::persist::read_bytes(persist_type type, const char *name, void *buf, size_t size) { int fd = get_fd(type); + std::string filename = get_filename(type); if (fd >= 0) { ssize_t rc = read(fd, buf, size); if (rc != (ssize_t) size) { - LBANN_ERROR(std::string{} + "failed to read file (" + name + ")"); + LBANN_ERROR("failed to read ", size, " bytes from fd ", + fd, " for file ", filename, " and field (", name, + ") at offset ", m_bytes[type]); return false; } - m_bytes += size; + m_bytes[type] += size; } else { return false; @@ -477,15 +431,11 @@ bool lbann::persist::read_string(persist_type type, const char *name, char *val, } int lbann::persist::get_fd(persist_type type) const { - int fd = -1; - if (type == persist_type::train) { - fd = m_train_fd; - } else if (type == persist_type::model) { - fd = m_model_fd; - } else if (type == persist_type::validate) { - fd = m_validate_fd; - } - return fd; + return m_FDs.at(type); +} + +std::string lbann::persist::get_filename(persist_type type) const { + return m_filenames.at(type); } /**************************************************** @@ -506,9 +456,7 @@ bool lbann::read_distmat(int fd, const char *name, DistMat *M, uint64_t *bytes) // check whether file exists int exists = lbann::exists(name); if (! exists) { - LBANN_ERROR(std::string{} - + "failed to read distributed matrix from file " - + "(" + name + ")"); + LBANN_ERROR("failed to read distributed matrix from file (", name, ")"); return false; } @@ -525,7 +473,7 @@ bool lbann::write_bytes(int fd, const char *name, const void *buf, size_t size) if (fd >= 0) { ssize_t rc = write(fd, buf, size); if (rc != (ssize_t) size) { - LBANN_ERROR(std::string{} + "failed to write file (" + name + ")"); + LBANN_ERROR("failed to write file (", name, ")"); return false; } } @@ -536,7 +484,7 @@ bool lbann::read_bytes(int fd, const char *name, void *buf, size_t size) { if (fd >= 0) { ssize_t rc = read(fd, buf, size); if (rc != (ssize_t) size) { - LBANN_ERROR(std::string{} + "failed to read file (" + name + ")"); + LBANN_ERROR("failed to read file (", name, ")"); return false; } } @@ -589,7 +537,7 @@ bool lbann::write_string(int fd, const char *name, const char *buf, size_t size) if (fd > 0) { ssize_t rc = write(fd, buf, size); if (rc != (ssize_t) size) { - LBANN_ERROR(std::string{} + "failed to write file (" + name + ")"); + LBANN_ERROR("failed to write file (", name, ")"); return false; } } @@ -600,7 +548,7 @@ bool lbann::read_string(int fd, const char *name, char *buf, size_t size) { if (fd > 0) { ssize_t rc = read(fd, buf, size); if (rc <= 0) { - LBANN_ERROR(std::string{} + "failed to read file (" + name + ")"); + LBANN_ERROR("failed to read file (", name, ")"); return false; } } diff --git a/src/layers/layer.cpp b/src/layers/layer.cpp index d404e59d170..8ba242ee38f 100644 --- a/src/layers/layer.cpp +++ b/src/layers/layer.cpp @@ -29,6 +29,7 @@ #include "lbann/models/model.hpp" #include "lbann/io/file_io.hpp" #include "lbann/io/persist.hpp" +#include "lbann/execution_contexts/sgd_execution_context.hpp" #include @@ -253,7 +254,8 @@ void Layer::forward_prop() { const auto fp_start = get_time(); // Setup tensors - const auto& mini_batch_size = m_model->get_current_mini_batch_size(); + const auto& c = static_cast(m_model->get_execution_context()); + const auto& mini_batch_size = c.get_current_mini_batch_size(); fp_setup_inputs(mini_batch_size); fp_setup_outputs(mini_batch_size); @@ -285,7 +287,8 @@ void Layer::back_prop() { const auto bp_start = get_time(); // Setup tensors - const auto& mini_batch_size = m_model->get_current_mini_batch_size(); + const auto& c = static_cast(m_model->get_execution_context()); + const auto& mini_batch_size = c.get_current_mini_batch_size(); bp_setup_gradient_wrt_outputs(mini_batch_size); bp_setup_gradient_wrt_inputs(mini_batch_size); diff --git a/src/layers/learning/channelwise_scale_bias.cpp b/src/layers/learning/channelwise_scale_bias.cpp index d837982387a..b019ae28a5a 100644 --- a/src/layers/learning/channelwise_scale_bias.cpp +++ b/src/layers/learning/channelwise_scale_bias.cpp @@ -25,6 +25,7 @@ //////////////////////////////////////////////////////////////////////////////// #include "lbann/layers/learning/channelwise_scale_bias.hpp" +#include "lbann/execution_contexts/sgd_execution_context.hpp" namespace lbann { @@ -125,7 +126,8 @@ void channelwise_scale_bias_layer // Update optimizer with gradient auto* opt = m_weights[0]->get_optimizer(); if (opt != nullptr) { - const El::Int mini_batch_size = this->m_model->get_effective_mini_batch_size(); + const auto& c = static_cast(this->m_model->get_execution_context()); + const auto mini_batch_size = c.get_effective_mini_batch_size(); opt->add_to_gradient(*m_weights_gradient, DataType{1} / mini_batch_size, true); diff --git a/src/layers/learning/channelwise_scale_bias.cu b/src/layers/learning/channelwise_scale_bias.cu index 0843fe990ac..a6d37f04251 100644 --- a/src/layers/learning/channelwise_scale_bias.cu +++ b/src/layers/learning/channelwise_scale_bias.cu @@ -28,6 +28,7 @@ #ifdef HYDROGEN_HAVE_CUB #include "cub/block/block_reduce.cuh" #endif // HYDROGEN_HAVE_CUB +#include "lbann/execution_contexts/sgd_execution_context.hpp" namespace lbann { @@ -264,7 +265,8 @@ void channelwise_scale_bias_layer // Update optimizer with gradient auto* opt = m_weights[0]->get_optimizer(); if (opt != nullptr) { - const El::Int mini_batch_size = this->m_model->get_effective_mini_batch_size(); + const auto& c = static_cast(this->m_model->get_execution_context()); + const auto mini_batch_size = c.get_effective_mini_batch_size(); opt->add_to_gradient(*m_weights_gradient, DataType{1} / mini_batch_size, true); diff --git a/src/layers/learning/embedding.cpp b/src/layers/learning/embedding.cpp index c87e33aaa90..a84f7223dab 100644 --- a/src/layers/learning/embedding.cpp +++ b/src/layers/learning/embedding.cpp @@ -26,6 +26,7 @@ #include "lbann/layers/learning/embedding.hpp" #include "lbann/models/model.hpp" +#include "lbann/execution_contexts/sgd_execution_context.hpp" namespace lbann { @@ -122,7 +123,8 @@ void embedding_layer::bp_compute() { auto& local_dict_grad = m_dictionary_gradient.Matrix(); const auto& local_output_grad = get_local_prev_error_signals(); const auto& local_width = local_input.Width(); - const auto& mini_batch_size = this->m_model->get_effective_mini_batch_size(); + const auto& c = static_cast(this->m_model->get_execution_context()); + const auto& mini_batch_size = c.get_effective_mini_batch_size(); // Update appropriate columns of gradient w.r.t. dictionary El::Zero(local_dict_grad); diff --git a/src/layers/learning/entrywise_scale_bias.cpp b/src/layers/learning/entrywise_scale_bias.cpp index 9478bbca732..085ac42c259 100644 --- a/src/layers/learning/entrywise_scale_bias.cpp +++ b/src/layers/learning/entrywise_scale_bias.cpp @@ -25,6 +25,7 @@ //////////////////////////////////////////////////////////////////////////////// #include "lbann/layers/learning/entrywise_scale_bias.hpp" +#include "lbann/execution_contexts/sgd_execution_context.hpp" namespace lbann { @@ -139,22 +140,24 @@ void entrywise_scale_bias_layer template <> void entrywise_scale_bias_layer ::bp_compute() { + const auto& c = static_cast(this->m_model->get_execution_context()); bp_impl(dynamic_cast(get_local_prev_activations()), dynamic_cast(get_local_prev_error_signals()), dynamic_cast(get_local_error_signals()), *this->m_weights[0], *m_weights_gradient, - this->m_model->get_effective_mini_batch_size()); + c.get_effective_mini_batch_size()); } template <> void entrywise_scale_bias_layer ::bp_compute() { + const auto& c = static_cast(this->m_model->get_execution_context()); bp_impl(dynamic_cast(get_local_prev_activations()), dynamic_cast(get_local_prev_error_signals()), dynamic_cast(get_local_error_signals()), *this->m_weights[0], *m_weights_gradient, - this->m_model->get_effective_mini_batch_size()); + c.get_effective_mini_batch_size()); } } // namespace lbann diff --git a/src/layers/learning/entrywise_scale_bias.cu b/src/layers/learning/entrywise_scale_bias.cu index aac88d96243..d0492ab78a6 100644 --- a/src/layers/learning/entrywise_scale_bias.cu +++ b/src/layers/learning/entrywise_scale_bias.cu @@ -25,6 +25,7 @@ //////////////////////////////////////////////////////////////////////////////// #include "lbann/layers/learning/entrywise_scale_bias.hpp" +#include "lbann/execution_contexts/sgd_execution_context.hpp" namespace lbann { @@ -195,22 +196,24 @@ void entrywise_scale_bias_layer template <> void entrywise_scale_bias_layer ::bp_compute() { + const auto& c = static_cast(this->m_model->get_execution_context()); bp_impl(dynamic_cast(get_local_prev_activations()), dynamic_cast(get_local_prev_error_signals()), dynamic_cast(get_local_error_signals()), *this->m_weights[0], *m_weights_gradient, - this->m_model->get_effective_mini_batch_size()); + c.get_effective_mini_batch_size()); } template <> void entrywise_scale_bias_layer ::bp_compute() { + const auto& c = static_cast(this->m_model->get_execution_context()); bp_impl(dynamic_cast(get_local_prev_activations()), dynamic_cast(get_local_prev_error_signals()), dynamic_cast(get_local_error_signals()), *this->m_weights[0], *m_weights_gradient, - this->m_model->get_effective_mini_batch_size()); + c.get_effective_mini_batch_size()); } } // namespace lbann diff --git a/src/layers/learning/fully_connected.cpp b/src/layers/learning/fully_connected.cpp index 730a58631be..4454d6e8897 100644 --- a/src/layers/learning/fully_connected.cpp +++ b/src/layers/learning/fully_connected.cpp @@ -25,6 +25,7 @@ //////////////////////////////////////////////////////////////////////////////// #include "lbann/layers/learning/fully_connected.hpp" +#include "lbann/execution_contexts/sgd_execution_context.hpp" namespace lbann { @@ -100,9 +101,10 @@ void fully_connected_layer::fp_com /** CPU implementation of backward prop computation. */ template <> void fully_connected_layer::bp_compute() { + auto& c = static_cast(this->m_model->get_execution_context()); // Effective mini-batch size - const int mini_batch_size = this->m_model->get_effective_mini_batch_size(); + const auto mini_batch_size = c.get_effective_mini_batch_size(); // Matrices const auto& linearity = m_weights[0]->get_values(); @@ -208,9 +210,10 @@ void fully_connected_layer::fp_comp /** CPU implementation of backward prop computation. */ template <> void fully_connected_layer::bp_compute() { + auto& c = static_cast(this->m_model->get_execution_context()); // Effective mini-batch size - const int mini_batch_size = this->m_model->get_effective_mini_batch_size(); + const auto mini_batch_size = c.get_effective_mini_batch_size(); // Matrices const auto& local_linearity = m_weights[0]->get_values().LockedMatrix(); @@ -292,9 +295,10 @@ void fully_connected_layer::fp_comp /** GPU implementation of backward prop computation. */ template <> void fully_connected_layer::bp_compute() { + auto& c = static_cast(this->m_model->get_execution_context()); // Effective mini-batch size - const int mini_batch_size = this->m_model->get_effective_mini_batch_size(); + const auto mini_batch_size = c.get_effective_mini_batch_size(); // Matrices const auto& local_linearity = m_weights[0]->get_values().LockedMatrix(); @@ -394,9 +398,10 @@ void fully_connected_layer::fp_com template <> void fully_connected_layer::bp_compute() { + auto& c = static_cast(this->m_model->get_execution_context()); // Effective mini-batch size - const int mini_batch_size = this->m_model->get_effective_mini_batch_size(); + const auto mini_batch_size = c.get_effective_mini_batch_size(); // Matrices const auto& linearity = m_weights[0]->get_values(); diff --git a/src/layers/regularizers/batch_normalization.cpp b/src/layers/regularizers/batch_normalization.cpp index 2d5cd8cc95f..c93130f8442 100644 --- a/src/layers/regularizers/batch_normalization.cpp +++ b/src/layers/regularizers/batch_normalization.cpp @@ -25,6 +25,7 @@ //////////////////////////////////////////////////////////////////////////////// #include "lbann/layers/regularizers/batch_normalization.hpp" +#include "lbann/execution_contexts/sgd_execution_context.hpp" namespace lbann { @@ -32,7 +33,7 @@ template <> void batch_normalization_layer::fp_compute() { constexpr DataType zero = 0; constexpr DataType one = 1; - const bool is_training = this->m_model->get_execution_mode() == execution_mode::training; + const bool is_training = this->m_model->get_execution_context().get_execution_mode() == execution_mode::training; // Matrices const auto& input = get_prev_activations(); @@ -157,7 +158,7 @@ void batch_normalization_layer::fp_ template <> void batch_normalization_layer::bp_compute() { constexpr DataType one = 1; - const bool is_training = this->m_model->get_execution_mode() == execution_mode::training; + const bool is_training = this->m_model->get_execution_context().get_execution_mode() == execution_mode::training; // Matrices const auto& local_scale = this->m_weights[0]->get_values().LockedMatrix(); @@ -177,7 +178,8 @@ void batch_normalization_layer::bp_ auto& local_bias_gradient = m_bias_gradient->Matrix(); // Matrix parameters - const El::Int effective_mini_batch_size = this->m_model->get_effective_mini_batch_size(); + const auto& c = static_cast(this->m_model->get_execution_context()); + const auto effective_mini_batch_size = c.get_effective_mini_batch_size(); const auto& width = input.Width(); const auto& local_width = local_input.Width(); const auto& output_dims = get_output_dims(); diff --git a/src/layers/regularizers/batch_normalization.cu b/src/layers/regularizers/batch_normalization.cu index 02f6b6071f2..664f7ad847c 100644 --- a/src/layers/regularizers/batch_normalization.cu +++ b/src/layers/regularizers/batch_normalization.cu @@ -26,6 +26,7 @@ #include "lbann/layers/regularizers/batch_normalization.hpp" #include "lbann/utils/cuda.hpp" +#include "lbann/execution_contexts/sgd_execution_context.hpp" namespace lbann { @@ -296,7 +297,7 @@ __global__ void backprop2_kernel( template <> void batch_normalization_layer::fp_compute() { constexpr DataType one = 1; - const bool is_training = this->m_model->get_execution_mode() == execution_mode::training; + const bool is_training = this->m_model->get_execution_context().get_execution_mode() == execution_mode::training; // CUDA objects CHECK_CUDA(cudaSetDevice(El::GPUManager::Device())); @@ -406,7 +407,7 @@ void batch_normalization_layer::fp_ template <> void batch_normalization_layer::bp_compute() { constexpr DataType one = 1; - const bool is_training = this->m_model->get_execution_mode() == execution_mode::training; + const bool is_training = this->m_model->get_execution_context().get_execution_mode() == execution_mode::training; // CUDA objects CHECK_CUDA(cudaSetDevice(El::GPUManager::Device())); @@ -430,7 +431,8 @@ void batch_normalization_layer::bp_ auto& local_bias_gradient = m_bias_gradient->Matrix(); // Matrix parameters - const El::Int effective_mini_batch_size = this->m_model->get_effective_mini_batch_size(); + const auto& c = static_cast(this->m_model->get_execution_context()); + const auto effective_mini_batch_size = c.get_effective_mini_batch_size(); const auto& width = input.Width(); const auto& local_width = local_input.Width(); const auto& output_dims = get_output_dims(); diff --git a/src/layers/regularizers/entrywise_batch_normalization.cpp b/src/layers/regularizers/entrywise_batch_normalization.cpp index f288d4b848b..4cdd51fbbf7 100644 --- a/src/layers/regularizers/entrywise_batch_normalization.cpp +++ b/src/layers/regularizers/entrywise_batch_normalization.cpp @@ -25,6 +25,7 @@ //////////////////////////////////////////////////////////////////////////////// #include "lbann/layers/regularizers/entrywise_batch_normalization.hpp" +#include "lbann/execution_contexts/sgd_execution_context.hpp" namespace lbann { @@ -375,10 +376,11 @@ void bp_impl(lbann_comm& comm, // Template instantiation template <> void entrywise_batch_normalization_layer::fp_compute() { + const auto& c = static_cast(this->m_model->get_execution_context()); fp_impl(*get_comm(), m_decay, m_epsilon, - m_model->get_execution_mode() == execution_mode::training, + c.get_execution_mode() == execution_mode::training, get_prev_activations(), get_activations(), *m_batch_statistics, @@ -387,10 +389,11 @@ void entrywise_batch_normalization_layer void entrywise_batch_normalization_layer::fp_compute() { + const auto& c = static_cast(this->m_model->get_execution_context()); fp_impl(*get_comm(), m_decay, m_epsilon, - m_model->get_execution_mode() == execution_mode::training, + c.get_execution_mode() == execution_mode::training, get_prev_activations(), get_activations(), *m_batch_statistics, @@ -399,9 +402,10 @@ void entrywise_batch_normalization_layer void entrywise_batch_normalization_layer::bp_compute() { + const auto& c = static_cast(this->m_model->get_execution_context()); bp_impl(*get_comm(), m_epsilon, - m_model->get_execution_mode() == execution_mode::training, + c.get_execution_mode() == execution_mode::training, get_prev_activations(), get_prev_error_signals(), get_error_signals(), @@ -411,9 +415,10 @@ void entrywise_batch_normalization_layer void entrywise_batch_normalization_layer::bp_compute() { + const auto& c = static_cast(this->m_model->get_execution_context()); bp_impl(*get_comm(), m_epsilon, - m_model->get_execution_mode() == execution_mode::training, + c.get_execution_mode() == execution_mode::training, get_prev_activations(), get_prev_error_signals(), get_error_signals(), diff --git a/src/layers/regularizers/entrywise_batch_normalization.cu b/src/layers/regularizers/entrywise_batch_normalization.cu index 79108fd3321..fde189373bc 100644 --- a/src/layers/regularizers/entrywise_batch_normalization.cu +++ b/src/layers/regularizers/entrywise_batch_normalization.cu @@ -26,6 +26,7 @@ #include "lbann/layers/regularizers/entrywise_batch_normalization.hpp" #include "lbann/utils/cuda.hpp" +#include "lbann/execution_contexts/sgd_execution_context.hpp" namespace lbann { @@ -563,10 +564,11 @@ void bp_impl(lbann_comm& comm, // Template instantiation template <> void entrywise_batch_normalization_layer::fp_compute() { + const auto& c = static_cast(this->m_model->get_execution_context()); fp_impl(*get_comm(), m_decay, m_epsilon, - m_model->get_execution_mode() == execution_mode::training, + c.get_execution_mode() == execution_mode::training, get_prev_activations(), get_activations(), *m_batch_statistics, @@ -575,10 +577,11 @@ void entrywise_batch_normalization_layer void entrywise_batch_normalization_layer::fp_compute() { + const auto& c = static_cast(this->m_model->get_execution_context()); fp_impl(*get_comm(), m_decay, m_epsilon, - m_model->get_execution_mode() == execution_mode::training, + c.get_execution_mode() == execution_mode::training, get_prev_activations(), get_activations(), *m_batch_statistics, @@ -587,9 +590,10 @@ void entrywise_batch_normalization_layer void entrywise_batch_normalization_layer::bp_compute() { + const auto& c = static_cast(this->m_model->get_execution_context()); bp_impl(*get_comm(), m_epsilon, - m_model->get_execution_mode() == execution_mode::training, + c.get_execution_mode() == execution_mode::training, get_prev_activations(), get_prev_error_signals(), get_error_signals(), @@ -599,9 +603,10 @@ void entrywise_batch_normalization_layer void entrywise_batch_normalization_layer::bp_compute() { + const auto& c = static_cast(this->m_model->get_execution_context()); bp_impl(*get_comm(), m_epsilon, - m_model->get_execution_mode() == execution_mode::training, + c.get_execution_mode() == execution_mode::training, get_prev_activations(), get_prev_error_signals(), get_error_signals(), diff --git a/src/metrics/metric.cpp b/src/metrics/metric.cpp index 80c75d5df8f..9540a995f84 100644 --- a/src/metrics/metric.cpp +++ b/src/metrics/metric.cpp @@ -50,31 +50,6 @@ void metric_statistics::reset() { m_num_samples = 0; } -bool metric_statistics::pack_scalars(persist& p) { - p.write_double(persist_type::validate, "sum", m_sum); - p.write_uint64(persist_type::validate, "num_samples", m_num_samples); - return true; -} - -bool metric_statistics::unpack_scalars(persist& p, struct packing_header *header) { - double sum; - uint64_t num_samples; - p.read_double(persist_type::validate, "sum", &sum); - p.read_uint64(persist_type::validate, "num_samples", (uint64_t *) &num_samples); - m_sum = sum; - m_num_samples = num_samples; - if (header != nullptr) { - header->sum = sum; - header->num_samples = num_samples; - } - return true; -} - -void metric_statistics::unpack_header(struct packing_header& header) { - m_sum = header.sum; - m_num_samples = header.num_samples; -} - metric::metric(lbann_comm *comm) : m_comm(comm) {} EvalType metric::get_mean_value(execution_mode mode) const { @@ -114,48 +89,23 @@ void metric::set_layer_pointers(std::vector layers) { bool metric::save_to_checkpoint_shared(persist& p) { // write out fields we need to save for model if (m_comm->am_trainer_master()) { - m_statistics[execution_mode::training].pack_scalars(p); - m_statistics[execution_mode::testing].pack_scalars(p); - m_statistics[execution_mode::validation].pack_scalars(p); + write_cereal_archive(*this, p, persist_type::metrics, ".xml"); } return true; } bool metric::load_from_checkpoint_shared(persist& p) { - struct metric_statistics::packing_header training_header, validation_header, testing_header; - if (m_comm->am_trainer_master()) { - m_statistics[execution_mode::training].unpack_scalars(p, &training_header); - m_statistics[execution_mode::testing].unpack_scalars(p, &testing_header); - m_statistics[execution_mode::validation].unpack_scalars(p, &validation_header); - } - - m_comm->trainer_broadcast(0, training_header); - m_comm->trainer_broadcast(0, validation_header); - m_comm->trainer_broadcast(0, testing_header); - - m_statistics[execution_mode::training].unpack_header(training_header); - m_statistics[execution_mode::validation].unpack_header(validation_header); - m_statistics[execution_mode::testing].unpack_header(testing_header); + load_from_shared_cereal_archive(*this, p, persist_type::metrics, *m_comm, ".xml"); return true; } bool metric::save_to_checkpoint_distributed(persist& p) { - // write out fields we need to save for model - m_statistics[execution_mode::training].pack_scalars(p); - m_statistics[execution_mode::testing].pack_scalars(p); - m_statistics[execution_mode::validation].pack_scalars(p); + write_cereal_archive(*this, p, persist_type::metrics, ".xml"); return true; } bool metric::load_from_checkpoint_distributed(persist& p) { - struct metric_statistics::packing_header training_header, validation_header, testing_header; - m_statistics[execution_mode::training].unpack_scalars(p, &training_header); - m_statistics[execution_mode::testing].unpack_scalars(p, &testing_header); - m_statistics[execution_mode::validation].unpack_scalars(p, &validation_header); - - m_statistics[execution_mode::training].unpack_header(training_header); - m_statistics[execution_mode::validation].unpack_header(validation_header); - m_statistics[execution_mode::testing].unpack_header(testing_header); + read_cereal_archive(*this, p, persist_type::metrics, ".xml"); return true; } diff --git a/src/models/model.cpp b/src/models/model.cpp index 4e4da3e6f04..1eb014dc7c5 100644 --- a/src/models/model.cpp +++ b/src/models/model.cpp @@ -25,6 +25,7 @@ //////////////////////////////////////////////////////////////////////////////// #include "lbann/models/model.hpp" +#include "lbann/trainers/trainer.hpp" #include "lbann/callbacks/callback.hpp" #include "lbann/callbacks/save_model.hpp" #include "lbann/io/persist.hpp" @@ -56,13 +57,12 @@ namespace lbann { // ============================================= model::model(lbann_comm* comm, - El::Int mini_batch_size, + size_t mini_batch_size, objective_function* obj_fn, optimizer* default_optimizer) - : m_comm(comm), - m_current_mini_batch_size(mini_batch_size), + : m_execution_context(nullptr), + m_comm(comm), m_max_mini_batch_size(mini_batch_size), - m_effective_mini_batch_size(mini_batch_size), m_default_optimizer(default_optimizer), m_objective_function(obj_fn) { @@ -74,16 +74,10 @@ model::model(lbann_comm* comm, } model::model(const model& other) : + m_execution_context(other.m_execution_context), m_comm(other.m_comm), m_name(other.m_name), - m_execution_mode(other.m_execution_mode), - m_epoch(other.m_epoch), - m_step(other.m_step), - m_terminate_training(other.m_terminate_training), - m_current_mini_batch_size(other.m_current_mini_batch_size), - m_max_mini_batch_size(other.m_max_mini_batch_size), - m_effective_mini_batch_size(other.m_effective_mini_batch_size), - m_background_io_allowed(other.m_background_io_allowed) { + m_max_mini_batch_size(other.m_max_mini_batch_size) { // Deep copies m_default_optimizer = (other.m_default_optimizer ? @@ -128,6 +122,7 @@ model::model(const model& other) : model& model::operator=(const model& other) { // Delete objects + if (m_execution_context != nullptr) { delete m_execution_context; } /// @todo BVE FIXME what do we do with smart pointers here if (m_objective_function != nullptr) { delete m_objective_function; } for (const auto& m : m_metrics) { delete m; } for (const auto& cb : m_callbacks) { delete cb; } @@ -136,16 +131,10 @@ model& model::operator=(const model& other) { // Shallow copies m_comm = other.m_comm; m_name = other.m_name; - m_execution_mode = other.m_execution_mode; - m_epoch = other.m_epoch; - m_step = other.m_step; - m_terminate_training = other.m_terminate_training; - m_current_mini_batch_size = other.m_current_mini_batch_size; m_max_mini_batch_size = other.m_max_mini_batch_size; - m_effective_mini_batch_size = other.m_effective_mini_batch_size; - m_background_io_allowed = other.m_background_io_allowed; // Deep copies + m_execution_context = other.m_execution_context; m_objective_function = other.m_objective_function; m_metrics = other.m_metrics; m_callbacks = other.m_callbacks; @@ -337,27 +326,7 @@ const std::vector model::get_weights() const { return weights_list; } -void model::set_execution_mode(execution_mode mode) { - m_execution_mode = mode; -} - -execution_mode model::get_execution_mode() const noexcept { - return m_execution_mode; -} - -El::Int model::get_step() const noexcept { - return get_step(get_execution_mode()); -} - -El::Int model::get_step(execution_mode mode) const noexcept { - if (m_step.count(mode) > 0) { - return m_step.at(mode); - } else { - return 0; - } -} - -int model::get_num_iterations_per_epoch(execution_mode mode) const { +size_t model::get_num_iterations_per_epoch(execution_mode mode) const { for (El::Int i = 0; i < get_num_layers(); ++i) { const auto* input = dynamic_cast(&get_layer(i)); if (input != nullptr) { @@ -600,11 +569,7 @@ void model::remap_pointers(const std::unordered_map& layer_map, // Setup // ============================================= -void model::setup(std::shared_ptr io_thread_pool) { - - // Setup I/O threads - set up before setting up the layers (input - // layer depends on having a properly initialized thread pool) - m_io_thread_pool = std::move(io_thread_pool); +void model::setup() { // Setup layers setup_layer_topology(); @@ -614,9 +579,6 @@ void model::setup(std::shared_ptr io_thread_pool) { // Setup weights setup_weights(); - // Setup objective function - m_objective_function->setup(*this); - // Setup metrics for (const auto& m : m_metrics) { m->setup(*this); @@ -975,60 +937,6 @@ void model::add_split_layers(std::unordered_set& layer_names) { // Execution // ============================================= -void model::evaluate(execution_mode mode, int num_batches) { - - // Return early if execution mode is invalid - if (!is_execution_mode_valid(mode)) return; - if (mode != execution_mode::validation - && mode != execution_mode::testing) { - std::stringstream err; - err << __FILE__ << " " << __LINE__ << " :: " - << "invalid execution mode for evaluation"; - throw lbann_exception(err.str()); - } - - // Evaluate on all mini-batches - reset_epoch_statistics(mode); - reset_mode_and_model(mode); - do_evaluate_begin_cbs(mode); - if (num_batches > 0) { - for (int i = 0; i < num_batches; i++) { evaluate_mini_batch(mode); } - } else { - while (!evaluate_mini_batch(mode)) {} - } - do_evaluate_end_cbs(mode); -} - -void model::train(int num_epochs, int num_batches) { - do_train_begin_cbs(); - for (int epoch = m_epoch; epoch < num_epochs; ++epoch) { - if (get_terminate_training()) { break; } - - // Initialize epoch - reset_mode_and_model(execution_mode::training); - do_epoch_begin_cbs(); - - // Training iterations - if (num_batches > 0) { - for (int i = 0; i < num_batches; i++) { train_mini_batch(); } - } else { - while (!train_mini_batch()) {} - } - - // Finalize epoch - ++m_epoch; - reconcile_weight_values(); - do_epoch_end_cbs(); - reset_epoch_statistics(execution_mode::training); - - // Evaluate on validation set - evaluate(execution_mode::validation); - - } - do_train_end_cbs(); -} - - void model::collect_background_data_fetch(execution_mode mode) { for (El::Int i = 0; i < get_num_layers(); ++i) { auto *input = dynamic_cast(&get_layer(i)); @@ -1065,8 +973,9 @@ void model::mark_data_store_explicitly_loading(execution_mode mode) { // At the start of the epoch, set the execution mode and make sure // that each layer points to this model -void model::reset_mode_and_model(execution_mode mode) { - set_execution_mode(mode); +void model::reset_mode(execution_context& context, execution_mode mode) { + m_execution_context = static_cast>(&context); + // set_execution_mode(mode); for (El::Int i = 0; i < get_num_layers(); ++i) { get_layer(i).set_model(this); } @@ -1074,78 +983,16 @@ void model::reset_mode_and_model(execution_mode mode) { // At the end of the epoch, clean up the objective function and metrics void model::reset_epoch_statistics(execution_mode mode) { - m_objective_function->reset_statistics(mode); + get_objective_function()->reset_statistics(mode); for (const auto& m : m_metrics) { m->reset_statistics(mode); } } -bool model::evaluate_mini_batch(execution_mode mode) { - reset_mode_and_model(mode); - do_batch_begin_cbs(mode); - forward_prop(mode); - m_objective_function->start_evaluation(mode, get_current_mini_batch_size()); - m_objective_function->finish_evaluation(mode, get_current_mini_batch_size()); - for (const auto& m : m_metrics) { - m->evaluate(mode, get_current_mini_batch_size()); - } - const bool finished = update_layers(); - - // Increment mini-batch step - /// @todo Move after the callbacks - if (m_step.count(mode) < 1) { m_step[mode] = 0; } - ++m_step[mode]; - - do_batch_end_cbs(mode); - return finished; -} - -bool model::train_mini_batch() { - constexpr execution_mode mode = execution_mode::training; - reset_mode_and_model(mode); - do_batch_begin_cbs(mode); - - - bool finished; - -#if defined(LBANN_HAVE_OMP_TASKLOOP) - LBANN_OMP_PARALLEL - { - #pragma omp single - { -#endif - // Forward prop step - clear_gradients(); - forward_prop(mode); - // Result is not needed until the end of the mini-batch. - m_objective_function->start_evaluation(mode, get_current_mini_batch_size()); - - // Backward prop step - m_objective_function->differentiate(); - backward_prop(); - m_objective_function->compute_weight_regularization(); - - // Finish evaluation. - m_objective_function->finish_evaluation(mode, get_current_mini_batch_size()); +void model::evaluate_metrics(execution_mode mode, size_t current_mini_batch_size) { for (const auto& m : m_metrics) { - m->evaluate(mode, get_current_mini_batch_size()); + m->evaluate(mode, current_mini_batch_size); } - - // Update step - update_weights(); - finished = update_layers(); -#if defined(LBANN_HAVE_OMP_TASKLOOP) - } - } -#endif - - // Increment mini-batch step - /// @todo Move after the callbacks - if (m_step.count(mode) < 1) { m_step[mode] = 0; } - ++m_step[mode]; - - do_batch_end_cbs(execution_mode::training); - return finished; } void model::clear_gradients() { @@ -1231,103 +1078,11 @@ void model::do_setup_end_cbs() { } } -void model::do_train_begin_cbs() { - for (const auto& cb : m_callbacks) { - cb->on_train_begin(this); - } -} - -void model::do_train_end_cbs() { - for (const auto& cb : m_callbacks) { - cb->on_train_end(this); - } -} - -void model::do_evaluate_begin_cbs(execution_mode mode) { - for (const auto& cb : m_callbacks) { - switch (mode) { - case execution_mode::validation: - cb->on_validation_begin(this); break; - case execution_mode::testing: - cb->on_test_begin(this); break; - default: - std::stringstream err; - err << __FILE__ << " " << __LINE__ << " :: " - << "invalid execution mode"; - throw lbann_exception(err.str()); - } - } -} - -void model::do_evaluate_end_cbs(execution_mode mode) { - for (const auto& cb : m_callbacks) { - switch (mode) { - case execution_mode::validation: - cb->on_validation_end(this); break; - case execution_mode::testing: - cb->on_test_end(this); break; - default: - std::stringstream err; - err << __FILE__ << " " << __LINE__ << " :: " - << "invalid execution mode"; - throw lbann_exception(err.str()); - } - } -} - -void model::do_epoch_begin_cbs() { - for (const auto& cb : m_callbacks) { - cb->on_epoch_begin(this); - } -} - -void model::do_epoch_end_cbs() { - for (const auto& cb : m_callbacks) { - cb->on_epoch_end(this); - } -} - -void model::do_batch_begin_cbs(execution_mode mode) { - for (const auto& cb : m_callbacks) { - switch (mode) { - case execution_mode::training: - if (get_step() % cb->get_batch_interval() == 0) { - cb->on_batch_begin(this); - } - break; - case execution_mode::validation: - case execution_mode::testing: - cb->on_batch_evaluate_begin(this); - break; - default: - LBANN_ERROR("invalid execution mode"); - } - } -} - -void model::do_batch_end_cbs(execution_mode mode) { - for (const auto& cb : m_callbacks) { - switch (mode) { - case execution_mode::training: - if (get_step() % cb->get_batch_interval() == 0) { - cb->on_batch_end(this); - } - break; - case execution_mode::validation: - case execution_mode::testing: - cb->on_batch_evaluate_end(this); - break; - default: - LBANN_ERROR("invalid execution mode"); - } - } -} - void model::do_model_forward_prop_begin_cbs(execution_mode mode) { for (const auto& cb : m_callbacks) { switch (mode) { case execution_mode::training: - if (get_step() % cb->get_batch_interval() == 0) { + if (get_execution_context().get_step() % cb->get_batch_interval() == 0) { cb->on_forward_prop_begin(this); } break; @@ -1345,7 +1100,7 @@ void model::do_model_forward_prop_end_cbs(execution_mode mode) { for (const auto& cb : m_callbacks) { switch (mode) { case execution_mode::training: - if (get_step() % cb->get_batch_interval() == 0) { + if (get_execution_context().get_step() % cb->get_batch_interval() == 0) { cb->on_forward_prop_end(this); } break; @@ -1366,7 +1121,7 @@ void model::do_layer_forward_prop_begin_cbs(execution_mode mode, Layer *l) { for (const auto& cb : m_callbacks) { switch (mode) { case execution_mode::training: - if (get_step() % cb->get_batch_interval() == 0) { + if (get_execution_context().get_step() % cb->get_batch_interval() == 0) { cb->on_forward_prop_begin(this, l); } break; @@ -1387,7 +1142,7 @@ void model::do_layer_forward_prop_end_cbs(execution_mode mode, Layer *l) { for (const auto& cb : m_callbacks) { switch (mode) { case execution_mode::training: - if (get_step() % cb->get_batch_interval() == 0) { + if (get_execution_context().get_step() % cb->get_batch_interval() == 0) { cb->on_forward_prop_end(this, l); } break; @@ -1403,7 +1158,7 @@ void model::do_layer_forward_prop_end_cbs(execution_mode mode, Layer *l) { void model::do_model_backward_prop_begin_cbs() { for (const auto& cb : m_callbacks) { - if (get_step() % cb->get_batch_interval() == 0) { + if (get_execution_context().get_step() % cb->get_batch_interval() == 0) { cb->on_backward_prop_begin(this); } } @@ -1411,7 +1166,7 @@ void model::do_model_backward_prop_begin_cbs() { void model::do_model_backward_prop_end_cbs() { for (const auto& cb : m_callbacks) { - if (get_step() % cb->get_batch_interval() == 0) { + if (get_execution_context().get_step() % cb->get_batch_interval() == 0) { cb->on_backward_prop_end(this); } } @@ -1419,7 +1174,7 @@ void model::do_model_backward_prop_end_cbs() { void model::do_layer_backward_prop_begin_cbs(Layer *l) { for (const auto& cb : m_callbacks) { - if (get_step() % cb->get_batch_interval() == 0) { + if (get_execution_context().get_step() % cb->get_batch_interval() == 0) { cb->on_backward_prop_begin(this, l); } } @@ -1427,7 +1182,7 @@ void model::do_layer_backward_prop_begin_cbs(Layer *l) { void model::do_layer_backward_prop_end_cbs(Layer *l) { for (const auto& cb : m_callbacks) { - if (get_step() % cb->get_batch_interval() == 0) { + if (get_execution_context().get_step() % cb->get_batch_interval() == 0) { cb->on_backward_prop_end(this, l); } } @@ -1435,7 +1190,7 @@ void model::do_layer_backward_prop_end_cbs(Layer *l) { void model::do_model_optimize_begin_cbs() { for (const auto& cb : m_callbacks) { - if (get_step() % cb->get_batch_interval() == 0) { + if (get_execution_context().get_step() % cb->get_batch_interval() == 0) { cb->on_optimize_begin(this); } } @@ -1443,7 +1198,7 @@ void model::do_model_optimize_begin_cbs() { void model::do_model_optimize_end_cbs() { for (const auto& cb : m_callbacks) { - if (get_step() % cb->get_batch_interval() == 0) { + if (get_execution_context().get_step() % cb->get_batch_interval() == 0) { cb->on_optimize_end(this); } } @@ -1451,7 +1206,7 @@ void model::do_model_optimize_end_cbs() { void model::do_weight_optimize_begin_cbs(weights *w) { for (const auto& cb : m_callbacks) { - if (get_step() % cb->get_batch_interval() == 0) { + if (get_execution_context().get_step() % cb->get_batch_interval() == 0) { cb->on_optimize_begin(this, w); } } @@ -1459,7 +1214,7 @@ void model::do_weight_optimize_begin_cbs(weights *w) { void model::do_weight_optimize_end_cbs(weights *w) { for (const auto& cb : m_callbacks) { - if (get_step() % cb->get_batch_interval() == 0) { + if (get_execution_context().get_step() % cb->get_batch_interval() == 0) { cb->on_optimize_end(this, w); } } @@ -1470,20 +1225,21 @@ void model::do_weight_optimize_end_cbs(weights *w) { // ============================================= void model::summarize_stats(lbann_summary& summarizer) { + const auto& c = get_execution_context(); for (El::Int i = 0; i < get_num_layers(); ++i) { - get_layer(i).summarize_stats(summarizer, get_step(execution_mode::training)); + get_layer(i).summarize_stats(summarizer, c.get_step()); } summarizer.reduce_scalar("objective", - m_objective_function->get_mean_value(m_execution_mode), - get_step(execution_mode::training)); + m_objective_function->get_mean_value(c.get_execution_mode()), + c.get_step()); summarizer.reduce_scalar( "objective_evaluation_time", m_objective_function->get_evaluation_time(), - get_step(execution_mode::training)); + c.get_step()); summarizer.reduce_scalar( "objective_differentiation_time", m_objective_function->get_differentiation_time(), - get_step(execution_mode::training)); + c.get_step()); m_objective_function->reset_counters(); double total_metric_time = 0.0; for (auto&& m : m_metrics) { @@ -1493,12 +1249,13 @@ void model::summarize_stats(lbann_summary& summarizer) { summarizer.reduce_scalar( "metric_evaluation_time", total_metric_time, - get_step(execution_mode::training)); + c.get_step()); } void model::summarize_matrices(lbann_summary& summarizer) { + const auto& c = get_execution_context(); for (El::Int i = 0; i < get_num_layers(); ++i) { - get_layer(i).summarize_matrices(summarizer, get_step(execution_mode::training)); + get_layer(i).summarize_matrices(summarizer, c.get_step()); } } @@ -1508,65 +1265,29 @@ void model::summarize_matrices(lbann_summary& summarizer) { /* struct used to serialize mode fields in file and MPI transfer */ struct lbann_model_header { - uint32_t execution_mode; - uint32_t terminate_training; - uint64_t epoch; - uint64_t training_step; - uint64_t validation_step; - uint64_t testing_step; - uint32_t max_mini_batch_size; - uint32_t current_mini_batch_size; - uint32_t callback_type;; + uint64_t max_mini_batch_size; + uint32_t callback_type; }; bool model::save_to_checkpoint_shared(persist& p) { // write out fields we need to save for model - if (p.get_cb_type() != callback_type::validation) { - if (m_comm->am_trainer_master()) { - p.write_uint32(persist_type::train, "execution_mode", (uint32_t) m_execution_mode); - p.write_uint32(persist_type::train, "terminate_training", (uint32_t) m_terminate_training); - p.write_uint64(persist_type::train, "epoch", (uint64_t) m_epoch); - p.write_uint64(persist_type::train, "training_step", (uint64_t) get_step(execution_mode::training)); - p.write_uint64(persist_type::train, "testing_step", (uint64_t) get_step(execution_mode::testing)); - p.write_uint32(persist_type::train, "max_mini_batch_size", (uint32_t) m_max_mini_batch_size); - p.write_uint32(persist_type::train, "current_mini_batch_size", (uint32_t) m_current_mini_batch_size); - p.write_uint32(persist_type::train, "persist_callback_type", (uint32_t) p.get_cb_type()); - if(p.get_cb_type() == callback_type::batch) - p.write_uint64(persist_type::validate, "validation_step", (uint64_t) get_step(execution_mode::validation)); - } + if (m_comm->am_trainer_master()) { + p.write_uint64(persist_type::model, "max_mini_batch_size", (uint64_t) m_max_mini_batch_size); + p.write_uint32(persist_type::model, "persist_callback_type", (uint32_t) p.get_cb_type()); + } - for (weights *w : m_weights) { - w->save_to_checkpoint_shared(p); - } + for (weights *w : m_weights) { + w->save_to_checkpoint_shared(p); + } - for (El::Int i = 0; i < get_num_layers(); ++i) { - if (!get_layer(i).save_to_checkpoint_shared(p)) { - return false; - } - } - if(p.get_cb_type() == callback_type::batch || get_num_iterations_per_epoch(execution_mode::validation) == 0){ - save_rng_to_checkpoint_shared(p, m_comm); - for (const auto& m : m_metrics) { - m->save_to_checkpoint_shared(p); - } + for (El::Int i = 0; i < get_num_layers(); ++i) { + if (!get_layer(i).save_to_checkpoint_shared(p)) { + LBANN_ERROR("Unable to save layer[",i,"]=", get_layer(i).get_name()); } } - else{ - if (m_comm->am_trainer_master()) { - p.write_uint64(persist_type::validate, "validation_step", (uint64_t) get_step(execution_mode::validation)); - } - save_rng_to_checkpoint_shared(p, m_comm); - for (weights *w : m_weights) { - w->save_to_checkpoint_shared(p); - } - for (El::Int i = 0; i < get_num_layers(); ++i) { - if (!get_layer(i).save_to_checkpoint_shared(p)) { - return false; - } - } - for (const auto& m : m_metrics) { - m->save_to_checkpoint_shared(p); - } + save_rng_to_checkpoint(p, m_comm); + for (const auto& m : m_metrics) { + m->save_to_checkpoint_shared(p); } return true; } @@ -1577,41 +1298,17 @@ bool model::load_from_checkpoint_shared(persist& p) { struct lbann_model_header header; // Assume checkpoint reload from epoch end not step end if (m_comm->am_trainer_master()) { - if (p.get_cb_type() != callback_type::validation) { - p.read_uint32(persist_type::train, "execution_mode", &header.execution_mode); - p.read_uint32(persist_type::train, "terminate_training", &header.terminate_training); - p.read_uint64(persist_type::train, "epoch", &header.epoch); - p.read_uint64(persist_type::train, "training_step", &header.training_step); - if(get_num_iterations_per_epoch(execution_mode::validation) != 0) - p.read_uint64(persist_type::validate, "validation_step", &header.validation_step); - p.read_uint64(persist_type::train, "testing_step", &header.testing_step); - p.read_uint32(persist_type::train, "max_mini_batch_size", &header.max_mini_batch_size); - p.read_uint32(persist_type::train, "current_mini_batch_size", &header.current_mini_batch_size); - p.read_uint32(persist_type::train, "persist_callback_type", &header.callback_type); - } else { - p.read_uint64(persist_type::validate, "validation_step", &header.validation_step); - } + p.read_uint64(persist_type::model, "max_mini_batch_size", &header.max_mini_batch_size); + p.read_uint32(persist_type::model, "persist_callback_type", &header.callback_type); } - load_rng_from_checkpoint_shared(p, m_comm); + load_rng_from_checkpoint(p, m_comm); // TODO: this assumes homogeneous processors // broadcast state from rank 0 m_comm->trainer_broadcast(0, header); // set our member params from values read from disk - if (p.get_cb_type() != callback_type::validation) { - m_execution_mode = (execution_mode) header.execution_mode; - m_terminate_training = (bool) header.terminate_training; - m_epoch = (int) header.epoch; - m_step[execution_mode::training] = (int) header.training_step; - if(get_num_iterations_per_epoch(execution_mode::validation) != 0) - m_step[execution_mode::validation] = (int) header.validation_step; - m_step[execution_mode::testing] = (int) header.testing_step; - m_max_mini_batch_size = (int) header.max_mini_batch_size; - m_current_mini_batch_size = (int) header.current_mini_batch_size; - // set state of persist object to know which type of ckpt we are returning from. - p.set_cb_type((callback_type) header.callback_type); - } else { - m_step[execution_mode::validation] = (int) header.validation_step; - } + m_max_mini_batch_size = (size_t) header.max_mini_batch_size; + // set state of persist object to know which type of ckpt we are returning from. + p.set_cb_type((callback_type) header.callback_type); for (weights *w : m_weights) { w->load_from_checkpoint_shared(p); @@ -1620,7 +1317,7 @@ bool model::load_from_checkpoint_shared(persist& p) { // read in each layer for (El::Int i = 0; i < get_num_layers(); ++i) { if (!get_layer(i).load_from_checkpoint_shared(p)) { - return false; + LBANN_ERROR("Unable to load layer[",i,"]=", get_layer(i).get_name()); } } if(get_num_iterations_per_epoch(execution_mode::validation) != 0){ @@ -1636,76 +1333,36 @@ bool model::load_from_checkpoint_shared(persist& p) { bool model::save_to_checkpoint_distributed(persist& p){ // write out fields we need to save for model - if (p.get_cb_type() != callback_type::validation) { - p.write_uint32(persist_type::train, "execution_mode", (uint32_t) m_execution_mode); - p.write_uint32(persist_type::train, "terminate_training", (uint32_t) m_terminate_training); - p.write_uint64(persist_type::train, "epoch", (uint64_t) m_epoch); - p.write_uint64(persist_type::train, "training_step", (uint64_t) get_step(execution_mode::training)); - p.write_uint64(persist_type::train, "testing_step", (uint64_t) get_step(execution_mode::testing)); - p.write_uint32(persist_type::train, "max_mini_batch_size", (uint32_t) m_max_mini_batch_size); - p.write_uint32(persist_type::train, "current_mini_batch_size", (uint32_t) m_current_mini_batch_size); - p.write_uint32(persist_type::train, "persist_callback_type", (uint32_t) p.get_cb_type()); - if(p.get_cb_type() == callback_type::batch) - p.write_uint64(persist_type::validate, "validataion_step", (uint64_t) get_step(execution_mode::validation)); - - for (weights *w : m_weights) { - w->save_to_checkpoint_distributed(p); - } + p.write_uint64(persist_type::model, "max_mini_batch_size", (uint64_t) m_max_mini_batch_size); + p.write_uint32(persist_type::train, "persist_callback_type",(uint32_t) p.get_cb_type()); - for (El::Int i = 0; i < get_num_layers(); ++i) { - if (!get_layer(i).save_to_checkpoint_distributed(p)) { - return false; - } - } - if(p.get_cb_type() == callback_type::batch || get_num_iterations_per_epoch(execution_mode::validation) == 0){ - save_rng_to_checkpoint_shared(p, m_comm); - for (const auto& m : m_metrics) { - m->save_to_checkpoint_distributed(p); - } - } + // for each execution context write out them out + for (weights *w : m_weights) { + w->save_to_checkpoint_distributed(p); } - else { - p.write_uint64(persist_type::validate, "validataion_step", (uint64_t) get_step(execution_mode::validation)); - save_rng_to_checkpoint_shared(p, m_comm); - - for (El::Int i = 0; i < get_num_layers(); ++i) { - if (!get_layer(i).save_to_checkpoint_distributed(p)) { - return false; - } - } - for (const auto& m : m_metrics) { - m->save_to_checkpoint_distributed(p); + for (El::Int i = 0; i < get_num_layers(); ++i) { + if (!get_layer(i).save_to_checkpoint_distributed(p)) { + LBANN_ERROR("Unable to save layer[",i,"]=", get_layer(i).get_name()); } } + save_rng_to_checkpoint(p, m_comm); + for (const auto& m : m_metrics) { + m->save_to_checkpoint_distributed(p); + } + return true; } bool model::load_from_checkpoint_distributed(persist& p){ struct lbann_model_header header; - p.read_uint32(persist_type::train, "execution_mode", &header.execution_mode); - p.read_uint32(persist_type::train, "terminate_training", &header.terminate_training); - p.read_uint64(persist_type::train, "epoch", &header.epoch); - p.read_uint64(persist_type::train, "training_step", &header.training_step); - if(get_num_iterations_per_epoch(execution_mode::validation) != 0) - p.read_uint64(persist_type::validate, "validation_step", &header.validation_step); - p.read_uint64(persist_type::train, "testing_step", &header.testing_step); - p.read_uint32(persist_type::train, "max_mini_batch_size", &header.max_mini_batch_size); - p.read_uint32(persist_type::train, "current_mini_batch_size", &header.current_mini_batch_size); + p.read_uint64(persist_type::model, "max_mini_batch_size", &header.max_mini_batch_size); p.read_uint32(persist_type::train, "persist_callback_type", &header.callback_type); - m_execution_mode = (execution_mode) header.execution_mode; - m_terminate_training = (bool) header.terminate_training; - m_epoch = (int) header.epoch; - m_step[execution_mode::training] = (int) header.training_step; - if(get_num_iterations_per_epoch(execution_mode::validation) != 0) - m_step[execution_mode::validation] = (int) header.validation_step; - m_step[execution_mode::testing] = (int) header.testing_step; - m_max_mini_batch_size = (int) header.max_mini_batch_size; - m_current_mini_batch_size = (int) header.current_mini_batch_size; + m_max_mini_batch_size = (size_t) header.max_mini_batch_size; p.set_cb_type((callback_type) header.callback_type); - load_rng_from_checkpoint_shared(p, m_comm); + load_rng_from_checkpoint(p, m_comm); for (weights *w : m_weights) { w->load_from_checkpoint_distributed(p); @@ -1713,13 +1370,11 @@ bool model::load_from_checkpoint_distributed(persist& p){ for (El::Int i = 0; i < get_num_layers(); ++i) { if (!get_layer(i).load_from_checkpoint_distributed(p)) { - return false; + LBANN_ERROR("Unable to load layer[",i,"]=", get_layer(i).get_name()); } } - if(get_num_iterations_per_epoch(execution_mode::validation) != 0){ - for (const auto& m : m_metrics) { - m->load_from_checkpoint_distributed(p); - } + for (const auto& m : m_metrics) { + m->load_from_checkpoint_distributed(p); } return true; } diff --git a/src/optimizers/hypergradient_adam.cpp b/src/optimizers/hypergradient_adam.cpp index 445c13ae81b..4db7150bfff 100644 --- a/src/optimizers/hypergradient_adam.cpp +++ b/src/optimizers/hypergradient_adam.cpp @@ -145,8 +145,7 @@ void hypergradient_adam::step_compute(AbsDistMat& values, } bool hypergradient_adam::save_to_checkpoint_shared(persist& p, std::string name_prefix) { - if(p.get_cb_type() == callback_type::batch) - optimizer::save_to_checkpoint_shared(p,name_prefix); + optimizer::save_to_checkpoint_shared(p,name_prefix); if (get_comm().am_trainer_master()) { pack_scalars(p); } @@ -165,8 +164,7 @@ bool hypergradient_adam::save_to_checkpoint_shared(persist& p, std::string name_ } bool hypergradient_adam::load_from_checkpoint_shared(persist& p, std::string name_prefix) { - if(p.get_cb_type() == callback_type::batch) - optimizer::load_from_checkpoint_shared(p,name_prefix); + optimizer::load_from_checkpoint_shared(p,name_prefix); struct packing_header header; if (get_comm().am_trainer_master()) { unpack_scalars(p, &header); @@ -189,8 +187,7 @@ bool hypergradient_adam::load_from_checkpoint_shared(persist& p, std::string nam } bool hypergradient_adam::save_to_checkpoint_distributed(persist& p, std::string name_prefix) { - if(p.get_cb_type() == callback_type::batch) - optimizer::save_to_checkpoint_distributed(p,name_prefix); + optimizer::save_to_checkpoint_distributed(p,name_prefix); pack_scalars(p); char l_name[512]; @@ -207,8 +204,7 @@ bool hypergradient_adam::save_to_checkpoint_distributed(persist& p, std::string } bool hypergradient_adam::load_from_checkpoint_distributed(persist& p, std::string name_prefix) { - if(p.get_cb_type() == callback_type::batch) - optimizer::load_from_checkpoint_distributed(p,name_prefix); + optimizer::load_from_checkpoint_distributed(p,name_prefix); struct packing_header header; unpack_scalars(p, &header); diff --git a/src/proto/CMakeLists.txt b/src/proto/CMakeLists.txt index e719ff46f78..2c69276d4ba 100644 --- a/src/proto/CMakeLists.txt +++ b/src/proto/CMakeLists.txt @@ -15,6 +15,8 @@ if (LBANN_HAS_PROTOBUF) objective_functions.proto optimizers.proto reader.proto + trainer.proto + training_algorithm.proto transforms.proto weights.proto ) diff --git a/src/proto/factories/CMakeLists.txt b/src/proto/factories/CMakeLists.txt index 05c1259463a..34d42b4ed21 100644 --- a/src/proto/factories/CMakeLists.txt +++ b/src/proto/factories/CMakeLists.txt @@ -6,6 +6,7 @@ set_full_path(THIS_DIR_SOURCES model_factory.cpp objective_function_factory.cpp optimizer_factory.cpp + trainer_factory.cpp transform_factory.cpp weights_factory.cpp ) diff --git a/src/proto/factories/layer_graph_factory.cpp b/src/proto/factories/layer_graph_factory.cpp index 4c75e068bb2..804a9d8fe9c 100644 --- a/src/proto/factories/layer_graph_factory.cpp +++ b/src/proto/factories/layer_graph_factory.cpp @@ -170,6 +170,7 @@ void setup_unpooling_pointers(lbann_comm* comm, std::vector> construct_layer_graph( lbann_comm* comm, const std::map& data_readers, + const lbann_data::Trainer& proto_trainer, const lbann_data::Model& proto_model) { std::stringstream err; @@ -205,7 +206,7 @@ std::vector> construct_layer_graph( if (layout_str.empty()) { layout = data_layout::DATA_PARALLEL; } if (layout_str == "data_parallel") { layout = data_layout::DATA_PARALLEL; } if (layout_str == "model_parallel") { layout = data_layout::MODEL_PARALLEL; } - const auto& num_parallel_readers = proto_model.num_parallel_readers(); + const auto& num_parallel_readers = proto_trainer.num_parallel_readers(); El::Device device = El::Device::CPU; #ifdef LBANN_HAS_GPU const auto& device_str = proto_layer.device_allocation(); diff --git a/src/proto/factories/model_factory.cpp b/src/proto/factories/model_factory.cpp index f56ecd3a3e9..e56fc15eb5d 100644 --- a/src/proto/factories/model_factory.cpp +++ b/src/proto/factories/model_factory.cpp @@ -239,11 +239,13 @@ std::unique_ptr construct_model( lbann_comm* comm, const std::map& data_readers, const lbann_data::Optimizer& proto_opt, + const lbann_data::Trainer& proto_trainer, const lbann_data::Model& proto_model) { // Construct layer graph auto&& layer_list = construct_layer_graph(comm, data_readers, + proto_trainer, proto_model); // Construct objective function diff --git a/src/proto/factories/trainer_factory.cpp b/src/proto/factories/trainer_factory.cpp new file mode 100644 index 00000000000..2ef088c18b0 --- /dev/null +++ b/src/proto/factories/trainer_factory.cpp @@ -0,0 +1,46 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#include "lbann/proto/factories.hpp" +#include "lbann/objective_functions/layer_term.hpp" + +namespace lbann { +namespace proto { + +std::unique_ptr construct_trainer(lbann_comm* comm, + const lbann_data::Trainer& proto_trainer) { + + // Instantiate trainer + auto t = make_unique(comm); + const auto& name = proto_trainer.name(); + if (!name.empty()) { + t->set_name(name); + } + return t; +} + +} // namespace proto +} // namespace lbann diff --git a/src/proto/lbann.proto b/src/proto/lbann.proto index 5e26d3b0a11..aa9b09f9e91 100644 --- a/src/proto/lbann.proto +++ b/src/proto/lbann.proto @@ -31,10 +31,14 @@ package lbann_data; import "reader.proto"; import "model.proto"; import "optimizers.proto"; +import "trainer.proto"; +import "training_algorithm.proto"; message LbannPB { DataReader data_reader = 1; Model model = 2; Optimizer optimizer = 3; DataSetMetaData data_set_metadata = 5; + Trainer trainer = 6; + TrainingAlgorithm training_algorithm = 76; } diff --git a/src/proto/model.proto b/src/proto/model.proto index 12e69fdf73f..29d1c4d5753 100644 --- a/src/proto/model.proto +++ b/src/proto/model.proto @@ -52,11 +52,7 @@ message Model { int64 num_epochs = 4; int64 super_steps = 121; //multiple steps/epochs currently use in GAN int64 num_batches = 122; //multiple batches/sub epoch - int64 block_size = 50; - int64 procs_per_trainer = 51; - int64 num_gpus = 53; //has no effect int64 evaluation_frequency = 54; - int64 num_parallel_readers = 100; bool serialize_io = 101; bool disable_cuda = 8; diff --git a/src/proto/proto_common.cpp b/src/proto/proto_common.cpp index b50495ee9f4..4352a065354 100644 --- a/src/proto/proto_common.cpp +++ b/src/proto/proto_common.cpp @@ -588,10 +588,10 @@ bool write_prototext_file(const std::string& fn, lbann_data::LbannPB& pb) return true; } -bool check_if_num_parallel_readers_set(const lbann_comm& comm, const lbann_data::Model& model) +bool check_if_num_parallel_readers_set(const lbann_comm& comm, const lbann_data::Trainer& trainer) { const bool master = comm.am_world_master(); - const int parallel_io = model.num_parallel_readers(); + const int parallel_io = trainer.num_parallel_readers(); if (parallel_io == 0) { if (master) { @@ -608,24 +608,24 @@ bool check_if_num_parallel_readers_set(const lbann_comm& comm, const lbann_data: void set_num_parallel_readers(const lbann_comm& comm, lbann_data::LbannPB& p) { - lbann_data::Model *model = p.mutable_model(); - const bool is_set = check_if_num_parallel_readers_set(comm, *model); + lbann_data::Trainer *trainer = p.mutable_trainer(); + const bool is_set = check_if_num_parallel_readers_set(comm, *trainer); if (!is_set) { const int parallel_io = comm.get_procs_per_trainer(); - model->set_num_parallel_readers(parallel_io); //adjust the prototext + trainer->set_num_parallel_readers(parallel_io); //adjust the prototext } } int get_requested_num_parallel_readers(const lbann_comm& comm, const lbann_data::LbannPB& p) { - const lbann_data::Model& model = p.model(); - const bool is_set = check_if_num_parallel_readers_set(comm, model); + const lbann_data::Trainer& trainer = p.trainer(); + const bool is_set = check_if_num_parallel_readers_set(comm, trainer); if (!is_set) { return comm.get_procs_per_trainer(); } - return model.num_parallel_readers(); + return trainer.num_parallel_readers(); } void set_data_readers_filenames( @@ -726,6 +726,7 @@ void get_cmdline_overrides(const lbann_comm& comm, lbann_data::LbannPB& p) std::ostringstream err; options *opts = options::get(); + lbann_data::Trainer *trainer = p.mutable_trainer(); lbann_data::Model *model = p.mutable_model(); lbann_data::DataReader *d_reader = p.mutable_data_reader(); int size = d_reader->reader_size(); @@ -776,13 +777,13 @@ void get_cmdline_overrides(const lbann_comm& comm, lbann_data::LbannPB& p) model->set_num_epochs(opts->get_int("num_epochs")); } if (opts->has_int("block_size")) { - model->set_block_size(opts->get_int("block_size")); + trainer->set_block_size(opts->get_int("block_size")); } if (opts->has_int("procs_per_trainer")) { - model->set_procs_per_trainer(opts->get_int("procs_per_trainer")); + trainer->set_procs_per_trainer(opts->get_int("procs_per_trainer")); } if (opts->has_int("num_parallel_readers")) { - model->set_num_parallel_readers(opts->get_int("num_parallel_readers")); + trainer->set_num_parallel_readers(opts->get_int("num_parallel_readers")); } if (opts->get_bool("disable_cuda")) { model->set_disable_cuda(opts->get_bool("disable_cuda")); @@ -802,19 +803,30 @@ void print_parameters(const lbann_comm& comm, lbann_data::LbannPB& p) return; } + const lbann_data::Trainer &t = p.trainer(); const lbann_data::Model &m = p.model(); + bool disable_cuda = m.disable_cuda(); +#ifndef LBANN_HAS_GPU + disable_cuda = false; +#endif // LBANN_HAS_GPU + bool disable_cudnn = disable_cuda; +#ifndef LBANN_HAS_CUDNN + disable_cudnn = false; +#endif // LBANN_HAS_CUDNN + std::cout << std::endl << "Running with these parameters:\n" << " General:\n" << " datatype size: " << sizeof(DataType) << std::endl << " mini_batch_size: " << m.mini_batch_size() << std::endl << " num_epochs: " << m.num_epochs() << std::endl - << " block_size: " << m.block_size() << std::endl - << " procs_per_trainer: " << m.procs_per_trainer() << std::endl - << " num_parallel_readers: " << m.num_parallel_readers() << std::endl + << " block_size: " << t.block_size() << std::endl + << " procs_per_trainer: " << t.procs_per_trainer() << std::endl + << " num_parallel_readers: " << t.num_parallel_readers() << std::endl << " serialize_io: " << m.serialize_io() << std::endl - << " disable_cuda: " << m.disable_cuda() << std::endl + << " cuda: " << (disable_cuda ? "disabled" : "enabled") << std::endl + << " cudnn: " << (disable_cudnn ? "disabled" : "enabled") << std::endl << " random_seed: " << m.random_seed() << std::endl << " data_layout: " << m.data_layout() << std::endl << " (only used for metrics)\n"; @@ -841,8 +853,6 @@ void print_help(std::ostream& os) " --saveme= You can suppress writing the file via the option:\n" " --saveme=0\n" "\n" - " To reload from a previous checkpoint you specify --ckpt_dir=\n" - "\n" "Some prototext values can be over-riden on the command line;\n" "(notes: use '1' or '0' for bool; if no value is given for a flag,\n" " e.g: --disable_cuda, then a value of '1' is assigned)\n" @@ -881,6 +891,10 @@ void print_help(std::ostream& os) " Writes out the sample list that was loaded into the current directory\n" " --ltfb_verbose \n" " Increases number of per-trainer messages that are reported\n" + " --ckpt_dir=\n" + " Save to or reload from a specific checkpoint directory.\n" + " Additionally, sets the output directory for dumping weights.\n" + " Modifies callbacks: checkpoint, save_model, dump_weights\n" "\n" "DataReaders:\n" " --data_filedir=\n" diff --git a/src/proto/trainer.proto b/src/proto/trainer.proto new file mode 100644 index 00000000000..b45384b84bb --- /dev/null +++ b/src/proto/trainer.proto @@ -0,0 +1,38 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +syntax = "proto3"; + +package lbann_data; + +message Trainer { + string name = 3; + + int64 block_size = 50; + int64 procs_per_trainer = 51; + int64 num_gpus = 53; //has no effect + int64 num_parallel_readers = 100; +} diff --git a/src/proto/training_algorithm.proto b/src/proto/training_algorithm.proto new file mode 100644 index 00000000000..df2ca5162ea --- /dev/null +++ b/src/proto/training_algorithm.proto @@ -0,0 +1,34 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +syntax = "proto3"; + +package lbann_data; + +message TrainingAlgorithm { + string type = 1; + string name = 3; +} diff --git a/src/trainers/CMakeLists.txt b/src/trainers/CMakeLists.txt new file mode 100644 index 00000000000..69c37b0c9fa --- /dev/null +++ b/src/trainers/CMakeLists.txt @@ -0,0 +1,7 @@ +# Add the source files for this directory +set_full_path(THIS_DIR_SOURCES + trainer.cpp + ) + +# Propagate the files up the tree +set(SOURCES "${SOURCES}" "${THIS_DIR_SOURCES}" PARENT_SCOPE) diff --git a/src/trainers/trainer.cpp b/src/trainers/trainer.cpp new file mode 100644 index 00000000000..4579213c8b0 --- /dev/null +++ b/src/trainers/trainer.cpp @@ -0,0 +1,216 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#include "lbann/trainers/trainer.hpp" +#include "lbann/callbacks/callback.hpp" +//#include "lbann/callbacks/callback_save_model.hpp" +#include "lbann/io/persist.hpp" +#include "lbann/layers/io/input/generic_input_layer.hpp" +#include "lbann/layers/transform/dummy.hpp" +#include "lbann/layers/transform/split.hpp" +#include "lbann/layers/transform/evaluation.hpp" +#include "lbann/objective_functions/layer_term.hpp" +#include "lbann/metrics/layer_metric.hpp" +#include "lbann/utils/random.hpp" +#include "lbann/utils/omp_diagnostics.hpp" +#include "lbann/utils/description.hpp" +#include "lbann/execution_contexts/sgd_execution_context.hpp" +#include "lbann/training_algorithms/sgd_training_algorithm.hpp" +#include +#include +#include +#include +#include +#include + +#include "mpi.h" + +namespace lbann { + +//////////////////////////////////////////////////////////// +// Constructors and destructor +//////////////////////////////////////////////////////////// + +trainer::trainer(lbann_comm *comm) + : m_comm(comm), + m_io_thread_pool(), + m_background_io_allowed(true) { + + // Default trainer name + m_name = "trainer" + std::to_string(m_comm->get_trainer_rank()); +} + +trainer::trainer(const trainer& other) : + m_comm(other.m_comm), + m_background_io_allowed(other.m_background_io_allowed) { + + // Deep copies + // m_io_thread_pool = (other.m_io_thread_pool ? + // other.m_io_thread_pool->copy() : nullptr); +} + +trainer& trainer::operator=(const trainer& other) { + + // Shallow copies + m_comm = other.m_comm; + m_background_io_allowed = other.m_background_io_allowed; + + // Deep copies + // m_io_thread_pool = (other.m_io_thread_pool ? + // other.m_io_thread_pool->copy() : nullptr); + + return *this; +} + +trainer::~trainer() { +} + +//////////////////////////////////////////////////////////// +// Trainer specification +//////////////////////////////////////////////////////////// + +void trainer::set_name(std::string const& name) { + m_name = name; +} + +description trainer::get_description() const { + + // Construct description object + description desc(get_name()); + desc.add("Background I/O", m_background_io_allowed); + + // Result + return desc; + +} + +//////////////////////////////////////////////////////////// +// Setup +//////////////////////////////////////////////////////////// + +void trainer::setup(std::unique_ptr io_thread_pool) { + // Setup I/O threads - set up before setting up the layers (input + // layer depends on having a properly initialized thread pool) + m_io_thread_pool = std::move(io_thread_pool); +} + +/// Check if there is already an execution context for the model in this mode, if not create one +trainer::execution_context_key_pair_t trainer::check_and_build_execution_context(training_algorithm& alg, + observer_ptr model, + execution_mode mode) { + auto key = std::make_pair(model,mode); + if(m_model_execution_context.count(key) == 0) { + /// Create a execution context for each model and execution mode + std::unique_ptr context; + if(dynamic_cast>(&alg) != nullptr) { + /// @todo BVE FIXME Figure out how to get a good mini-batch size + /// in here + context = make_unique(this, m_comm, mode, model->get_max_mini_batch_size()); + }else { + context = make_unique(this, m_comm, mode); + } + m_model_execution_context.emplace(key,std::move(context)); + } + return key; +} + +/// Check if there is already an execution context for the model in this mode, if not create one +trainer::execution_context_key_pair_t trainer::check_and_build_execution_context(const execution_context& c, + model& model, + execution_mode mode) { + auto key = std::make_pair(&model, mode); + if(m_model_execution_context.count(key) == 0) { + std::unique_ptr context; + if(dynamic_cast>(&c) != nullptr) { + context = make_unique(this, m_comm, mode, model.get_max_mini_batch_size()); + }else { + context = make_unique(this, m_comm, mode); + } + m_model_execution_context.emplace(key,std::move(context)); + } + return key; +} + +execution_context& trainer::get_execution_context(observer_ptr model, + execution_mode mode) { + auto key = std::make_pair(model,mode); + return get_execution_context(key); +} + +execution_context& trainer::get_execution_context(execution_context_key_pair_t key) { + if(m_model_execution_context.count(key) == 0) { + LBANN_ERROR("No execution context for this model / mode pair"); + } + return static_cast(*(m_model_execution_context[key].get())); +} + +void trainer::delete_execution_context(execution_context_key_pair_t key) { + if(m_model_execution_context.count(key) == 0) { + LBANN_WARNING("Attempting to delete an invalid execution context for model=" + + (key.first)->get_name() + " / " + to_string(key.second)); + } + m_model_execution_context.erase(key); +} + +void trainer::for_each_execution_context(std::function)>fn) { + for(auto&& c : m_model_execution_context) { + // auto&& model = c.first.first; + // auto&& mode = c.first.second; + auto&& context = c.second; + fn(context.get()); + } +} + + +//////////////////////////////////////////////////////////// +// Evaluation and training +//////////////////////////////////////////////////////////// +void trainer::apply(training_algorithm& alg, + observer_ptr model, + execution_mode mode, + termination_criteria const& term_criteria) { + + auto key = check_and_build_execution_context(alg, model, mode); + + /// Apply the training algorithm to train the model + alg.apply(*(m_model_execution_context[key].get()), *model, mode, term_criteria); +} + +void trainer::train(observer_ptr model, El::Int num_epochs, El::Int num_batches) { + auto sgd = make_unique(); + auto key = check_and_build_execution_context(*sgd.get(), model, execution_mode::training); + /// Apply the training algorithm to train the model + sgd.get()->train(static_cast(*(m_model_execution_context[key].get())), *model, num_epochs, num_batches); +} + +void trainer::evaluate(observer_ptr model, execution_mode mode, El::Int num_batches) { + auto sgd = make_unique(); + auto key = check_and_build_execution_context(*sgd.get(), model, mode); + /// Apply the training algorithm to evaluate the model + sgd.get()->evaluate(static_cast(*(m_model_execution_context[key].get())), *model, mode, num_batches); +} + +} // namespace lbann diff --git a/src/training_algorithms/CMakeLists.txt b/src/training_algorithms/CMakeLists.txt new file mode 100644 index 00000000000..a7252c161e2 --- /dev/null +++ b/src/training_algorithms/CMakeLists.txt @@ -0,0 +1,7 @@ +# Add the source files for this directory +set_full_path(THIS_DIR_SOURCES + sgd_training_algorithm.cpp + ) + +# Propagate the files up the tree +set(SOURCES "${SOURCES}" "${THIS_DIR_SOURCES}" PARENT_SCOPE) diff --git a/src/training_algorithms/sgd_training_algorithm.cpp b/src/training_algorithms/sgd_training_algorithm.cpp new file mode 100644 index 00000000000..72e4e6fd815 --- /dev/null +++ b/src/training_algorithms/sgd_training_algorithm.cpp @@ -0,0 +1,278 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2016, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#include "lbann/training_algorithms/sgd_training_algorithm.hpp" +#include "lbann/models/model.hpp" +#include "lbann/callbacks/callback.hpp" + +namespace lbann { + +//////////////////////////////////////////////////////////// +// Evaluation and training +//////////////////////////////////////////////////////////// + +void sgd_training_algorithm::apply(execution_context& context, + model& model, + execution_mode mode, + termination_criteria const& term_criteria) { + sgd_execution_context& sgd_context = static_cast(context); + const sgd_termination_criteria& sgd_term = static_cast(term_criteria); + switch(mode) { + case execution_mode::training: + train(sgd_context, model, sgd_term.num_epochs, sgd_term.num_steps); + break; + case execution_mode::validation: + case execution_mode::testing: + case execution_mode::prediction: + evaluate(sgd_context, model, mode, sgd_term.num_steps); + break; + default: + LBANN_ERROR(std::string{} + "Illegal mode: " + to_string(mode)); + } +} + +void sgd_training_algorithm::train(sgd_execution_context& c, + model& model, + size_t num_epochs, + size_t num_batches) { + + // Initialize epoch + model.reset_mode(c, execution_mode::training); + + do_train_begin_cbs(model); + for (size_t epoch = c.get_epoch(); epoch < num_epochs; ++epoch) { + if (c.get_terminate_training()) { break; } + + // Initialize epoch + model.reset_mode(c, execution_mode::training); + model.reset_epoch_statistics(execution_mode::training); + do_epoch_begin_cbs(model); + + // Training iterations + if (num_batches > 0) { + for (size_t i = 0; i < num_batches; i++) { train_mini_batch(c, model); } + } else { + while (!train_mini_batch(c, model)) {} + } + + // Finalize epoch + c.inc_epoch(); + model.reconcile_weight_values(); + do_epoch_end_cbs(model); + + // Evaluate on validation set + auto key = c.get_trainer().check_and_build_execution_context(c, model, execution_mode::validation); + auto& evaluation_context = static_cast(c.get_trainer().get_execution_context(key)); + evaluate(evaluation_context, model, execution_mode::validation); + } + do_train_end_cbs(model); +} + +//////////////////////////////////////////////////////////// +// Evaluation and training +//////////////////////////////////////////////////////////// + +bool sgd_training_algorithm::train_mini_batch(sgd_execution_context& c, + model& model) { + model.reset_mode(c, execution_mode::training); + do_batch_begin_cbs(model, execution_mode::training); + + bool finished; + +#if defined(LBANN_HAVE_OMP_TASKLOOP) + LBANN_OMP_PARALLEL + { + #pragma omp single + { +#endif + // Forward prop step + model.clear_gradients(); + model.forward_prop(execution_mode::training); + // Result is not needed until the end of the mini-batch. + model.get_objective_function()->start_evaluation(execution_mode::training, + c.get_current_mini_batch_size()); + + // Backward prop step + model.get_objective_function()->differentiate(); + model.backward_prop(); + model.get_objective_function()->compute_weight_regularization(); + + // Finish evaluation. + model.get_objective_function()->finish_evaluation(execution_mode::training, + c.get_current_mini_batch_size()); + model.evaluate_metrics(execution_mode::training, + c.get_current_mini_batch_size()); + + // Update step + model.update_weights(); + finished = model.update_layers(); +#if defined(LBANN_HAVE_OMP_TASKLOOP) + } + } +#endif + + c.inc_step(); + do_batch_end_cbs(model, execution_mode::training); + return finished; +} + +void sgd_training_algorithm::evaluate(sgd_execution_context& c, + model& model, + execution_mode mode, + size_t num_batches) { + // Return early if execution mode is invalid + if (!model.is_execution_mode_valid(mode)) return; + if (mode != execution_mode::validation + && mode != execution_mode::testing) { + std::stringstream err; + err << __FILE__ << " " << __LINE__ << " :: " + << "invalid execution mode for evaluation"; + throw lbann_exception(err.str()); + } + + // Evaluate on all mini-batches + model.reset_epoch_statistics(mode); + model.reset_mode(c, mode); + do_evaluate_begin_cbs(model, mode); + if (num_batches > 0) { + for (size_t i = 0; i < num_batches; i++) { evaluate_mini_batch(c, model, mode); } + } else { + while (!evaluate_mini_batch(c, model, mode)) {} + } + c.inc_epoch(); + do_evaluate_end_cbs(model, mode); +} + +bool sgd_training_algorithm::evaluate_mini_batch(sgd_execution_context& c, + model& model, + execution_mode mode) { + model.reset_mode(c, mode); + do_batch_begin_cbs(model, mode); + model.forward_prop(mode); + model.get_objective_function()->start_evaluation(mode, c.get_current_mini_batch_size()); + model.get_objective_function()->finish_evaluation(mode, c.get_current_mini_batch_size()); + model.evaluate_metrics(mode, c.get_current_mini_batch_size()); + const bool finished = model.update_layers(); + c.inc_step(); + do_batch_end_cbs(model, mode); + return finished; +} + +//////////////////////////////////////////////////////////// +// Callbacks +//////////////////////////////////////////////////////////// + +void sgd_training_algorithm::do_train_begin_cbs(model& model) { + for (const auto& cb : model.get_callbacks()) { + cb->on_train_begin(&model); + } +} + +void sgd_training_algorithm::do_train_end_cbs(model& model) { + for (const auto& cb : model.get_callbacks()) { + cb->on_train_end(&model); + } +} + +void sgd_training_algorithm::do_evaluate_begin_cbs(model& model, execution_mode mode) { + for (const auto& cb : model.get_callbacks()) { + switch (mode) { + case execution_mode::validation: + cb->on_validation_begin(&model); break; + case execution_mode::testing: + cb->on_test_begin(&model); break; + default: + LBANN_ERROR("invalid execution mode"); + } + } +} + +void sgd_training_algorithm::do_evaluate_end_cbs(model& model, execution_mode mode) { + for (const auto& cb : model.get_callbacks()) { + switch (mode) { + case execution_mode::validation: + cb->on_validation_end(&model); break; + case execution_mode::testing: + cb->on_test_end(&model); break; + default: + LBANN_ERROR("invalid execution mode"); + } + } +} + +void sgd_training_algorithm::do_epoch_begin_cbs(model& model) { + for (const auto& cb : model.get_callbacks()) { + cb->on_epoch_begin(&model); + } +} + +void sgd_training_algorithm::do_epoch_end_cbs(model& model) { + for (const auto& cb : model.get_callbacks()) { + cb->on_epoch_end(&model); + } +} + +void sgd_training_algorithm::do_batch_begin_cbs(model& model, execution_mode mode) { + sgd_execution_context& c = static_cast(model.get_execution_context()); + + for (const auto& cb : model.get_callbacks()) { + switch (mode) { + case execution_mode::training: + if (c.get_step() % cb->get_batch_interval() == 0) { + cb->on_batch_begin(&model); + } + break; + case execution_mode::validation: + case execution_mode::testing: + cb->on_batch_evaluate_begin(&model); + break; + default: + LBANN_ERROR("invalid execution mode"); + } + } +} + +void sgd_training_algorithm::do_batch_end_cbs(model& model, execution_mode mode) { + sgd_execution_context& c = static_cast(model.get_execution_context()); + + for (const auto& cb : model.get_callbacks()) { + switch (mode) { + case execution_mode::training: + if (c.get_step() % cb->get_batch_interval() == 0) { + cb->on_batch_end(&model); + } + break; + case execution_mode::validation: + case execution_mode::testing: + cb->on_batch_evaluate_end(&model); + break; + default: + LBANN_ERROR("invalid execution mode"); + } + } +} + +} // namespace lbann diff --git a/src/utils/graph.cpp b/src/utils/graph.cpp index 1987676a0a8..f48c09a4467 100644 --- a/src/utils/graph.cpp +++ b/src/utils/graph.cpp @@ -36,7 +36,7 @@ namespace graph { void print(const std::set& nodes, const std::map>& edges, - std::ostream& os) { + std::ostream& os = std::cout) { for (const auto& node : nodes) { os << "node " << node << " neighbors :"; for (const auto& neighbor : get_neighbors(node, edges)) { diff --git a/src/utils/lbann_library.cpp b/src/utils/lbann_library.cpp index 214a70a36c4..6cad37dc1a9 100644 --- a/src/utils/lbann_library.cpp +++ b/src/utils/lbann_library.cpp @@ -29,17 +29,100 @@ #include "lbann/proto/factories.hpp" #include "lbann/utils/omp_diagnostics.hpp" #include "lbann/utils/threads/thread_utils.hpp" +#include "lbann/callbacks/checkpoint.hpp" +#include "lbann/callbacks/dump_weights.hpp" +#include "lbann/callbacks/save_model.hpp" #include #include namespace lbann { +/// Construct a trainer that contains a lbann comm object and threadpool +std::unique_ptr construct_trainer(lbann_comm *comm, + lbann_data::Trainer* pb_trainer, + options *opts) { + bool master = comm->am_world_master(); + try { + int procs_per_trainer = 0; + if(pb_trainer->procs_per_trainer() > 0) { + procs_per_trainer = pb_trainer->procs_per_trainer(); + } + if (procs_per_trainer == 0) { + procs_per_trainer = comm->get_procs_in_world(); + } + + // Set up the communicator and split the grid if necessary + comm->split_trainers(procs_per_trainer); + if (pb_trainer->num_parallel_readers() > procs_per_trainer) { + pb_trainer->set_num_parallel_readers(procs_per_trainer); + } + + // Adjust the number of parallel readers; this may be adjusted + // after calling split_trainers() + // set_num_parallel_readers(*comm, pb); + + // Initalize a per-trainer I/O thread pool + std::unique_ptr io_thread_pool = construct_io_thread_pool(comm, opts); + + // Setup I/O threads + auto io_threads_per_process = io_thread_pool->get_num_threads(); + auto io_threads_offset = io_thread_pool->get_threads_offset(); + + // Set algorithmic blocksize + if (pb_trainer->block_size() == 0 and master) { + LBANN_ERROR("model does not provide a valid block size (", pb_trainer->block_size(), ")"); + } + El::SetBlocksize(pb_trainer->block_size()); + + // Set up the communicator and get the grid based on the trainers' spec. + // We do not currently support splitting different trainers in different ways, + // as this implies different grids. + if (procs_per_trainer != comm->get_procs_per_trainer()) { + comm->split_trainers(procs_per_trainer); + } + + // Display how the OpenMP threads are provisioned + // if (opts->has_string("print_affinity")) { + // display_omp_setup(); + // } + + // User feedback + // print_parameters(comm, pb); + + // Initalize trainer + std::unique_ptr trainer = proto::construct_trainer(comm, *pb_trainer); + + trainer->setup(std::move(io_thread_pool)); + + if(opts->get_bool("disable_background_io_activity")) { + trainer->allow_background_io_activity(false); + } + + // Report useful information + if (comm->am_world_master()) { + print_lbann_configuration(comm, + io_threads_per_process, + io_threads_offset); + std::cout << "\n" + << trainer->get_description() + << std::endl; + } + + return trainer; + + } catch (lbann_exception& e) { + El::mpi::Abort(El::mpi::COMM_WORLD, 1); + } catch (std::exception& e) { + El::ReportException(e); // Elemental exceptions + } + return nullptr; +} + /// Setup I/O thread pool that is shared across all models -std::unique_ptr construct_io_thread_pool(lbann_comm *comm) { +std::unique_ptr construct_io_thread_pool(lbann_comm *comm, options *opts) { int num_io_threads = num_free_cores_per_process(comm); - options *opts = options::get(); if(opts->has_int("num_io_threads")) { int requested_io_threads = opts->get_int("num_io_threads"); if(requested_io_threads > 0 && requested_io_threads < num_io_threads) { @@ -62,9 +145,11 @@ std::unique_ptr construct_io_thread_pool(lbann_comm *comm) { std::unique_ptr build_model_from_prototext( int argc, char **argv, + const lbann_data::Trainer* pb_trainer, lbann_data::LbannPB &pb, lbann_comm *comm, - std::shared_ptr io_thread_pool, + options *opts, + thread_pool& io_thread_pool, bool first_model) { int random_seed = lbann_default_random_seed; @@ -74,36 +159,21 @@ std::unique_ptr build_model_from_prototext( } std::ostringstream err; - options *opts = options::get(); - - // Optionally over-ride some values in prototext - get_cmdline_overrides(*comm, pb); lbann_data::Model *pb_model = pb.mutable_model(); - // Adjust the number of parallel readers; this may be adjusted - // after calling split_trainers() - set_num_parallel_readers(*comm, pb); - // Check to see if the model wants to reduce the I/O parallelism - if(pb_model->serialize_io() && io_thread_pool->get_num_threads() != 1) { + if(pb_model->serialize_io() && io_thread_pool.get_num_threads() != 1) { if(master) { std::cout << "Model " << pb_model->name() << " serialized the I/O threads" << std::endl; } - io_thread_pool->relaunch_pinned_threads(1); + io_thread_pool.relaunch_pinned_threads(1); } - // Setup I/O threads - auto io_threads_per_process = io_thread_pool->get_num_threads(); - auto io_threads_offset = io_thread_pool->get_threads_offset(); - - // Set algorithmic blocksize - if (pb_model->block_size() == 0 and master) { - err << "model does not provide a valid block size (" << pb_model->block_size() << ")"; - LBANN_ERROR(err.str()); - } - El::SetBlocksize(pb_model->block_size()); + // Get I/O thread details + auto io_threads_per_process = io_thread_pool.get_num_threads(); + /// @todo BVE FIXME should this be in the trainer // Change random seed if needed. if (pb_model->random_seed() > 0) { random_seed = pb_model->random_seed(); @@ -111,21 +181,6 @@ std::unique_ptr build_model_from_prototext( init_random(random_seed); init_data_seq_random(random_seed); } - // Set up the communicator and get the grid based on the first model's spec. - // We do not currently support splitting different models in different ways, - // as this implies different grids. - int procs_per_trainer = pb_model->procs_per_trainer(); - if (procs_per_trainer == 0) { - procs_per_trainer = comm->get_procs_in_world(); - } - if (first_model) { - comm->split_trainers(procs_per_trainer); - if (pb_model->num_parallel_readers() > procs_per_trainer) { - pb_model->set_num_parallel_readers(procs_per_trainer); - } - } else if (procs_per_trainer != comm->get_procs_per_trainer()) { - LBANN_ERROR("Model prototexts requesting different procs per model is not supported"); - } // Initialize models differently if needed. #ifndef LBANN_DETERMINISTIC @@ -148,11 +203,6 @@ std::unique_ptr build_model_from_prototext( // from the cmd line) and various other info save_session(*comm, argc, argv, pb); - // Report useful information - if (master) { - print_lbann_configuration(pb_model, comm, io_threads_per_process, io_threads_offset); - } - // Display how the OpenMP threads are provisioned if (opts->has_string("print_affinity")) { display_omp_setup(); @@ -170,6 +220,11 @@ std::unique_ptr build_model_from_prototext( is_shared_testing_data_reader = opts->get_bool("share_testing_data_readers"); } init_data_readers(comm, pb, data_readers, is_shared_training_data_reader, is_shared_testing_data_reader); + /// Setup the data readers with the I/O thread pool + for(auto&& dr: data_readers) { + dr.second->setup(io_threads_per_process, &io_thread_pool); + dr.second->set_rank(comm->get_rank_in_trainer()); + } // hack to prevent all data readers from loading identical data; instead, // share a single copy. See data_reader_jag_conduit_hdf5 for example @@ -185,14 +240,42 @@ std::unique_ptr build_model_from_prototext( print_parameters(*comm, pb); // Initalize model - auto ret_model = - proto::construct_model(comm, data_readers, pb.optimizer(), pb.model()); - ret_model->setup(std::move(io_thread_pool)); - - if(opts->get_bool("disable_background_io_activity")) { - ret_model->allow_background_io_activity(false); + std::unique_ptr ret_model = proto::construct_model(comm, + data_readers, + pb.optimizer(), + pb.trainer(), + pb.model()); + + // If the checkpoint directory has been overridden reset it before + // setting up the model + if (opts->has_string("ckpt_dir")) { + for (auto&& c : ret_model->get_callbacks()) { + { + auto* cb = dynamic_cast(c); + if(cb != nullptr) { + cb->set_checkpoint_dir(opts->get_string("ckpt_dir")); + std::cout << "Setting the checkpoint directory to " << cb->get_checkpoint_dir() << std::endl; + } + } + { + auto* cb = dynamic_cast(c); + if(cb != nullptr) { + cb->set_target_dir(opts->get_string("ckpt_dir")); + std::cout << "Setting the dump weights directory to " << cb->get_target_dir() << std::endl; + } + } + { + auto* cb = dynamic_cast(c); + if(cb != nullptr) { + cb->set_target_dir(opts->get_string("ckpt_dir")); + std::cout << "Setting the dump weights directory to " << cb->get_target_dir() << std::endl; + } + } + } } + ret_model->setup(); + if (opts->get_bool("use_data_store") || opts->get_bool("preload_data_store") || opts->get_bool("data_store_cache")) { if (master) { std::cout << "\nUSING DATA STORE!\n\n"; @@ -223,7 +306,7 @@ std::unique_ptr build_model_from_prototext( return ret_model; } -void print_lbann_configuration(lbann_data::Model *pb_model, lbann_comm *comm, int io_threads_per_process, int io_threads_offset) { +void print_lbann_configuration(lbann_comm *comm, int io_threads_per_process, int io_threads_offset) { // Report hardware settings std::cout << "Hardware properties (for master process)" << std::endl << " Processes on node : " << comm->get_procs_per_node() << std::endl @@ -268,22 +351,6 @@ void print_lbann_configuration(lbann_data::Model *pb_model, lbann_comm *comm, in #else std::cout << "NOT detected" << std::endl; #endif // HYDROGEN_HAVE_CUB - std::cout << std::endl; - - // Report device settings - std::cout << "GPU settings" << std::endl; - bool disable_cuda = pb_model->disable_cuda(); -#ifndef LBANN_HAS_GPU - disable_cuda = true; -#endif // LBANN_HAS_GPU - std::cout << " CUDA : " - << (disable_cuda ? "disabled" : "enabled") << std::endl; - std::cout << " cuDNN : "; -#ifdef LBANN_HAS_CUDNN - std::cout << (disable_cuda ? "disabled" : "enabled") << std::endl; -#else - std::cout << "disabled" << std::endl; -#endif // LBANN_HAS_CUDNN const auto* env = std::getenv("MV2_USE_CUDA"); std::cout << " MV2_USE_CUDA : " << (env != nullptr ? env : "") << std::endl; std::cout << std::endl; @@ -301,10 +368,9 @@ void print_lbann_configuration(lbann_data::Model *pb_model, lbann_comm *comm, in // Report model settings const auto& grid = comm->get_trainer_grid(); - int procs_per_trainer = pb_model->procs_per_trainer(); - std::cout << "Model settings" << std::endl - << " Models : " << comm->get_num_trainers() << std::endl - << " Processes per trainer : " << procs_per_trainer << std::endl + std::cout << "Trainer settings" << std::endl + << " Trainers : " << comm->get_num_trainers() << std::endl + << " Processes per trainer : " << comm->get_procs_per_trainer() << std::endl << " Grid dimensions : " << grid.Height() << " x " << grid.Width() << std::endl; std::cout << std::endl; } diff --git a/src/utils/random.cpp b/src/utils/random.cpp index 799bf06d43e..4c53952f0c0 100644 --- a/src/utils/random.cpp +++ b/src/utils/random.cpp @@ -100,7 +100,7 @@ fast_rng_gen& get_fast_io_generator() { return ::fast_io_generator; } -bool save_rng_to_checkpoint_shared(persist& p, const lbann_comm* comm) { +bool save_rng_to_checkpoint(persist& p, const lbann_comm* comm) { std::string dirname = std::string(p.m_checkpoint_dir) + "/rng_state"; makedir(dirname.c_str()); std::string rng_name; @@ -108,12 +108,16 @@ bool save_rng_to_checkpoint_shared(persist& p, const lbann_comm* comm) { /// @todo - Note that the RNG with thread local data is not correct rng_name = dirname + "/rng_seq_generator"; std::ofstream rng_seq(rng_name); + if(!rng_seq) { LBANN_ERROR("Failed to open ", rng_name); } rng_seq << ::data_seq_generator; + rng_seq.close(); #ifdef LBANN_SET_EL_RNG rng_name = dirname + "/EL_generator"; std::ofstream rng_EL(rng_name); + if(!rng_EL) { LBANN_ERROR("Failed to open ", rng_name); } rng_EL << El::Generator(); + rng_EL.close(); #endif std::string rank_in_world; @@ -126,38 +130,50 @@ bool save_rng_to_checkpoint_shared(persist& p, const lbann_comm* comm) { /// @todo - Note that the RNG with thread local data is not correct rng_name = dirname + "/rng_io_generator_" + rank_in_world; std::ofstream rng_io(rng_name); + if(!rng_io) { LBANN_ERROR("Failed to open ", rng_name); } rng_io << ::io_generator; + rng_io.close(); /// @todo - Note that the RNG with thread local data is not correct rng_name = dirname + "/rng_fast_io_generator_" + rank_in_world; std::ofstream rng_fast_io(rng_name); + if(!rng_fast_io) { LBANN_ERROR("Failed to open ", rng_name); } rng_fast_io << ::fast_io_generator; + rng_fast_io.close(); #ifdef _OPENMP #pragma omp parallel private(rng_name) { rng_name = dirname + "/rng_generator_" + rank_in_world + "_" + std::to_string(omp_get_thread_num()); std::ofstream rng(rng_name); + if(!rng) { LBANN_ERROR("Failed to open ", rng_name); } rng << ::generator; + rng.close(); rng_name = dirname + "/rng_fast_generator_" + rank_in_world + "_" + std::to_string(omp_get_thread_num()); std::ofstream rng_fast(rng_name); + if(!rng_fast) { LBANN_ERROR("Failed to open ", rng_name); } rng_fast << ::fast_generator; + rng_fast.close(); } #else rng_name = dirname + "/rng_generator_" + rank_in_world; std::ofstream rng(rng_name); + if(!rng) { LBANN_ERROR("Failed to open ", rng_name); } rng << ::generator; + rng.close(); rng_name = dirname + "/rng_fast_generator_" + rank_in_world; std::ofstream rng_fast(rng_name); + if(!rng_fast) { LBANN_ERROR("Failed to open ", rng_name); } rng_fast << ::fast_generator; + rng_fast.close(); #endif return true; } -bool load_rng_from_checkpoint_shared(persist& p, const lbann_comm* comm) { +bool load_rng_from_checkpoint(persist& p, const lbann_comm* comm) { std::string dirname = std::string(p.m_checkpoint_dir) + "/rng_state"; std::string rng_name; @@ -165,11 +181,13 @@ bool load_rng_from_checkpoint_shared(persist& p, const lbann_comm* comm) { /// @todo - Note that the RNG with thread local data is not correct rng_name = dirname + "/rng_seq_generator"; std::ifstream rng_seq(rng_name); + if(!rng_seq) { LBANN_ERROR("Failed to open ", rng_name); } rng_seq >> ::data_seq_generator; #ifdef LBANN_SET_EL_RNG rng_name = dirname + "/EL_generator"; std::ifstream rng_EL(rng_name); + if(!rng_EL) { LBANN_ERROR("Failed to open ", rng_name); } rng_EL >> El::Generator(); #endif @@ -183,11 +201,13 @@ bool load_rng_from_checkpoint_shared(persist& p, const lbann_comm* comm) { /// @todo - Note that the RNG with thread local data is not correct rng_name = dirname + "/rng_io_generator_" + rank_in_world; std::ifstream rng_io(rng_name); + if(!rng_io) { LBANN_ERROR("Failed to open ", rng_name); } rng_io >> ::io_generator; /// @todo - Note that the RNG with thread local data is not correct rng_name = dirname + "/rng_fast_io_generator_" + rank_in_world; std::ifstream rng_fast_io(rng_name); + if(!rng_fast_io) { LBANN_ERROR("Failed to open ", rng_name); } rng_fast_io >> ::fast_io_generator; #ifdef _OPENMP @@ -195,19 +215,23 @@ bool load_rng_from_checkpoint_shared(persist& p, const lbann_comm* comm) { { rng_name = dirname + "/rng_generator_" + rank_in_world + "_" + std::to_string(omp_get_thread_num()); std::ifstream rng(rng_name); + if(!rng) { LBANN_ERROR("Failed to open ", rng_name); } rng >> ::generator; rng_name = dirname + "/rng_fast_generator_" + rank_in_world + "_" + std::to_string(omp_get_thread_num()); std::ifstream rng_fast(rng_name); + if(!rng_fast) { LBANN_ERROR("Failed to open ", rng_name); } rng_fast >> ::fast_generator; } #else rng_name = dirname + "/rng_generator_" + rank_in_world; std::ifstream rng(rng_name); + if(!rng) { LBANN_ERROR("Failed to open ", rng_name); } rng >> ::generator; rng_name = dirname + "/rng_fast_generator_" + rank_in_world; std::ifstream rng_fast(rng_name); + if(!rng_fast) { LBANN_ERROR("Failed to open ", rng_name); } rng_fast >> ::fast_generator; } #endif diff --git a/src/weights/weights.cpp b/src/weights/weights.cpp index 21d0d81612d..4723b35b249 100644 --- a/src/weights/weights.cpp +++ b/src/weights/weights.cpp @@ -428,7 +428,7 @@ bool weights::save_to_checkpoint_shared(lbann::persist& p) // write weights using persist call -- uses Elemental's write function. p.write_distmat(persist_type::model, l_name, m_values.get()); // if saving training state, also write out state of optimizer - if (m_optimizer != nullptr && (p.get_cb_type() == callback_type::batch || p.get_cb_type() == callback_type::epoch)) { + if (m_optimizer != nullptr) { m_optimizer->save_to_checkpoint_shared(p, m_name); } From 43cd7e90ebde6e3692bfd109ba9de186d7eb6c90 Mon Sep 17 00:00:00 2001 From: Tom Benson <30674819+benson31@users.noreply.github.com> Date: Sun, 8 Sep 2019 08:30:27 -0700 Subject: [PATCH 282/634] Small update to OSX build instructions (#1177) * Minor documentation update in spack instructions and add warning about system headers --- docs/build_osx.rst | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/docs/build_osx.rst b/docs/build_osx.rst index 12470de11fa..a563174f7c6 100644 --- a/docs/build_osx.rst +++ b/docs/build_osx.rst @@ -14,6 +14,15 @@ Building LBANN on OS X Getting Started -------------------- +.. warning:: If using OSX 10.14 or newer, be sure that + :bash:`/usr/include` has been restored. In version 10.14, + this may be accomplished by installing + :bash:`/Library/Developer/CommandLineTools/Packages/macOS_SDK_headers_for_macOS_10.14.pkg`. + If this package is not available, it's possible command + line tools have not been installed; do so by executing + :bash:`xcode-select --install`. + + .. _osx-setup-spack: ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -136,7 +145,7 @@ Building & Installing LBANN as a developer -D LBANN_SB_FWD_LBANN_OpenMP_CXX_FLAGS="-fopenmp=libomp" \ -D LBANN_SB_FWD_LBANN_OpenMP_omp_LIBRARY=/usr/local/opt/llvm/lib/libomp.dylib \ \ - -D CMAKE_CXX_COMPILER=$(which clang) \ + -D CMAKE_CXX_COMPILER=$(which clang++) \ -D CMAKE_C_COMPILER=$(which clang) \ ${LBANN_HOME}/superbuild From bfadaa3ac5258d1a18e304c2e21fb193e2bd767e Mon Sep 17 00:00:00 2001 From: "Thomas R. Benson" Date: Mon, 9 Sep 2019 13:10:12 -0700 Subject: [PATCH 283/634] set a fixed version of hydrogen in the superbuild --- superbuild/hydrogen/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/superbuild/hydrogen/CMakeLists.txt b/superbuild/hydrogen/CMakeLists.txt index caf5748de3c..8ebd4d0dbad 100644 --- a/superbuild/hydrogen/CMakeLists.txt +++ b/superbuild/hydrogen/CMakeLists.txt @@ -109,7 +109,7 @@ else () endif () # ... then the tag. -set(HYDROGEN_TAG "hydrogen" +set(HYDROGEN_TAG "v1.1.0-1" CACHE STRING "The git tag or hash to checkout for Hydrogen") if (HYDROGEN_CUSTOM_SOURCE_DIR) From 2dd32362f91fa249a53a534c98987eaff7be755d Mon Sep 17 00:00:00 2001 From: Tim Moon Date: Mon, 9 Sep 2019 13:47:10 -0700 Subject: [PATCH 284/634] Custom batch scripts in Python frontend (#1188) * Implement Python class to write custom batch scripts Currently only supports Slurm. * Support custom LSF batch scripts in Python frontend Not yet tested. * Specify experiment structure in Python lbann.run function * Implement Python function to create batch script generator Automatically detects scheduler if needed. * Implement Python function to create batch script generator, with LC-specific defaults and optimizations * Documenting batch script managers in Python frontend * Fix typos in LSF batch script manager * Python batch script managers can accept commands as lists of args * Python batch script manager does not overwrite script file by default --- python/lbann/contrib/lc/launcher.py | 138 +++++++++---- python/lbann/launcher/__init__.py | 233 ++++++++++++++++------ python/lbann/launcher/batch_script.py | 193 ++++++++++++++++++ python/lbann/launcher/lsf.py | 272 ++++++++++++++------------ python/lbann/launcher/slurm.py | 269 +++++++++++++------------ 5 files changed, 758 insertions(+), 347 deletions(-) create mode 100644 python/lbann/launcher/batch_script.py diff --git a/python/lbann/contrib/lc/launcher.py b/python/lbann/contrib/lc/launcher.py index 3dd9718a176..b6055ba8dbe 100644 --- a/python/lbann/contrib/lc/launcher.py +++ b/python/lbann/contrib/lc/launcher.py @@ -1,30 +1,104 @@ +import os, os.path from lbann import lbann_exe from lbann.contrib.lc.systems import * import lbann.launcher +from lbann.util import make_iterable def run(trainer, model, data_reader, optimizer, - lbann_exe=lbann_exe(), - lbann_args='', experiment_dir=None, nodes=1, procs_per_node=procs_per_node(), - time_limit=60, + time_limit=None, scheduler=scheduler(), job_name='lbann', system=system(), partition=partition(), account=account(), reservation=None, - launcher_args='', + launcher_args=[], + lbann_args=[], environment={}, setup_only=False): - """Run LBANN experiment with LC-specific optimizations. + """Run LBANN with LC-specific optimizations. - This is a convenience wrapper around the `lbann.launcher.run` - function, with defaults and optimizations for LC systems. + This is intended to match the behavior of `lbann.launcher.run`, + with defaults and optimizations for LC systems. """ + # Create batch script generator + script = make_batch_script(work_dir=experiment_dir, + nodes=nodes, + procs_per_node=procs_per_node, + time_limit=time_limit, + scheduler=scheduler, + job_name=job_name, + partition=partition, + account=account, + reservation=reservation, + launcher_args=launcher_args, + environment=environment) + + # Check for an existing job allocation + has_allocation = False + if isinstance(script, lbann.launcher.slurm.SlurmBatchScript): + has_allocation = 'SLURM_JOB_ID' in os.environ + if isinstance(script, lbann.launcher.lsf.LSFBatchScript): + has_allocation = 'LSB_JOBID' in os.environ + + # Batch script prints start time + script.add_command('date | sed "s/^/Started at /"') + + # Batch script invokes LBANN + lbann_command = [lbann.lbann_exe()] + lbann_command.extend(make_iterable(lbann_args)) + prototext_file = os.path.join(script.work_dir, 'experiment.prototext') + lbann.proto.save_prototext(prototext_file, + trainer=trainer, + model=model, + data_reader=data_reader, + optimizer=optimizer) + lbann_command.append('--prototext={}'.format(prototext_file)) + script.add_parallel_command(lbann_command) + + # Batch script prints finish time + script.add_command('date | sed "s/^/Finished at /"') + + # Write, run, or submit batch script + status = 0 + if setup_only: + script.write() + elif has_allocation: + status = script.run() + else: + status = script.submit() + return status + +def make_batch_script(script_file=None, + work_dir=None, + nodes=1, + procs_per_node=procs_per_node(), + time_limit=None, + scheduler=scheduler(), + job_name='lbann', + system=system(), + partition=partition(), + account=account(), + reservation=None, + launcher_args=[], + environment={}): + """Construct batch script manager with LC-specific optimizations. + + This is intended to match the behavior of + `lbann.launcher.make_batch_script`, with defaults and + optimizations for LC systems. + + """ + + # Create shallow copies of input arguments + launcher_args = list(make_iterable(launcher_args)) + environment = environment.copy() + # Setup GPU bindings # Note: Hydrogen processes take ownership of the GPU indices that # matches their node communicator ranks. mpibind assigns each rank @@ -32,14 +106,15 @@ def run(trainer, model, data_reader, optimizer, # may touch the wrong GPUs in the process of figuring out GPU # ownership, so an exclusive GPU compute mode causes problems. if scheduler == 'slurm' and has_gpu(system): - launcher_args += ' --mpibind=off --nvidia_compute_mode=default' + launcher_args.extend(['--mpibind=off', + '--nvidia_compute_mode=default']) - # Deal with Pascal's strange hardware topology + # Deal with Pascal's hardware topology # Note: Both GPUs on a Pascal node are on the same socket, so we # only use cores on that socket. if system == 'pascal' and procs_per_node == 2: if scheduler == 'slurm': - launcher_args += ' --cpu_bind=mask_cpu:0x000001ff,0x0003fe00' + launcher_args.append('--cpu_bind=mask_cpu:0x000001ff,0x0003fe00') environment['OMP_NUM_THREADS'] = 8 environment['AL_PROGRESS_RANKS_PER_NUMA_NODE'] = 2 @@ -49,36 +124,31 @@ def run(trainer, model, data_reader, optimizer, # present in MVAPICH2-2.3rc2. environment['MV2_USE_RDMA_CM'] = 0 - # Hacked bugfix for MPI_Sendrecv in MVAPICH2-2.3 - # Note: MPI_Sendrecv produces incorrect output under certain - # circumstances. This bug is not present in MVAPICH2-2.2 or - # MVAPICH2-2.3.1. - environment['MV2_USE_LAZY_MEM_UNREGISTER'] = 0 - # Magic default arguments to jsrun/etc. # Note: Pack processes using ten cores for each, with 40 cores total, and # all four GPUs visible to each process. if system in ('sierra', 'lassen'): if scheduler == 'lsf': - launcher_args += ' -d packed -b "packed:10" -r 1 -c 40 -g 4' + launcher_args.extend([ + '--launch_distribution packed', + '--bind "packed:10"', + '--rs_per_host 1', + '--cpu_per_rs 40', + '--gpu_per_rs 4' + ]) environment['OMP_NUM_THREADS'] = 4 # Deal with topology mis-identification on Sierra/Lassen. environment['AL_PROGRESS_RANKS_PER_NUMA_NODE'] = 2 - # Run LBANN - return lbann.launcher.run(trainer, model, data_reader, optimizer, - lbann_exe=lbann_exe, - lbann_args=lbann_args, - experiment_dir=experiment_dir, - nodes=nodes, - procs_per_node=procs_per_node, - time_limit=time_limit, - scheduler=scheduler, - job_name=job_name, - system=system, - partition=partition, - account=account, - reservation=reservation, - launcher_args=launcher_args, - environment=environment, - setup_only=setup_only) + return lbann.launcher.make_batch_script(script_file=script_file, + work_dir=work_dir, + nodes=nodes, + procs_per_node=procs_per_node, + time_limit=time_limit, + scheduler=scheduler, + job_name=job_name, + partition=partition, + account=account, + reservation=reservation, + launcher_args=launcher_args, + environment=environment) diff --git a/python/lbann/launcher/__init__.py b/python/lbann/launcher/__init__.py index f8acbf4e720..335d46508ad 100644 --- a/python/lbann/launcher/__init__.py +++ b/python/lbann/launcher/__init__.py @@ -1,32 +1,31 @@ -import os -import os.path import datetime +import os, os.path +import subprocess import lbann import lbann.proto import lbann.launcher.slurm import lbann.launcher.lsf +from lbann.util import make_iterable # ============================================== # Run experiments # ============================================== def run(trainer, model, data_reader, optimizer, - lbann_exe=lbann.lbann_exe(), - lbann_args='', experiment_dir=None, nodes=1, procs_per_node=1, - time_limit=60, - scheduler='slurm', + time_limit=None, + scheduler=None, job_name='lbann', - system=None, partition=None, account=None, reservation=None, - launcher_args='', + launcher_args=[], + lbann_args=[], environment={}, setup_only=False): - """Run LBANN experiment. + """Run LBANN. This is intended to interface with job schedulers on HPC clusters. It will either submit a batch job (if on a login node) @@ -39,15 +38,11 @@ def run(trainer, model, data_reader, optimizer, can be set with the environment variable `LBANN_EXPERIMENT_DIR`. Args: - trainer (lbann.Trainer): LBANN Trainer (resource manager). - model (lbann.model.Model or lbann_pb2.Model): Neural network + trainer (lbann.Trainer): LBANN trainer. + model (lbann.Model): Neural network model. + data_reader (lbann.reader_pb2.DataReader): Data reader. + optimizer (lbann.model.Optimizer): Default optimizer for model. - data_reader (lbann_pb2.DataReader): Data reader. - optimizer (lbann.model.Optimizer or lbann_pb2.Optimizer): - Default optimizer for model. - lbann_exe (str, optional): LBANN executable. - lbann_args (str, optional): Command-line arguments to LBANN - executable. experiment_dir (str, optional): Experiment directory. nodes (int, optional): Number of compute nodes. procs_per_node (int, optional): Number of processes per compute @@ -55,12 +50,13 @@ def run(trainer, model, data_reader, optimizer, time_limit (int, optional): Job time limit, in minutes. scheduler (str, optional): Job scheduler. job_name (str, optional): Batch job name. - system (str, optional): Target system. partition (str, optional): Scheduler partition. account (str, optional): Scheduler account. reservation (str, optional): Scheduler reservation name. launcher_args (str, optional): Command-line arguments to launcher. + lbann_args (str, optional): Command-line arguments to LBANN + executable. environment (dict of {str: str}, optional): Environment variables. setup_only (bool, optional): If true, the experiment is not @@ -74,60 +70,171 @@ def run(trainer, model, data_reader, optimizer, """ - # Construct experiment directory if needed - if not experiment_dir: - if 'LBANN_EXPERIMENT_DIR' in os.environ: - experiment_dir = os.environ['LBANN_EXPERIMENT_DIR'] - else: - experiment_dir = os.path.join(os.getcwd()) - timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S') - experiment_dir = os.path.join(experiment_dir, - '{}_{}'.format(timestamp, job_name)) - i = 1 - while os.path.lexists(experiment_dir): - i += 1 - experiment_dir = os.path.join( - os.path.dirname(experiment_dir), - '{}_{}_{}'.format(timestamp, job_name, i)) - experiment_dir = os.path.abspath(experiment_dir) - os.makedirs(experiment_dir, exist_ok=True) + # Create batch script generator + script = make_batch_script(work_dir=experiment_dir, + nodes=nodes, + procs_per_node=procs_per_node, + time_limit=time_limit, + scheduler=scheduler, + job_name=job_name, + partition=partition, + account=account, + reservation=reservation, + launcher_args=launcher_args, + environment=environment) - # Create experiment prototext file - prototext_file = os.path.join(experiment_dir, 'experiment.prototext') + # Check for an existing job allocation + has_allocation = False + if isinstance(script, lbann.launcher.slurm.SlurmBatchScript): + has_allocation = 'SLURM_JOB_ID' in os.environ + if isinstance(script, lbann.launcher.lsf.LSFBatchScript): + has_allocation = 'LSB_JOBID' in os.environ + + # Batch script prints start time + script.add_command('date | sed "s/^/Started at /"') + + # Batch script invokes LBANN + lbann_command = [lbann.lbann_exe()] + lbann_command.extend(make_iterable(lbann_args)) + prototext_file = os.path.join(script.work_dir, 'experiment.prototext') lbann.proto.save_prototext(prototext_file, trainer=trainer, model=model, data_reader=data_reader, optimizer=optimizer) - lbann_args += ' --prototext=' + prototext_file + lbann_command.append('--prototext={}'.format(prototext_file)) + script.add_parallel_command(lbann_command) + + # Batch script prints finish time + script.add_command('date | sed "s/^/Finished at /"') + + # Write, run, or submit batch script + status = 0 + if setup_only: + script.write() + elif has_allocation: + status = script.run() + else: + status = script.submit() + return status + +def make_batch_script(script_file=None, + work_dir=None, + nodes=1, + procs_per_node=1, + time_limit=None, + scheduler=None, + job_name='lbann', + partition=None, + account=None, + reservation=None, + launcher_args=[], + environment={}): + """Construct batch script manager. + + Attempts to detect a scheduler if one is not provided. + + If a working directory is not provided, a timestamped directory is + created (by default in the current working directory). The + location of autogenerated working directories can be set with the + environment variable `LBANN_EXPERIMENT_DIR`. + + Args: + script_file (str): Script file. + work_dir (str, optional): Working directory + (default: autogenerated, timestamped directory). + nodes (int, optional): Number of compute nodes + (default: 1). + procs_per_node (int, optional): Parallel processes per + compute node (default: 1). + time_limit (int, optional): Job time limit, in minutes. + scheduler (str, optional): Job scheduler + (default: autodetected scheduler). + job_name (str, optional): Job name (default: 'lbann'). + partition (str, optional): Scheduler partition. + account (str, optional): Scheduler account. + reservation (str, optional): Scheduler advance reservation. + launcher_args (`Iterable` of `str`, optional): + Command-line arguments to parallel command launcher. + environment (`dict` of `{str: str}`, optional): Environment + variables. + + Returns: + `lbann.launcher.batch_script.BatchScript` + + """ + + # Try detecting job scheduler if not provided + if not scheduler: + try: + subprocess.call(['sbatch', '--version'], + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL) + scheduler = 'slurm' + except: + pass + if not scheduler: + try: + subprocess.call(['bsub', '-V'], + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL) + scheduler = 'lsf' + except: + pass + if not scheduler: + raise RuntimeError('could not detect job scheduler') - # Run experiment + # Create work directory if not provided + if not work_dir: + if 'LBANN_EXPERIMENT_DIR' in os.environ: + work_dir = os.environ['LBANN_EXPERIMENT_DIR'] + else: + work_dir = os.path.join(os.getcwd()) + timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S') + work_dir = os.path.join(work_dir, + '{}_{}'.format(timestamp, job_name)) + i = 1 + while os.path.lexists(work_dir): + i += 1 + work_dir = os.path.join( + os.path.dirname(work_dir), + '{}_{}_{}'.format(timestamp, job_name, i)) + work_dir = os.path.realpath(work_dir) + os.makedirs(work_dir, exist_ok=True) + + # Create batch script manager + if not script_file: + script_file = os.path.join(work_dir, 'batch.sh') + script = None if scheduler.lower() in ('slurm', 'srun', 'sbatch'): - return slurm.run(experiment_dir=experiment_dir, - command='{} {}'.format(lbann_exe, lbann_args), - nodes=nodes, - procs_per_node=procs_per_node, - time_limit=time_limit, - job_name=job_name, - partition=partition, - account=account, - reservation=reservation, - srun_args=launcher_args, - environment=environment, - setup_only=setup_only) + script = lbann.launcher.slurm.SlurmBatchScript( + script_file=script_file, + work_dir=work_dir, + nodes=nodes, + procs_per_node=procs_per_node, + time_limit=time_limit, + job_name=job_name, + partition=partition, + account=account, + launcher_args=launcher_args) elif scheduler.lower() in ('lsf', 'jsrun', 'bsub'): - return lsf.run(experiment_dir=experiment_dir, - command='{} {}'.format(lbann_exe, lbann_args), - nodes=nodes, - procs_per_node=procs_per_node, - time_limit=time_limit, - job_name=job_name, - partition=partition, - account=account, - reservation=reservation, - jsrun_args=launcher_args, - environment=environment, - setup_only=setup_only) + script = lbann.launcher.lsf.LSFBatchScript( + script_file=script_file, + work_dir=work_dir, + nodes=nodes, + procs_per_node=procs_per_node, + time_limit=time_limit, + job_name=job_name, + partition=partition, + account=account, + reservation=reservation, + launcher_args=launcher_args) else: raise RuntimeError('unsupported job scheduler ({})' .format(scheduler)) + + # Set batch script environment + for variable, value in environment.items(): + script.add_command('export {0}={1}'.format(variable, value)) + + return script diff --git a/python/lbann/launcher/batch_script.py b/python/lbann/launcher/batch_script.py new file mode 100644 index 00000000000..e649d62fd37 --- /dev/null +++ b/python/lbann/launcher/batch_script.py @@ -0,0 +1,193 @@ +import os +import os.path +import subprocess +from lbann.util import make_iterable + +class BatchScript: + """Utility class to write batch job scripts. + + This class manages a non-interactive script file that can be + submitted as a batch job to an HPC job scheduler. A script is made + up of two parts: the header configures the job and the body + contains the actual commands to be executed. + + This particular class is not fully implemented. Derived classes + for specific job schedulers should implement + `add_parallel_command` and `submit`, maintaining the same API. + + """ + + def __init__(self, + script_file=None, + work_dir=os.getcwd(), + interpreter='/bin/bash'): + """Construct batch script manager. + + Args: + script_file (str): Script file. + work_dir (str, optional): Working directory + (default: current working directory). + interpreter (str, optional): Script interpreter + (default: /bin/bash). + + """ + + # Lines in script are stored as lists of strings + self.header = [] + self.body = [] + + # Construct file paths + self.work_dir = os.path.realpath(work_dir) + self.script_file = script_file + if not self.script_file: + self.script_file = os.path.join(self.work_dir, 'batch.sh') + self.script_file = os.path.realpath(self.script_file) + self.out_log_file = os.path.join(self.work_dir, 'out.log') + self.err_log_file = os.path.join(self.work_dir, 'err.log') + + # Shebang line + if interpreter: + self.add_header_line('#!{}'.format(interpreter)) + + def add_header_line(self, line): + """Add line to script header. + + The header should specify configuration options for the job + scheduler, without containing executable commands. + + """ + self.header.append(line) + + def add_body_line(self, line): + """Add line to script body. + + The body should contain the script's executable commands. + + """ + self.body.append(line) + + def add_command(self, command): + """Add executable command to script. + + Args: + command (`str` or `Iterable` of `str`s): Program + invocation or sequence of program arguments. + + """ + self.add_body_line(' '.join(make_iterable(command))) + + def add_parallel_command(self, + command, + launcher=None, + launcher_args=None, + nodes=None, + procs_per_node=None): + """Add command to be executed in parallel. + + The command is executed via a launcher, e.g. `mpirun`. + Parallel processes are distributed evenly amongst the compute + nodes. + + Args: + command (`str` or `Iterable` of `str`s): Command to be + executed in parallel. + launcher (str, optional): Parallel command launcher, + `mpirun`. + launcher_args (`Iterable` of `str`s, optional): + Command-line arguments to parallel command launcher. + nodes (int, optional): Number of compute nodes. + procs_per_node (int, optional): Number of parallel + processes per compute node. + + """ + raise NotImplementedError( + 'classes that inherit from `BatchScript` should implement ' + '`add_parallel_command` to use a specific job scheduler' + ) + + def write(self, overwrite=False): + """Write script to file. + + The working directory is created if needed. + + Args: + overwrite (bool): Whether to overwrite script file if it + already exists (default: false). + + """ + + # Create directories if needed + os.makedirs(self.work_dir, exist_ok=True) + os.makedirs(os.path.dirname(self.script_file), exist_ok=True) + + # Check if script file already exists + if not overwrite and os.path.isfile(self.script_file): + raise RuntimeError('Attempted to write batch script to {}, ' + 'but it already exists' + .format(self.script_file)) + + # Write script to file + with open(self.script_file, 'w') as f: + for line in self.header: + f.write('{}\n'.format(line)) + f.write('\n') + for line in self.body: + f.write('{}\n'.format(line)) + + # Make script file executable + os.chmod(self.script_file, 0o755) + + def run(self, overwrite=False): + """Execute the script. + + The script is executed directly and is _not_ submitted to a + job scheduler. The script file is written before being + executed. + + Args: + overwrite (bool): Whether to overwrite script file if it + already exists (default: false). + + Returns: + int: Exit status from executing script. + + """ + + # Construct script file + self.write(overwrite=overwrite) + + # Run script and pipe output to log files + run_proc = subprocess.Popen(self.script_file, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + cwd=self.work_dir) + out_proc = subprocess.Popen(['tee', self.out_log_file], + stdin=run_proc.stdout, + cwd=self.work_dir) + err_proc = subprocess.Popen(['tee', self.err_log_file], + stdin=run_proc.stderr, + cwd=self.work_dir) + run_proc.stdout.close() + run_proc.stderr.close() + run_proc.wait() + out_proc.wait() + err_proc.wait() + return run_proc.returncode + + def submit(self, overwrite=False): + """Submit batch job to job scheduler. + + The script file is written before being submitted. + + Args: + overwrite (bool): Whether to overwrite script file if it + already exists (default: false). + + Returns: + int: Exit status from submitting to job scheduler. + + """ + raise NotImplementedError( + 'classes that inherit from `BatchScript` should implement ' + '`submit` to use a specific job scheduler' + ) diff --git a/python/lbann/launcher/lsf.py b/python/lbann/launcher/lsf.py index c105944dbd1..a4846f966af 100644 --- a/python/lbann/launcher/lsf.py +++ b/python/lbann/launcher/lsf.py @@ -1,136 +1,158 @@ """Utility functions for LSF.""" import os -import os.path import subprocess from lbann.util import make_iterable +from .batch_script import BatchScript -def run(command, - experiment_dir=os.getcwd(), - nodes=1, - procs_per_node=1, - time_limit=-1, - job_name=None, - partition=None, - account=None, - reservation=None, - jsrun_args='', - environment={}, - setup_only=False): - """Run executable with LSF. - - Creates an LSF batch script in the experiment directory. If a LSF - job allocation is detected, the script is run directly. Otherwise, - the script is submitted to bsub. - - Args: - command (str): Program to run under LSF, i.e. an executable and - its command-line arguments. - experiment_dir (str, optional): Experiment directory. - nodes (int, optional): Number of compute nodes. - procs_per_node (int, optional): Number of processes per compute - node. - time_limit (int, optional): Job time limit, in minutes. A - negative value implies the system-default time limit. - job_name (str, optional): Batch job name. - partition (str, optional): Scheduler partition. - account (str, optional): Scheduler account. - reservation (str, optional): Scheduler reservation name. - jsrun_args (str, optional): Command-line arguments to jsrun. - environment (dict of {str: str}, optional): Environment - variables. - setup_only (bool, optional): If true, the experiment is not - run after the batch script is created. - - Returns: - int: Exit status from LSF. This is really only meaningful if - the script is run on an existing node allocation. If a - batch job is submitted, LSF will probably return 0 - trivially. - - """ - # Check for an existing job allocation. - # Note: Settings for existing allocations take precedence. - has_allocation = 'LSB_JOBID' in os.environ - if has_allocation: - job_name = os.environ['LSB_JOBNAME'] - partition = os.environ['LSB_QUEUE'] - # LSF does not provide a way to get the account via env vars. - time_limit = -1 - - # Experiment directory - experiment_dir = os.path.abspath(experiment_dir) - os.makedirs(experiment_dir, exist_ok=True) - batch_file = os.path.join(experiment_dir, 'batch.sh') - out_file = os.path.join(experiment_dir, 'out.log') - err_file = os.path.join(experiment_dir, 'err.log') - nodes_file = os.path.join(experiment_dir, 'nodes.txt') - - # Create batch script. - s = '#!/bin/sh\n' - if job_name: - s += '#BSUB -J {}\n'.format(job_name) - s += '#BSUB -nnodes {}\n'.format(nodes) - if partition: - s += '#BSUB -q {}\n'.format(partition) - if account: - s += '#BSUB -G {}\n'.format(account) - else: - raise ValueError('LSF requires an account') - if reservation: - s += '#BSUB -U {}\n'.format(reservation) - s += '#BSUB -cwd {}\n'.format(experiment_dir) - s += '#BSUB -o {}\n'.format(out_file) - s += '#BSUB -e {}\n'.format(err_file) - if time_limit >= 0: - s += '#BSUB -W {}\n'.format(time_limit) - - # Set environment variables. - if environment: - s += '\n# ==== Environment ====\n' - for variable, value in environment.items(): - s += 'export {}={}\n'.format(variable, value) - - # Time and node list. - s += '\n# ==== Useful info ====\n' - s += 'date\n' - s += 'jsrun -n {} -a 1 -r 1 hostname > {}\n'.format(nodes, nodes_file) - s += 'sort --unique --output={0} {0}\n'.format(nodes_file) - - # Run experiment. - s += '\n# ==== Experiment ====\n' - for cmd in make_iterable(command): - s += 'jsrun -n {} -a {} {} {}\n'.format( - nodes, procs_per_node, jsrun_args, cmd) - - with open(batch_file, 'w') as f: - f.write(s) - - # Make batch script executable. - os.chmod(batch_file, 0o755) - - # Launch if needed. - if setup_only: - return 0 - else: - if has_allocation: - run_proc = subprocess.Popen(['sh', batch_file], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - cwd=experiment_dir) - else: - # bsub requires the batch script be read from its stdin. - run_proc = subprocess.Popen('bsub < {}'.format(batch_file), - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - cwd=experiment_dir, - shell=True) - out_proc = subprocess.Popen(['tee', out_file], +class LSFBatchScript(BatchScript): + """Utility class to write LSF batch scripts.""" + + def __init__(self, + script_file=None, + work_dir=os.getcwd(), + nodes=1, + procs_per_node=1, + time_limit=None, + job_name=None, + partition=None, + account=None, + reservation=None, + launcher='jsrun', + launcher_args=[], + interpreter='/bin/bash'): + """Construct LSF batch script manager. + + Args: + script_file (str): Script file. + work_dir (str, optional): Working directory + (default: current working directory). + nodes (int, optional): Number of compute nodes + (default: 1). + procs_per_node (int, optional): Parallel processes per + compute node (default: 1). + time_limit (int, optional): Job time limit, in minutes + (default: none). + job_name (str, optional): Job name (default: none). + partition (str, optional): Scheduler partition + (default: none). + account (str, optional): Scheduler account + (default: none). + reservation (str, optional): Scheduler advance reservation + (default: none). + launcher (str, optional): Parallel command launcher + (default: jsrun). + launcher_args (`Iterable` of `str`, optional): + Command-line arguments to jsrun. + interpreter (str, optional): Script interpreter + (default: /bin/bash). + + """ + super().__init__(script_file=script_file, + work_dir=work_dir, + interpreter=interpreter) + self.nodes = nodes + self.procs_per_node = procs_per_node + self.launcher = launcher + self.launcher_args = launcher_args + + # Configure header with LSF job options + self._construct_header(job_name=job_name, + nodes=self.nodes, + time_limit=time_limit, + partition=partition, + account=account, + reservation=reservation) + + def _construct_header(self, + job_name=None, + nodes=1, + time_limit=None, + partition=None, + account=None, + reservation=None): + """Construct script header with options for bsub.""" + if job_name: + self.add_header_line('#BSUB -J {}'.format(job_name)) + if partition: + self.add_header_line('#BSUB -q {}'.format(partition)) + self.add_header_line('#BSUB -nnodes {}'.format(nodes)) + if time_limit: + hours, minutes = divmod(int(time_limit), 60) + self.add_header_line('#BSUB -W {}:{:02d}'.format(hours, minutes)) + self.add_header_line('#BSUB -cwd {}'.format(self.work_dir)) + self.add_header_line('#BSUB -o {}'.format(self.out_log_file)) + self.add_header_line('#BSUB -e {}'.format(self.err_log_file)) + if account: + self.add_header_line('#BSUB -G {}'.format(account)) + if reservation: + self.add_header_line('#BSUB -U {}'.format(reservation)) + + def add_parallel_command(self, + command, + launcher=None, + launcher_args=None, + nodes=None, + procs_per_node=None): + """Add command to be executed in parallel. + + The command is launched with jsrun. Parallel processes are + distributed evenly amongst the compute nodes. + + Args: + command (`str` or `Iterable` of `str`s): Command to be + executed in parallel. + launcher (str, optional): jsrun executable. + launcher_args (`Iterable` of `str`s, optional): + Command-line arguments to jsrun. + nodes (int, optional): Number of compute nodes. + procs_per_node (int, optional): Number of parallel + processes per compute node. + + """ + if launcher is None: + launcher = self.launcher + if launcher_args is None: + launcher_args = self.launcher_args + if nodes is None: + nodes = self.nodes + if procs_per_node is None: + procs_per_node = self.procs_per_node + args = [launcher] + args.extend(make_iterable(launcher_args)) + args.append('-n {}'.format(nodes)) + args.append('--tasks_per_rs {}'.format(procs_per_node)) + args.extend(make_iterable(command)) + self.add_command(args) + + def submit(self, overwrite=False): + """Submit batch job to LSF with bsub. + + The script file is written before being submitted. + + Args: + overwrite (bool): Whether to overwrite script file if it + already exists (default: false). + + Returns: + int: Exit status from bsub. + + """ + + # Construct script file + self.write(overwrite=overwrite) + + # Submit batch script and pipe output to log files + run_proc = subprocess.Popen(['bsub', self.script_file], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + cwd=self.work_dir) + out_proc = subprocess.Popen(['tee', self.out_log_file], stdin=run_proc.stdout, - cwd=experiment_dir) - err_proc = subprocess.Popen(['tee', err_file], + cwd=self.work_dir) + err_proc = subprocess.Popen(['tee', self.err_log_file], stdin=run_proc.stderr, - cwd=experiment_dir) + cwd=self.work_dir) run_proc.stdout.close() run_proc.stderr.close() run_proc.wait() diff --git a/python/lbann/launcher/slurm.py b/python/lbann/launcher/slurm.py index e7253adc5df..bb509e2c55a 100644 --- a/python/lbann/launcher/slurm.py +++ b/python/lbann/launcher/slurm.py @@ -1,136 +1,155 @@ """Utility functions for Slurm.""" -import os, os.path + +import os import subprocess from lbann.util import make_iterable +from .batch_script import BatchScript + +class SlurmBatchScript(BatchScript): + """Utility class to write Slurm batch scripts.""" + + def __init__(self, + script_file=None, + work_dir=os.getcwd(), + nodes=1, + procs_per_node=1, + time_limit=None, + job_name=None, + partition=None, + account=None, + launcher='srun', + launcher_args=[], + interpreter='/bin/bash'): + """Construct Slurm batch script manager. + + Args: + script_file (str): Script file. + work_dir (str, optional): Working directory + (default: current working directory). + nodes (int, optional): Number of compute nodes + (default: 1). + procs_per_node (int, optional): Parallel processes per + compute node (default: 1). + time_limit (int, optional): Job time limit, in minutes + (default: none). + job_name (str, optional): Job name (default: none). + partition (str, optional): Scheduler partition + (default: none). + account (str, optional): Scheduler account + (default: none). + launcher (str, optional): Parallel command launcher + (default: srun). + launcher_args (`Iterable` of `str`, optional): + Command-line arguments to srun. + interpreter (str, optional): Script interpreter + (default: /bin/bash). + + """ + super().__init__(script_file=script_file, + work_dir=work_dir, + interpreter=interpreter) + self.nodes = nodes + self.procs_per_node = procs_per_node + self.launcher = launcher + self.launcher_args = launcher_args + + # Configure header with Slurm job options + self._construct_header(job_name=job_name, + nodes=self.nodes, + time_limit=time_limit, + partition=partition, + account=account) -def run(command, - experiment_dir=os.getcwd(), - nodes=1, - procs_per_node=1, - time_limit=-1, - job_name=None, - partition=None, - account=None, - reservation=None, - srun_args='', - environment={}, - setup_only=False): - """Run executable with Slurm. - - Creates a Slurm batch script in the experiment directory. If a - Slurm job allocation is detected, the script is run - directly. Otherwise, the script is submitted to sbatch. - - Args: - command (str): Program to run under Slurm, i.e. an executable - and its command-line arguments. - experiment_dir (str, optional): Experiment directory. - nodes (int, optional): Number of compute nodes. - procs_per_node (int, optional): Number of processes per compute - node. - time_limit (int, optional): Job time limit, in minutes. A - negative value implies the system-default time limit. - job_name (str, optional): Batch job name. - partition (str, optional): Scheduler partition. - account (str, optional): Scheduler account. - reservation (str, optional): Scheduler reservation name. - srun_args (str, optional): Command-line arguments to srun. - environment (dict of {str: str}, optional): Environment - variables. - setup_only (bool, optional): If true, the experiment is not - run after the batch script is created. - - Returns: - int: Exit status from Slurm. This is really only meaningful if - the script is run on an existing node allocation. If a - batch job is submitted, Slurm will probably return 0 - trivially. - - """ - - # Check for an existing job allocation from Slurm - # Note: Settings for current job allocation take precedence - has_allocation = 'SLURM_JOB_ID' in os.environ - if has_allocation: - job_name = os.environ['SLURM_JOB_NAME'] - partition = os.environ['SLURM_JOB_PARTITION'] - account = os.environ['SLURM_JOB_ACCOUNT'] - time_limit = -1 - - # Experiment directory - experiment_dir = os.path.abspath(experiment_dir) - os.makedirs(experiment_dir, exist_ok=True) - batch_file = os.path.join(experiment_dir, 'batch.sh') - out_file = os.path.join(experiment_dir, 'out.log') - err_file = os.path.join(experiment_dir, 'err.log') - nodes_file = os.path.join(experiment_dir, 'nodes.txt') - - # Write batch script - with open(batch_file, 'w') as f: - f.write('#!/bin/sh\n') - - # Slurm job settings + def _construct_header(self, + job_name=None, + nodes=1, + time_limit=None, + partition=None, + account=None): + """Construct script header with options for sbatch.""" if job_name: - f.write('#SBATCH --job-name={}\n'.format(job_name)) - f.write('#SBATCH --nodes={}\n'.format(nodes)) - if partition: - f.write('#SBATCH --partition={}\n'.format(partition)) - if account: - f.write('#SBATCH --account={}\n'.format(account)) - if reservation: - raise ValueError('Slurm reservations not supported') - f.write('#SBATCH --workdir={}\n'.format(experiment_dir)) - f.write('#SBATCH --output={}\n'.format(out_file)) - f.write('#SBATCH --error={}\n'.format(err_file)) - if time_limit >= 0: + self.add_header_line('#SBATCH --job-name={}'.format(job_name)) + self.add_header_line('#SBATCH --nodes={}'.format(nodes)) + if time_limit is not None: + time_limit = max(time_limit, 0) seconds = int((time_limit % 1) * 60) hours, minutes = divmod(int(time_limit), 60) days, hours = divmod(hours, 24) - f.write('#SBATCH --time={}-{:02d}:{:02d}:{:02d}\n' - .format(days, hours, minutes, seconds)) - - # Set environment - if environment: - f.write('\n') - f.write('# ==== Environment ====\n') - for variable, value in environment.items(): - f.write('export {}={}\n'.format(variable, value)) - - # Display time and node list - f.write('\n') - f.write('# ==== Useful info ====\n') - f.write('date\n') - f.write('srun --nodes={0} --ntasks={0} hostname > {1}\n' - .format(nodes, nodes_file)) - f.write('sort --unique --output={0} {0}\n'.format(nodes_file)) - - # Run experiment - f.write('\n') - f.write('# ==== Experiment ====\n') - for cmd in make_iterable(command): - f.write('srun {} --nodes={} --ntasks={} {}\n' - .format(srun_args, nodes, nodes * procs_per_node, - cmd)) - - # Make batch script executable - os.chmod(batch_file, 0o755) - - # Launch job if needed - # Note: Pipes output to log files - if setup_only: - return 0 - else: - run_exe = 'sh' if has_allocation else 'sbatch' - run_proc = subprocess.Popen([run_exe, batch_file], - stdout = subprocess.PIPE, - stderr = subprocess.PIPE, - cwd = experiment_dir) - out_proc = subprocess.Popen(['tee', out_file], - stdin = run_proc.stdout, - cwd = experiment_dir) - err_proc = subprocess.Popen(['tee', err_file], - stdin = run_proc.stderr, - cwd = experiment_dir) + self.add_header_line('#SBATCH --time={}-{:02d}:{:02d}:{:02d}' + .format(days, hours, minutes, seconds)) + self.add_header_line('#SBATCH --workdir={}'.format(self.work_dir)) + self.add_header_line('#SBATCH --output={}'.format(self.out_log_file)) + self.add_header_line('#SBATCH --error={}'.format(self.err_log_file)) + if partition: + self.add_header_line('#SBATCH --partition={}'.format(partition)) + if account: + self.add_header_line('#SBATCH --account={}'.format(account)) + + def add_parallel_command(self, + command, + launcher=None, + launcher_args=None, + nodes=None, + procs_per_node=None): + """Add command to be executed in parallel. + + The command is launched with srun. Parallel processes are + distributed evenly amongst the compute nodes. + + Args: + command (`str` or `Iterable` of `str`s): Command to be + executed in parallel. + launcher (str, optional): srun executable. + launcher_args (`Iterable` of `str`s, optional): + Command-line arguments to srun. + nodes (int, optional): Number of compute nodes. + procs_per_node (int, optional): Number of parallel + processes per compute node. + + """ + if launcher is None: + launcher = self.launcher + if launcher_args is None: + launcher_args = self.launcher_args + if nodes is None: + nodes = self.nodes + if procs_per_node is None: + procs_per_node = self.procs_per_node + args = [launcher] + args.extend(make_iterable(launcher_args)) + args.append('--nodes={}'.format(nodes)) + args.append('--ntasks={}'.format(nodes * procs_per_node)) + args.extend(make_iterable(command)) + self.add_command(args) + + def submit(self, overwrite=False): + """Submit batch job to Slurm with sbatch. + + The script file is written before being submitted. + + Args: + overwrite (bool): Whether to overwrite script file if it + already exists (default: false). + + Returns: + int: Exit status from sbatch. + + """ + + # Construct script file + self.write(overwrite=overwrite) + + # Submit batch script and pipe output to log files + run_proc = subprocess.Popen(['sbatch', self.script_file], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + cwd=self.work_dir) + out_proc = subprocess.Popen(['tee', self.out_log_file], + stdin=run_proc.stdout, + cwd=self.work_dir) + err_proc = subprocess.Popen(['tee', self.err_log_file], + stdin=run_proc.stderr, + cwd=self.work_dir) run_proc.stdout.close() run_proc.stderr.close() run_proc.wait() From 0b2244e2cdbd7db0588d02efff5d8fd9ec1dcecd Mon Sep 17 00:00:00 2001 From: Tim Moon Date: Mon, 9 Sep 2019 13:50:05 -0700 Subject: [PATCH 285/634] Enable dataset splitting for Python data reader (#1192) Closes #1125. --- src/proto/proto_common.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/proto/proto_common.cpp b/src/proto/proto_common.cpp index 4352a065354..57da047532b 100644 --- a/src/proto/proto_common.cpp +++ b/src/proto/proto_common.cpp @@ -487,6 +487,7 @@ void init_data_readers( params.sample_function(), params.num_samples_function(), params.sample_dims_function()); + (*(python_reader *)reader_validation) = (*(python_reader *)reader); #else LBANN_ERROR("attempted to construct Python data reader, " "but LBANN is not built with Python/C API"); From 1edba6919c4209d71c66caaa5b1c8444f91e5e4a Mon Sep 17 00:00:00 2001 From: Tim Moon Date: Mon, 9 Sep 2019 15:02:07 -0700 Subject: [PATCH 286/634] Setup data readers after model has been constructed --- src/utils/lbann_library.cpp | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/src/utils/lbann_library.cpp b/src/utils/lbann_library.cpp index 6cad37dc1a9..69631579b35 100644 --- a/src/utils/lbann_library.cpp +++ b/src/utils/lbann_library.cpp @@ -220,11 +220,6 @@ std::unique_ptr build_model_from_prototext( is_shared_testing_data_reader = opts->get_bool("share_testing_data_readers"); } init_data_readers(comm, pb, data_readers, is_shared_training_data_reader, is_shared_testing_data_reader); - /// Setup the data readers with the I/O thread pool - for(auto&& dr: data_readers) { - dr.second->setup(io_threads_per_process, &io_thread_pool); - dr.second->set_rank(comm->get_rank_in_trainer()); - } // hack to prevent all data readers from loading identical data; instead, // share a single copy. See data_reader_jag_conduit_hdf5 for example @@ -274,6 +269,13 @@ std::unique_ptr build_model_from_prototext( } } + // Setup data readers + for(auto&& dr: data_readers) { + dr.second->setup(io_threads_per_process, &io_thread_pool); + dr.second->set_rank(comm->get_rank_in_trainer()); + } + + // Setup models ret_model->setup(); if (opts->get_bool("use_data_store") || opts->get_bool("preload_data_store") || opts->get_bool("data_store_cache")) { From 1bce60a41aa60339ae30e858505ecb1235489f3e Mon Sep 17 00:00:00 2001 From: Brian Van Essen Date: Tue, 10 Sep 2019 05:34:20 -0700 Subject: [PATCH 287/634] Updated list of publications. (#1232) --- docs/publications.rst | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/docs/publications.rst b/docs/publications.rst index c2bb25449cd..0926625be23 100644 --- a/docs/publications.rst +++ b/docs/publications.rst @@ -3,10 +3,31 @@ Papers, Presentations, and Posters Publications about or related to using LBANN: ++ Nikoli Dryden, Naoya Maruyama, Tom Benson, Tim Moon, Marc Snir, + Brian Van Essen. `"Channel and Filter Parallelism for Large-Scale + CNN Training"`_, to appear in *International Conference for High + Performance Computing, Networking, Storage and Analysis (SC'19)*, 2019. + ++ Sam Ade Jacobs, Brian Van Essen, Tim Moon, Jae Seung Yeom, David + Hysom, Brian Spears, Rushil Anirudh, Jayaraman Thiagaranjan, Shusen + Liu, Jim Gaffney, Peer-Timo Bremer, Tom Benson, Peter Robinson, and + Luc Peterson, `"Parallelizing Training of Deep Generative Models on + Massive Scientific Datasets"`_, to appear in * Proceedings of Cluster + Computing*, 2019 + ++ Shusen Liu, Di Wang, Dan Maljovec, Rushil Anirudh, + Jayaraman J. Thiagarajan, Sam Ade Jacobs, Brian C. Van Essen, David + Hysom, Jae-Seung Yeom, Jim Gaffney, Luc Peterson, Peter B. Robinson, + Harsh Bhatia, Valerio Pascucci, Brian K. Spears, Peer-Timo Bremer. + `"Scalable Topological Data Analysis and Visualization for + Evaluating Data-Driven Models in Scientific Applications" + `_, to appear in *IEEE Transactions + on Visualization and Computer Graphics*, 2019 + + Nikoli Dryden, Naoya Maruyama, Tom Benson, Tim Moon, Marc Snir, Brian Van Essen. `"Improving Strong-Scaling of CNN Training by Exploiting Finer-Grained Parallelism" - `_, to appear in *IEEE + `_, in *Proceedings of IEEE International Parallel & Distributed Processing Symposium*, 2019. + `IPDPS'19 `_ From afd90724631c373258244bc23ca760c28b977915 Mon Sep 17 00:00:00 2001 From: Brian Van Essen Date: Tue, 10 Sep 2019 05:46:11 -0700 Subject: [PATCH 288/634] Fixed formatting for publications. (#1233) * Updated list of publications. * Fixed formatting. --- docs/publications.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/publications.rst b/docs/publications.rst index 0926625be23..aa22a58b0e6 100644 --- a/docs/publications.rst +++ b/docs/publications.rst @@ -4,15 +4,15 @@ Papers, Presentations, and Posters Publications about or related to using LBANN: + Nikoli Dryden, Naoya Maruyama, Tom Benson, Tim Moon, Marc Snir, - Brian Van Essen. `"Channel and Filter Parallelism for Large-Scale - CNN Training"`_, to appear in *International Conference for High + Brian Van Essen. "Channel and Filter Parallelism for Large-Scale + CNN Training", to appear in *International Conference for High Performance Computing, Networking, Storage and Analysis (SC'19)*, 2019. + Sam Ade Jacobs, Brian Van Essen, Tim Moon, Jae Seung Yeom, David Hysom, Brian Spears, Rushil Anirudh, Jayaraman Thiagaranjan, Shusen Liu, Jim Gaffney, Peer-Timo Bremer, Tom Benson, Peter Robinson, and - Luc Peterson, `"Parallelizing Training of Deep Generative Models on - Massive Scientific Datasets"`_, to appear in * Proceedings of Cluster + Luc Peterson, "Parallelizing Training of Deep Generative Models on + Massive Scientific Datasets", to appear in *Proceedings of Cluster Computing*, 2019 + Shusen Liu, Di Wang, Dan Maljovec, Rushil Anirudh, From fc3ce96bd70ce7e90b04b89b06e0629c7f246711 Mon Sep 17 00:00:00 2001 From: "David A. Hysom" Date: Tue, 10 Sep 2019 10:19:13 -0700 Subject: [PATCH 289/634] Changed LBANN_ERRORs to variadic format --- src/data_store/data_store_conduit.cpp | 70 ++++++++++++++------------- 1 file changed, 36 insertions(+), 34 deletions(-) diff --git a/src/data_store/data_store_conduit.cpp b/src/data_store/data_store_conduit.cpp index be9c516b045..a40c7d3364c 100644 --- a/src/data_store/data_store_conduit.cpp +++ b/src/data_store/data_store_conduit.cpp @@ -44,6 +44,7 @@ namespace lbann { // Macro to throw an LBANN exception +#if 0 #undef LBANN_ERROR #define LBANN_ERROR(message) \ do { \ @@ -66,6 +67,7 @@ namespace lbann { } \ throw lbann::exception(ss_LBANN_ERROR.str()); \ } while (0) +#endif data_store_conduit::data_store_conduit( generic_data_reader *reader) : @@ -430,20 +432,20 @@ void data_store_conduit::error_check_compacted_node(const conduit::Node &nd, int if (m_compacted_sample_size == 0) { m_compacted_sample_size = nd.total_bytes_compact(); } else if (m_compacted_sample_size != nd.total_bytes_compact() && !m_node_sizes_vary) { - LBANN_ERROR("Conduit node being added data_id: " + std::to_string(data_id) - + " is not the same size as existing nodes in the data_store " - + std::to_string(m_compacted_sample_size) + " != " - + std::to_string(nd.total_bytes_compact()) - + " role: " + m_reader->get_role()); + LBANN_ERROR("Conduit node being added data_id: ", std::to_string(data_id), + " is not the same size as existing nodes in the data_store ", + std::to_string(m_compacted_sample_size), " != ", + std::to_string(nd.total_bytes_compact()), + " role: ", m_reader->get_role()); } if (!nd.is_contiguous()) { - LBANN_ERROR("m_data[" + std::to_string(data_id) + "] does not have a contiguous layout"); + LBANN_ERROR("m_data[", std::to_string(data_id), "] does not have a contiguous layout"); } if (nd.data_ptr() == nullptr) { - LBANN_ERROR("m_data[" + std::to_string(data_id) + "] does not have a valid data pointer"); + LBANN_ERROR("m_data[", std::to_string(data_id), "] does not have a valid data pointer"); } if (nd.contiguous_data_ptr() == nullptr) { - LBANN_ERROR("m_data[" + std::to_string(data_id) + "] does not have a valid contiguous data pointer"); + LBANN_ERROR("m_data[", std::to_string(data_id), "] does not have a valid contiguous data pointer"); } } @@ -454,7 +456,7 @@ void data_store_conduit::set_conduit_node(int data_id, conduit::Node &node, bool } m_mutex.lock(); if (already_have == false && m_data.find(data_id) != m_data.end()) { - LBANN_ERROR("duplicate data_id: " + std::to_string(data_id) + " in data_store_conduit::set_conduit_node"); + LBANN_ERROR("duplicate data_id: ", std::to_string(data_id), " in data_store_conduit::set_conduit_node"); } if (m_output) { @@ -463,7 +465,7 @@ void data_store_conduit::set_conduit_node(int data_id, conduit::Node &node, bool if (already_have && is_local_cache()) { if (m_data.find(data_id) == m_data.end()) { - LBANN_ERROR("you claim the passed node was obtained from this data_store, but the data_id (" + std::to_string(data_id) + ") doesn't exist in m_data"); + LBANN_ERROR("you claim the passed node was obtained from this data_store, but the data_id (", std::to_string(data_id), ") doesn't exist in m_data"); } m_mutex.unlock(); return; @@ -519,7 +521,7 @@ const conduit::Node & data_store_conduit::get_conduit_node(int data_id) const { if (is_local_cache()) { std::unordered_map::const_iterator t3 = m_data.find(data_id); if (t3 == m_data.end()) { - LBANN_ERROR("(local cache) failed to find data_id: " + std::to_string(data_id) + " in m_data; m_data.size: " + std::to_string(m_data.size())); + LBANN_ERROR("(local cache) failed to find data_id: ", std::to_string(data_id), " in m_data; m_data.size: ", std::to_string(m_data.size())); } return t3->second; } @@ -532,7 +534,7 @@ const conduit::Node & data_store_conduit::get_conduit_node(int data_id) const { if (t3 != m_data.end()) { return t3->second["data"]; } - LBANN_ERROR("failed to find data_id: " + std::to_string(data_id) + " in m_minibatch_data; m_minibatch_data.size: " + std::to_string(m_minibatch_data.size())+ " and also failed to find it in m_data; m_data.size: " + std::to_string(m_data.size()) + "; role: " + m_reader->get_role()); + LBANN_ERROR("failed to find data_id: ", std::to_string(data_id), " in m_minibatch_data; m_minibatch_data.size: ", std::to_string(m_minibatch_data.size()), " and also failed to find it in m_data; m_data.size: ", std::to_string(m_data.size()), "; role: ", m_reader->get_role()); if (m_output) { m_output << "failed to find data_id: " << data_id << " in m_minibatch_data; my m_minibatch_data indices: "; for (auto t : m_minibatch_data) { @@ -620,25 +622,25 @@ void data_store_conduit::exchange_data_by_sample(size_t current_pos, size_t mb_s const std::unordered_set &indices = m_indices_to_send[p]; for (auto index : indices) { if (m_data.find(index) == m_data.end()) { - LBANN_ERROR("failed to find data_id: " + std::to_string(index) + " to be sent to " + std::to_string(p) + " in m_data"); + LBANN_ERROR("failed to find data_id: ", std::to_string(index), " to be sent to ", std::to_string(p), " in m_data"); } const conduit::Node& n = m_data[index]; const El::byte *s = reinterpret_cast(n.data_ptr()); if(!n.is_contiguous()) { - LBANN_ERROR("data_id: " + std::to_string(index) + " does not have a contiguous layout"); + LBANN_ERROR("data_id: ", std::to_string(index), " does not have a contiguous layout"); } if(n.data_ptr() == nullptr) { - LBANN_ERROR("data_id: " + std::to_string(index) + " does not have a valid data pointer"); + LBANN_ERROR("data_id: ", std::to_string(index), " does not have a valid data pointer"); } if(n.contiguous_data_ptr() == nullptr) { - LBANN_ERROR("data_id: " + std::to_string(index) + " does not have a valid contiguous data pointer"); + LBANN_ERROR("data_id: ", std::to_string(index), " does not have a valid contiguous data pointer"); } size_t sz = m_compacted_sample_size; if (m_node_sizes_vary) { if (m_sample_sizes.find(index) == m_sample_sizes.end()) { - LBANN_ERROR("m_sample_sizes.find(index) == m_sample_sizes.end() for index: " + std::to_string(index) + "; m_sample_sizes.size: " + std::to_string(m_sample_sizes.size())); + LBANN_ERROR("m_sample_sizes.find(index) == m_sample_sizes.end() for index: ", std::to_string(index), "; m_sample_sizes.size: ", std::to_string(m_sample_sizes.size())); } sz = m_sample_sizes[index]; } @@ -653,7 +655,7 @@ void data_store_conduit::exchange_data_by_sample(size_t current_pos, size_t mb_s // sanity checks if (ss != m_send_requests.size()) { - LBANN_ERROR("ss != m_send_requests.size; ss: " + std::to_string(ss) + " m_send_requests.size: " + std::to_string(m_send_requests.size())); + LBANN_ERROR("ss != m_send_requests.size; ss: ", std::to_string(ss), " m_send_requests.size: ", std::to_string(m_send_requests.size())); } // start recvs for incoming data @@ -667,7 +669,7 @@ void data_store_conduit::exchange_data_by_sample(size_t current_pos, size_t mb_s int sz = m_compacted_sample_size; if (m_node_sizes_vary) { if (m_sample_sizes.find(index) == m_sample_sizes.end()) { - LBANN_ERROR("m_sample_sizes.find(index) == m_sample_sizes.end() for index: " + std::to_string(index) + "; m_sample_sizes.size(): " + std::to_string(m_sample_sizes.size()) + " role: " + m_reader->get_role() + " for index: " + std::to_string(sanity) + " of " + std::to_string(indices.size())); + LBANN_ERROR("m_sample_sizes.find(index) == m_sample_sizes.end() for index: ", std::to_string(index), "; m_sample_sizes.size(): ", std::to_string(m_sample_sizes.size()), " role: ", m_reader->get_role(), " for index: ", std::to_string(sanity), " of ", std::to_string(indices.size())); } sz = m_sample_sizes[index]; } @@ -682,10 +684,10 @@ void data_store_conduit::exchange_data_by_sample(size_t current_pos, size_t mb_s // sanity checks if (ss != m_recv_buffer.size()) { - LBANN_ERROR("ss != m_recv_buffer.size; ss: " + std::to_string(ss) + " m_recv_buffer.size: " + std::to_string(m_recv_buffer.size())); + LBANN_ERROR("ss != m_recv_buffer.size; ss: ", std::to_string(ss), " m_recv_buffer.size: ", std::to_string(m_recv_buffer.size())); } if (m_recv_requests.size() != m_recv_buffer.size()) { - LBANN_ERROR("m_recv_requests.size != m_recv_buffer.size; m_recv_requests: " + std::to_string(m_recv_requests.size()) + " m_recv_buffer.size: " + std::to_string(m_recv_buffer.size())); + LBANN_ERROR("m_recv_requests.size != m_recv_buffer.size; m_recv_requests: ", std::to_string(m_recv_requests.size()), " m_recv_buffer.size: ", std::to_string(m_recv_buffer.size())); } // wait for all msgs to complete @@ -805,7 +807,7 @@ const conduit::Node & data_store_conduit::get_random_node(const std::string &fie conduit::Node & data_store_conduit::get_empty_node(int data_id) { if (m_data.find(data_id) != m_data.end()) { - LBANN_ERROR("we already have a node with data_id= " + std::to_string(data_id)); + LBANN_ERROR("we already have a node with data_id= ", std::to_string(data_id)); } return m_data[data_id]; } @@ -869,7 +871,7 @@ void data_store_conduit::check_mem_capacity(lbann_comm *comm, const std::string std::stringstream s3(line); s3 >> line >> a_mem >> units; if (units != "kB") { - LBANN_ERROR("units is " + units + " but we only know how to handle kB; please contact Dave Hysom"); + LBANN_ERROR("units is ", units, " but we only know how to handle kB; please contact Dave Hysom"); } break; } @@ -889,7 +891,7 @@ void data_store_conduit::check_mem_capacity(lbann_comm *comm, const std::string // get list of conduit files that I own, and compute my num_samples std::ifstream istr(sample_list_file); if (!istr.good()) { - LBANN_ERROR("failed to open " + sample_list_file + " for reading"); + LBANN_ERROR("failed to open ", sample_list_file, " for reading"); } std::string base_dir; @@ -934,7 +936,7 @@ void data_store_conduit::check_mem_capacity(lbann_comm *comm, const std::string try { hdf5_file_hnd = conduit::relay::io::hdf5_open_file_for_read(base_dir + '/' + filename); } catch (conduit::Error const& e) { - LBANN_ERROR(" failed to open " + base_dir + '/' + filename + " for reading"); + LBANN_ERROR(" failed to open ", base_dir, '/', filename, " for reading"); } std::vector sample_names; try { @@ -948,7 +950,7 @@ void data_store_conduit::check_mem_capacity(lbann_comm *comm, const std::string try { conduit::relay::io::hdf5_read(hdf5_file_hnd, key, useme); } catch (conduit::Error const& e) { - LBANN_ERROR("failed to read success flag for " + key); + LBANN_ERROR("failed to read success flag for ", key); } if (useme.to_int64() == 1) { got_one = true; @@ -956,7 +958,7 @@ void data_store_conduit::check_mem_capacity(lbann_comm *comm, const std::string key = "/" + t; conduit::relay::io::hdf5_read(hdf5_file_hnd, key, useme); } catch (conduit::Error const& e) { - LBANN_ERROR("failed to load JAG sample: " + key); + LBANN_ERROR("failed to load JAG sample: ", key); } break; } @@ -1045,7 +1047,7 @@ void data_store_conduit::exchange_sample_sizes() { m_comm->broadcast(k, other_sizes.data(), all_counts[k]*2, m_comm->get_trainer_comm()); for (size_t i=0; i &file_si const std::string fn = m_reader->get_file_dir() + '/' + image_list[(*m_shuffled_indices)[h]].first; std::ifstream in(fn.c_str()); if (!in) { - LBANN_ERROR("failed to open " + fn + " for reading; file_dir: " + m_reader->get_file_dir() + " fn: " + image_list[h].first + "; role: " + m_reader->get_role()); + LBANN_ERROR("failed to open ", fn, " for reading; file_dir: ", m_reader->get_file_dir(), " fn: ", image_list[h].first, "; role: ", m_reader->get_role()); } in.seekg(0, std::ios::end); my_image_sizes.push_back((*m_shuffled_indices)[h]); @@ -1124,7 +1126,7 @@ void data_store_conduit::compute_image_offsets(std::unordered_map &s for (size_t p=0; p << msg.str() << "\n"; } if (m_mem_seg_length >= avail_mem) { - LBANN_ERROR("insufficient available memory:\n" + msg.str()); + LBANN_ERROR("insufficient available memory:\n", msg.str()); } //need to ensure name is unique across all data readers @@ -1181,7 +1183,7 @@ void data_store_conduit::allocate_shared_segment(std::unordered_map } int v = ftruncate(shm_fd, size); if (v != 0) { - LBANN_ERROR("ftruncate failed for size: " + std::to_string(size)); + LBANN_ERROR("ftruncate failed for size: ", std::to_string(size)); } void *m = mmap(0, size, PROT_WRITE | PROT_READ, MAP_SHARED, shm_fd, 0); if (m == MAP_FAILED) { @@ -1200,7 +1202,7 @@ void data_store_conduit::allocate_shared_segment(std::unordered_map if (node_id != 0) { shm_fd = shm_open(m_seg_name.c_str(), O_RDONLY, 0666); if (shm_fd == -1) { - LBANN_ERROR("shm_open failed for filename: " + m_seg_name); + LBANN_ERROR("shm_open failed for filename: ", m_seg_name); } void *m = mmap(0, size, PROT_READ, MAP_SHARED, shm_fd, 0); if (m == MAP_FAILED) { @@ -1214,7 +1216,7 @@ void data_store_conduit::allocate_shared_segment(std::unordered_map LBANN_ERROR("fstat failed"); } if (b.st_size != size) { - LBANN_ERROR("b.st_size= " + std::to_string(b.st_size) + " should be equal to " + std::to_string(size)); + LBANN_ERROR("b.st_size= ", std::to_string(b.st_size), " should be equal to ", std::to_string(size)); } } close(shm_fd); From 54c6be8190021932f6d01fa936262033030912a5 Mon Sep 17 00:00:00 2001 From: Tim Moon Date: Tue, 10 Sep 2019 13:50:45 -0700 Subject: [PATCH 290/634] Store weights in unique_ptrs Model keeps weights inside a vector>. --- .../layers/learning/base_convolution.hpp | 4 +- .../learning/channelwise_scale_bias.hpp | 25 +- .../layers/learning/entrywise_scale_bias.hpp | 25 +- .../lbann/layers/learning/fully_connected.hpp | 4 +- .../regularizers/batch_normalization.hpp | 40 +-- .../entrywise_batch_normalization.hpp | 18 +- include/lbann/layers/transform/weights.hpp | 45 ++-- include/lbann/models/model.hpp | 4 +- src/models/model.cpp | 234 +++++++++--------- src/proto/factories/model_factory.cpp | 4 +- 10 files changed, 198 insertions(+), 205 deletions(-) diff --git a/include/lbann/layers/learning/base_convolution.hpp b/include/lbann/layers/learning/base_convolution.hpp index 6f210b7eccc..b32abf9fca5 100644 --- a/include/lbann/layers/learning/base_convolution.hpp +++ b/include/lbann/layers/learning/base_convolution.hpp @@ -358,7 +358,7 @@ class base_convolution_layer : public Layer { w->set_initializer(std::move(init)); w->set_optimizer(std::move(opt)); this->m_weights[0] = w.get(); - this->m_model->add_weights(w.release()); + this->m_model->add_weights(std::move(w)); } auto& kernel_weights = *this->m_weights[0]; @@ -385,7 +385,7 @@ class base_convolution_layer : public Layer { w->set_name(get_name() + "_bias"); w->set_optimizer(std::move(opt)); this->m_weights[1] = w.get(); - this->m_model->add_weights(w.release()); + this->m_model->add_weights(std::move(w)); } auto& bias_weights = *this->m_weights[1]; bias_weights.set_dims(output_dims[0]); diff --git a/include/lbann/layers/learning/channelwise_scale_bias.hpp b/include/lbann/layers/learning/channelwise_scale_bias.hpp index d1c776d68bc..fc93aa53ca9 100644 --- a/include/lbann/layers/learning/channelwise_scale_bias.hpp +++ b/include/lbann/layers/learning/channelwise_scale_bias.hpp @@ -93,25 +93,24 @@ class channelwise_scale_bias_layer : public Layer { const El::Int num_channels = get_output_dims()[0]; // Construct default weights if needed - if (this->m_weights.size() < 1) { - this->m_weights.push_back(new weights(get_comm())); + // Note: Scale is initialized to 1 and bias to 0 + if (this->m_weights.empty()) { + auto w = make_unique(get_comm()); std::vector vals(2*num_channels, DataType{0}); std::fill(vals.begin(), vals.begin()+num_channels, DataType{1}); auto init = make_unique(vals); std::unique_ptr opt(m_model->create_optimizer()); - this->m_weights[0]->set_name(get_name() + "_weights"); - this->m_weights[0]->set_initializer(std::move(init)); - this->m_weights[0]->set_optimizer(std::move(opt)); - this->m_model->add_weights(this->m_weights[0]); + w->set_name(get_name() + "_weights"); + w->set_initializer(std::move(init)); + w->set_optimizer(std::move(opt)); + this->m_weights.push_back(w.get()); + this->m_model->add_weights(std::move(w)); } if (this->m_weights.size() != 1) { - std::ostringstream err; - err << "attempted to setup " - << this->get_type() << " layer \"" << this->get_name() << "\" " - << "with an invalid number of weights " - << "(expected 1, " - << "found " << this->m_weights.size() << ")"; - LBANN_ERROR(err.str()); + LBANN_ERROR("attempted to setup ", + this->get_type()," layer \"",this->get_name(),"\" ", + "with an invalid number of weights ", + "(expected 1, found ",this->m_weights.size(),")"); } // Setup weights diff --git a/include/lbann/layers/learning/entrywise_scale_bias.hpp b/include/lbann/layers/learning/entrywise_scale_bias.hpp index fb10d83e44b..0b313e68bc9 100644 --- a/include/lbann/layers/learning/entrywise_scale_bias.hpp +++ b/include/lbann/layers/learning/entrywise_scale_bias.hpp @@ -92,25 +92,24 @@ class entrywise_scale_bias_layer : public Layer { const El::Int output_size = get_output_size(); // Construct default weights if needed - if (this->m_weights.size() < 1) { - this->m_weights.push_back(new weights(get_comm())); + // Note: Scale is initialized to 1 and bias to 0 + if (this->m_weights.empty()) { + auto w = make_unique(get_comm()); std::vector vals(2*output_size, DataType{0}); std::fill(vals.begin(), vals.begin()+output_size, DataType{1}); auto init = make_unique(vals); std::unique_ptr opt(m_model->create_optimizer()); - this->m_weights[0]->set_name(get_name() + "_weights"); - this->m_weights[0]->set_initializer(std::move(init)); - this->m_weights[0]->set_optimizer(std::move(opt)); - this->m_model->add_weights(this->m_weights[0]); + w->set_name(get_name() + "_weights"); + w->set_initializer(std::move(init)); + w->set_optimizer(std::move(opt)); + this->m_weights.push_back(w.get()); + this->m_model->add_weights(std::move(w)); } if (this->m_weights.size() != 1) { - std::ostringstream err; - err << "attempted to setup " - << this->get_type() << " layer \"" << this->get_name() << "\" " - << "with an invalid number of weights " - << "(expected 1, " - << "found " << this->m_weights.size() << ")"; - LBANN_ERROR(err.str()); + LBANN_ERROR("attempted to setup ", + this->get_type()," layer \"",this->get_name(),"\" ", + "with an invalid number of weights ", + "(expected 1, found ",this->m_weights.size(),")"); } // Setup weights diff --git a/include/lbann/layers/learning/fully_connected.hpp b/include/lbann/layers/learning/fully_connected.hpp index 3acc8062322..24445b240e4 100644 --- a/include/lbann/layers/learning/fully_connected.hpp +++ b/include/lbann/layers/learning/fully_connected.hpp @@ -134,7 +134,7 @@ class fully_connected_layer : public learning_layer { w->set_initializer(std::move(init)); w->set_optimizer(std::move(opt)); this->m_weights[0] = w.get(); - this->m_model->add_weights(w.release()); + this->m_model->add_weights(std::move(w)); } auto& linearity_weights = *this->m_weights[0]; @@ -168,7 +168,7 @@ class fully_connected_layer : public learning_layer { w->set_name(get_name() + "_bias_weights"); w->set_optimizer(std::move(opt)); this->m_weights[1] = w.get(); - this->m_model->add_weights(w.release()); + this->m_model->add_weights(std::move(w)); } auto& bias_weights = *this->m_weights[1]; // Setup bias weights diff --git a/include/lbann/layers/regularizers/batch_normalization.hpp b/include/lbann/layers/regularizers/batch_normalization.hpp index 79318781a01..030fd1359d6 100644 --- a/include/lbann/layers/regularizers/batch_normalization.hpp +++ b/include/lbann/layers/regularizers/batch_normalization.hpp @@ -250,36 +250,40 @@ class batch_normalization_layer : public regularizer_layer { } this->m_weights.resize(4, nullptr); if (this->m_weights[0] == nullptr) { - this->m_weights[0] = new weights(get_comm()); + auto w = make_unique(get_comm()); auto init = make_unique(DataType(1)); std::unique_ptr opt(m_model->create_optimizer()); - this->m_weights[0]->set_name(get_name() + "_scale"); - this->m_weights[0]->set_initializer(std::move(init)); - this->m_weights[0]->set_optimizer(std::move(opt)); - this->m_model->add_weights(this->m_weights[0]); + w->set_name(get_name() + "_scale"); + w->set_initializer(std::move(init)); + w->set_optimizer(std::move(opt)); + this->m_weights[0] = w.get(); + this->m_model->add_weights(std::move(w)); } if (this->m_weights[1] == nullptr) { - this->m_weights[1] = new weights(get_comm()); + auto w = make_unique(get_comm()); auto init = make_unique(DataType(0)); std::unique_ptr opt(m_model->create_optimizer()); - this->m_weights[1]->set_name(get_name() + "_bias"); - this->m_weights[1]->set_initializer(std::move(init)); - this->m_weights[1]->set_optimizer(std::move(opt)); - this->m_model->add_weights(this->m_weights[1]); + w->set_name(get_name() + "_bias"); + w->set_initializer(std::move(init)); + w->set_optimizer(std::move(opt)); + this->m_weights[1] = w.get(); + this->m_model->add_weights(std::move(w)); } if (this->m_weights[2] == nullptr) { - this->m_weights[2] = new weights(get_comm()); - this->m_weights[2]->set_name(get_name() + "_running_mean"); + auto w = make_unique(get_comm()); auto init = make_unique(DataType(0)); - this->m_weights[2]->set_initializer(std::move(init)); - this->m_model->add_weights(this->m_weights[2]); + w->set_name(get_name() + "_running_mean"); + w->set_initializer(std::move(init)); + this->m_weights[2] = w.get(); + this->m_model->add_weights(std::move(w)); } if (this->m_weights[3] == nullptr) { - this->m_weights[3] = new weights(get_comm()); - this->m_weights[3]->set_name(get_name() + "_running_variance"); + auto w = make_unique(get_comm()); auto init = make_unique(DataType(1)); - this->m_weights[3]->set_initializer(std::move(init)); - this->m_model->add_weights(this->m_weights[3]); + w->set_name(get_name() + "_running_variance"); + w->set_initializer(std::move(init)); + this->m_weights[3] = w.get(); + this->m_model->add_weights(std::move(w)); } // Setup weights diff --git a/include/lbann/layers/regularizers/entrywise_batch_normalization.hpp b/include/lbann/layers/regularizers/entrywise_batch_normalization.hpp index 81536a1f01d..0bc692b92b6 100644 --- a/include/lbann/layers/regularizers/entrywise_batch_normalization.hpp +++ b/include/lbann/layers/regularizers/entrywise_batch_normalization.hpp @@ -118,18 +118,20 @@ class entrywise_batch_normalization_layer : public Layer { } this->m_weights.resize(2, nullptr); if (this->m_weights[0] == nullptr) { - this->m_weights[0] = new weights(get_comm()); - this->m_weights[0]->set_name(get_name() + "_running_mean"); + auto w = make_unique(get_comm()); auto init = make_unique(DataType{0}); - this->m_weights[0]->set_initializer(std::move(init)); - this->m_model->add_weights(this->m_weights[0]); + w->set_name(get_name() + "_running_mean"); + w->set_initializer(std::move(init)); + this->m_weights[0] = w.get(); + this->m_model->add_weights(std::move(w)); } if (this->m_weights[1] == nullptr) { - this->m_weights[1] = new weights(get_comm()); - this->m_weights[1]->set_name(get_name() + "_running_variance"); + auto w = make_unique(get_comm()); auto init = make_unique(DataType{1}); - this->m_weights[1]->set_initializer(std::move(init)); - this->m_model->add_weights(this->m_weights[1]); + w->set_name(get_name() + "_running_variance"); + w->set_initializer(std::move(init)); + this->m_weights[1] = w.get(); + this->m_model->add_weights(std::move(w)); } // Setup weights diff --git a/include/lbann/layers/transform/weights.hpp b/include/lbann/layers/transform/weights.hpp index 0b4aee13672..5f0091fed1f 100644 --- a/include/lbann/layers/transform/weights.hpp +++ b/include/lbann/layers/transform/weights.hpp @@ -113,43 +113,38 @@ class weights_layer : public transform_layer { transform_layer::setup_data(); // Initialize default weights if none are provided - if (this->m_weights.size() > 1) { - std::stringstream err; - err << "attempted to setup " - << get_type() << " layer \"" << get_name() << "\" " - << "with an invalid number of weights " - << "(expected at most 1, " - << "but found " << this->m_weights.size() << ")"; - LBANN_ERROR(err.str()); - } - this->m_weights.resize(1, nullptr); - auto& w = this->m_weights[0]; - if (w == nullptr) { - w = new weights(get_comm()); + if (this->m_weights.empty()) { + auto w = make_unique(get_comm()); auto init = make_unique(DataType(0)); std::unique_ptr opt(m_model->create_optimizer()); w->set_name(get_name() + "_weights"); w->set_initializer(std::move(init)); w->set_optimizer(std::move(opt)); - this->m_model->add_weights(w); + this->m_weights.push_back(w.get()); + this->m_model->add_weights(std::move(w)); + } + if (this->m_weights.size() != 1) { + LBANN_ERROR("attempted to setup ", + get_type()," layer \"",get_name(),"\" ", + "with an invalid number of weights ", + "(expected at most 1, ", + "but found ",this->m_weights.size(),")"); } // Setup weights and weights gradient m_gradient->AlignWith(get_activations()); m_gradient->Resize(get_output_size(), 1); - w->set_dims(get_output_dims()); - w->set_matrix_distribution(m_gradient->DistData()); + m_weights[0]->set_dims(get_output_dims()); + m_weights[0]->set_matrix_distribution(m_gradient->DistData()); // Initialize freeze state - if (this->m_frozen) { w->freeze(); } - else { w->unfreeze(); } - if (w->is_frozen() != this->m_frozen) { - std::stringstream err; - err << (m_frozen ? "" : "un") << "frozen " - << "layer \"" << get_name() << "\" has " - << (w->is_frozen() ? "" : "un") << "frozen " - << "weights \"" << w->get_name() << "\""; - LBANN_ERROR(err.str()); + if (this->m_frozen) { m_weights[0]->freeze(); } + else { m_weights[0]->unfreeze(); } + if (m_weights[0]->is_frozen() != this->m_frozen) { + LBANN_ERROR((m_frozen ? "" : "un"),"frozen ", + "layer \"",get_name(),"\" has ", + (m_weights[0]->is_frozen() ? "" : "un"),"frozen ", + "weights \"",m_weights[0]->get_name(),"\""); } } diff --git a/include/lbann/models/model.hpp b/include/lbann/models/model.hpp index b0440a49ea9..8e41005bbdb 100644 --- a/include/lbann/models/model.hpp +++ b/include/lbann/models/model.hpp @@ -163,7 +163,7 @@ class model { virtual void add_layer(std::unique_ptr l); /** @brief Add weights to model. */ - void add_weights(weights *w); + void add_weights(std::unique_ptr w); /** @brief Register a new callback for the model. */ void add_callback(callback_base *cb); @@ -407,7 +407,7 @@ class model { std::vector> m_layers; /** @brief Trainable parameters. */ - std::vector m_weights; + std::vector> m_weights; /** @details Maximum possible minibatch size supported by layers in * this model. Note that this is local to the particular model, diff --git a/src/models/model.cpp b/src/models/model.cpp index 1eb014dc7c5..1735f35db54 100644 --- a/src/models/model.cpp +++ b/src/models/model.cpp @@ -96,22 +96,26 @@ model::model(const model& other) : // Copy layers std::unordered_map layer_map; m_layers.reserve(other.m_layers.size()); - for (const auto& ptr : other.m_layers) { - if (ptr == nullptr) { LBANN_ERROR("unexpected null pointer"); } - auto* old_layer = ptr.get(); - auto* new_layer = old_layer->copy(); - new_layer->set_model(this); - m_layers.emplace_back(new_layer); - layer_map[old_layer] = new_layer; + for (const auto& other_layer : other.m_layers) { + if (other_layer == nullptr) { + LBANN_ERROR("model \"",other.get_name(),"\" ", + "has a null pointer in its list of layers"); + } + m_layers.emplace_back(other_layer->copy()); + m_layers.back()->set_model(this); + layer_map[other_layer.get()] = m_layers.back().get(); } // Copy weights - m_weights = other.m_weights; std::unordered_map weights_map; - for (auto& w : m_weights) { - auto&& w_copy = w->copy(); - weights_map[w] = w_copy; - w = w_copy; + m_weights.reserve(other.m_weights.size()); + for (const auto& other_weights : other.m_weights) { + if (other_weights == nullptr) { + LBANN_ERROR("model \"",other.get_name(),"\" ", + "has a null pointer in its list of weights"); + } + m_weights.emplace_back(make_unique(*other_weights)); + weights_map[other_weights.get()] = m_weights.back().get(); } // Fix pointers @@ -126,7 +130,6 @@ model& model::operator=(const model& other) { if (m_objective_function != nullptr) { delete m_objective_function; } for (const auto& m : m_metrics) { delete m; } for (const auto& cb : m_callbacks) { delete cb; } - for (const auto& w : m_weights) { delete w; } // Shallow copies m_comm = other.m_comm; @@ -138,7 +141,6 @@ model& model::operator=(const model& other) { m_objective_function = other.m_objective_function; m_metrics = other.m_metrics; m_callbacks = other.m_callbacks; - m_weights = other.m_weights; if (m_objective_function != nullptr) { m_objective_function = m_objective_function->copy(); } @@ -148,21 +150,35 @@ model& model::operator=(const model& other) { for (auto& cb : m_callbacks) { cb = cb->copy(); } + + // Copy layers std::unordered_map layer_map; m_layers.clear(); m_layers.reserve(other.m_layers.size()); - for (const auto& ptr : other.m_layers) { - if (ptr == nullptr) { LBANN_ERROR("unexpected null pointer"); } - auto* old_layer = ptr.get(); - auto* new_layer = old_layer->copy(); - new_layer->set_model(this); - m_layers.emplace_back(new_layer); - layer_map[old_layer] = new_layer; + for (const auto& other_layer : other.m_layers) { + if (other_layer == nullptr) { + LBANN_ERROR("model \"",other.get_name(),"\" ", + "has a null pointer in its list of layers"); + } + m_layers.emplace_back(other_layer->copy()); + m_layers.back()->set_model(this); + layer_map[other_layer.get()] = m_layers.back().get(); } + + // Copy weights std::unordered_map weights_map; - for (auto& w : m_weights) { - w = weights_map[w] = w->copy(); + m_weights.clear(); + m_weights.reserve(other.m_weights.size()); + for (const auto& other_weights : other.m_weights) { + if (other_weights == nullptr) { + LBANN_ERROR("model \"",other.get_name(),"\" ", + "has a null pointer in its list of weights"); + } + m_weights.emplace_back(make_unique(*other_weights)); + weights_map[other_weights.get()] = m_weights.back().get(); } + + // Fix pointers remap_pointers(layer_map, weights_map); return *this; @@ -171,7 +187,6 @@ model& model::operator=(const model& other) { model::~model() { if (m_objective_function != nullptr) { delete m_objective_function; } if (m_default_optimizer != nullptr) { delete m_default_optimizer; } - for (const auto& w : m_weights) { delete w; } for (const auto& m : m_metrics) { delete m; } for (const auto& cb : m_callbacks) { delete cb; } } @@ -247,7 +262,7 @@ description model::get_description() const { // Weights description weights_desc("Weights:"); - for (const auto* w : m_weights) { + for (const auto& w : m_weights) { if (w == nullptr) { weights_desc.add("unknown weights"); } else { @@ -313,7 +328,7 @@ const std::vector model::get_layers() const { std::vector model::get_weights() { std::vector weights_list; for (const auto& w : m_weights) { - weights_list.push_back(w); + weights_list.push_back(w.get()); } return weights_list; } @@ -321,7 +336,7 @@ std::vector model::get_weights() { const std::vector model::get_weights() const { std::vector weights_list; for (const auto& w : m_weights) { - weights_list.push_back(w); + weights_list.push_back(w.get()); } return weights_list; } @@ -340,14 +355,12 @@ size_t model::get_num_iterations_per_epoch(execution_mode mode) const { // Model specification // ============================================= -void model::add_layer(std::unique_ptr l) { - std::stringstream err; +void model::add_layer(std::unique_ptr ptr) { // Check for null pointer - if (l == nullptr) { - err << "attempted to add a null pointer as a layer to " - << "model \"" << get_name() << "\""; - LBANN_ERROR(err.str()); + if (ptr == nullptr) { + LBANN_ERROR("attempted to add a null pointer as layer to ", + "model \"",get_name(),"\""); } // Check that the new layer name is unique @@ -355,30 +368,27 @@ void model::add_layer(std::unique_ptr l) { // bottleneck. If it is, consider maintaining a hash table // containing all layer names (and properly updating it during // copies and pointer remaps). - const auto& name = l->get_name(); - for (El::Int i = 0; i < get_num_layers(); ++i) { - if (get_layer(i).get_name() == name) { - err << "attempted to add layer \"" << name << "\" to " - << "model \"" << get_name() << "\", " - << "but the model already contains a layer with that name"; - LBANN_ERROR(err.str()); + const auto& name = ptr->get_name(); + for (const auto& l : m_layers) { + if (l->get_name() == name) { + LBANN_ERROR("attempted to add layer \"",name,"\" to ", + "model \"",get_name(),"\", ", + "but the model already contains a layer with that name"); } } // Add layer to model - m_layers.emplace_back(std::move(l)); + m_layers.emplace_back(std::move(ptr)); m_layers.back()->set_model(this); } -void model::add_weights(weights* w) { - std::stringstream err; +void model::add_weights(std::unique_ptr ptr) { // Check for null pointer - if (w == nullptr) { - err << "attempted to add a null pointer as weights to " - << "model \"" << get_name() << "\""; - LBANN_ERROR(err.str()); + if (ptr == nullptr) { + LBANN_ERROR("attempted to add a null pointer as weights to ", + "model \"",get_name(),"\""); } // Check that the new weights name is unique @@ -386,18 +396,17 @@ void model::add_weights(weights* w) { // bottleneck. If it is, consider maintaining a hash table // containing all weights names (and properly updating it during // copies and pointer remaps). - const auto& name = w->get_name(); - for (const auto& w2 : m_weights) { - if (w2->get_name() == name) { - err << "attempted to add weights \"" << name << "\" to " - << "model \"" << get_name() << "\", " - << "but the model already contains weights with that name"; - LBANN_ERROR(err.str()); + const auto& name = ptr->get_name(); + for (const auto& w : m_weights) { + if (w->get_name() == name) { + LBANN_ERROR("attempted to add weights \"",name,"\" to ", + "model \"",get_name(),"\", ", + "but the model already contains weights with that name"); } } // Add weights to model - m_weights.push_back(w); + m_weights.emplace_back(std::move(ptr)); } @@ -416,31 +425,26 @@ void model::add_metric(metric *m) { } void model::replace_weights(std::vector& new_weights) { + /// @todo tym (9/9/19): This function isn't used anywhere. It's + /// probably safe to delete? // Check that number of weights is valid if (new_weights.size() > m_weights.size()) { - std::stringstream err; - err << __FILE__ << " " << __LINE__ << " :: " - << "attempted to replace weights with an invalid number of weights " - << "(expected at most " << m_weights.size() << ", found " << new_weights.size() << ")"; - throw lbann_exception(err.str()); + LBANN_ERROR("attempted to replace weights with ", + "an invalid number of weights ", + "(expected at most ",m_weights.size(),", ", + "found ",new_weights.size(),")"); } // Replace weights in list - std::vector old_weights(m_weights.begin(), - m_weights.begin() + new_weights.size()); std::unordered_map weights_map; std::unordered_map layer_map; for (size_t i = 0; i < new_weights.size(); ++i) { - m_weights[i] = weights_map[old_weights[i]] = new_weights[i]; + weights_map[m_weights[i].get()] = new_weights[i]; + m_weights[i].reset(new_weights[i]); } remap_pointers(layer_map, weights_map); - // Delete old weights - for (const auto& w : old_weights) { - delete w; - } - } void model::copy_trained_weights_from(std::vector& new_weights) { @@ -681,48 +685,16 @@ void model::setup_layers() { void model::setup_weights() { - // List of used and unused weights - std::unordered_set weights_set(m_weights.begin(), - m_weights.end()); - std::set unused_weights(m_weights.begin(), - m_weights.end()); - - // Find weights used by layers - for (El::Int i = 0; i < get_num_layers(); ++i) { - for (const auto& w : get_layer(i).get_weights()) { - if (weights_set.count(w) == 0) { - m_weights.push_back(w); - weights_set.insert(w); - } - unused_weights.erase(w); - } - } - - // Find weights used by objective function - for (const auto& w : m_objective_function->get_weights_pointers()) { - if (weights_set.count(w) == 0) { - m_weights.push_back(w); - weights_set.insert(w); - } - unused_weights.erase(w); - } - - // Delete unused weights - for (auto&& w : unused_weights) { - m_weights.erase(std::remove(m_weights.begin(), m_weights.end(), w), - m_weights.end()); - } - - // For run-to-run reproducibility, make sure the weights are - // initialized in the same order no matter how they are ordered in - // the prototext file. + // Sort weights by name + // Note: For run-to-run consistency. Names are assumed to be unique. std::sort(m_weights.begin(), m_weights.end(), - [](weights* const &x, weights* const &y) { + [] (const std::unique_ptr& x, + const std::unique_ptr& y) { return x->get_name().compare(y->get_name()) < 0; }); // Setup weights - for (auto* w : m_weights) { w->setup(); } + for (auto&& w : m_weights) { w->setup(); } } @@ -996,8 +968,8 @@ void model::evaluate_metrics(execution_mode mode, size_t current_mini_batch_size } void model::clear_gradients() { - for (const auto& w : m_weights) { - optimizer* opt = w->get_optimizer(); + for (auto&& w : m_weights) { + auto&& opt = w->get_optimizer(); if (opt != nullptr) { opt->clear_gradient(); } } } @@ -1040,15 +1012,24 @@ void model::backward_prop() { void model::update_weights() { do_model_optimize_begin_cbs(); - for (El::Int i = m_weights.size()-1; i >= 0; --i) { - auto& w = *m_weights[i]; - optimizer* opt = w.get_optimizer(); + + // Apply optimization step to weights + // Note: Heuristically, forward prop consumes weights in the same + // order as m_weights and backprop computes weights gradients in + // reverse order. Also, we often launch a non-blocking allreduce + // after a weights gradient has been computed. Thus, iterating in + // reverse order will use gradients that have already finished their + // allreduce, giving more time for more recent allreduces to finish. + for (auto rit = m_weights.rbegin(); rit != m_weights.rend(); ++rit) { + auto& w = **rit; + auto&& opt = w.get_optimizer(); if (opt != nullptr) { do_weight_optimize_begin_cbs(&w); opt->step(); do_weight_optimize_end_cbs(&w); } } + do_model_optimize_end_cbs(); } @@ -1061,11 +1042,24 @@ bool model::update_layers() { } void model::reconcile_weight_values() { - std::vector reqs(m_weights.size()); - for (El::Int i = m_weights.size()-1; i >= 0; --i) { - m_weights[i]->reconcile_values(reqs[i]); - } + + // Launch non-blocking communication to reconcile weights + // Note: Heuristically, forward prop consumes weights in the same + // order as m_weights. Also, weights tend to get larger as you get + // deeper into a neural network. Thus, iterating in reverse order + // means that we perform the expensive communication first, covering + // up the launch overheads for the subsequent cheap communication. + std::vector reqs; + reqs.reserve(m_weights.size()); + for (auto rit = m_weights.rbegin(); rit != m_weights.rend(); ++rit) { + auto& w = **rit; + reqs.emplace_back(); + w.reconcile_values(reqs.back()); + } + + // Wait for communication to finish for (auto& req : reqs) { m_comm->wait(req); } + } // ============================================= @@ -1276,7 +1270,7 @@ bool model::save_to_checkpoint_shared(persist& p) { p.write_uint32(persist_type::model, "persist_callback_type", (uint32_t) p.get_cb_type()); } - for (weights *w : m_weights) { + for (auto&& w : m_weights) { w->save_to_checkpoint_shared(p); } @@ -1310,7 +1304,7 @@ bool model::load_from_checkpoint_shared(persist& p) { // set state of persist object to know which type of ckpt we are returning from. p.set_cb_type((callback_type) header.callback_type); - for (weights *w : m_weights) { + for (auto&& w : m_weights) { w->load_from_checkpoint_shared(p); } @@ -1337,7 +1331,7 @@ bool model::save_to_checkpoint_distributed(persist& p){ p.write_uint32(persist_type::train, "persist_callback_type",(uint32_t) p.get_cb_type()); // for each execution context write out them out - for (weights *w : m_weights) { + for (auto&& w : m_weights) { w->save_to_checkpoint_distributed(p); } @@ -1364,7 +1358,7 @@ bool model::load_from_checkpoint_distributed(persist& p){ p.set_cb_type((callback_type) header.callback_type); load_rng_from_checkpoint(p, m_comm); - for (weights *w : m_weights) { + for (auto&& w : m_weights) { w->load_from_checkpoint_distributed(p); } @@ -1388,7 +1382,7 @@ void model::write_proto(lbann_data::Model* proto) { bool model::save_weights(persist& p) { // write out fields we need to save a model's weights - for (weights *w : m_weights) { + for (auto&& w : m_weights) { w->save_to_checkpoint_shared(p); } return true; @@ -1396,7 +1390,7 @@ bool model::save_weights(persist& p) { bool model::reload_weights(const std::string latest, const std::vector& weight_list) { // load weights that appear in weight list. - for(weights *w : m_weights) { + for(auto&& w : m_weights) { w->load_from_save(latest,weight_list); } return true; diff --git a/src/proto/factories/model_factory.cpp b/src/proto/factories/model_factory.cpp index e56fc15eb5d..aa8b8ae62e0 100644 --- a/src/proto/factories/model_factory.cpp +++ b/src/proto/factories/model_factory.cpp @@ -284,8 +284,8 @@ std::unique_ptr construct_model( // Instantiate model auto m = instantiate_model(comm, std::move(obj), proto_opt, proto_model); - for (auto&& l : layer_list ) { m->add_layer(std::move(l)); } - for (auto&& w : weights_list ) { m->add_weights(w.release()); } + for (auto&& l : layer_list ) { m->add_layer(std::move(l)); } + for (auto&& w : weights_list ) { m->add_weights(std::move(w)); } for (auto&& met : metric_list ) { m->add_metric(met.release()); } for (auto&& cb : callback_list) { m->add_callback(cb.release()); } const auto& name = proto_model.name(); From c730a901836037575245e245186bc25634b175f6 Mon Sep 17 00:00:00 2001 From: Tim Moon Date: Wed, 11 Sep 2019 10:28:20 -0700 Subject: [PATCH 291/634] Return LBANN exit code in Python-generated batch script --- python/lbann/contrib/lc/launcher.py | 4 +++- python/lbann/launcher/__init__.py | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/python/lbann/contrib/lc/launcher.py b/python/lbann/contrib/lc/launcher.py index b6055ba8dbe..b10d23d634d 100644 --- a/python/lbann/contrib/lc/launcher.py +++ b/python/lbann/contrib/lc/launcher.py @@ -60,9 +60,11 @@ def run(trainer, model, data_reader, optimizer, optimizer=optimizer) lbann_command.append('--prototext={}'.format(prototext_file)) script.add_parallel_command(lbann_command) + script.add_command('status=$?') - # Batch script prints finish time + # Batch script prints finish time and returns status script.add_command('date | sed "s/^/Finished at /"') + script.add_command('exit ${status}') # Write, run, or submit batch script status = 0 diff --git a/python/lbann/launcher/__init__.py b/python/lbann/launcher/__init__.py index 335d46508ad..521faac6ac9 100644 --- a/python/lbann/launcher/__init__.py +++ b/python/lbann/launcher/__init__.py @@ -104,9 +104,11 @@ def run(trainer, model, data_reader, optimizer, optimizer=optimizer) lbann_command.append('--prototext={}'.format(prototext_file)) script.add_parallel_command(lbann_command) + script.add_command('status=$?') - # Batch script prints finish time + # Batch script prints finish time and returns status script.add_command('date | sed "s/^/Finished at /"') + script.add_command('exit ${status}') # Write, run, or submit batch script status = 0 From 2950c86409312f19e278880a61f857dc8c9a027e Mon Sep 17 00:00:00 2001 From: "David A. Hysom" Date: Wed, 11 Sep 2019 11:09:29 -0700 Subject: [PATCH 292/634] bug fix; m_num_samples was not being initialized, hence the shuffled_indices vector had size = 0. --- src/data_readers/data_reader_numpy_npz_conduit.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/data_readers/data_reader_numpy_npz_conduit.cpp b/src/data_readers/data_reader_numpy_npz_conduit.cpp index 8eb72d4849b..afa6e4cf9d1 100644 --- a/src/data_readers/data_reader_numpy_npz_conduit.cpp +++ b/src/data_readers/data_reader_numpy_npz_conduit.cpp @@ -325,6 +325,10 @@ void numpy_npz_conduit_reader::fill_in_metadata() { LBANN_ERROR("failed to open " + m_filenames[my_file] + " for reading"); } in.close(); + m_num_samples = m_filenames.size(); + if (is_master()) { + std::cout << "num samples: " << m_num_samples << "\n"; + } int data_id = 0; //meaningless conduit::Node node; From f5a33878a271da038ac16210fe29dca79848cb7f Mon Sep 17 00:00:00 2001 From: Tim Moon Date: Wed, 11 Sep 2019 13:22:49 -0700 Subject: [PATCH 293/634] Improve time reporting in Python-generated batch script --- python/lbann/contrib/lc/launcher.py | 4 ++-- python/lbann/launcher/__init__.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/python/lbann/contrib/lc/launcher.py b/python/lbann/contrib/lc/launcher.py index b10d23d634d..ca36fcc35af 100644 --- a/python/lbann/contrib/lc/launcher.py +++ b/python/lbann/contrib/lc/launcher.py @@ -47,7 +47,7 @@ def run(trainer, model, data_reader, optimizer, has_allocation = 'LSB_JOBID' in os.environ # Batch script prints start time - script.add_command('date | sed "s/^/Started at /"') + script.add_command('echo "Started at $(date)"') # Batch script invokes LBANN lbann_command = [lbann.lbann_exe()] @@ -63,7 +63,7 @@ def run(trainer, model, data_reader, optimizer, script.add_command('status=$?') # Batch script prints finish time and returns status - script.add_command('date | sed "s/^/Finished at /"') + script.add_command('echo "Finished at $(date)"') script.add_command('exit ${status}') # Write, run, or submit batch script diff --git a/python/lbann/launcher/__init__.py b/python/lbann/launcher/__init__.py index 521faac6ac9..0cd3e3b5ad5 100644 --- a/python/lbann/launcher/__init__.py +++ b/python/lbann/launcher/__init__.py @@ -91,7 +91,7 @@ def run(trainer, model, data_reader, optimizer, has_allocation = 'LSB_JOBID' in os.environ # Batch script prints start time - script.add_command('date | sed "s/^/Started at /"') + script.add_command('echo "Started at $(date)"') # Batch script invokes LBANN lbann_command = [lbann.lbann_exe()] @@ -107,7 +107,7 @@ def run(trainer, model, data_reader, optimizer, script.add_command('status=$?') # Batch script prints finish time and returns status - script.add_command('date | sed "s/^/Finished at /"') + script.add_command('echo "Finished at $(date)"') script.add_command('exit ${status}') # Write, run, or submit batch script From 0c9391b20b74da99152a1ff95962054a20fc8249 Mon Sep 17 00:00:00 2001 From: "Thomas R. Benson" Date: Wed, 11 Sep 2019 13:52:10 -0700 Subject: [PATCH 294/634] Fix a mismatched forward-declaration I opted to make the definition use `class` rather than changing the forward-declaration. The reason for this is that logically this type represents a more complicated idea than a true POD struct. This has not been fleshed out yet (see #1199), but it will be soon. --- include/lbann/execution_contexts/execution_context.hpp | 3 ++- include/lbann/execution_contexts/sgd_execution_context.hpp | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/include/lbann/execution_contexts/execution_context.hpp b/include/lbann/execution_contexts/execution_context.hpp index 456cf52c795..d60729710de 100644 --- a/include/lbann/execution_contexts/execution_context.hpp +++ b/include/lbann/execution_contexts/execution_context.hpp @@ -38,7 +38,8 @@ namespace lbann { // Forward-declare this. class trainer; -struct termination_criteria { +class termination_criteria { +public: size_t num_steps; }; diff --git a/include/lbann/execution_contexts/sgd_execution_context.hpp b/include/lbann/execution_contexts/sgd_execution_context.hpp index d3a5409dddc..255baa80035 100644 --- a/include/lbann/execution_contexts/sgd_execution_context.hpp +++ b/include/lbann/execution_contexts/sgd_execution_context.hpp @@ -31,7 +31,8 @@ #include namespace lbann { -struct sgd_termination_criteria : public termination_criteria { +class sgd_termination_criteria : public termination_criteria { +public: size_t num_epochs; }; From 5baaf825ed453e87e3287f29cad8d1b2288e690d Mon Sep 17 00:00:00 2001 From: "David A. Hysom" Date: Thu, 12 Sep 2019 07:54:09 -0700 Subject: [PATCH 295/634] changed whitspace --- model_zoo/CMakeLists.txt | 1 + .../gan/mnist/adversarial_data.prototext | 70 ------------- .../gan/mnist/discriminator_data.prototext | 70 ------------- .../models/jag/data_reader_jag.prototext | 46 --------- .../data_reader_jag_conduit_lustre.prototext | 56 ----------- .../finetune-cub/data_reader_cub.prototext | 90 ----------------- .../data_reader_multi_images.prototext | 98 ------------------- .../triplet/data_reader_triplet.prototext | 98 ------------------- 8 files changed, 1 insertion(+), 528 deletions(-) delete mode 100644 model_zoo/models/gan/mnist/adversarial_data.prototext delete mode 100644 model_zoo/models/gan/mnist/discriminator_data.prototext delete mode 100644 model_zoo/models/jag/data_reader_jag.prototext delete mode 100644 model_zoo/models/jag/wae_cycle_gan/data_reader_jag_conduit_lustre.prototext delete mode 100644 model_zoo/models/siamese/finetune-cub/data_reader_cub.prototext delete mode 100644 model_zoo/models/siamese/triplet/data_reader_multi_images.prototext delete mode 100644 model_zoo/models/siamese/triplet/data_reader_triplet.prototext diff --git a/model_zoo/CMakeLists.txt b/model_zoo/CMakeLists.txt index 6d63af9db54..1249ef6392b 100644 --- a/model_zoo/CMakeLists.txt +++ b/model_zoo/CMakeLists.txt @@ -1,3 +1,4 @@ + # Parallel Tests add_executable( lbann-bin lbann.cpp ) target_link_libraries(lbann-bin lbann ) diff --git a/model_zoo/models/gan/mnist/adversarial_data.prototext b/model_zoo/models/gan/mnist/adversarial_data.prototext deleted file mode 100644 index 64497bfbda7..00000000000 --- a/model_zoo/models/gan/mnist/adversarial_data.prototext +++ /dev/null @@ -1,70 +0,0 @@ -data_reader { - reader { - name: "mnist" - role: "train" - shuffle: true - data_filedir: "/p/lscratchh/brainusr/datasets/MNIST" - data_filename: "train-images-idx3-ubyte" - label_filename: "train-labels-idx1-ubyte" - validation_percent: 0.1 - absolute_sample_count: 0 - percent_of_data_to_use: 1.0 - gan_labelling: true - gan_label_value: 1 - image_preprocessor { - noise_factor: 0.0 - normalizer { - scale: true - subtract_mean: false - unit_variance: false - z_score: false - } - augmenter { - horizontal_flip: false - vertical_flip: false - rotation: 0 - horizontal_shift: 0 - vertical_shift: 0 - shear_range: 0 - } - noiser { - disable: false - factor: 0.0 - } - } - } - reader { - name: "mnist" - role: "test" - shuffle: true - data_filedir: "/p/lscratchh/brainusr/datasets/MNIST" - data_filename: "t10k-images-idx3-ubyte" - label_filename: "t10k-labels-idx1-ubyte" - validation_percent: 1.0 - absolute_sample_count: 0 - percent_of_data_to_use: 1.0 - gan_labelling: true - gan_label_value: 1 - image_preprocessor { - noise_factor: 0.0 - normalizer { - scale: true - subtract_mean: false - unit_variance: false - z_score: false - } - augmenter { - horizontal_flip: false - vertical_flip: false - rotation: 0 - horizontal_shift: 0 - vertical_shift: 0 - shear_range: 0 - } - noiser { - disable: false - factor: 0.0 - } - } - } -} diff --git a/model_zoo/models/gan/mnist/discriminator_data.prototext b/model_zoo/models/gan/mnist/discriminator_data.prototext deleted file mode 100644 index 6f236834e0b..00000000000 --- a/model_zoo/models/gan/mnist/discriminator_data.prototext +++ /dev/null @@ -1,70 +0,0 @@ -data_reader { - reader { - name: "mnist" - role: "train" - shuffle: true - data_filedir: "/p/lscratchh/brainusr/datasets/MNIST" - data_filename: "train-images-idx3-ubyte" - label_filename: "train-labels-idx1-ubyte" - validation_percent: 0.1 - absolute_sample_count: 0 - percent_of_data_to_use: 1.0 - gan_labelling: true - gan_label_value: 0 - image_preprocessor { - noise_factor: 0.0 - normalizer { - scale: true - subtract_mean: false - unit_variance: false - z_score: false - } - augmenter { - horizontal_flip: false - vertical_flip: false - rotation: 0 - horizontal_shift: 0 - vertical_shift: 0 - shear_range: 0 - } - noiser { - disable: false - factor: 0.0 - } - } - } - reader { - name: "mnist" - role: "test" - shuffle: true - data_filedir: "/p/lscratchh/brainusr/datasets/MNIST" - data_filename: "t10k-images-idx3-ubyte" - label_filename: "t10k-labels-idx1-ubyte" - validation_percent: 1.0 - absolute_sample_count: 0 - percent_of_data_to_use: 1.0 - gan_labelling: true - gan_label_value: 0 - image_preprocessor { - noise_factor: 0.0 - normalizer { - scale: true - subtract_mean: false - unit_variance: false - z_score: false - } - augmenter { - horizontal_flip: false - vertical_flip: false - rotation: 0 - horizontal_shift: 0 - vertical_shift: 0 - shear_range: 0 - } - noiser { - disable: false - factor: 0.0 - } - } - } -} diff --git a/model_zoo/models/jag/data_reader_jag.prototext b/model_zoo/models/jag/data_reader_jag.prototext deleted file mode 100644 index 34561db2d99..00000000000 --- a/model_zoo/models/jag/data_reader_jag.prototext +++ /dev/null @@ -1,46 +0,0 @@ -data_reader { - reader { - name: "jag" - role: "train" - shuffle: true - data_filedir: "/p/lscratchh/jjayaram/LDRD-SI/deep-latent-spaces/data" - data_filename: "outputs_polar_100k.npy:outputs_scalars_100k.npy:inputs_100k.npy" - validation_percent: 0.1 - absolute_sample_count: 0 - percent_of_data_to_use: 1.0 - - # 1: JAG_Image, 2: JAG_Scalar, 3: JAG_Input - independent: [1] - dependent: [3] - - image_preprocessor { - raw_width: 50 - raw_height: 50 - - # 0: none, 1: dataset-wise, 2: sample-wise - early_normalization: 2 - } - } - - reader { - name: "jag" - role: "test" - shuffle: true - data_filedir: "/p/lscratchh/jjayaram/LDRD-SI/deep-latent-spaces/data" - data_filename: "outputs_polar_100k.npy:outputs_scalars_100k.npy:inputs_100k.npy" - absolute_sample_count: 0 - percent_of_data_to_use: 1.0 - - # 1: JAG_Image, 2: JAG_Scalar, 3: JAG_Input - independent: [1] - dependent: [3] - - image_preprocessor { - raw_width: 50 - raw_height: 50 - - # 0: none, 1: dataset-wise, 2: sample-wise - early_normalization: 2 - } - } -} diff --git a/model_zoo/models/jag/wae_cycle_gan/data_reader_jag_conduit_lustre.prototext b/model_zoo/models/jag/wae_cycle_gan/data_reader_jag_conduit_lustre.prototext deleted file mode 100644 index 4ae82349f50..00000000000 --- a/model_zoo/models/jag/wae_cycle_gan/data_reader_jag_conduit_lustre.prototext +++ /dev/null @@ -1,56 +0,0 @@ -######################################################################## -# The JAG normalization values were computed over the 10M + 1MA + 1MB random -# pulls from the 100M data set. They are valid for the directories: -# /p/lustre2/brainusr/datasets/10MJAG/ (10M | 1M_A | 1M_B) -# /p/lustre2/brainusr/datasets/10MJAG_balanced_1K/ (1M_A | 1M_B) -# /p/gpfs1/brainusr/datasets/10MJAG/10M | 1M_A | 1M_B -# /p/gpfs1/brainusr/datasets/10MJAG_balanced_1K/ (1M_A | 1M_B) -######################################################################## - -data_reader { - requires_data_set_metadata: true - - reader { - name: "jag_conduit" - role: "train" - shuffle: true - # change to a lustre path - #data_filedir: "/p/lustre2/brainusr/datasets/10MJAG/1M_A/" - #index_list: "index.txt" - data_filedir: "/p/lustre2/brainusr/datasets/10MJAG/1M_A/100K4trainers/" - index_list: "100Kindex.txt" - index_list_per_trainer: false - index_list_per_model: false - - validation_percent: 0 - absolute_sample_count: 0 - percent_of_data_to_use: 1.0 - disable_responses: true - disable_labels: true - - num_labels: 5 - - } - - reader { - name: "jag_conduit" - role: "test" - shuffle: true - # change to a lustre path - data_filedir: "/p/lustre2/brainusr/datasets/10MJAG/1M_B/" - index_list: "index.txt" - #data_filedir: "/p/lustre2/brainusr/datasets/10MJAG/1M_A/100K16trainers/" - #index_list: "t1_sample_list.txt" - index_list_per_trainer: false - index_list_per_model: false - - validation_percent: 0 - absolute_sample_count: 0 - percent_of_data_to_use: 0.0005 - disable_responses: true - disable_labels: true - - num_labels: 5 - - } -} diff --git a/model_zoo/models/siamese/finetune-cub/data_reader_cub.prototext b/model_zoo/models/siamese/finetune-cub/data_reader_cub.prototext deleted file mode 100644 index 038f2dadc39..00000000000 --- a/model_zoo/models/siamese/finetune-cub/data_reader_cub.prototext +++ /dev/null @@ -1,90 +0,0 @@ -data_reader { - reader { - name: "imagenet" - role: "train" - shuffle: true - data_filedir: "/p/lscratchh/brainusr/datasets/CUB_200_2011/basic_set_256x256/" - data_filename: "/p/lscratchh/brainusr/datasets/CUB_200_2011_list/train_list.txt" - label_filename: "" - validation_percent: 0.01 - absolute_sample_count: 0 - percent_of_data_to_use: 1.0 - num_labels: 200 - - image_preprocessor { - raw_width: 256 - raw_height: 256 - raw_num_channels: 3 - - cropper { - disable: false - crop_width: 224 - crop_height: 224 - crop_randomly: true - resized_width: 256 - resized_height: 256 - } - - colorizer { - disable: true - } - - augmenter { - disable: true - } - - subtractor { - disable: false - channel_mean: [0.40625, 0.45703, 0.48047] - } - - normalizer { - disable: true - } - } - } - - reader { - name: "imagenet" - role: "test" - shuffle: true - data_filedir: "/p/lscratchh/brainusr/datasets/CUB_200_2011/basic_set_256x256/" - data_filename: "/p/lscratchh/brainusr/datasets/CUB_200_2011_list/test_list.txt" - label_filename: "" - absolute_sample_count: 0 - percent_of_data_to_use: 1.0 - num_labels: 200 - - image_preprocessor { - raw_width: 256 - raw_height: 256 - raw_num_channels: 3 - - cropper { - disable: false - crop_width: 224 - crop_height: 224 - crop_randomly: false - resized_width: 256 - resized_height: 256 - } - - colorizer { - disable: true - } - - augmenter { - disable: true - } - - subtractor { - disable: false - channel_mean: [0.40625, 0.45703, 0.48047] - } - - normalizer { - disable: true - } - } - } -} diff --git a/model_zoo/models/siamese/triplet/data_reader_multi_images.prototext b/model_zoo/models/siamese/triplet/data_reader_multi_images.prototext deleted file mode 100644 index 7d825c946a4..00000000000 --- a/model_zoo/models/siamese/triplet/data_reader_multi_images.prototext +++ /dev/null @@ -1,98 +0,0 @@ -data_reader { - reader { - name: "multi_images" - role: "train" - shuffle: true - data_filedir: "/p/lscratchh/brainusr/datasets/ILSVRC2012/patches_84h_110x110_13x13-blur-ab_compact/" - data_filename: "/p/lscratchh/brainusr/datasets/ILSVRC2012/birds_and_cars/patches_84h_110x110_13x13-blur-ab_compact/train_list_8h_birds.txt" - label_filename: "" - validation_percent: 0.1 - absolute_sample_count: 0 - percent_of_data_to_use: 1.0 - num_labels: 20 - num_image_srcs: 3 - - image_preprocessor { - raw_width: 110 - raw_height: 110 - - cropper { - disable: false - crop_width: 96 - crop_height: 96 - crop_randomly: false - resized_width: 96 - resized_height: 96 - } - - colorizer { - disable: true - } - - augmenter { - disable: true - } - - subtractor { - disable: false - channel_mean: [0.40625, 0.45703, 0.48047] - } - - normalizer { - disable: true - scale: true - subtract_mean: true - unit_variance: true - z_score: false - } - } - } - - reader { - name: "multi_images" - role: "test" - shuffle: true - data_filedir: "/p/lscratchh/brainusr/datasets/ILSVRC2012/patches_84h_110x110_13x13-blur-ab_compact/" - data_filename: "/p/lscratchh/brainusr/datasets/ILSVRC2012/birds_and_cars/patches_84h_110x110_13x13-blur-ab_compact/val_list_8h.txt" - label_filename: "" - absolute_sample_count: 0 - percent_of_data_to_use: 1.0 - num_labels: 20 - num_image_srcs: 3 - - image_preprocessor { - raw_width: 110 - raw_height: 110 - - cropper { - disable: false - crop_width: 96 - crop_height: 96 - crop_randomly: false - resized_width: 96 - resized_height: 96 - } - - colorizer { - disable: true - } - - augmenter { - disable: true - } - - subtractor { - disable: false - channel_mean: [0.40625, 0.45703, 0.48047] - } - - normalizer { - disable: true - scale: true - subtract_mean: true - unit_variance: true - z_score: false - } - } - } -} diff --git a/model_zoo/models/siamese/triplet/data_reader_triplet.prototext b/model_zoo/models/siamese/triplet/data_reader_triplet.prototext deleted file mode 100644 index baa90c196bf..00000000000 --- a/model_zoo/models/siamese/triplet/data_reader_triplet.prototext +++ /dev/null @@ -1,98 +0,0 @@ -data_reader { - reader { - name: "triplet" - role: "train" - shuffle: true - data_filedir: "/p/lscratchh/brainusr/datasets/ILSVRC2012/patches_84h_110x110_13x13-blur-ab_compact/" - data_filename: "/p/lscratchh/brainusr/datasets/ILSVRC2012/patches_84h_110x110_13x13-blur-ab_compact/train/train_list_8h.nfl.npz" - label_filename: "" - validation_percent: 0.1 - absolute_sample_count: 0 - percent_of_data_to_use: 1.0 - num_labels: 20 - - image_preprocessor { - raw_width: 110 - raw_height: 110 - raw_num_channels: 3 - - cropper { - disable: false - crop_width: 96 - crop_height: 96 - crop_randomly: false - resized_width: 96 - resized_height: 96 - } - - colorizer { - disable: true - } - - augmenter { - disable: true - } - - subtractor { - disable: false - channel_mean: [0.40625, 0.45703, 0.48047] - } - - normalizer { - disable: true - scale: true - subtract_mean: true - unit_variance: true - z_score: false - } - } - } - - reader { - name: "triplet" - role: "test" - shuffle: true - data_filedir: "/p/lscratchh/brainusr/datasets/ILSVRC2012/patches_84h_110x110_13x13-blur-ab_compact/" - data_filename: "/p/lscratchh/brainusr/datasets/ILSVRC2012/patches_84h_110x110_13x13-blur-ab_compact/val/val_list_8h.nfl.npz" - label_filename: "" - absolute_sample_count: 0 - percent_of_data_to_use: 1.0 - num_labels: 20 - - image_preprocessor { - raw_width: 110 - raw_height: 110 - raw_num_channels: 3 - - cropper { - disable: false - crop_width: 96 - crop_height: 96 - crop_randomly: false - resized_width: 96 - resized_height: 96 - } - - colorizer { - disable: true - } - - augmenter { - disable: true - } - - subtractor { - disable: false - channel_mean: [0.40625, 0.45703, 0.48047] - } - - normalizer { - disable: true - scale: true - subtract_mean: true - unit_variance: true - z_score: false - } - } - } -} From e78bc8eb9c110c96281912cb04c0b2768be99278 Mon Sep 17 00:00:00 2001 From: "David A. Hysom" Date: Thu, 12 Sep 2019 08:14:50 -0700 Subject: [PATCH 296/634] restoring a file that should not have been deleted --- .../data_reader_jag_conduit_lustre.prototext | 56 +++++++++++++++++++ 1 file changed, 56 insertions(+) create mode 100644 model_zoo/models/jag/wae_cycle_gan/data_reader_jag_conduit_lustre.prototext diff --git a/model_zoo/models/jag/wae_cycle_gan/data_reader_jag_conduit_lustre.prototext b/model_zoo/models/jag/wae_cycle_gan/data_reader_jag_conduit_lustre.prototext new file mode 100644 index 00000000000..4ae82349f50 --- /dev/null +++ b/model_zoo/models/jag/wae_cycle_gan/data_reader_jag_conduit_lustre.prototext @@ -0,0 +1,56 @@ +######################################################################## +# The JAG normalization values were computed over the 10M + 1MA + 1MB random +# pulls from the 100M data set. They are valid for the directories: +# /p/lustre2/brainusr/datasets/10MJAG/ (10M | 1M_A | 1M_B) +# /p/lustre2/brainusr/datasets/10MJAG_balanced_1K/ (1M_A | 1M_B) +# /p/gpfs1/brainusr/datasets/10MJAG/10M | 1M_A | 1M_B +# /p/gpfs1/brainusr/datasets/10MJAG_balanced_1K/ (1M_A | 1M_B) +######################################################################## + +data_reader { + requires_data_set_metadata: true + + reader { + name: "jag_conduit" + role: "train" + shuffle: true + # change to a lustre path + #data_filedir: "/p/lustre2/brainusr/datasets/10MJAG/1M_A/" + #index_list: "index.txt" + data_filedir: "/p/lustre2/brainusr/datasets/10MJAG/1M_A/100K4trainers/" + index_list: "100Kindex.txt" + index_list_per_trainer: false + index_list_per_model: false + + validation_percent: 0 + absolute_sample_count: 0 + percent_of_data_to_use: 1.0 + disable_responses: true + disable_labels: true + + num_labels: 5 + + } + + reader { + name: "jag_conduit" + role: "test" + shuffle: true + # change to a lustre path + data_filedir: "/p/lustre2/brainusr/datasets/10MJAG/1M_B/" + index_list: "index.txt" + #data_filedir: "/p/lustre2/brainusr/datasets/10MJAG/1M_A/100K16trainers/" + #index_list: "t1_sample_list.txt" + index_list_per_trainer: false + index_list_per_model: false + + validation_percent: 0 + absolute_sample_count: 0 + percent_of_data_to_use: 0.0005 + disable_responses: true + disable_labels: true + + num_labels: 5 + + } +} From 4f313e9542f0fdb960625b02d3957a60429bb041 Mon Sep 17 00:00:00 2001 From: Ryan Forsyth Date: Thu, 5 Sep 2019 10:53:02 -0700 Subject: [PATCH 297/634] Test improvements --- bamboo/common_python/test_tools.py | 4 +- bamboo/common_python/tools.py | 6 +- .../test_integration_performance.py | 10 +- bamboo/local_test.cmd | 10 ++ bamboo/local_test.sh | 106 ++++++++++++++++++ bamboo/unit_tests/conftest.py | 12 +- bamboo/unit_tests/test_unit_checkpoint.py | 31 ++--- bamboo/unit_tests/test_unit_lbann2_reload.py | 12 +- model_zoo/vision/densenet.py | 13 ++- scripts/build_lbann_lc.sh | 8 +- 10 files changed, 176 insertions(+), 36 deletions(-) create mode 100644 bamboo/local_test.cmd create mode 100755 bamboo/local_test.sh diff --git a/bamboo/common_python/test_tools.py b/bamboo/common_python/test_tools.py index 4ac070e9e2f..c15bc5f36d5 100644 --- a/bamboo/common_python/test_tools.py +++ b/bamboo/common_python/test_tools.py @@ -1,5 +1,4 @@ import pytest -import subprocess import tools @@ -34,6 +33,7 @@ def test_command_catalyst(): expected = 'salloc --nodes=20 --partition=pdebug --time=30 srun --mpibind=off --time=30 --ntasks=40 exe --reader=dir/model_zoo/data_readers/data_reader_mnist.prototext --data_reader_percent=0.100000 --exit_after_setup --mini_batch_size=15 --model=dir/model_zoo/models/folder/model_lenet.prototext --num_epochs=7 --optimizer=dir/model_zoo/optimizers/opt_adagrad.prototext --procs_per_model=10 --block_size=4 --print_affinity > output_file 2> error_file' assert actual == expected + def test_command_corona(): actual = tools.get_command(cluster='corona', **d) expected = 'salloc --nodes=20 --partition=pdebug --time=30 srun --mpibind=off --time=30 --ntasks=40 exe --reader=dir/model_zoo/data_readers/data_reader_mnist.prototext --data_reader_percent=0.100000 --exit_after_setup --mini_batch_size=15 --model=dir/model_zoo/models/folder/model_lenet.prototext --num_epochs=7 --optimizer=dir/model_zoo/optimizers/opt_adagrad.prototext --procs_per_model=10 --block_size=4 --print_affinity > output_file 2> error_file' @@ -439,7 +439,7 @@ def test_bad_extra_lbann_flags_invalid_flag(): " 'random_seed', 'objective_function', 'data_layout'," " 'print_affinity', 'use_data_store', 'preload_data_store'," " 'super_node', 'write_sample_list', 'ltfb_verbose'," - " 'index_list_train', 'index_list_test'," + " 'ckpt_dir', 'index_list_train', 'index_list_test'," " 'label_filename_train', 'label_filename_test'," " 'share_testing_data_readers', 'image_dir', 'no_im_comm']." ) diff --git a/bamboo/common_python/tools.py b/bamboo/common_python/tools.py index 153e447968f..85c41fa9b58 100644 --- a/bamboo/common_python/tools.py +++ b/bamboo/common_python/tools.py @@ -397,6 +397,8 @@ def get_command(cluster, # this error to lbann_errors. if isinstance(extra_lbann_flags, dict): # See `lbann --help` or src/proto/proto_common.cpp + # Commented out flags already have their own parameters + # in this function. allowed_flags = [ # 'model', # 'optimizer', @@ -558,7 +560,9 @@ def get_error_line(error_file_name): for line in error_file: if ('ERROR' in line) or ('LBANN error' in line) or \ ('Error:' in line) or \ - ('Expired or invalid job' in line): + ('Expired or invalid job' in line) or \ + ('Segmentation fault (core dumped)' in line) or \ + ('Relinquishing job allocation' in line): error_line = line break elif ('Stack trace:' in line) or \ diff --git a/bamboo/integration_tests/test_integration_performance.py b/bamboo/integration_tests/test_integration_performance.py index fdb3e01df4a..c6e33775e9c 100644 --- a/bamboo/integration_tests/test_integration_performance.py +++ b/bamboo/integration_tests/test_integration_performance.py @@ -141,7 +141,7 @@ def skeleton_performance_full_alexnet(cluster, dir_name, executables, print('Skip - ' + e) pytest.skip(e) if not weekly: - e = 'skeleton_performance_full_alexnet: Non-local testing' + e = 'skeleton_performance_full_alexnet: Not doing weekly testing' print('Skip - ' + e) pytest.skip(e) if compiler_name not in executables: @@ -233,11 +233,13 @@ def test_integration_performance_alexnet_exe(cluster, dirname, exe): skeleton_performance_alexnet(cluster, dirname, exes, 'exe', True) -# Run with python3 -m pytest -s test_integration_performance.py -k 'test_integration_performance_full_alexnet_exe' --exe= -def test_integration_performance_full_alexnet_exe(cluster, dirname, exe): +# Run with python3 -m pytest -s test_integration_performance.py -k 'test_integration_performance_full_alexnet_exe' --weekly --run --exe= +def test_integration_performance_full_alexnet_exe(cluster, dirname, weekly, + run, exe): if exe is None: e = 'test_integration_performance_full_alexnet_exe: Non-local testing' print('Skip - ' + e) pytest.skip(e) exes = {'exe': exe} - skeleton_performance_full_alexnet(cluster, dirname, exes, 'exe', True) + skeleton_performance_full_alexnet(cluster, dirname, exes, 'exe', weekly, + run) diff --git a/bamboo/local_test.cmd b/bamboo/local_test.cmd new file mode 100644 index 00000000000..0b3c18fd93f --- /dev/null +++ b/bamboo/local_test.cmd @@ -0,0 +1,10 @@ +#!/bin/bash +#SBATCH --nodes 16 +#SBATCH --partition pbatch +#SBATCH --time 960 + +# Update "--time" above to increase/decrease allocation time. +# Update "executable" with your executable. +# Use "--integration-tests" to only run integration tests. +# Use "--unit-tests" to only run unit tests. +./local_test.sh --executable "../build/gnu.Release.pascal.llnl.gov/install/bin/lbann" diff --git a/bamboo/local_test.sh b/bamboo/local_test.sh new file mode 100755 index 00000000000..09d98743c2c --- /dev/null +++ b/bamboo/local_test.sh @@ -0,0 +1,106 @@ +#!/bin/bash -l + +# Local testing (i.e. not with Bamboo) + +################################################################ +# Help message +################################################################ + +function help_message { + local SCRIPT=$(basename ${0}) + local N=$(tput sgr0) # Normal text + local C=$(tput setf 4) # Colored text + cat << EOF +Run integration and unit tests locally, outside Bamboo. +Usage: ./${SCRIPT} [options] +Options: + ${C}--help${N} Display this help message and exit. + ${C}--executable${N} Specify executable to be used. Required field. + ${C}--integration-tests${N} Specify that only integration tests should be run. + ${C}--unit-tests${N} Specify that only unit tests should be run. +EOF +} + +################################################################ +# Parse command-line arguments +################################################################ + +EXECUTABLE= +INTEGRATION_TESTS=1 +UNIT_TESTS=1 +while :; do + case ${1} in + -h|--help) + # Help message + help_message + exit 0 + ;; + -e|--executable) + # Set executable + # -n: check if string has non-zero length. + if [ -n "${2}" ]; then + EXECUTABLE=${2} + shift + else + echo "\"${1}\" option requires a non-empty option argument" >&2 + help_message + exit 1 + fi + ;; + -i|--integration-tests) + # Run only integration tests + UNIT_TESTS=0 + ;; + -u|--unit-tests) + # Run only unit tests + INTEGRATION_TESTS=0 + ;; + -?*) + # Unknown option + echo "Unknown option (${1})" >&2 + exit 1 + ;; + *) + # Break loop if there are no more options + break + esac + shift +done + +# -z: check if string has zero length. +if [ -z ${EXECUTABLE} ]; then + echo "Executable must be set." + help_message + exit 1 +fi + +################################################################ +# Run tests +################################################################ + +# Assume user already has an executable (i.e. no need for compiler tests). +# Assume user already has 16 nodes allocated on a cluster. + +echo "EXECUTABLE=${EXECUTABLE}" +echo "INTEGRATION_TESTS=${INTEGRATION_TESTS}" +echo "UNIT_TESTS=${UNIT_TESTS}" +PYTHON=python3 + +echo "Task: Cleaning" +./clean.sh + +echo "Task: Integration Tests" +cd integration_tests +if [ ${INTEGRATION_TESTS} -ne 0 ]; then + $PYTHON -m pytest -s -vv --durations=0 --exe=${EXECUTABLE} +fi +cd .. + +echo "Task: Unit Tests" +cd unit_tests +if [ ${UNIT_TESTS} -ne 0 ]; then + $PYTHON -m pytest -s -vv --durations=0 --exe=${EXECUTABLE} +fi +cd .. + +echo "Task: Finished" diff --git a/bamboo/unit_tests/conftest.py b/bamboo/unit_tests/conftest.py index 12c5bf457ec..a750292c101 100644 --- a/bamboo/unit_tests/conftest.py +++ b/bamboo/unit_tests/conftest.py @@ -3,6 +3,7 @@ import tools import pytest, re, subprocess + def pytest_addoption(parser): cluster = re.sub('[0-9]+', '', subprocess.check_output( 'hostname'.split()).decode('utf-8').strip()) @@ -17,7 +18,11 @@ def pytest_addoption(parser): parser.addoption('--exes', action='store', default=default_exes, help='--exes={compiler_name: path}') # For local testing only - parser.addoption('--exe', action='store', help='--exe=') + parser.addoption('--data-reader-percent', action='store', default=1.0, + help='--data-reader-percent= -def test_unit_checkpoint_lenet_shared_exe(cluster, dirname, exe): +def test_unit_checkpoint_lenet_shared_exe(cluster, dirname, exe, data_reader_percent): if exe is None: e = 'test_unit_checkpoint_lenet_exe: Non-local testing' print('Skip - ' + e) pytest.skip(e) exes = {'exe': exe} - skeleton_checkpoint_lenet_shared(cluster, exes, dirname, 'exe') + skeleton_checkpoint_lenet_shared(cluster, exes, dirname, 'exe', data_reader_percent) # Run with python3 -m pytest -s test_unit_checkpoint.py -k 'test_unit_checkpoint_lenet_distributed_exe' --exe= -def test_unit_checkpoint_lenet_distributed_exe(cluster, dirname, exe): +def test_unit_checkpoint_lenet_distributed_exe(cluster, dirname, exe, data_reader_percent): if exe is None: e = 'test_unit_checkpoint_lenet_exe: Non-local testing' print('Skip - ' + e) pytest.skip(e) exes = {'exe': exe} - skeleton_checkpoint_lenet_distributed(cluster, exes, dirname, 'exe') + skeleton_checkpoint_lenet_distributed(cluster, exes, dirname, 'exe', data_reader_percent) diff --git a/bamboo/unit_tests/test_unit_lbann2_reload.py b/bamboo/unit_tests/test_unit_lbann2_reload.py index 554c1e8dce0..5c0f65998c3 100644 --- a/bamboo/unit_tests/test_unit_lbann2_reload.py +++ b/bamboo/unit_tests/test_unit_lbann2_reload.py @@ -5,7 +5,7 @@ import os, sys -def skeleton_lbann2_reload(cluster, executables, dir_name, compiler_name): +def skeleton_lbann2_reload(cluster, executables, dir_name, compiler_name, data_reader_percent=1.0): if compiler_name not in executables: e = 'skeleton_lbann2_reload: default_exes[%s] does not exist' % compiler_name print('Skip - ' + e) @@ -22,7 +22,7 @@ def skeleton_lbann2_reload(cluster, executables, dir_name, compiler_name): data_reader_name='mnist', data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST', dir_name=dir_name, - data_reader_percent=1.0, + data_reader_percent=data_reader_percent, ckpt_dir=no_ckpt_dir, model_path=model_path, optimizer_name='sgd', @@ -41,7 +41,7 @@ def skeleton_lbann2_reload(cluster, executables, dir_name, compiler_name): cluster=cluster, executable=lbann2, num_nodes=1, num_processes=2, dir_name=dir_name, data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST', - data_reader_name='mnist', data_reader_percent=1.0, + data_reader_name='mnist', data_reader_percent=data_reader_percent, ckpt_dir=ckpt_dir, model_folder='tests', model_name='lenet_mnist_ckpt', num_epochs=2, optimizer_name='sgd', output_file_name=output_file_name, @@ -57,7 +57,7 @@ def skeleton_lbann2_reload(cluster, executables, dir_name, compiler_name): dir_name=dir_name, data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST', data_reader_name='mnist', - data_reader_percent=1.0, + data_reader_percent=data_reader_percent, ckpt_dir=ckpt_dir, model_path='../../model_zoo/tests/model_lenet_mnist_lbann2ckpt.prototext', num_epochs=2, optimizer_name='sgd', @@ -123,10 +123,10 @@ def test_unit_lbann2_reload_intel19(cluster, exes, dirname): # Run with python3 -m pytest -s test_unit_lbann2_reload.py -k 'test_unit_lbann2_reload_exe' --exe= -def test_unit_lbann2_reload_exe(cluster, dirname, exe): +def test_unit_lbann2_reload_exe(cluster, dirname, exe, data_reader_percent): if exe is None: e = 'test_unit_lbann2_reload_exe: Non-local testing' print('Skip - ' + e) pytest.skip(e) exes = {'exe': exe} - skeleton_lbann2_reload(cluster, exes, dirname, 'exe') + skeleton_lbann2_reload(cluster, exes, dirname, 'exe', data_reader_percent) diff --git a/model_zoo/vision/densenet.py b/model_zoo/vision/densenet.py index 52064c9baa8..b1b19c6b44a 100755 --- a/model_zoo/vision/densenet.py +++ b/model_zoo/vision/densenet.py @@ -490,17 +490,22 @@ def set_up_experiment(args, else: optimizer = lbann.contrib.args.create_optimizer(args) + # Setup trainer + trainer = lbann.Trainer() + # Save prototext to args.prototext if args.prototext: lbann.proto.save_prototext(args.prototext, + trainer=trainer, model=model, optimizer=optimizer, data_reader=data_reader_proto) - return model, data_reader_proto, optimizer + return trainer, model, data_reader_proto, optimizer def run_experiment(args, + trainer, model, data_reader_proto, optimizer): @@ -530,7 +535,7 @@ def run_experiment(args, imagenet_dir(data_set='val', num_classes=classes), imagenet_labels(data_set='val', num_classes=classes))) - lbann.contrib.lc.launcher.run(model, + lbann.contrib.lc.launcher.run(trainer, model, data_reader_proto, optimizer, job_name='lbann_densenet', @@ -565,7 +570,7 @@ def main(): # Setup experiment # ---------------------------------- - (model, data_reader_proto, optimizer) = set_up_experiment( + (trainer, model, data_reader_proto, optimizer) = set_up_experiment( args, input_node, probs, labels) # ---------------------------------- @@ -573,7 +578,7 @@ def main(): # ---------------------------------- # Note: Use `lbann.run` instead for non-LC systems. - run_experiment(args, model, data_reader_proto, optimizer) + run_experiment(args, trainer, model, data_reader_proto, optimizer) if __name__ == '__main__': diff --git a/scripts/build_lbann_lc.sh b/scripts/build_lbann_lc.sh index ee7c61ae4d1..397c2361851 100755 --- a/scripts/build_lbann_lc.sh +++ b/scripts/build_lbann_lc.sh @@ -408,8 +408,8 @@ fi # Add compiler optimization flags if [ "${BUILD_TYPE}" == "Release" ]; then if [ "${COMPILER}" == "gnu" ]; then - C_FLAGS="${C_FLAGS} -O3 ${INSTRUMENT}" - CXX_FLAGS="${CXX_FLAGS} -O3 ${INSTRUMENT}" + C_FLAGS="${C_FLAGS} -O3 ${INSTRUMENT} -fno-omit-frame-pointer" + CXX_FLAGS="${CXX_FLAGS} -O3 ${INSTRUMENT} -fno-omit-frame-pointer" Fortran_FLAGS="${Fortran_FLAGS} -O3" if [ "${CLUSTER}" == "catalyst" ]; then C_FLAGS="${C_FLAGS} -march=ivybridge -mtune=ivybridge" @@ -435,8 +435,8 @@ if [ "${BUILD_TYPE}" == "Release" ]; then fi else if [ "${COMPILER}" == "gnu" ]; then - C_FLAGS="${C_FLAGS} -g ${INSTRUMENT}" - CXX_FLAGS="${CXX_FLAGS} -g ${INSTRUMENT}" + C_FLAGS="${C_FLAGS} -g ${INSTRUMENT} -fno-omit-frame-pointer" + CXX_FLAGS="${CXX_FLAGS} -g ${INSTRUMENT} -fno-omit-frame-pointer" Fortran_FLAGS="${Fortran_FLAGS} -g" fi fi From 4988efa76ff151f990598c9554b44586ddd40ba0 Mon Sep 17 00:00:00 2001 From: Ryan Forsyth Date: Wed, 26 Jun 2019 16:26:50 -0700 Subject: [PATCH 298/634] Update unit tests to use Python front end --- bamboo/allocate_and_run.sh | 5 + bamboo/common_python/tools.py | 13 ++ bamboo/run.sh | 1 + bamboo/unit_tests/experiments/README.md | 1 + .../test_unit_mnist_ridge_regression.py | 124 +++++++++++++++--- 5 files changed, 127 insertions(+), 17 deletions(-) create mode 100644 bamboo/unit_tests/experiments/README.md diff --git a/bamboo/allocate_and_run.sh b/bamboo/allocate_and_run.sh index 6ada77c39bd..3876955f9e7 100755 --- a/bamboo/allocate_and_run.sh +++ b/bamboo/allocate_and_run.sh @@ -5,6 +5,8 @@ CLUSTER=$(hostname | sed 's/\([a-zA-Z][a-zA-Z]*\)[0-9]*/\1/g') echo "allocate_and_run.sh CLUSTER=" echo $CLUSTER +export PYTHONPATH=${HOME}/.local/lib/python3.7/site-packages:${PYTHONPATH} + WEEKLY=0 while :; do case ${1} in @@ -53,4 +55,7 @@ elif [ "${CLUSTER}" = 'catalyst' ] || [ "${CLUSTER}" = 'corona' ] || [ "${CLUSTE else timeout -k 5 24h salloc -N16 --partition=pbatch -t $ALLOCATION_TIME_LIMIT ./run.sh fi +else + echo "allocate_and_run.sh. Unsupported cluster CLUSTER=" + echo $CLUSTER fi diff --git a/bamboo/common_python/tools.py b/bamboo/common_python/tools.py index 85c41fa9b58..a908cf05120 100644 --- a/bamboo/common_python/tools.py +++ b/bamboo/common_python/tools.py @@ -496,12 +496,25 @@ def process_executable_existence(executable, skip_no_exe=True): if not executable_exists: error_string = 'Executable does not exist: %s' % executable if skip_no_exe: + print('Skip - ' + error_string) import pytest pytest.skip(error_string) else: raise Exception(error_string) +def process_executable(name, compiler_name, executables): + if compiler_name not in executables: + e = '{n}: default_exes[{c}] does not exist'.format( + n=name, c=compiler_name) + print('Skip - ' + e) + import pytest + pytest.skip(e) + executable_path = executables[compiler_name] + print('{n}: executable_path={e}'.format(n=name, e=executable_path)) + process_executable_existence(executable_path) + + def get_spack_exes(default_dirname, cluster): exes = {} diff --git a/bamboo/run.sh b/bamboo/run.sh index 7ce3597a0bd..c90d256bb5c 100755 --- a/bamboo/run.sh +++ b/bamboo/run.sh @@ -51,6 +51,7 @@ cd .. echo "Task: Unit Tests" cd unit_tests +module load python/3.6.4 $PYTHON -m pytest -s -vv --durations=0 --junitxml=results.xml cd .. diff --git a/bamboo/unit_tests/experiments/README.md b/bamboo/unit_tests/experiments/README.md new file mode 100644 index 00000000000..0c210a7e6e8 --- /dev/null +++ b/bamboo/unit_tests/experiments/README.md @@ -0,0 +1 @@ +Subdirectory for test experiments diff --git a/bamboo/unit_tests/test_unit_mnist_ridge_regression.py b/bamboo/unit_tests/test_unit_mnist_ridge_regression.py index d89158deb52..4321d0f0cdd 100644 --- a/bamboo/unit_tests/test_unit_mnist_ridge_regression.py +++ b/bamboo/unit_tests/test_unit_mnist_ridge_regression.py @@ -1,26 +1,116 @@ import sys sys.path.insert(0, '../common_python') import tools -import pytest + import os +import pytest + + +def skeleton_mnist_ridge_regression(cluster, executables, dir_name, + compiler_name): + tools.process_executable( + 'skeleton_mnist_ridge_regression', compiler_name, executables) + + if compiler_name == 'exe': + exe = executables[compiler_name] + bin_dir = os.path.dirname(exe) + install_dir = os.path.dirname(bin_dir) + build_path = '{i}/lib/python3.7/site-packages'.format(i=install_dir) + else: + if compiler_name == 'clang6': + path = 'clang.Release' + elif compiler_name == 'clang6_debug': + path = 'clang.Debug' + elif compiler_name == 'gcc7': + path = 'gnu.Release' + elif compiler_name == 'clang6_debug': + path = 'gnu.Debug' + elif compiler_name == 'intel19': + path = 'intel.Release' + elif compiler_name == 'intel19_debug': + path = 'intel.Debug' + path = '{p}.{c}.llnl.gov'.format(p=path, c=cluster) + build_path = '{d}/build/{p}/install/lib/python3.7/site-packages'.format( + d=dir_name, p=path) + print('build_path={b}'.format(b=build_path)) + sys.path.append(build_path) + + # Model + # Converted from lbann/model_zoo/tests/model_mnist_ridge_regression.prototext. + # Equivalent to prototext's "Layers" section. + import lbann + input_node = lbann.Input() + images_node = lbann.Identity(input_node) + image_labels_node = lbann.Identity(input_node) + fc_node = lbann.FullyConnected(images_node, num_neurons=10, has_bias=True) + mse = lbann.MeanSquaredError([fc_node, image_labels_node]) + # Equivalent to prototext's "Objective function" section. + layers = list(lbann.traverse_layer_graph(input_node)) + weights = set() + for l in layers: + weights.update(l.weights) + # scale == weight decay + l2_reg = lbann.L2WeightRegularization(weights=weights, scale=0.01) + objective_function = lbann.ObjectiveFunction([mse, l2_reg]) + # Equivalent to prototext's "Metrics" section. + metrics = [lbann.Metric(mse, name='mean squared error')] + # Equivalent to prototext's "Callbacks" section. + callbacks = [lbann.CallbackPrint(), + lbann.CallbackTimer(), + lbann.CallbackCheckGradients( + verbose=False, error_on_failure=True)] + # Equivalent to prototext's model-level parameters. + model = lbann.Model(mini_batch_size=131, + epochs=4, + layers=layers, + objective_function=objective_function, + metrics=metrics, + callbacks=callbacks) + + # Data Reader + # TODO: Do we also want to programatically construct the data reader, not just the model? + data_reader_prototext_file = os.path.join(dir_name, + 'model_zoo', + 'data_readers', + 'data_reader_mnist.prototext') + data_reader_proto = lbann.lbann_pb2.LbannPB() + with open(data_reader_prototext_file, 'r') as f: + import google.protobuf.text_format as txtf + txtf.Merge(f.read(), data_reader_proto) + data_reader_proto = data_reader_proto.data_reader + + # Optimizer + # Learning rate from model_zoo/optimizers/opt_adam.prototext + optimizer = lbann.optimizer.Adam(learn_rate=0.001, beta1=0.9, beta2=0.99, eps=1e-8) + + # kwargs + kwargs = { + 'account': 'guests', + 'nodes': 1, + 'partition': 'pbatch', + 'procs_per_node': 1 + } + + if cluster == 'lassen': + kwargs['lbann_args'] = '--data_filedir_train=/p/gpfs1/brainusr/datasets/MNIST --data_filedir_test=/p/gpfs1/brainusr/datasets/MNIST' + # Run + experiment_dir = '{d}/bamboo/unit_tests/experiments/mnist_ridge_regression_{c}'.format( + d=dir_name, c=compiler_name) + # Setup trainer + trainer = lbann.Trainer() + import lbann.contrib.lc.launcher + return_code = lbann.contrib.lc.launcher.run( + trainer=trainer, + experiment_dir=experiment_dir, + model=model, + data_reader=data_reader_proto, + optimizer=optimizer, + job_name='lbann_ridge_regression', + **kwargs) -def skeleton_mnist_ridge_regression(cluster, executables, dir_name, compiler_name): - if compiler_name not in executables: - e = 'skeleton_mnist_ridge_regression: default_exes[%s] does not exist' % compiler_name - print('Skip - ' + e) - pytest.skip(e) - output_file_name = '%s/bamboo/unit_tests/output/mnist_ridge_regression_%s_output.txt' % (dir_name, compiler_name) - error_file_name = '%s/bamboo/unit_tests/error/mnist_ridge_regression_%s_error.txt' % (dir_name, compiler_name) - command = tools.get_command( - cluster=cluster, executable=executables[compiler_name], num_nodes=1, - num_processes=1, dir_name=dir_name, - data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST', - data_reader_name='mnist', - model_folder='tests', model_name='mnist_ridge_regression', - optimizer_name='adam', - output_file_name=output_file_name, error_file_name=error_file_name) - return_code = os.system(command) + error_file_name = '{e}/err.log'.format( + e=experiment_dir, c=compiler_name) tools.assert_success(return_code, error_file_name) From 716237c394f82266698ed37a81c9fed9a13fb165 Mon Sep 17 00:00:00 2001 From: Tim Moon Date: Thu, 12 Sep 2019 18:21:48 -0700 Subject: [PATCH 299/634] Clean up LTFB callback code Use namespaces instead of double underscores. Closes #1220. --- src/callbacks/ltfb.cpp | 62 ++++++++++++++++++++++++------------------ 1 file changed, 35 insertions(+), 27 deletions(-) diff --git a/src/callbacks/ltfb.cpp b/src/callbacks/ltfb.cpp index 96a3386de48..f82a331370e 100644 --- a/src/callbacks/ltfb.cpp +++ b/src/callbacks/ltfb.cpp @@ -42,7 +42,7 @@ namespace callback { namespace { -/** Generate partner trainer assignments. +/** @brief Generate partner trainer assignments. * * Requires a scatter from the world master process. If there are an * odd number of trainers, one of them is partnered with itself. @@ -93,19 +93,20 @@ El::Int get_partner_trainer(lbann_comm& comm, } } -/** Exchange weights values with partner trainer. - * - * @param weights_names Names of weights to exchange. If empty, +/// See @c lbann::callbacks::ltfb::communication_algorithm::sendrecv_weights +namespace sendrecv_weights { + +/** @param weights_names Names of weights to exchange. If empty, * then all weights are exchanged. * @param send_weights Weights values sent to partner. * @param recv_weights Weights values recieved from partner. */ -void exchange_models__sendrecv_weights(lbann_comm& comm, - El::Int partner_trainer, - const std::set& weights_names, - const std::vector& send_weights, - std::vector& recv_weights, - bool exchange_hyperparameters) { +void exchange_models(lbann_comm& comm, + El::Int partner_trainer, + const std::set& weights_names, + const std::vector& send_weights, + std::vector& recv_weights, + bool exchange_hyperparameters) { // Get partner process const El::Int rank_in_trainer = comm.get_rank_in_trainer(); @@ -197,11 +198,21 @@ void exchange_models__sendrecv_weights(lbann_comm& comm, } -void exchange_models__checkpoint_file(lbann_comm& comm, - El::Int partner_trainer, - model& m, - const std::set& weights_names, - const std::vector& local_weights) { +} // namespace sendrecv_weights + +/// See @c lbann::callbacks::ltfb::communication_algorithm::checkpoint_file +namespace checkpoint_file { + +/** @param weights_names Names of weights to exchange. If empty, + * then all weights are exchanged. + * @param local_weight Copies of weights. Used to restore weights + * that we don't want to exchange. + */ +void exchange_models(lbann_comm& comm, + El::Int partner_trainer, + model& m, + const std::set& weights_names, + const std::vector& local_weights) { // Checkpoint directories const auto& c = m.get_execution_context(); @@ -261,7 +272,7 @@ void exchange_models__checkpoint_file(lbann_comm& comm, } -void restore_local_model__checkpoint_file(lbann_comm& comm, model& m) { +void restore_local_model(lbann_comm& comm, model& m) { // Checkpoint directories const auto& c = m.get_execution_context(); @@ -285,6 +296,7 @@ void restore_local_model__checkpoint_file(lbann_comm& comm, model& m) { } } +} // namespace checkpoint_file /** Get mean metric value with validation set. */ EvalType evaluate(model& m, const std::string& metric_name) { @@ -311,10 +323,8 @@ EvalType evaluate(model& m, const std::string& metric_name) { } } if (!found_metric) { - std::stringstream err; - err << "could not find metric \"" << metric_name << "\"" - << "in model \"" << m.get_name() << "\""; - LBANN_ERROR(err.str()); + LBANN_ERROR("could not find metric \"",metric_name,"\" ", + "in model \"",m.get_name(),"\""); } // Mark the data store as loaded - Note that this is a temporary fix @@ -327,7 +337,7 @@ EvalType evaluate(model& m, const std::string& metric_name) { } -} // namespace +} // namespace ltfb::ltfb(El::Int batch_interval, std::string metric_name, @@ -456,7 +466,7 @@ void ltfb::on_batch_begin(model *m) { } switch (m_comm_algo) { case communication_algorithm::sendrecv_weights: - exchange_models__sendrecv_weights(comm, + sendrecv_weights::exchange_models(comm, partner_trainer, m_weights_names, local_weights, @@ -464,7 +474,7 @@ void ltfb::on_batch_begin(model *m) { m_exchange_hyperparameters); break; case communication_algorithm::checkpoint_file: - exchange_models__checkpoint_file(comm, + checkpoint_file::exchange_models(comm, partner_trainer, *m, m_weights_names, @@ -493,7 +503,7 @@ void ltfb::on_batch_begin(model *m) { } break; case communication_algorithm::checkpoint_file: - restore_local_model__checkpoint_file(comm, *m); + checkpoint_file::restore_local_model(comm, *m); break; default: LBANN_ERROR("invalid LTFB communication algorithm"); @@ -525,9 +535,7 @@ ltfb::string_to_comm_algo(const std::string& str) { } // Invalid LTFB communication algorithm - std::stringstream err; - err << "invalid LTFB communication algorithm (" << str << ")"; - LBANN_ERROR(err.str()); + LBANN_ERROR("invalid LTFB communication algorithm (",str,")"); return communication_algorithm::sendrecv_weights; } From f88f61e05b2a1c37f4692abd39c9320b6c03c4a0 Mon Sep 17 00:00:00 2001 From: Sam Ade Jacobs Date: Fri, 13 Sep 2019 16:27:59 -0700 Subject: [PATCH 300/634] Implementation of argmax and tovec layers (#1237) * Implementation of argmax and tovec layers * Add argmin layer Parallelized CPU argmax layer with OpenMP. Moved argmax layer to misc layers. * Rename tovec_layer to one_hot_layer Parallelized CPU one-hot layer with OpenMP. Moved one-hot layer to misc layers. * Implement one-hot layer on GPU --- include/lbann/layers/misc/CMakeLists.txt | 3 + include/lbann/layers/misc/argmax.hpp | 77 +++++++++++++++++++++ include/lbann/layers/misc/argmin.hpp | 77 +++++++++++++++++++++ include/lbann/layers/misc/one_hot.hpp | 81 ++++++++++++++++++++++ include/lbann/lbann.hpp | 3 + src/layers/misc/CMakeLists.txt | 4 ++ src/layers/misc/argmax.cpp | 49 ++++++++++++++ src/layers/misc/argmin.cpp | 49 ++++++++++++++ src/layers/misc/one_hot.cpp | 54 +++++++++++++++ src/layers/misc/one_hot.cu | 86 ++++++++++++++++++++++++ src/proto/factories/layer_factory.cpp | 28 ++++++++ src/proto/layers.proto | 29 ++++++++ 12 files changed, 540 insertions(+) create mode 100644 include/lbann/layers/misc/argmax.hpp create mode 100644 include/lbann/layers/misc/argmin.hpp create mode 100644 include/lbann/layers/misc/one_hot.hpp create mode 100644 src/layers/misc/argmax.cpp create mode 100644 src/layers/misc/argmin.cpp create mode 100644 src/layers/misc/one_hot.cpp create mode 100644 src/layers/misc/one_hot.cu diff --git a/include/lbann/layers/misc/CMakeLists.txt b/include/lbann/layers/misc/CMakeLists.txt index 2b5808fdfa7..06c9e2acfb7 100644 --- a/include/lbann/layers/misc/CMakeLists.txt +++ b/include/lbann/layers/misc/CMakeLists.txt @@ -5,6 +5,9 @@ set_full_path(THIS_DIR_HEADERS channelwise_mean.hpp mini_batch_index.hpp mini_batch_size.hpp + argmax.hpp + argmin.hpp + one_hot.hpp ) # Propagate the files up the tree diff --git a/include/lbann/layers/misc/argmax.hpp b/include/lbann/layers/misc/argmax.hpp new file mode 100644 index 00000000000..2627396a038 --- /dev/null +++ b/include/lbann/layers/misc/argmax.hpp @@ -0,0 +1,77 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_LAYERS_MISC_ARGMAX_HPP_INCLUDED +#define LBANN_LAYERS_MISC_ARGMAX_HPP_INCLUDED + +#include "lbann/layers/layer.hpp" + +namespace lbann { + +/** @brief Get index of maximum-value tensor entry + * + * Expects a 1-D input tensor. If multiple entries have the same + * maximum value, outputs the index of the first one. + */ +template +class argmax_layer : public Layer { +public: + + argmax_layer(lbann_comm* comm) : Layer(comm) { + static_assert(Layout == data_layout::DATA_PARALLEL, + "argmax layer only supports data parallel layout"); + static_assert(Device == El::Device::CPU, + "argmax layer only supports CPU"); + } + argmax_layer* copy() const override { return new argmax_layer(*this); } + std::string get_type() const override { return "argmax"; } + data_layout get_data_layout() const override { return Layout; } + El::Device get_device_allocation() const override { return Device; } + +protected: + + void setup_dims() override { + Layer::setup_dims(); + set_output_dims({1}); + + // Make sure input tensor is 1-D + const auto input_dims = get_input_dims(); + if (input_dims.size() != 1) { + LBANN_ERROR(get_type()," layer \"",get_name(),"\" ", + "expects a 1-D input tensor, ", + "but parent layer \"",m_parent_layers[0]->get_name(),"\" ", + "outputs a ",input_dims.size(),"-D tensor"); + } + + } + + void fp_compute() override; + +}; + +} // namespace lbann + +#endif // LBANN_LAYERS_MISC_ARGMAX_HPP_INCLUDED diff --git a/include/lbann/layers/misc/argmin.hpp b/include/lbann/layers/misc/argmin.hpp new file mode 100644 index 00000000000..844e3637b35 --- /dev/null +++ b/include/lbann/layers/misc/argmin.hpp @@ -0,0 +1,77 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_LAYERS_MISC_ARGMIN_HPP_INCLUDED +#define LBANN_LAYERS_MISC_ARGMIN_HPP_INCLUDED + +#include "lbann/layers/layer.hpp" + +namespace lbann { + +/** @brief Get index of minimum-value tensor entry + * + * Expects a 1-D input tensor. If multiple entries have the same + * minimum value, outputs the index of the first one. + */ +template +class argmin_layer : public Layer { +public: + + argmin_layer(lbann_comm* comm) : Layer(comm) { + static_assert(Layout == data_layout::DATA_PARALLEL, + "argmin layer only supports data parallel layout"); + static_assert(Device == El::Device::CPU, + "argmin layer only supports CPU"); + } + argmin_layer* copy() const override { return new argmin_layer(*this); } + std::string get_type() const override { return "argmin"; } + data_layout get_data_layout() const override { return Layout; } + El::Device get_device_allocation() const override { return Device; } + +protected: + + void setup_dims() override { + Layer::setup_dims(); + set_output_dims({1}); + + // Make sure input tensor is 1-D + const auto input_dims = get_input_dims(); + if (input_dims.size() != 1) { + LBANN_ERROR(get_type()," layer \"",get_name(),"\" ", + "expects a 1-D input tensor, ", + "but parent layer \"",m_parent_layers[0]->get_name(),"\" ", + "outputs a ",input_dims.size(),"-D tensor"); + } + + } + + void fp_compute() override; + +}; + +} // namespace lbann + +#endif // LBANN_LAYERS_MISC_ARGMIN_HPP_INCLUDED diff --git a/include/lbann/layers/misc/one_hot.hpp b/include/lbann/layers/misc/one_hot.hpp new file mode 100644 index 00000000000..aeaca0ab975 --- /dev/null +++ b/include/lbann/layers/misc/one_hot.hpp @@ -0,0 +1,81 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_LAYERS_MISC_ONE_HOT_HPP_INCLUDED +#define LBANN_LAYERS_MISC_ONE_HOT_HPP_INCLUDED + +#include "lbann/layers/layer.hpp" + +namespace lbann { + +/** @brief Convert index to a one-hot vector + * + * Expects a scalar input tensor and outputs a 1-D output tensor with + * @c size entries. The input is interpreted as an index, and output + * entries are one if they correspond to that index and zero + * otherwise. If the input is outside @f$[0,\text{size})@f$, then the + * output is all zeros. + */ +template +class one_hot_layer : public Layer { +public: + + one_hot_layer(lbann_comm* comm, size_t size) : Layer(comm) { + set_output_dims({static_cast(size)}); + static_assert(Layout == data_layout::DATA_PARALLEL, + "one-hot layer only supports data-parallel layout"); + } + one_hot_layer* copy() const override { return new one_hot_layer(*this); } + std::string get_type() const override { return "one-hot"; } + data_layout get_data_layout() const override { return Layout; } + El::Device get_device_allocation() const override { return Device; } + +protected: + + void setup_dims() override { + Layer::setup_dims(); + + // Make sure input tensor is scalar + if (get_input_size() != 1) { + const auto input_dims = get_input_dims(); + std::ostringstream dim_ss; + for (size_t i = 0; i < input_dims.size(); ++i) { + dim_ss << (i > 0 ? "x" : "") << input_dims[i]; + } + LBANN_ERROR(get_type()," layer \"",get_name(),"\" ", + "received an input tensor with invalid dimensions ", + "(expected 1, got ",dim_ss.str(),")"); + } + + } + + void fp_compute() override; + +}; + +} // namespace lbann + +#endif // LBANN_LAYERS_MISC_ONE_HOT_HPP_INCLUDED diff --git a/include/lbann/lbann.hpp b/include/lbann/lbann.hpp index 094eab8e2b8..d4cc34730a0 100644 --- a/include/lbann/lbann.hpp +++ b/include/lbann/lbann.hpp @@ -112,6 +112,9 @@ #include "lbann/layers/misc/channelwise_mean.hpp" #include "lbann/layers/misc/mini_batch_index.hpp" #include "lbann/layers/misc/mini_batch_size.hpp" +#include "lbann/layers/misc/argmax.hpp" +#include "lbann/layers/misc/argmin.hpp" +#include "lbann/layers/misc/one_hot.hpp" /// Data readers #include "lbann/data_readers/data_reader_imagenet.hpp" diff --git a/src/layers/misc/CMakeLists.txt b/src/layers/misc/CMakeLists.txt index 69fe4933e5f..f0b66a776b7 100644 --- a/src/layers/misc/CMakeLists.txt +++ b/src/layers/misc/CMakeLists.txt @@ -3,6 +3,9 @@ set_full_path(THIS_DIR_SOURCES covariance.cpp variance.cpp channelwise_mean.cpp + argmax.cpp + argmin.cpp + one_hot.cpp ) if (LBANN_HAS_CUDA) @@ -11,6 +14,7 @@ if (LBANN_HAS_CUDA) covariance.cu variance.cu channelwise_mean.cu + one_hot.cu ) endif () diff --git a/src/layers/misc/argmax.cpp b/src/layers/misc/argmax.cpp new file mode 100644 index 00000000000..c038075b5d0 --- /dev/null +++ b/src/layers/misc/argmax.cpp @@ -0,0 +1,49 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#include "lbann/layers/misc/argmax.hpp" +#include + +namespace lbann { + +template <> +void argmax_layer + ::fp_compute() { + const auto& local_input = dynamic_cast(get_local_prev_activations()); + auto& local_output = dynamic_cast(get_local_activations()); + const El::Int local_height = local_input.Height(); + const El::Int local_width = local_input.Width(); + LBANN_OMP_PARALLEL_FOR + for (El::Int col = 0; col < local_width; ++col) { + const auto buf_start = local_input.LockedBuffer(0, col); + const auto buf_max = std::max_element(buf_start, + buf_start+local_height); + const auto max_ind = std::distance(buf_start, buf_max); + local_output(0, col) = static_cast(max_ind); + } +} + +} // namespace lbann diff --git a/src/layers/misc/argmin.cpp b/src/layers/misc/argmin.cpp new file mode 100644 index 00000000000..c0e1d17435a --- /dev/null +++ b/src/layers/misc/argmin.cpp @@ -0,0 +1,49 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#include "lbann/layers/misc/argmin.hpp" +#include + +namespace lbann { + +template <> +void argmin_layer + ::fp_compute() { + const auto& local_input = dynamic_cast(get_local_prev_activations()); + auto& local_output = dynamic_cast(get_local_activations()); + const El::Int local_height = local_input.Height(); + const El::Int local_width = local_input.Width(); + LBANN_OMP_PARALLEL_FOR + for (El::Int col = 0; col < local_width; ++col) { + const auto buf_start = local_input.LockedBuffer(0, col); + const auto buf_min = std::min_element(buf_start, + buf_start+local_height); + const auto min_ind = std::distance(buf_start, buf_min); + local_output(0, col) = static_cast(min_ind); + } +} + +} // namespace lbann diff --git a/src/layers/misc/one_hot.cpp b/src/layers/misc/one_hot.cpp new file mode 100644 index 00000000000..c3531211393 --- /dev/null +++ b/src/layers/misc/one_hot.cpp @@ -0,0 +1,54 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#include "lbann/layers/misc/one_hot.hpp" + +namespace lbann { + +template <> +void one_hot_layer + ::fp_compute() { + + // Local matrices + const auto& local_input = dynamic_cast(get_local_prev_activations()); + auto& local_output = dynamic_cast(get_local_activations()); + const El::Int local_height = local_output.Height(); + const El::Int local_width = local_output.Width(); + + // Populate one-hot vectors + El::Zero(local_output); + LBANN_OMP_PARALLEL_FOR + for (El::Int col = 0; col < local_width; ++col) { + const auto& ind = local_input(0, col); + if (DataType{0} <= ind && ind < DataType(local_height)) { + const El::Int row = static_cast(ind); + local_output(row, col) = DataType{1}; + } + } + +} + +} // namespace lbann diff --git a/src/layers/misc/one_hot.cu b/src/layers/misc/one_hot.cu new file mode 100644 index 00000000000..171cba8ba39 --- /dev/null +++ b/src/layers/misc/one_hot.cu @@ -0,0 +1,86 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#include "lbann/layers/misc/one_hot.hpp" + +namespace lbann { + +namespace { + +/** + * On input, output is assumed to be filled with zeros. + * + * Block dimensions: bsize x 1 x 1 + * + * Grid dimensions: (width / bsize) x 1 x 1 + */ +__global__ void fp_kernel(size_t height, + size_t width, + const DataType* __restrict__ indices, + size_t indices_stride, + DataType* __restrict__ output, + size_t output_ldim) { + const size_t gid = threadIdx.x + blockIdx.x * blockDim.x; + const size_t nthreads = blockDim.x * gridDim.x; + for (size_t col = gid; col < width; col += nthreads) { + const auto& ind = indices[col*indices_stride]; + if (DataType{0} <= ind && ind < DataType(height)) { + const size_t row = static_cast(ind); + output[row+col*output_ldim] = DataType{1}; + } + } +} + +} // namespace + +template <> +void one_hot_layer + ::fp_compute() { + + // Local matrices + const auto& local_input = dynamic_cast(get_local_prev_activations()); + auto& local_output = dynamic_cast(get_local_activations()); + + // Populate one-hot vectors + El::Zero(local_output); + if (!local_output.IsEmpty()) { + const size_t local_height = local_output.Height(); + const size_t local_width = local_output.Width(); + constexpr size_t block_size = 64; + const size_t grid_size = (local_width + block_size - 1) / block_size; + fp_kernel + <<>>( + local_height, + local_width, + local_input.LockedBuffer(), + local_input.LDim(), + local_output.Buffer(), + local_output.LDim()); + } + +} + +} // namespace lbann diff --git a/src/proto/factories/layer_factory.cpp b/src/proto/factories/layer_factory.cpp index 74dc45a8488..8ba40ae4df3 100644 --- a/src/proto/factories/layer_factory.cpp +++ b/src/proto/factories/layer_factory.cpp @@ -61,6 +61,9 @@ #include "lbann/layers/misc/mini_batch_index.hpp" #include "lbann/layers/misc/mini_batch_size.hpp" #include "lbann/layers/misc/variance.hpp" +#include "lbann/layers/misc/argmax.hpp" +#include "lbann/layers/misc/argmin.hpp" +#include "lbann/layers/misc/one_hot.hpp" #include "lbann/layers/regularizers/batch_normalization.hpp" #include "lbann/layers/regularizers/dropout.hpp" #include "lbann/layers/regularizers/local_response_normalization.hpp" @@ -707,6 +710,31 @@ std::unique_ptr construct_layer( } CONSTRUCT_LAYER(mini_batch_index); CONSTRUCT_LAYER(mini_batch_size); + if (proto_layer.has_argmax()) { + if (Layout == data_layout::DATA_PARALLEL && Device == El::Device::CPU) { + return lbann::make_unique>(comm); + } else { + LBANN_ERROR("argmax layer is only supported with " + "a data-parallel layout and on CPU"); + } + } + if (proto_layer.has_argmin()) { + if (Layout == data_layout::DATA_PARALLEL && Device == El::Device::CPU) { + return lbann::make_unique>(comm); + } else { + LBANN_ERROR("argmin layer is only supported with " + "a data-parallel layout and on CPU"); + } + } + if (proto_layer.has_one_hot()) { + if (Layout == data_layout::DATA_PARALLEL) { + const auto& params = proto_layer.one_hot(); + return lbann::make_unique>(comm, params.size()); + } else { + LBANN_ERROR("one-hot layer is only supported with " + "a data-parallel layout"); + } + } // Throw exception if layer has not been constructed err << "could not construct layer " << proto_layer.name(); diff --git a/src/proto/layers.proto b/src/proto/layers.proto index 6d4a28b16df..bd1b684c0e1 100644 --- a/src/proto/layers.proto +++ b/src/proto/layers.proto @@ -175,6 +175,10 @@ message Layer { ChannelwiseMean channelwise_mean = 602; MiniBatchIndex mini_batch_index = 603; MiniBatchSize mini_batch_size = 604; + Argmax argmax = 605; + Argmin argmin = 606; + OneHot one_hot = 607; + } /////////////////////// @@ -527,6 +531,7 @@ message Layer { ////////////////////////// // Miscellaneous layers // ////////////////////////// + message Covariance { bool biased = 1; //Whether to use a biased covariance estimate } @@ -536,6 +541,30 @@ message Layer { message ChannelwiseMean {} message MiniBatchIndex {} message MiniBatchSize {} + + // Get index of maximum-value tensor entry + // + // Expects a 1-D input tensor. If multiple entries have the same + // maximum value, outputs the index of the first one. + message Argmax {} + + // Get index of minimum-value tensor entry + // + // Expects a 1-D input tensor. If multiple entries have the same + // minimum value, outputs the index of the first one. + message Argmin {} + + // Convert index to a one-hot vector + // + // Expects a scalar input tensor and outputs a 1-D output tensor. + // The input is interpreted as an index, and output entries are one + // if they correspond to that index and zero otherwise. If the input + // is outside [0,size), then the output is all zeros. + message OneHot { + // Size of one-hot vector + int64 size = 1; + } + }// message Layer //note: I'd like to put this enum inside of Layer, but if I do the enum values From d03b9492c79517ce2291b0f61bd492fe9eded379 Mon Sep 17 00:00:00 2001 From: Tim Moon Date: Mon, 16 Sep 2019 15:19:25 -0700 Subject: [PATCH 301/634] Fix bug in CIFAR autoencoder integration test --- .../autoencoder_cifar10/model_conv_autoencoder_cifar10.prototext | 1 - 1 file changed, 1 deletion(-) diff --git a/model_zoo/models/autoencoder_cifar10/model_conv_autoencoder_cifar10.prototext b/model_zoo/models/autoencoder_cifar10/model_conv_autoencoder_cifar10.prototext index b829a4af5f3..463b9f50484 100644 --- a/model_zoo/models/autoencoder_cifar10/model_conv_autoencoder_cifar10.prototext +++ b/model_zoo/models/autoencoder_cifar10/model_conv_autoencoder_cifar10.prototext @@ -7,7 +7,6 @@ model { data_layout: "data_parallel" mini_batch_size: 128 num_epochs: 10 - procs_per_trainer: 0 disable_cuda: true ################################################### From 0266b2fc385b453f75fb6406be61bd73c1573e77 Mon Sep 17 00:00:00 2001 From: Tim Moon Date: Tue, 17 Sep 2019 14:24:46 -0700 Subject: [PATCH 302/634] Add unit test for Python data reader (#1254) * Add unit test for Python data reader * Autogenerate test name in unit test for Python data reader --- .../unit_tests/test_unit_datareader_python.py | 211 ++++++++++++++++++ 1 file changed, 211 insertions(+) create mode 100644 bamboo/unit_tests/test_unit_datareader_python.py diff --git a/bamboo/unit_tests/test_unit_datareader_python.py b/bamboo/unit_tests/test_unit_datareader_python.py new file mode 100644 index 00000000000..be83d06fe8a --- /dev/null +++ b/bamboo/unit_tests/test_unit_datareader_python.py @@ -0,0 +1,211 @@ +import os +import os.path +import sys +import numpy as np +import pytest + +# Local files +current_file = os.path.realpath(__file__) +current_dir = os.path.dirname(current_file) +sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python')) +import tools + +# ============================================== +# Objects for Python data reader +# ============================================== +# Note: The Python data reader imports this file and calls the +# functions below to ingest data. This is the only part of the script +# that should be executed when the script is imported, or else the +# Python data reader might misbehave. + +# Data +np.random.seed(20190708) +_num_samples = 23 +_sample_size = 7 +_samples = np.random.normal(size=(_num_samples,_sample_size)) +_samples = _samples.astype(np.float32) + +# Sample access functions +def get_sample(index): + return _samples[index,:] +def num_samples(): + return _num_samples +def sample_dims(): + return (_sample_size,) + +# ============================================== +# Setup LBANN experiment +# ============================================== + +def setup_experiment(lbann): + """Construct LBANN experiment. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + trainer = lbann.Trainer() + model = construct_model(lbann) + data_reader = construct_data_reader(lbann) + optimizer = lbann.NoOptimizer() + return trainer, model, data_reader, optimizer + +def construct_model(lbann): + """Construct LBANN model. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Layer graph + x = lbann.Input() + obj = lbann.L2Norm2(x) + layers = list(lbann.traverse_layer_graph(x)) + metric = lbann.Metric(obj, name='obj') + callbacks = [] + + # Compute expected value with NumPy + vals = [] + for i in range(num_samples()): + x = get_sample(i) + obj = np.inner(x, x) + vals.append(obj) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metric.name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # Construct model + mini_batch_size = 5 + num_epochs = 0 + return lbann.Model(mini_batch_size, + num_epochs, + layers=layers, + metrics=[metric], + callbacks=callbacks) + +def construct_data_reader(lbann): + """Construct Protobuf message for Python data reader. + + The Python data reader will import the current Python file to + access the sample access functions. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + module_name = os.path.splitext(os.path.basename(current_file))[0] + + # Base data reader message + message = lbann.reader_pb2.DataReader() + + # Training set data reader + # TODO: This can be removed once + # https://github.com/LLNL/lbann/issues/1098 is resolved. + data_reader = message.reader.add() + data_reader.name = 'python' + data_reader.role = 'train' + data_reader.percent_of_data_to_use = 1.0 + data_reader.python.module = module_name + data_reader.python.module_dir = current_dir + data_reader.python.sample_function = 'get_sample' + data_reader.python.num_samples_function = 'num_samples' + data_reader.python.sample_dims_function = 'sample_dims' + + # Test set data reader + data_reader = message.reader.add() + data_reader.name = 'python' + data_reader.role = 'test' + data_reader.percent_of_data_to_use = 1.0 + data_reader.python.module = module_name + data_reader.python.module_dir = current_dir + data_reader.python.sample_function = 'get_sample' + data_reader.python.num_samples_function = 'num_samples' + data_reader.python.sample_dims_function = 'sample_dims' + + return message + +# ============================================== +# Setup PyTest +# ============================================== + +# Generate test name based on file name +_test_name = os.path.splitext(os.path.basename(current_file))[0] + +# Primary test method +def _test(cluster, executables, dir_name, compiler_name): + tools.process_executable(_test_name, compiler_name, executables) + + # Import LBANN Python frontend + if compiler_name == 'exe': + exe = executables[compiler_name] + bin_dir = os.path.dirname(exe) + install_dir = os.path.dirname(bin_dir) + build_path = '{i}/lib/python3.7/site-packages'.format(i=install_dir) + else: + if compiler_name == 'clang6': + path = 'clang.Release' + elif compiler_name == 'clang6_debug': + path = 'clang.Debug' + elif compiler_name == 'gcc7': + path = 'gnu.Release' + elif compiler_name == 'clang6_debug': + path = 'gnu.Debug' + elif compiler_name == 'intel19': + path = 'intel.Release' + elif compiler_name == 'intel19_debug': + path = 'intel.Debug' + path = '{p}.{c}.llnl.gov'.format(p=path, c=cluster) + build_path = '{d}/build/{p}/install/lib/python3.7/site-packages'.format( + d=dir_name, p=path) + print('build_path={b}'.format(b=build_path)) + sys.path.append(build_path) + import lbann + import lbann.contrib.lc.launcher + + # Setup LBANN experiment + trainer, model, data_reader, optimizer = setup_experiment(lbann) + + # Run LBANN experiment + kwargs = { + 'account': 'guests', + 'nodes': 1, + 'partition': 'pbatch' + } + experiment_dir = '{d}/bamboo/unit_tests/experiments/{t}_{c}'.format( + d=dir_name, t=_test_name, c=compiler_name) + error_file_name = '{e}/err.log'.format( + e=experiment_dir, c=compiler_name) + return_code = lbann.contrib.lc.launcher.run( + trainer=trainer, + model=model, + data_reader=data_reader, + optimizer=optimizer, + experiment_dir=experiment_dir, + job_name='lbann_{}'.format(_test_name), + **kwargs) + tools.assert_success(return_code, error_file_name) + +# Construct methods that will be detected by PyTest +def _test_clang6(cluster, exes, dirname): + _test(cluster, exes, dirname, 'clang6') +def _test_gcc7(cluster, exes, dirname): + _test(cluster, exes, dirname, 'gcc7') +def _test_intel19(cluster, exes, dirname): + _test(cluster, exes, dirname, 'intel19') +def _test_exe(cluster, dirname, exe): + if exe is None: + e = 'test_{}_exe: Non-local testing'.format(_test_name) + print('Skip - ' + e) + pytest.skip(e) + exes = {'exe': exe} + _test(cluster, exes, dirname, 'exe') +globals()['{}_clang6'.format(_test_name)] = _test_clang6 +globals()['{}_gcc7'.format(_test_name)] = _test_gcc7 +globals()['{}_intel19'.format(_test_name)] = _test_intel19 +globals()['{}_exe'.format(_test_name)] = _test_exe From 8cc520625f689eb8cb958e2872ca1e60cbeb9fa8 Mon Sep 17 00:00:00 2001 From: Brian Van Essen Date: Wed, 18 Sep 2019 04:13:39 +0200 Subject: [PATCH 303/634] Refactor weights and objective fns to get comm at setup (#1247) * Created a private default constructor for the weight class that does not take the comm object. This allows the deserialization to create new instances of the weight class that can then complete their initialization in the restart logic. This will make it easier to restore weight classes from archive files. * Moved the initialization of the weights comm object and default matrix distribution into a separate function and out of the constructor. * Updated the optimizer classes to get the comm object during setup rather than construction. This is necessary for using Cereal for checkpoint and restart of the classes. * Changed get_comm in the weights class to return a reference. --- include/lbann/optimizers/adagrad.hpp | 4 +- include/lbann/optimizers/adam.hpp | 5 +-- .../lbann/optimizers/hypergradient_adam.hpp | 5 +-- include/lbann/optimizers/optimizer.hpp | 2 +- include/lbann/optimizers/rmsprop.hpp | 5 +-- include/lbann/optimizers/sgd.hpp | 5 +-- include/lbann/proto/factories.hpp | 1 - include/lbann/weights/weights.hpp | 12 ++++++ src/optimizers/adagrad.cpp | 8 ++-- src/optimizers/adam.cpp | 10 ++--- src/optimizers/hypergradient_adam.cpp | 10 ++--- src/optimizers/optimizer.cpp | 9 ++-- src/optimizers/rmsprop.cpp | 10 ++--- src/optimizers/sgd.cpp | 10 ++--- src/proto/factories/model_factory.cpp | 2 +- src/proto/factories/optimizer_factory.cpp | 8 ++-- src/proto/factories/weights_factory.cpp | 2 +- src/weights/weights.cpp | 42 ++++++++++++------- 18 files changed, 79 insertions(+), 71 deletions(-) diff --git a/include/lbann/optimizers/adagrad.hpp b/include/lbann/optimizers/adagrad.hpp index d4fdc8f6a6d..53bce3ab69d 100644 --- a/include/lbann/optimizers/adagrad.hpp +++ b/include/lbann/optimizers/adagrad.hpp @@ -42,7 +42,7 @@ namespace lbann { class adagrad : public optimizer { public: - adagrad(lbann_comm* comm, DataType learning_rate, DataType eps = 1e-8); + adagrad(DataType learning_rate, DataType eps = 1e-8); adagrad(const adagrad& other); adagrad& operator=(const adagrad& other); ~adagrad() override = default; @@ -87,7 +87,7 @@ class adagrad : public optimizer { std::unique_ptr build_adagrad_optimizer_from_pbuf( - google::protobuf::Message const&, lbann_comm*); + google::protobuf::Message const&); } // namespace lbann diff --git a/include/lbann/optimizers/adam.hpp b/include/lbann/optimizers/adam.hpp index a1e5b109742..b2c4884df74 100644 --- a/include/lbann/optimizers/adam.hpp +++ b/include/lbann/optimizers/adam.hpp @@ -47,8 +47,7 @@ class adam : public optimizer { /** @name Life cycle functions */ ///@{ - adam(lbann_comm* comm, - DataType learning_rate, + adam(DataType learning_rate, DataType beta1 = 0.9, DataType beta2 = 0.99, DataType eps = 1e-8); @@ -210,7 +209,7 @@ class adam : public optimizer { std::unique_ptr build_adam_optimizer_from_pbuf( - google::protobuf::Message const&, lbann_comm*); + google::protobuf::Message const&); } // namespace lbann diff --git a/include/lbann/optimizers/hypergradient_adam.hpp b/include/lbann/optimizers/hypergradient_adam.hpp index c0a14a2a412..0936548cdf8 100644 --- a/include/lbann/optimizers/hypergradient_adam.hpp +++ b/include/lbann/optimizers/hypergradient_adam.hpp @@ -55,8 +55,7 @@ class hypergradient_adam : public optimizer { * @param eps Small factor to avoid division by * zero. */ - hypergradient_adam(lbann_comm *comm, - DataType init_learning_rate = 1e-3, + hypergradient_adam(DataType init_learning_rate = 1e-3, DataType hyper_learning_rate = 1e-7, DataType beta1 = 0.9, DataType beta2 = 0.99, @@ -163,7 +162,7 @@ class hypergradient_adam : public optimizer { std::unique_ptr build_hypergradient_adam_optimizer_from_pbuf( - google::protobuf::Message const&, lbann_comm*); + google::protobuf::Message const&); } // namespace lbann diff --git a/include/lbann/optimizers/optimizer.hpp b/include/lbann/optimizers/optimizer.hpp index 6e0e9ee6712..46be0fe0e85 100644 --- a/include/lbann/optimizers/optimizer.hpp +++ b/include/lbann/optimizers/optimizer.hpp @@ -77,7 +77,7 @@ class persist; class optimizer { public: - optimizer(lbann_comm* comm, DataType learning_rate = 0); + optimizer(DataType learning_rate = 0); optimizer(const optimizer& other); optimizer& operator=(const optimizer& other); virtual ~optimizer() = default; diff --git a/include/lbann/optimizers/rmsprop.hpp b/include/lbann/optimizers/rmsprop.hpp index efc6c0db0d0..2737c33c9d6 100644 --- a/include/lbann/optimizers/rmsprop.hpp +++ b/include/lbann/optimizers/rmsprop.hpp @@ -40,8 +40,7 @@ namespace lbann { class rmsprop : public optimizer { public: - rmsprop(lbann_comm* comm, - DataType learning_rate, + rmsprop(DataType learning_rate, DataType decay_rate, DataType eps = 1e-8); rmsprop(const rmsprop& other); @@ -114,7 +113,7 @@ class rmsprop : public optimizer { std::unique_ptr build_rmsprop_optimizer_from_pbuf( - google::protobuf::Message const&, lbann_comm*); + google::protobuf::Message const&); } // namespace lbann diff --git a/include/lbann/optimizers/sgd.hpp b/include/lbann/optimizers/sgd.hpp index 95e0e35cd2b..8d6960f4e34 100644 --- a/include/lbann/optimizers/sgd.hpp +++ b/include/lbann/optimizers/sgd.hpp @@ -42,8 +42,7 @@ class sgd : public optimizer { /** @name Life cycle functions */ ///@{ - sgd(lbann_comm *comm, - DataType learning_rate, + sgd(DataType learning_rate, DataType momentum = 0, bool nesterov = false); sgd(const sgd& other); @@ -156,7 +155,7 @@ class sgd : public optimizer { std::unique_ptr build_sgd_optimizer_from_pbuf( - google::protobuf::Message const&, lbann_comm*); + google::protobuf::Message const&); } // namespace lbann diff --git a/include/lbann/proto/factories.hpp b/include/lbann/proto/factories.hpp index 94e7bb6309a..41836a51de2 100644 --- a/include/lbann/proto/factories.hpp +++ b/include/lbann/proto/factories.hpp @@ -97,7 +97,6 @@ std::unique_ptr construct_summarizer(lbann_comm* comm, /** Construct an optimizer specified with prototext. */ std::unique_ptr construct_optimizer( - lbann_comm* comm, const lbann_data::Optimizer& proto_opt); /** Construct an objective function specified with prototext. */ diff --git a/include/lbann/weights/weights.hpp b/include/lbann/weights/weights.hpp index 24ace95c8a4..59b3b065e5c 100644 --- a/include/lbann/weights/weights.hpp +++ b/include/lbann/weights/weights.hpp @@ -63,6 +63,13 @@ class optimizer; */ class weights { friend class optimizer; +private: + weights(); + // ----------------------------------------------- + // Internal method for setting the comm pointer + // ----------------------------------------------- + void set_comm(lbann_comm& comm); + void setup_default_matrix_distribution(); public: weights(lbann_comm* comm); @@ -77,6 +84,11 @@ class weights { /** Get weights name. */ std::string get_name() const { return m_name; } + lbann_comm& get_comm() const { + if(m_comm == nullptr) { LBANN_ERROR("weights class has null comm pointer"); } + return *m_comm; + } + /** Create a copy of the weights. * This function dynamically allocates memory for a weights * instance and instantiates a copy. The caller is responsible for diff --git a/src/optimizers/adagrad.cpp b/src/optimizers/adagrad.cpp index 499a1d14c01..889dc567348 100644 --- a/src/optimizers/adagrad.cpp +++ b/src/optimizers/adagrad.cpp @@ -32,8 +32,8 @@ namespace lbann { -adagrad::adagrad(lbann_comm *comm, DataType learning_rate, DataType eps) - : optimizer(comm, learning_rate), m_eps(eps) {} +adagrad::adagrad(DataType learning_rate, DataType eps) + : optimizer(learning_rate), m_eps(eps) {} adagrad::adagrad(const adagrad& other) : optimizer(other), @@ -147,10 +147,10 @@ bool adagrad::load_from_checkpoint_distributed(persist& p, std::string name_pref std::unique_ptr build_adagrad_optimizer_from_pbuf( - google::protobuf::Message const& msg, lbann_comm* comm) { + google::protobuf::Message const& msg) { const auto& params = dynamic_cast(msg); - return make_unique(comm, params.learn_rate(), params.eps()); + return make_unique(params.learn_rate(), params.eps()); } } // namespace lbann diff --git a/src/optimizers/adam.cpp b/src/optimizers/adam.cpp index 89bcb4ece69..959aa9fd439 100644 --- a/src/optimizers/adam.cpp +++ b/src/optimizers/adam.cpp @@ -32,12 +32,11 @@ namespace lbann { -adam::adam(lbann_comm* comm, - DataType learning_rate, +adam::adam(DataType learning_rate, DataType beta1, DataType beta2, DataType eps) - : optimizer(comm, learning_rate), + : optimizer(learning_rate), m_beta1(beta1), m_beta2(beta2), m_eps(eps) {} adam::adam(const adam& other) @@ -250,11 +249,10 @@ bool adam::load_from_checkpoint_distributed(persist& p, std::string name_prefix) std::unique_ptr build_adam_optimizer_from_pbuf( - google::protobuf::Message const& msg, lbann_comm* comm) { + google::protobuf::Message const& msg) { const auto& params = dynamic_cast(msg); - return make_unique(comm, - params.learn_rate(), + return make_unique(params.learn_rate(), params.beta1(), params.beta2(), params.eps()); diff --git a/src/optimizers/hypergradient_adam.cpp b/src/optimizers/hypergradient_adam.cpp index 4db7150bfff..f4dfcbca09d 100644 --- a/src/optimizers/hypergradient_adam.cpp +++ b/src/optimizers/hypergradient_adam.cpp @@ -32,13 +32,12 @@ namespace lbann { -hypergradient_adam::hypergradient_adam(lbann_comm *comm, - DataType init_learning_rate, +hypergradient_adam::hypergradient_adam(DataType init_learning_rate, DataType hyper_learning_rate, DataType beta1, DataType beta2, DataType eps) - : optimizer(comm, init_learning_rate), + : optimizer(init_learning_rate), m_hyper_learning_rate(hyper_learning_rate), m_beta1(beta1), m_beta2(beta2), @@ -222,11 +221,10 @@ bool hypergradient_adam::load_from_checkpoint_distributed(persist& p, std::strin std::unique_ptr build_hypergradient_adam_optimizer_from_pbuf( - google::protobuf::Message const& msg, lbann_comm* comm) { + google::protobuf::Message const& msg) { const auto& params = dynamic_cast(msg); - return make_unique(comm, - params.init_learning_rate(), + return make_unique(params.init_learning_rate(), params.hyper_learning_rate(), params.beta1(), params.beta2(), diff --git a/src/optimizers/optimizer.cpp b/src/optimizers/optimizer.cpp index 7320ddbe696..94b7a656605 100644 --- a/src/optimizers/optimizer.cpp +++ b/src/optimizers/optimizer.cpp @@ -44,12 +44,8 @@ std::string to_string(optimizer_gradient_status status) { } } -optimizer::optimizer(lbann_comm* comm, DataType learning_rate) - : m_comm(comm), m_learning_rate(learning_rate) { - if (m_comm == nullptr) { - LBANN_ERROR("got null pointer for lbann_comm"); - } -} +optimizer::optimizer(DataType learning_rate) + : m_comm(nullptr), m_learning_rate(learning_rate) {} optimizer::optimizer(const optimizer& other) : m_comm(other.m_comm), @@ -266,6 +262,7 @@ void optimizer::remove_gradient_source(const void* source) { } void optimizer::setup(weights* w) { + m_comm = &w->get_comm(); clear_gradient(); // Set weights being optimized diff --git a/src/optimizers/rmsprop.cpp b/src/optimizers/rmsprop.cpp index cc8f92ca7e3..e98c728392a 100644 --- a/src/optimizers/rmsprop.cpp +++ b/src/optimizers/rmsprop.cpp @@ -32,11 +32,10 @@ namespace lbann { -rmsprop::rmsprop(lbann_comm *comm, - DataType learning_rate, +rmsprop::rmsprop(DataType learning_rate, DataType decay_rate, DataType eps) - : optimizer(comm, learning_rate), + : optimizer(learning_rate), m_decay_rate(decay_rate), m_eps(eps) {} @@ -155,11 +154,10 @@ bool rmsprop::load_from_checkpoint_shared(persist& p, std::string name_prefix) { std::unique_ptr build_rmsprop_optimizer_from_pbuf( - google::protobuf::Message const& msg, lbann_comm* comm) { + google::protobuf::Message const& msg) { const auto& params = dynamic_cast(msg); - return make_unique(comm, - params.learn_rate(), + return make_unique(params.learn_rate(), params.decay_rate(), params.eps()); } diff --git a/src/optimizers/sgd.cpp b/src/optimizers/sgd.cpp index f4df3e8baba..9a0521ec8b6 100644 --- a/src/optimizers/sgd.cpp +++ b/src/optimizers/sgd.cpp @@ -32,11 +32,10 @@ namespace lbann { -sgd::sgd(lbann_comm* comm, - DataType learning_rate, +sgd::sgd(DataType learning_rate, DataType momentum, bool nesterov) - : optimizer(comm, learning_rate), + : optimizer(learning_rate), m_momentum(momentum), m_nesterov(nesterov) {} @@ -223,10 +222,9 @@ bool sgd::load_from_checkpoint_distributed(persist& p, std::string name_prefix) std::unique_ptr build_sgd_optimizer_from_pbuf( - google::protobuf::Message const& msg, lbann_comm* comm) { + google::protobuf::Message const& msg) { const auto& params = dynamic_cast(msg); - return make_unique(comm, - params.learn_rate(), + return make_unique(params.learn_rate(), params.momentum(), params.nesterov()); } diff --git a/src/proto/factories/model_factory.cpp b/src/proto/factories/model_factory.cpp index aa8b8ae62e0..1fcc8779451 100644 --- a/src/proto/factories/model_factory.cpp +++ b/src/proto/factories/model_factory.cpp @@ -57,7 +57,7 @@ instantiate_model(lbann_comm* comm, const lbann_data::Model& proto_model) { // Default optimizer - auto opt = construct_optimizer(comm, proto_opt); + auto opt = construct_optimizer(proto_opt); // Construct model const auto& type = proto_model.type(); diff --git a/src/proto/factories/optimizer_factory.cpp b/src/proto/factories/optimizer_factory.cpp index b7d548f4c50..d9488d0d0a2 100644 --- a/src/proto/factories/optimizer_factory.cpp +++ b/src/proto/factories/optimizer_factory.cpp @@ -45,7 +45,7 @@ namespace { std::unique_ptr build_no_optimizer_from_pbuf( - google::protobuf::Message const& msg, lbann_comm* comm) { + google::protobuf::Message const& msg) { return nullptr; } @@ -53,8 +53,7 @@ using factory_type = lbann::generic_factory< lbann::optimizer, std::string, generate_builder_type, + google::protobuf::Message const&>, default_key_error_policy>; void register_default_builders(factory_type& factory) { @@ -84,12 +83,11 @@ factory_type const& get_optimizer_factory() noexcept { }// namespace std::unique_ptr construct_optimizer( - lbann_comm* comm, const lbann_data::Optimizer& proto_opt) { auto const& factory = get_optimizer_factory(); auto const& msg = helpers::get_oneof_message(proto_opt, "optimizer_type"); - return factory.create_object(msg.GetDescriptor()->name(), msg, comm); + return factory.create_object(msg.GetDescriptor()->name(), msg); } } // namespace proto diff --git a/src/proto/factories/weights_factory.cpp b/src/proto/factories/weights_factory.cpp index b6b6dd1fcf7..b54052ff106 100644 --- a/src/proto/factories/weights_factory.cpp +++ b/src/proto/factories/weights_factory.cpp @@ -134,7 +134,7 @@ std::unique_ptr construct_weights( std::unique_ptr opt = (helpers::has_oneof(opt_msg, "optimizer_type") - ? construct_optimizer(comm, opt_msg) + ? construct_optimizer(opt_msg) : nullptr); w->set_initializer(std::move(init)); w->set_optimizer(std::move(opt)); diff --git a/src/weights/weights.cpp b/src/weights/weights.cpp index 4723b35b249..ff459c05891 100644 --- a/src/weights/weights.cpp +++ b/src/weights/weights.cpp @@ -62,28 +62,23 @@ std::string get_dims_string(const std::vector& matrix_height_dims, } // namespace -weights::weights(lbann_comm* comm) - : m_comm(comm), +weights::weights() + : m_comm(nullptr), m_frozen(false) { // Initialize weights name static int num_weights = 0; m_name = "weights" + std::to_string(num_weights); num_weights++; +} - // Default matrix distribution - m_matrix_dist.colDist = El::STAR; - m_matrix_dist.rowDist = El::STAR; - m_matrix_dist.blockHeight = 1; - m_matrix_dist.blockWidth = 1; - m_matrix_dist.colAlign = 0; - m_matrix_dist.rowAlign = 0; - m_matrix_dist.colCut = 0; - m_matrix_dist.rowCut = 0; - m_matrix_dist.root = 0; - m_matrix_dist.grid = &(comm->get_trainer_grid()); - m_matrix_dist.device = El::Device::CPU; +weights::weights(lbann_comm* comm) + : weights() { + + m_comm = comm; + if(comm == nullptr) { LBANN_ERROR("Unable to construct weights with null comm ptr"); } + setup_default_matrix_distribution(); } weights::weights(const weights& other) @@ -254,6 +249,25 @@ void weights::set_matrix_distribution(El::DistData dist) { m_matrix_dist = dist; } +void weights::set_comm(lbann_comm& comm) { + m_comm = &comm; +} + +void weights::setup_default_matrix_distribution() { + // Default matrix distribution + m_matrix_dist.colDist = El::STAR; + m_matrix_dist.rowDist = El::STAR; + m_matrix_dist.blockHeight = 1; + m_matrix_dist.blockWidth = 1; + m_matrix_dist.colAlign = 0; + m_matrix_dist.rowAlign = 0; + m_matrix_dist.colCut = 0; + m_matrix_dist.rowCut = 0; + m_matrix_dist.root = 0; + m_matrix_dist.grid = &(m_comm->get_trainer_grid()); + m_matrix_dist.device = El::Device::CPU; +} + // ----------------------------------------------- // Setup // ----------------------------------------------- From b8c55865be488d9ad18e17105a7cb537bff3fb0b Mon Sep 17 00:00:00 2001 From: "David A. Hysom" Date: Thu, 19 Sep 2019 09:28:07 -0700 Subject: [PATCH 304/634] Fixes for several bugs Fixes include, but are not limited to, the following: - Preloading was broken unless 100% of the data set was used - When the data_store was not in debug mode, messages were written to an unope ned std::ofstream - When a validation reader was carved out of a train reader, the data_store's point to the reader was not set - For the validation readers, operating in non-preload mode, data_store_condui t::set_conduit_node() was being called during epoch 1 instead of epoch 0 --- .../lbann/data_store/data_store_conduit.hpp | 31 +- src/data_readers/data_reader.cpp | 17 +- src/data_readers/data_reader_image.cpp | 7 +- src/data_readers/data_reader_imagenet.cpp | 1 - src/data_readers/data_reader_jag_conduit.cpp | 22 +- .../data_reader_numpy_npz_conduit.cpp | 6 + src/data_store/data_store_conduit.cpp | 282 ++++++++++-------- src/proto/proto_common.cpp | 7 +- 8 files changed, 203 insertions(+), 170 deletions(-) diff --git a/include/lbann/data_store/data_store_conduit.hpp b/include/lbann/data_store/data_store_conduit.hpp index 67ce8b33abc..d8bdc563bbb 100644 --- a/include/lbann/data_store/data_store_conduit.hpp +++ b/include/lbann/data_store/data_store_conduit.hpp @@ -68,9 +68,8 @@ class data_store_conduit { //! dtor ~data_store_conduit(); - /// normally not needed, since reader is passed to ctor. But may - /// be useful in some cases - void set_data_reader_ptr(generic_data_reader *reader) { m_reader = reader; } + /// required when the copy ctor is used to construct a validation set + void set_data_reader_ptr(generic_data_reader *reader); //! convenience handle void set_shuffled_indices(const std::vector *indices); @@ -124,22 +123,9 @@ class data_store_conduit { /// with the index int get_index_owner(int idx); - /// for use during development and debugging - void set_role(const std::string role); - bool is_local_cache() const { return m_is_local_cache; } - void exchange_mini_batch_data(size_t current_pos, size_t mb_size) { - if (is_local_cache()) { - return; - } - if (m_super_node) { - exchange_data_by_super_node(current_pos, mb_size); - } else { - exchange_data_by_sample(current_pos, mb_size); - } - ++m_n; - } + void exchange_mini_batch_data(size_t current_pos, size_t mb_size); void set_super_node_mode() { m_super_node = true; @@ -152,7 +138,7 @@ class data_store_conduit { /// only used for debugging; pass --debug on cmd line to get /// each data store to print to a different file. This is made /// public so data readers can also print to the file - mutable std::ofstream m_output; + mutable std::ofstream *m_output = nullptr; /// for use during development and debugging int get_data_size() { return m_data.size(); } @@ -160,10 +146,11 @@ class data_store_conduit { /// made public for debugging during development void copy_members(const data_store_conduit& rhs, const std::vector& = std::vector()); + void flush_debug_file(); + protected : - /// records the number of times exchange_mini_batch_data has been called - int m_n = 0; + int m_cur_epoch = 0; bool m_is_setup = false; @@ -250,7 +237,7 @@ protected : void build_node_for_sending(const conduit::Node &node_in, conduit::Node &node_out); /// fills in m_owner, which maps index -> owning processor - void build_owner_map(int mini_batch_size); + void exchange_owner_maps(); /// for use when conduit Nodes have non-uniform size, e.g, imagenet, /// and when running in non-super_node mode @@ -310,6 +297,8 @@ protected : char *m_mem_seg = 0; size_t m_mem_seg_length = 0; std::string m_seg_name; + + std::string m_debug_filename; }; } // namespace lbann diff --git a/src/data_readers/data_reader.cpp b/src/data_readers/data_reader.cpp index f2492ec2124..5c6e6dd2b7c 100644 --- a/src/data_readers/data_reader.cpp +++ b/src/data_readers/data_reader.cpp @@ -719,12 +719,8 @@ void generic_data_reader::instantiate_data_store(const std::vector& local_l std::cout << "generic_data_reader::instantiate_data_store - Starting the preload" << std::endl; } if (local_list_sizes.size() != 0) { - if (is_master()) std::cout << "XX local_list_sizes.size() != 0\n"; m_data_store->build_preloaded_owner_map(local_list_sizes); } -else { - if (is_master()) std::cout << "XX local_list_sizes.size() == 0\n"; -} preload_data_store(); if(is_master()) { std::cout << "preload complete" << std::endl; @@ -745,17 +741,18 @@ void generic_data_reader::setup_data_store(int mini_batch_size) { } bool generic_data_reader::data_store_active() const { - const auto& c = static_cast(m_model->get_execution_context()); if (m_data_store != nullptr && m_data_store->is_preloaded()) { return true; } + + const auto& c = static_cast(m_model->get_execution_context()); /// Use the data store for all modes except testing /// i.e. training, validation, tournament return (m_data_store != nullptr && (((c.get_execution_mode() == execution_mode::training) && c.get_epoch() > 0) || ((c.get_execution_mode() == execution_mode::validation) - && c.get_epoch() > 1))); + && c.get_epoch() > 0))); } bool generic_data_reader::priming_data_store() const { @@ -763,13 +760,14 @@ bool generic_data_reader::priming_data_store() const { if (m_data_store != nullptr && m_data_store->is_preloaded()) { return false; } + /// Use the data store for all modes except testing /// i.e. training, validation, tournament return (m_data_store != nullptr && (((c.get_execution_mode() == execution_mode::training) && c.get_epoch() == 0) || ((c.get_execution_mode() == execution_mode::validation) - && c.get_epoch() == 1) + && c.get_epoch() == 0) || m_data_store->is_explicitly_loading())); } @@ -804,12 +802,9 @@ void generic_data_reader::set_role(std::string role) { && get_role() == "train") { m_jag_partitioned = true; if (is_master()) { - std::cerr << "USING JAG DATA PARTITIONING\n"; + std::cout << "USING JAG DATA PARTITIONING\n"; } } - if (m_data_store != nullptr) { - m_data_store->set_role(role); - } } } // namespace lbann diff --git a/src/data_readers/data_reader_image.cpp b/src/data_readers/data_reader_image.cpp index 940cd46792d..6fd051eecdc 100644 --- a/src/data_readers/data_reader_image.cpp +++ b/src/data_readers/data_reader_image.cpp @@ -212,11 +212,12 @@ void image_data_reader::preload_data_store() { if (is_master()) std::cerr << "Starting image_data_reader::preload_data_store; num indices: " << m_shuffled_indices.size() << std::endl; int rank = m_comm->get_rank_in_trainer(); for (size_t data_id=0; data_idget_index_owner(data_id) != rank) { + int index = m_shuffled_indices[data_id]; + if (m_data_store->get_index_owner(index) != rank) { continue; } - load_conduit_node_from_file(data_id, node); - m_data_store->set_preloaded_conduit_node(data_id, node); + load_conduit_node_from_file(index, node); + m_data_store->set_preloaded_conduit_node(index, node); } if (is_master()) { diff --git a/src/data_readers/data_reader_imagenet.cpp b/src/data_readers/data_reader_imagenet.cpp index 0e3e9e55c73..8acad24ab9c 100644 --- a/src/data_readers/data_reader_imagenet.cpp +++ b/src/data_readers/data_reader_imagenet.cpp @@ -61,7 +61,6 @@ bool imagenet_reader::fetch_datum(CPUMat& X, int data_id, int mb_idx) { El::Matrix image; std::vector dims; const std::string image_path = get_file_dir() + m_image_list[data_id].first; - if (m_data_store != nullptr) { bool have_node = true; conduit::Node node; diff --git a/src/data_readers/data_reader_jag_conduit.cpp b/src/data_readers/data_reader_jag_conduit.cpp index 866ba62ed28..41e93072bf6 100644 --- a/src/data_readers/data_reader_jag_conduit.cpp +++ b/src/data_readers/data_reader_jag_conduit.cpp @@ -881,28 +881,30 @@ void data_reader_jag_conduit::preload_data_store() { } for (size_t idx=0; idx < m_shuffled_indices.size(); idx++) { - if(m_data_store->get_index_owner(idx) != m_rank_in_model) { + int index = m_shuffled_indices[idx]; + if(m_data_store->get_index_owner(index) != m_rank_in_model) { continue; } try { work.reset(); - m_sample_list.open_samples_file_handle(idx, true); - load_conduit_node(idx, key, work); - conduit::Node & node = m_data_store->get_empty_node(idx); - const std::string padded_idx = '/' + LBANN_DATA_ID_STR(idx); + m_sample_list.open_samples_file_handle(index, true); + load_conduit_node(index, key, work); + conduit::Node & node = m_data_store->get_empty_node(index); + const std::string padded_idx = '/' + LBANN_DATA_ID_STR(index); node[padded_idx] = work; - m_data_store->set_preloaded_conduit_node(idx, node); - }catch (conduit::Error const& e) { - LBANN_ERROR(" :: trying to load the node " + std::to_string(idx) + " with key " + key + " and got " + e.what()); + m_data_store->set_preloaded_conduit_node(index, node); + } catch (conduit::Error const& e) { + LBANN_ERROR(" :: trying to load the node " + std::to_string(index) + " with key " + key + " and got " + e.what()); } } /// Once all of the data has been preloaded, close all of the file handles for (size_t idx=0; idx < m_shuffled_indices.size(); idx++) { - if(m_data_store->get_index_owner(idx) != m_rank_in_model) { + int index = m_shuffled_indices[idx]; + if(m_data_store->get_index_owner(index) != m_rank_in_model) { continue; } - m_sample_list.close_if_done_samples_file_handle(idx); + m_sample_list.close_if_done_samples_file_handle(index); } if (get_comm()->am_world_master() || (opts->get_bool("ltfb_verbose") && get_comm()->am_trainer_master())) { diff --git a/src/data_readers/data_reader_numpy_npz_conduit.cpp b/src/data_readers/data_reader_numpy_npz_conduit.cpp index afa6e4cf9d1..24a98e4e4b2 100644 --- a/src/data_readers/data_reader_numpy_npz_conduit.cpp +++ b/src/data_readers/data_reader_numpy_npz_conduit.cpp @@ -132,6 +132,12 @@ void numpy_npz_conduit_reader::load() { } void numpy_npz_conduit_reader::preload_data_store() { + size_t count = get_absolute_sample_count(); + double use_percent = get_use_percent(); + if (count != 0 || use_percent != 1) { + LBANN_ERROR("numpy_npz_conduit_reader currently assumes you are using 100% of the data set; you specified get_absolute_sample_count() = ", count, " and get_use_percent() = ", use_percent, "; please ask Dave Hysom to modify the code, if you want to use less than 100%"); + } + double tm1 = get_time(); m_data_store->set_preload(); int rank = m_comm->get_rank_in_trainer(); diff --git a/src/data_store/data_store_conduit.cpp b/src/data_store/data_store_conduit.cpp index a40c7d3364c..58a7decf393 100644 --- a/src/data_store/data_store_conduit.cpp +++ b/src/data_store/data_store_conduit.cpp @@ -43,32 +43,6 @@ namespace lbann { -// Macro to throw an LBANN exception -#if 0 -#undef LBANN_ERROR -#define LBANN_ERROR(message) \ - do { \ - std::stringstream ss_LBANN_ERROR; \ - ss_LBANN_ERROR << "LBANN error "; \ - const int rank_LBANN_ERROR = lbann::get_rank_in_world(); \ - if (rank_LBANN_ERROR >= 0) { \ - ss_LBANN_ERROR << "on rank " << rank_LBANN_ERROR << " "; \ - } \ - ss_LBANN_ERROR << "(" << __FILE__ << ":" << __LINE__ << ")" \ - << ": " << (message); \ - if (errno) { \ - ss_LBANN_ERROR << "\nerrno: " << errno << " msg: " \ - << strerror(errno); \ - } \ - if (m_output) { \ - m_output << "ERROR: " << ss_LBANN_ERROR.str() \ - << std::endl; \ - m_output.close(); \ - } \ - throw lbann::exception(ss_LBANN_ERROR.str()); \ - } while (0) -#endif - data_store_conduit::data_store_conduit( generic_data_reader *reader) : m_reader(reader) { @@ -89,7 +63,8 @@ data_store_conduit::data_store_conduit( if (opts->get_bool("debug")) { std::stringstream ss; ss << "debug_" << m_reader->get_role() << "." << m_comm->get_rank_in_world(); - m_output.open(ss.str().c_str()); + m_output = new std::ofstream(ss.str().c_str()); + m_debug_filename = ss.str(); if (m_world_master) { std::cerr << "opened " << ss.str() << " for writing\n"; } @@ -114,7 +89,7 @@ data_store_conduit::data_store_conduit( data_store_conduit::~data_store_conduit() { if (m_output) { - m_output.close(); + m_output->close(); } if (m_is_local_cache && m_mem_seg) { int sanity = shm_unlink(m_seg_name.c_str()); @@ -145,16 +120,20 @@ data_store_conduit& data_store_conduit::operator=(const data_store_conduit& rhs) return (*this); } -void data_store_conduit::set_role(const std::string role) { +void data_store_conduit::set_data_reader_ptr(generic_data_reader *reader) { + m_reader = reader; if (options::get()->get_bool("debug")) { std::stringstream ss; ss << "debug_" << m_reader->get_role() << "." << m_comm->get_rank_in_world(); - m_output.open(ss.str().c_str()); + m_output = new std::ofstream(ss.str().c_str()); + m_debug_filename = ss.str(); + if (m_world_master) { + std::cerr << "data_store_conduit::set_data_reader_ptr; opened " << ss.str() << " for writing\n"; + } } } void data_store_conduit::copy_members(const data_store_conduit& rhs, const std::vector& ds_sample_move_list) { - m_n = rhs.m_n; m_is_setup = rhs.m_is_setup; m_preload = rhs.m_preload; m_explicit_loading = rhs.m_explicit_loading; @@ -177,13 +156,12 @@ void data_store_conduit::copy_members(const data_store_conduit& rhs, const std:: m_mem_seg_length = rhs.m_mem_seg_length; m_seg_name = rhs.m_seg_name; m_image_offsets = rhs.m_image_offsets; - - /// This block needed when carving a validation set from the training set - if (options::get()->get_bool("debug") && !m_output) { - std::stringstream ss; - ss << "debug_" << m_reader->get_role() << "." << m_comm->get_rank_in_world(); + if (m_output) { + LBANN_ERROR("m_output should be nullptr"); } + /// This block needed when carving a validation set from the training set + //if (options::get()->get_bool("debug") && !m_output) { if(ds_sample_move_list.size() == 0) { m_data = rhs.m_data; } else { @@ -191,10 +169,6 @@ void data_store_conduit::copy_members(const data_store_conduit& rhs, const std:: for(auto&& i : ds_sample_move_list) { if(rhs.m_data.find(i) != rhs.m_data.end()){ - if (m_output) { - rhs.m_output << "moving index: " << i << " from other to myself\n"; - } - if (!m_super_node) { /// Repack the nodes because they don't seem to copy correctly // @@ -256,13 +230,7 @@ void data_store_conduit::setup(int mini_batch_size) { } double tm1 = get_time(); - if (!m_preload) { - if (m_world_master) std::cout << "calling build_owner_map\n"; - build_owner_map(mini_batch_size); - if (m_world_master) std::cout << " build_owner_map time: " << (get_time()-tm1) << "\n"; - } else { - m_owner_map_mb_size = mini_batch_size; - } + m_owner_map_mb_size = mini_batch_size; m_is_setup = true; @@ -297,10 +265,10 @@ void data_store_conduit::exchange_data_by_super_node(size_t current_pos, size_t } if (m_output) { - m_output << "starting data_store_conduit::exchange_data_by_super_node; mb_size: " << mb_size << std::endl; + (*m_output) << "starting data_store_conduit::exchange_data_by_super_node; mb_size: " << mb_size << std::endl; } - if (m_n == 0) { + if (m_cur_epoch == 0) { setup_data_store_buffers(); } @@ -361,7 +329,7 @@ void data_store_conduit::exchange_data_by_super_node(size_t current_pos, size_t m_minibatch_data.clear(); for (int p=0; pget_role()); } if (!nd.is_contiguous()) { - LBANN_ERROR("m_data[", std::to_string(data_id), "] does not have a contiguous layout"); + LBANN_ERROR("m_data[", data_id, "] does not have a contiguous layout"); } if (nd.data_ptr() == nullptr) { - LBANN_ERROR("m_data[", std::to_string(data_id), "] does not have a valid data pointer"); + LBANN_ERROR("m_data[", data_id, "] does not have a valid data pointer"); } if (nd.contiguous_data_ptr() == nullptr) { - LBANN_ERROR("m_data[", std::to_string(data_id), "] does not have a valid contiguous data pointer"); + LBANN_ERROR("m_data[", data_id, "] does not have a valid contiguous data pointer"); } } void data_store_conduit::set_conduit_node(int data_id, conduit::Node &node, bool already_have) { + + if (m_output) { + (*m_output) << "set_conduit_node: " << data_id << std::endl; + } + if (m_is_local_cache && m_preload) { LBANN_ERROR("you called data_store_conduit::set_conduit_node, but you're running in local cache mode with preloading; something is broken; please contact Dave Hysom"); } - m_mutex.lock(); + //m_mutex.lock(); if (already_have == false && m_data.find(data_id) != m_data.end()) { - LBANN_ERROR("duplicate data_id: ", std::to_string(data_id), " in data_store_conduit::set_conduit_node"); + LBANN_ERROR("duplicate data_id: ", data_id, " in data_store_conduit::set_conduit_node; role: ", m_reader->get_role()); } - if (m_output) { - m_output << "set_conduit_node: " << data_id << std::endl; - } if (already_have && is_local_cache()) { if (m_data.find(data_id) == m_data.end()) { - LBANN_ERROR("you claim the passed node was obtained from this data_store, but the data_id (", std::to_string(data_id), ") doesn't exist in m_data"); + LBANN_ERROR("you claim the passed node was obtained from this data_store, but the data_id (", data_id, ") doesn't exist in m_data"); } - m_mutex.unlock(); + //m_mutex.unlock(); return; } if (is_local_cache()) { + m_mutex.lock(); m_data[data_id] = node; m_mutex.unlock(); } + #if 0 else if (m_owner[data_id] != m_rank_in_trainer) { - std::stringstream s; - s << "set_conduit_node error for data id: "<get_role() << "\n"; - LBANN_ERROR(s.str()); + LBANN_ERROR("set_conduit_node error for data id: ", data_id, " m_owner: ", + m_owner[data_id], " me: ", m_rank_in_trainer, + "; data reader role: ", m_reader->get_role()); } + #endif else if (! m_super_node) { + m_mutex.lock(); + m_owner[data_id] = m_rank_in_trainer; build_node_for_sending(node, m_data[data_id]); error_check_compacted_node(m_data[data_id], data_id); m_sample_sizes[data_id] = m_data[data_id].total_bytes_compact(); @@ -490,18 +464,16 @@ void data_store_conduit::set_conduit_node(int data_id, conduit::Node &node, bool } else { + m_mutex.lock(); + m_owner[data_id] = m_rank_in_trainer; m_data[data_id] = node; m_mutex.unlock(); - // @TODO would like to do: m_data[data_id].set_external(node); but since - // (as of now) 'node' is a local variable in a data_reader+jag_conduit, - // we need to do a deep copy. If the data_store furnishes a node to the - // data_reader during the first epoch, this copy can be avoided } } const conduit::Node & data_store_conduit::get_conduit_node(int data_id) const { if (m_output) { - m_output << "get_conduit_node: " << data_id << std::endl; + (*m_output) << "get_conduit_node: " << data_id << std::endl; } /** * dah: commenting this out since it gives a false positive for test @@ -521,7 +493,7 @@ const conduit::Node & data_store_conduit::get_conduit_node(int data_id) const { if (is_local_cache()) { std::unordered_map::const_iterator t3 = m_data.find(data_id); if (t3 == m_data.end()) { - LBANN_ERROR("(local cache) failed to find data_id: ", std::to_string(data_id), " in m_data; m_data.size: ", std::to_string(m_data.size())); + LBANN_ERROR("(local cache) failed to find data_id: ", data_id, " in m_data; m_data.size: ", m_data.size()); } return t3->second; } @@ -534,13 +506,13 @@ const conduit::Node & data_store_conduit::get_conduit_node(int data_id) const { if (t3 != m_data.end()) { return t3->second["data"]; } - LBANN_ERROR("failed to find data_id: ", std::to_string(data_id), " in m_minibatch_data; m_minibatch_data.size: ", std::to_string(m_minibatch_data.size()), " and also failed to find it in m_data; m_data.size: ", std::to_string(m_data.size()), "; role: ", m_reader->get_role()); + LBANN_ERROR("failed to find data_id: ", data_id, " in m_minibatch_data; m_minibatch_data.size: ", m_minibatch_data.size(), " and also failed to find it in m_data; m_data.size: ", m_data.size(), "; role: ", m_reader->get_role()); if (m_output) { - m_output << "failed to find data_id: " << data_id << " in m_minibatch_data; my m_minibatch_data indices: "; + (*m_output) << "failed to find data_id: " << data_id << " in m_minibatch_data; my m_minibatch_data indices: "; for (auto t : m_minibatch_data) { - m_output << t.first << " "; + (*m_output) << t.first << " "; } - m_output << std::endl; + (*m_output) << std::endl; } } @@ -550,9 +522,6 @@ const conduit::Node & data_store_conduit::get_conduit_node(int data_id) const { // code in the following method is a modification of code from // conduit/src/libs/relay/conduit_relay_mpi.cpp void data_store_conduit::build_node_for_sending(const conduit::Node &node_in, conduit::Node &node_out) { - if (m_output) { - m_output << "starting build_node_for_sending\n"; - } node_out.reset(); conduit::Schema s_data_compact; @@ -602,7 +571,7 @@ void data_store_conduit::exchange_data_by_sample(size_t current_pos, size_t mb_s } if (m_output) { - m_output << "starting data_store_conduit::exchange_data_by_sample; mb_size: " << mb_size << std::endl; + (*m_output) << "starting data_store_conduit::exchange_data_by_sample; mb_size: " << mb_size << std::endl; } int num_send_req = build_indices_i_will_send(current_pos, mb_size); @@ -622,31 +591,31 @@ void data_store_conduit::exchange_data_by_sample(size_t current_pos, size_t mb_s const std::unordered_set &indices = m_indices_to_send[p]; for (auto index : indices) { if (m_data.find(index) == m_data.end()) { - LBANN_ERROR("failed to find data_id: ", std::to_string(index), " to be sent to ", std::to_string(p), " in m_data"); + LBANN_ERROR("failed to find data_id: ", index, " to be sent to ", p, " in m_data"); } const conduit::Node& n = m_data[index]; const El::byte *s = reinterpret_cast(n.data_ptr()); if(!n.is_contiguous()) { - LBANN_ERROR("data_id: ", std::to_string(index), " does not have a contiguous layout"); + LBANN_ERROR("data_id: ", index, " does not have a contiguous layout"); } if(n.data_ptr() == nullptr) { - LBANN_ERROR("data_id: ", std::to_string(index), " does not have a valid data pointer"); + LBANN_ERROR("data_id: ", index, " does not have a valid data pointer"); } if(n.contiguous_data_ptr() == nullptr) { - LBANN_ERROR("data_id: ", std::to_string(index), " does not have a valid contiguous data pointer"); + LBANN_ERROR("data_id: ", index, " does not have a valid contiguous data pointer"); } size_t sz = m_compacted_sample_size; if (m_node_sizes_vary) { if (m_sample_sizes.find(index) == m_sample_sizes.end()) { - LBANN_ERROR("m_sample_sizes.find(index) == m_sample_sizes.end() for index: ", std::to_string(index), "; m_sample_sizes.size: ", std::to_string(m_sample_sizes.size())); + LBANN_ERROR("m_sample_sizes.find(index) == m_sample_sizes.end() for index: ", index, "; m_sample_sizes.size: ", m_sample_sizes.size()); } sz = m_sample_sizes[index]; } if (m_output) { - m_output << "sending " << index << " size: " << sz << " to " << p << std::endl; + (*m_output) << "sending " << index << " size: " << sz << " to " << p << std::endl; } m_comm->nb_tagged_send(s, sz, p, index, m_send_requests[ss++], m_comm->get_trainer_comm()); @@ -655,7 +624,7 @@ void data_store_conduit::exchange_data_by_sample(size_t current_pos, size_t mb_s // sanity checks if (ss != m_send_requests.size()) { - LBANN_ERROR("ss != m_send_requests.size; ss: ", std::to_string(ss), " m_send_requests.size: ", std::to_string(m_send_requests.size())); + LBANN_ERROR("ss != m_send_requests.size; ss: ", ss, " m_send_requests.size: ", m_send_requests.size()); } // start recvs for incoming data @@ -669,7 +638,7 @@ void data_store_conduit::exchange_data_by_sample(size_t current_pos, size_t mb_s int sz = m_compacted_sample_size; if (m_node_sizes_vary) { if (m_sample_sizes.find(index) == m_sample_sizes.end()) { - LBANN_ERROR("m_sample_sizes.find(index) == m_sample_sizes.end() for index: ", std::to_string(index), "; m_sample_sizes.size(): ", std::to_string(m_sample_sizes.size()), " role: ", m_reader->get_role(), " for index: ", std::to_string(sanity), " of ", std::to_string(indices.size())); + LBANN_ERROR("m_sample_sizes.find(index) == m_sample_sizes.end() for index: ", index, "; m_sample_sizes.size(): ", m_sample_sizes.size(), " role: ", m_reader->get_role(), " for index: ", sanity, " of ", indices.size()); } sz = m_sample_sizes[index]; } @@ -684,10 +653,10 @@ void data_store_conduit::exchange_data_by_sample(size_t current_pos, size_t mb_s // sanity checks if (ss != m_recv_buffer.size()) { - LBANN_ERROR("ss != m_recv_buffer.size; ss: ", std::to_string(ss), " m_recv_buffer.size: ", std::to_string(m_recv_buffer.size())); + LBANN_ERROR("ss != m_recv_buffer.size; ss: ", ss, " m_recv_buffer.size: ", m_recv_buffer.size()); } if (m_recv_requests.size() != m_recv_buffer.size()) { - LBANN_ERROR("m_recv_requests.size != m_recv_buffer.size; m_recv_requests: ", std::to_string(m_recv_requests.size()), " m_recv_buffer.size: ", std::to_string(m_recv_buffer.size())); + LBANN_ERROR("m_recv_requests.size != m_recv_buffer.size; m_recv_requests: ", m_recv_requests.size(), " m_recv_buffer.size: ", m_recv_buffer.size()); } // wait for all msgs to complete @@ -736,7 +705,7 @@ int data_store_conduit::build_indices_i_will_send(int current_pos, int mb_size) m_indices_to_send.resize(m_np_in_trainer); int k = 0; if (m_output) { - m_output << "build_indices_i_will_send; cur pos: " << current_pos << " mb_size: " << mb_size << " m_data.size: " << m_data.size() << "\n"; + (*m_output) << "build_indices_i_will_send; cur pos: " << current_pos << " mb_size: " << mb_size << " m_data.size: " << m_data.size() << "\n"; } for (int i = current_pos; i < current_pos + mb_size; i++) { auto index = (*m_shuffled_indices)[i]; @@ -766,10 +735,11 @@ void data_store_conduit::build_preloaded_owner_map(const std::vector& per_r ++owning_rank; per_rank_list_range_start += per_rank_list_size; } - m_owner[i] = owning_rank; + m_owner[(*m_shuffled_indices)[i]] = owning_rank; } } +#if 0 void data_store_conduit::build_owner_map(int mini_batch_size) { if (m_world_master) std::cerr << "starting data_store_conduit::build_owner_map for role: " << m_reader->get_role() << " with mini_batch_size: " << mini_batch_size << " num indices: " << m_shuffled_indices->size() << "\n"; if (mini_batch_size == 0) { @@ -785,6 +755,7 @@ void data_store_conduit::build_owner_map(int mini_batch_size) { m_owner[index] = (i % m_owner_map_mb_size) % m_np_in_trainer; } } +#endif const conduit::Node & data_store_conduit::get_random_node() const { size_t sz = m_data.size(); @@ -807,14 +778,14 @@ const conduit::Node & data_store_conduit::get_random_node(const std::string &fie conduit::Node & data_store_conduit::get_empty_node(int data_id) { if (m_data.find(data_id) != m_data.end()) { - LBANN_ERROR("we already have a node with data_id= ", std::to_string(data_id)); + LBANN_ERROR("we already have a node with data_id= ", data_id); } return m_data[data_id]; } void data_store_conduit::purge_unused_samples(const std::vector& indices) { if (m_output) { - m_output << " starting purge_unused_samples; indices.size(): " << indices.size() << " data.size(): " << m_data.size() << std::endl; + (*m_output) << " starting purge_unused_samples; indices.size(): " << indices.size() << " data.size(): " << m_data.size() << std::endl; } /// Remove unused indices from the data and owner maps for(auto&& i : indices) { @@ -826,7 +797,7 @@ void data_store_conduit::purge_unused_samples(const std::vector& indices) { } } if (m_output) { - m_output << " leaving purge_unused_samples; indices.size(): " << indices.size() << " data.size(): " << m_data.size() << std::endl; + (*m_output) << " leaving purge_unused_samples; indices.size(): " << indices.size() << " data.size(): " << m_data.size() << std::endl; } } @@ -1007,7 +978,7 @@ void data_store_conduit::check_mem_capacity(lbann_comm *comm, const std::string bool data_store_conduit::has_conduit_node(int data_id) const { std::unordered_map::const_iterator t = m_data.find(data_id); if (m_output) { - m_output << "has_conduit_node( " << data_id << " ) = " << (t == m_data.end()) << std::endl; + (*m_output) << "has_conduit_node( " << data_id << " ) = " << (t == m_data.end()) << std::endl; } return t != m_data.end(); } @@ -1018,7 +989,7 @@ void data_store_conduit::set_shuffled_indices(const std::vector *indices) { void data_store_conduit::exchange_sample_sizes() { if (m_output) { - m_output << "starting data_store_conduit::exchange_sample_sizes" << std::endl; + (*m_output) << "starting data_store_conduit::exchange_sample_sizes" << std::endl; } int my_count = m_sample_sizes.size(); @@ -1027,7 +998,7 @@ void data_store_conduit::exchange_sample_sizes() { if (m_output) { for (size_t h=0; hbroadcast(k, other_sizes.data(), all_counts[k]*2, m_comm->get_trainer_comm()); for (size_t i=0; i &s for (size_t p=0; p << " available mem: " << avail_mem << "\n" << " required size is " << percent << " percent of available\n"; if (m_world_master) { - std::cout << "\nShared memory segment statistics:\n" + std::cerr << "\nShared memory segment statistics:\n" << msg.str() << "\n"; } if (m_mem_seg_length >= avail_mem) { @@ -1183,7 +1154,7 @@ void data_store_conduit::allocate_shared_segment(std::unordered_map } int v = ftruncate(shm_fd, size); if (v != 0) { - LBANN_ERROR("ftruncate failed for size: ", std::to_string(size)); + LBANN_ERROR("ftruncate failed for size: ", size); } void *m = mmap(0, size, PROT_WRITE | PROT_READ, MAP_SHARED, shm_fd, 0); if (m == MAP_FAILED) { @@ -1216,7 +1187,7 @@ void data_store_conduit::allocate_shared_segment(std::unordered_map LBANN_ERROR("fstat failed"); } if (b.st_size != size) { - LBANN_ERROR("b.st_size= ", std::to_string(b.st_size), " should be equal to ", std::to_string(size)); + LBANN_ERROR("b.st_size= ", b.st_size, " should be equal to ", size); } } close(shm_fd); @@ -1227,34 +1198,34 @@ void data_store_conduit::preload_local_cache() { std::vector> indices; double tm1 = get_time(); - if (m_world_master) std::cout << "calling get_image_sizes" << std::endl; + if (m_world_master) std::cerr << "calling get_image_sizes" << std::endl; get_image_sizes(file_sizes, indices); - if (m_world_master) std::cout << " get_image_sizes time: " << (get_time()-tm1) << std::endl; + if (m_world_master) std::cerr << " get_image_sizes time: " << (get_time()-tm1) << std::endl; tm1 = get_time(); //indices[j] contains the indices (wrt m_reader->get_image_list()) //that P_j will read from disk, and subsequently bcast to all others // //file_sizes maps an index to its file size - if (m_world_master) std::cout << "calling allocate_shared_segment" << std::endl; + if (m_world_master) std::cerr << "calling allocate_shared_segment" << std::endl; allocate_shared_segment(file_sizes, indices); - if (m_world_master) std::cout << " allocate_shared_segment time: " << (get_time()-tm1) << std::endl; + if (m_world_master) std::cerr << " allocate_shared_segment time: " << (get_time()-tm1) << std::endl; tm1 = get_time(); - if (m_world_master) std::cout << "calling read_files" << std::endl; + if (m_world_master) std::cerr << "calling read_files" << std::endl; std::vector work; read_files(work, file_sizes, indices[m_rank_in_trainer]); - if (m_world_master) std::cout << " read_files time: " << (get_time()- tm1) << std::endl; + if (m_world_master) std::cerr << " read_files time: " << (get_time()- tm1) << std::endl; tm1 = get_time(); - if (m_world_master) std::cout << "calling compute_image_offsets" << std::endl; + if (m_world_master) std::cerr << "calling compute_image_offsets" << std::endl; compute_image_offsets(file_sizes, indices); - if (m_world_master) std::cout << " compute_image_offsets time: " << (get_time()-tm1) << std::endl; + if (m_world_master) std::cerr << " compute_image_offsets time: " << (get_time()-tm1) << std::endl; tm1 = get_time(); - if (m_world_master) std::cout << "calling exchange_images" << std::endl; + if (m_world_master) std::cerr << "calling exchange_images" << std::endl; exchange_images(work, file_sizes, indices); - if (m_world_master) std::cout << " exchange_images time: " << (get_time()-tm1) << std::endl; + if (m_world_master) std::cerr << " exchange_images time: " << (get_time()-tm1) << std::endl; tm1 = get_time(); if (m_world_master) std::cerr << "calling build_conduit_nodes" << std::endl; @@ -1272,7 +1243,7 @@ void data_store_conduit::read_files(std::vector &work, std::unordered_map< work.resize(n); if (m_output) { - m_output << "data_store_conduit::read_files; requested work size: " << n << std::endl; + (*m_output) << "data_store_conduit::read_files; requested work size: " << n << std::endl; } //get the list of images from the data reader @@ -1291,7 +1262,7 @@ void data_store_conduit::read_files(std::vector &work, std::unordered_map< in.close(); offset += s; } - if (m_world_master) std::cout << " finished reading files\n"; + if (m_world_master) std::cerr << " finished reading files\n"; } void data_store_conduit::build_conduit_nodes(std::unordered_map &sizes) { @@ -1343,5 +1314,72 @@ void data_store_conduit::exchange_images(std::vector &work, std::unordered m_comm->barrier(m_comm->get_node_comm()); } +void data_store_conduit::exchange_owner_maps() { + if (m_output) { + (*m_output) << "\nstarting data_store_conduit::exchange_owner_maps\n\n"; + } + int my_count = m_owner.size(); + std::vector all_counts(m_np_in_trainer); + m_comm->all_gather(&my_count, 1, all_counts.data(), 1, m_comm->get_trainer_comm()); + + std::vector my_sizes(m_owner.size()); + size_t j = 0; + for (auto t : m_owner) { + my_sizes[j++] = t.first; + } + + std::vector other_sizes; + for (int k=0; kbroadcast(k, my_sizes.data(), all_counts[k], m_comm->get_trainer_comm()); + } else { + m_comm->broadcast(k, other_sizes.data(), all_counts[k], m_comm->get_trainer_comm()); + for (size_t i=0; iget_role(), "; m_owner[",other_sizes[i],"] = ", m_owner[other_sizes[i]]); + } + m_owner[other_sizes[i]] = k; + } + } + } +} + +void data_store_conduit::exchange_mini_batch_data(size_t current_pos, size_t mb_size) { + if (is_local_cache()) { + return; + } + if (m_reader->at_new_epoch()) { + ++m_cur_epoch; + } + + if (m_reader->at_new_epoch() && !m_preload && !m_is_local_cache && m_cur_epoch == 1) { + exchange_owner_maps(); + } + + if (m_super_node) { + exchange_data_by_super_node(current_pos, mb_size); + } else { + exchange_data_by_sample(current_pos, mb_size); + } +} + +void data_store_conduit::flush_debug_file() { + if (!m_output) { + return; + } + m_output->close(); + m_output->open(m_debug_filename.c_str(), std::ios::app); +} } // namespace lbann diff --git a/src/proto/proto_common.cpp b/src/proto/proto_common.cpp index 57da047532b..d5abf284347 100644 --- a/src/proto/proto_common.cpp +++ b/src/proto/proto_common.cpp @@ -496,9 +496,12 @@ void init_data_readers( reader_validation->set_role("validate"); reader_validation->use_unused_index_set(); - if(reader_validation->get_data_store_ptr() != nullptr) { + data_store_conduit *store = reader_validation->get_data_store_ptr(); + if (store != nullptr) { + store->set_data_reader_ptr(reader_validation); reader_validation->get_data_store_ptr()->compact_nodes(); - } + } + /// At this point clean up any unused samples from the main data store if(reader->get_data_store_ptr() != nullptr) { auto&& data_store = reader->get_data_store_ptr(); From 0d0f9411ecd48f4a070c7e78bd43bf26b167dabb Mon Sep 17 00:00:00 2001 From: Tim Moon Date: Thu, 19 Sep 2019 14:48:32 -0700 Subject: [PATCH 305/634] Synchronize IO thread pool in input layer destructor (#1256) * Synchronize IO thread pool in trainer destructor Makes sure thread pool are not running while data readers are being destroyed. * Synchronize IO thread pool in input layer destructor Moved from synchronizing in trainer destructor. --- include/lbann/layers/io/input/generic_input_layer.hpp | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/include/lbann/layers/io/input/generic_input_layer.hpp b/include/lbann/layers/io/input/generic_input_layer.hpp index 74beeb6482f..6f8b05b382e 100644 --- a/include/lbann/layers/io/input/generic_input_layer.hpp +++ b/include/lbann/layers/io/input/generic_input_layer.hpp @@ -93,6 +93,16 @@ class generic_input_layer : public io_layer { } ~generic_input_layer() override { + + // Synchronize the I/O thread pool + // Note: The thread pool may still be running asynchronously if the + // trainer is destroyed in the middle of an epoch. The thread pool + // needs to interact with data readers, etc., so it needs to be + // synchronized before any of them are destroyed. + if (this->m_model != nullptr) { + this->m_model->get_execution_context().get_io_thread_pool().reap_threads(); + } + for (auto& io_buffer : m_io_buffers) { delete io_buffer; } From e4c698f8258f41fc4919dac485d3e883a575b9d5 Mon Sep 17 00:00:00 2001 From: Ryan Forsyth Date: Thu, 12 Sep 2019 16:02:12 -0700 Subject: [PATCH 306/634] Test length modes --- bamboo/common_python/tools.py | 54 ++++++++--- bamboo/compiler_tests/conftest.py | 2 +- bamboo/integration_tests/common_code.py | 69 +++++++++----- bamboo/integration_tests/conftest.py | 7 ++ .../test_integration_autoencoders.py | 27 +++--- .../test_integration_debug.py | 89 +++++++++++++------ .../test_integration_performance.py | 82 +++++++++++------ bamboo/local_test.cmd | 5 +- bamboo/local_test.sh | 26 ++++-- bamboo/unit_tests/conftest.py | 14 ++- .../test_unit_check_proto_models.py | 22 ++--- bamboo/unit_tests/test_unit_checkpoint.py | 62 ++++++++----- bamboo/unit_tests/test_unit_layer_clamp.py | 22 ++--- .../unit_tests/test_unit_layer_covariance.py | 29 +++--- bamboo/unit_tests/test_unit_layer_elu.py | 22 ++--- bamboo/unit_tests/test_unit_layer_identity.py | 26 +++--- bamboo/unit_tests/test_unit_layer_l1_norm.py | 25 +++--- bamboo/unit_tests/test_unit_layer_l2_norm2.py | 26 +++--- .../unit_tests/test_unit_layer_leaky_relu.py | 28 +++--- .../unit_tests/test_unit_layer_log_sigmoid.py | 29 +++--- .../unit_tests/test_unit_layer_log_softmax.py | 29 +++--- .../test_unit_layer_mean_absolute_error.py | 30 ++++--- bamboo/unit_tests/test_unit_layer_relu.py | 22 ++--- bamboo/unit_tests/test_unit_layer_selu.py | 22 ++--- bamboo/unit_tests/test_unit_layer_sigmoid.py | 25 +++--- bamboo/unit_tests/test_unit_layer_softmax.py | 25 +++--- bamboo/unit_tests/test_unit_layer_softplus.py | 26 +++--- bamboo/unit_tests/test_unit_layer_softsign.py | 31 ++++--- .../test_unit_layer_squared_difference.py | 30 ++++--- .../unit_tests/test_unit_layer_tessellate.py | 28 +++--- bamboo/unit_tests/test_unit_layer_variance.py | 26 +++--- bamboo/unit_tests/test_unit_lbann2_reload.py | 33 ++++--- .../unit_tests/test_unit_mnist_conv_graph.py | 31 ++++--- .../test_unit_mnist_ridge_regression.py | 36 +++++--- .../test_unit_mnist_softmax_classifier.py | 30 ++++--- .../test_unit_reconstruction_loss.py | 30 ++++--- docs/continuous_integration.rst | 31 +++---- 37 files changed, 740 insertions(+), 411 deletions(-) diff --git a/bamboo/common_python/tools.py b/bamboo/common_python/tools.py index a908cf05120..98b2450d2e7 100644 --- a/bamboo/common_python/tools.py +++ b/bamboo/common_python/tools.py @@ -12,10 +12,13 @@ def check_list(substrings, strings): def get_command(cluster, executable, + # Allocation/Run Parameters num_nodes=None, + num_processes=None, partition=None, time_limit=None, - num_processes=None, + # LBANN Parameters + ckpt_dir=None, dir_name=None, data_filedir_default=None, data_filedir_train_default=None, @@ -36,24 +39,32 @@ def get_command(cluster, optimizer_path=None, processes_per_model=None, extra_lbann_flags=None, - ckpt_dir=None, - output_file_name=None, + # Error/Output Redirect error_file_name=None, - return_tuple=False, + output_file_name=None, + # Misc. Parameters check_executable_existence=True, - skip_no_exe=True): + return_tuple=False, + skip_no_exe=True, + weekly=False): # Check parameters for black-listed characters like semi-colons that # would terminate the command and allow for an extra command blacklist = [';', '--'] strings = [ - cluster, executable, num_nodes, partition, time_limit, num_processes, - dir_name, data_filedir_default, data_filedir_train_default, + cluster, executable, + # Allocation/Run Parameters + num_nodes, num_processes, partition, time_limit, + # LBANN Parameters + ckpt_dir, dir_name, data_filedir_default, data_filedir_train_default, data_filename_train_default, data_filedir_test_default, data_filename_test_default, data_reader_name, data_reader_path, data_reader_percent, exit_after_setup, metadata, mini_batch_size, model_folder, model_name, model_path, num_epochs, optimizer_name, - optimizer_path, processes_per_model, ckpt_dir, output_file_name, - error_file_name, return_tuple, check_executable_existence, skip_no_exe + optimizer_path, processes_per_model, + # Error/Output Redirect + error_file_name, output_file_name, + # Misc. Parameters + check_executable_existence, return_tuple, skip_no_exe, weekly ] lbann_errors = [] if extra_lbann_flags is not None: @@ -70,8 +81,14 @@ def get_command(cluster, raise Exception('Invalid character(s): %s' % ' , '.join( invalid_character_errors)) + DEFAULT_TIME = 35 MAX_TIME = 360 # 6 hours. - if (time_limit is None) or (time_limit > MAX_TIME): + if time_limit is None: + if weekly: + time_limit = MAX_TIME + else: + time_limit = DEFAULT_TIME + if time_limit > MAX_TIME: time_limit = MAX_TIME # Check executable existence @@ -378,7 +395,22 @@ def get_command(cluster, ' data_reader_path are.')) # else: no conflicts if data_reader_percent is not None: - option_data_reader_percent = ' --data_reader_percent=%f' % data_reader_percent + # If data_reader_percent is not None, then it will override `weekly`. + # If it is None however, we choose its value based on `weekly`. + try: + data_reader_percent = float(data_reader_percent) + + except ValueError: + lbann_errors.append( + 'data_reader_percent={d} is not a float.'.format( + d=data_reader_percent)) + elif weekly: + data_reader_percent = 1.00 + else: + # Nightly + data_reader_percent = 0.10 + option_data_reader_percent = ' --data_reader_percent={d}'.format( + d=data_reader_percent) if exit_after_setup: option_exit_after_setup = ' --exit_after_setup' if metadata is not None: diff --git a/bamboo/compiler_tests/conftest.py b/bamboo/compiler_tests/conftest.py index 9f137a01527..ccffb182a73 100644 --- a/bamboo/compiler_tests/conftest.py +++ b/bamboo/compiler_tests/conftest.py @@ -10,7 +10,7 @@ def pytest_addoption(parser): parser.addoption('--cluster', action='store', default=cluster, help='--cluster= to specify the cluster being run on, for the purpose of determing which commands to use. Default the current cluster') parser.addoption('--dirname', action='store', default=default_dirname, - help='--dirname specifies the top-level directory') + help='--dirname= specifies the top-level directory') @pytest.fixture diff --git a/bamboo/integration_tests/common_code.py b/bamboo/integration_tests/common_code.py index ee53649d757..98107f545a7 100644 --- a/bamboo/integration_tests/common_code.py +++ b/bamboo/integration_tests/common_code.py @@ -6,40 +6,55 @@ # Set up the command ########################################################## def get_command(cluster, dir_name, model_folder, model_name, executable, - output_file_name, error_file_name, compiler_name, weekly=False): + output_file_name, error_file_name, compiler_name, weekly=False, + data_reader_percent=None): if model_name in ['alexnet', 'conv_autoencoder_imagenet']: if weekly: - data_reader_percent = 0.10 - time_limit = 600 + time_limit = 360 else: - data_reader_percent = 0.01 time_limit = 60 if cluster == 'lassen': command = tools.get_command( - cluster=cluster, executable=executable, num_nodes=16, - partition='pbatch', time_limit=time_limit, num_processes=32, + cluster=cluster, executable=executable, + # Allocation/Run Parameters + num_nodes=16, num_processes=32, partition='pbatch', + time_limit=time_limit, + # LBANN Parameters dir_name=dir_name, data_filedir_train_default='/p/lscratchh/brainusr/datasets/ILSVRC2012/original/train/', data_filename_train_default='/p/lscratchh/brainusr/datasets/ILSVRC2012/labels/train.txt', data_filedir_test_default='/p/lscratchh/brainusr/datasets/ILSVRC2012/original/val/', data_filename_test_default='/p/lscratchh/brainusr/datasets/ILSVRC2012/labels/val.txt', - data_reader_name='imagenet_lassen', data_reader_percent=data_reader_percent, + data_reader_name='imagenet_lassen', + data_reader_percent=data_reader_percent, model_folder=model_folder, model_name=model_name, num_epochs=20, - optimizer_name='adagrad', output_file_name=output_file_name, - error_file_name=error_file_name) + optimizer_name='adagrad', + # Error/Output Redirect + error_file_name=error_file_name, + output_file_name=output_file_name, + # Misc. Parameters + weekly=weekly) else: command = tools.get_command( - cluster=cluster, executable=executable, num_nodes=16, - partition='pbatch', time_limit=time_limit, num_processes=32, + cluster=cluster, executable=executable, + # Allocation/Run Parameters + num_nodes=16, num_processes=32, partition='pbatch', + time_limit=time_limit, + # LBANN Parameters dir_name=dir_name, data_filedir_train_default='/p/lscratchh/brainusr/datasets/ILSVRC2012/original/train/', data_filename_train_default='/p/lscratchh/brainusr/datasets/ILSVRC2012/labels/train.txt', data_filedir_test_default='/p/lscratchh/brainusr/datasets/ILSVRC2012/original/val/', data_filename_test_default='/p/lscratchh/brainusr/datasets/ILSVRC2012/labels/val.txt', - data_reader_name='imagenet', data_reader_percent=data_reader_percent, + data_reader_name='imagenet', + data_reader_percent=data_reader_percent, model_folder=model_folder, model_name=model_name, num_epochs=20, - optimizer_name='adagrad', output_file_name=output_file_name, - error_file_name=error_file_name) + optimizer_name='adagrad', + # Error/Output Redirect + error_file_name=error_file_name, + output_file_name=output_file_name, + # Misc. Parameters + weekly=weekly) elif model_name in ['conv_autoencoder_mnist', 'lenet_mnist']: if (model_name == 'lenet_mnist') and \ (compiler_name in ['clang6', 'intel19']): @@ -53,13 +68,21 @@ def get_command(cluster, dir_name, model_folder, model_name, executable, else: num_processes = 2 command = tools.get_command( - cluster=cluster, executable=executable, num_nodes=1, - partition=partition, time_limit=time_limit, - num_processes=num_processes, dir_name=dir_name, + cluster=cluster, executable=executable, + # Allocation/Run Parameters + num_nodes=1, num_processes=num_processes, partition=partition, + time_limit=time_limit, + # LBANN Parameters + dir_name=dir_name, data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST', - data_reader_name='mnist', model_folder=model_folder, - model_name=model_name, num_epochs=5, optimizer_name='adagrad', - output_file_name=output_file_name, error_file_name=error_file_name) + data_reader_name='mnist', data_reader_percent=data_reader_percent, + model_folder=model_folder, model_name=model_name, num_epochs=5, + optimizer_name='adagrad', + # Error/Output Redirect + error_file_name=error_file_name, + output_file_name=output_file_name, + # Misc. Parameters + weekly=weekly) else: raise Exception('Invalid model: %s' % model_name) return command @@ -214,7 +237,8 @@ def extract_data(output_file_name, data_fields, should_log): def skeleton(cluster, dir_name, executable, model_folder, model_name, - data_fields, should_log, compiler_name=None, weekly=False): + data_fields, should_log, compiler_name=None, weekly=False, + data_reader_percent=None): if compiler_name is None: output_file_name = '%s/bamboo/integration_tests/output/%s_output.txt' % (dir_name, model_name) error_file_name = '%s/bamboo/integration_tests/error/%s_error.txt' % (dir_name, model_name) @@ -223,7 +247,8 @@ def skeleton(cluster, dir_name, executable, model_folder, model_name, error_file_name = '%s/bamboo/integration_tests/error/%s_%s_error.txt' % (dir_name, model_name, compiler_name) command = get_command( cluster, dir_name, model_folder, model_name, executable, - output_file_name, error_file_name, compiler_name, weekly=weekly) + output_file_name, error_file_name, compiler_name, weekly=weekly, + data_reader_percent=data_reader_percent) run_lbann(command, model_name, output_file_name, error_file_name, should_log) return extract_data(output_file_name, data_fields, should_log) diff --git a/bamboo/integration_tests/conftest.py b/bamboo/integration_tests/conftest.py index 09c52c5119e..a318e7537ed 100644 --- a/bamboo/integration_tests/conftest.py +++ b/bamboo/integration_tests/conftest.py @@ -24,6 +24,8 @@ def pytest_addoption(parser): parser.addoption('--weekly', action='store_true', default=False, help='--weekly specifies that the test should ONLY be run weekly, not nightly. Default False') # For local testing only + parser.addoption('--data-reader-percent', action='store', default=None, + help='--data-reader-percent=. Default None. Note that 1.0 is 100%.') parser.addoption('--exe', action='store', help='--exe=') @@ -57,6 +59,11 @@ def weekly(request): return request.config.getoption('--weekly') +@pytest.fixture +def data_reader_percent(request): + return request.config.getoption('--data-reader-percent') + + @pytest.fixture def exe(request): return request.config.getoption('--exe') diff --git a/bamboo/integration_tests/test_integration_autoencoders.py b/bamboo/integration_tests/test_integration_autoencoders.py index c4e4f7bf9e4..1c89520dc74 100644 --- a/bamboo/integration_tests/test_integration_autoencoders.py +++ b/bamboo/integration_tests/test_integration_autoencoders.py @@ -49,7 +49,7 @@ def run_tests(actual_objective_functions, model_name, dir_name, cluster, def skeleton_autoencoder_imagenet(cluster, dir_name, executables, compiler_name, - weekly): + weekly, data_reader_percent): if cluster in ['lassen', 'pascal']: e = 'skeleton_autoencoder_imagenet: does not run on GPU' print('Skip - ' + e) @@ -63,7 +63,8 @@ def skeleton_autoencoder_imagenet(cluster, dir_name, executables, compiler_name, should_log = False actual_objective_functions = common_code.skeleton( cluster, dir_name, executables[compiler_name], model_folder, model_name, - DATA_FIELDS, should_log, compiler_name=compiler_name, weekly=weekly) + DATA_FIELDS, should_log, compiler_name=compiler_name, weekly=weekly, + data_reader_percent=data_reader_percent) frequency_str = '_nightly' if weekly: frequency_str = '_weekly' @@ -72,24 +73,30 @@ def skeleton_autoencoder_imagenet(cluster, dir_name, executables, compiler_name, def test_integration_autoencoder_imagenet_clang6(cluster, dirname, exes, - weekly): - skeleton_autoencoder_imagenet(cluster, dirname, exes, 'clang6', weekly) + weekly, data_reader_percent): + skeleton_autoencoder_imagenet(cluster, dirname, exes, 'clang6', weekly, + data_reader_percent) -def test_integration_autoencoder_imagenet_gcc7(cluster, dirname, exes, weekly): - skeleton_autoencoder_imagenet(cluster, dirname, exes, 'gcc7', weekly) +def test_integration_autoencoder_imagenet_gcc7(cluster, dirname, exes, weekly, + data_reader_percent): + skeleton_autoencoder_imagenet(cluster, dirname, exes, 'gcc7', weekly, + data_reader_percent) def test_integration_autoencoder_imagenet_intel19(cluster, dirname, exes, - weekly): - skeleton_autoencoder_imagenet(cluster, dirname, exes, 'intel19', weekly) + weekly, data_reader_percent): + skeleton_autoencoder_imagenet(cluster, dirname, exes, 'intel19', weekly, + data_reader_percent) # Run with python3 -m pytest -s test_integration_autoencoder.py -k 'test_integration_autoencoder_imagenet_exe' --exe= -def test_integration_autoencoder_imagenet_exe(cluster, dirname, exe): +def test_integration_autoencoder_imagenet_exe(cluster, dirname, exe, weekly, + data_reader_percent): if exe is None: e = 'test_integration_autoencoder_imagenet_exe: Non-local testing' print('Skip - ' + e) pytest.skip() exes = {'exe': exe} - skeleton_autoencoder_imagenet(cluster, dirname, exes, 'exe', True) + skeleton_autoencoder_imagenet(cluster, dirname, exes, 'exe', weekly, + data_reader_percent) diff --git a/bamboo/integration_tests/test_integration_debug.py b/bamboo/integration_tests/test_integration_debug.py index 274172be72b..8edf10eb3cc 100644 --- a/bamboo/integration_tests/test_integration_debug.py +++ b/bamboo/integration_tests/test_integration_debug.py @@ -6,7 +6,8 @@ def skeleton_mnist_debug(cluster, dir_name, executables, compiler_name, weekly, - debug_build, should_log=False): + debug_build, should_log=False, + data_reader_percent=None): # If weekly or debug_build are true, then run the test. if not (weekly or debug_build): e = 'skeleton_mnist_debug: Not doing weekly or debug_build testing' @@ -20,18 +21,28 @@ def skeleton_mnist_debug(cluster, dir_name, executables, compiler_name, weekly, output_file_name = '%s/bamboo/integration_tests/output/%s_%s_output.txt' %(dir_name, model_name, compiler_name) error_file_name = '%s/bamboo/integration_tests/error/%s_%s_error.txt' %(dir_name, model_name, compiler_name) command = tools.get_command( - cluster=cluster, executable=executables[compiler_name], num_nodes=1, - partition='pbatch', time_limit=100, dir_name=dir_name, + cluster=cluster, executable=executables[compiler_name], + # Allocation/Run Parameters + num_nodes=1, partition='pbatch', time_limit=100, + # LBANN Parameters + dir_name=dir_name, data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST', - data_reader_name='mnist', model_folder='models/' + model_name, + data_reader_name='mnist', + data_reader_percent=data_reader_percent, + model_folder='models/' + model_name, model_name=model_name, num_epochs=5, optimizer_name='adagrad', - output_file_name=output_file_name, error_file_name=error_file_name) + # Error/Output Redirect + error_file_name=error_file_name, + output_file_name=output_file_name, + # Misc. Parameters + weekly=weekly) common_code.run_lbann(command, model_name, output_file_name, error_file_name, should_log) def skeleton_cifar_debug(cluster, dir_name, executables, compiler_name, weekly, - debug_build, should_log=False): + debug_build, should_log=False, + data_reader_percent=None): # If weekly or debug_build are true, then run the test. if not (weekly or debug_build): e = 'skeleton_cifar_debug: Not doing weekly or debug_build testing' @@ -49,56 +60,82 @@ def skeleton_cifar_debug(cluster, dir_name, executables, compiler_name, weekly, output_file_name = '%s/bamboo/integration_tests/output/%s_%s_output.txt' %(dir_name, model_name, compiler_name) error_file_name = '%s/bamboo/integration_tests/error/%s_%s_error.txt' %(dir_name, model_name, compiler_name) command = tools.get_command( - cluster=cluster, executable=executables[compiler_name], num_nodes=1, - partition='pbatch', time_limit=100, dir_name=dir_name, + cluster=cluster, executable=executables[compiler_name], + # Allocation/Run Parameters + num_nodes=1, partition='pbatch', time_limit=100, + # LBANN Parameters + dir_name=dir_name, data_filename_train_default='/p/lscratchh/brainusr/datasets/cifar10-bin/data_all.bin', data_filename_test_default='/p/lscratchh/brainusr/datasets/cifar10-bin/test_batch.bin', - data_reader_name='cifar10', data_reader_percent=0.01, model_folder='models/' + model_name, + data_reader_name='cifar10', data_reader_percent=data_reader_percent, + model_folder='models/' + model_name, model_name='conv_' + model_name, num_epochs=5, optimizer_name='adagrad', - output_file_name=output_file_name, error_file_name=error_file_name) + # Error/Output Redirect + error_file_name=error_file_name, + output_file_name=output_file_name, + # Misc. Parameters + weekly=weekly) common_code.run_lbann(command, model_name, output_file_name, error_file_name, should_log) -def test_integration_mnist_clang6_debug(cluster, dirname, exes, weekly, debug_build): - skeleton_mnist_debug(cluster, dirname, exes, 'clang6_debug', weekly, debug_build) +def test_integration_mnist_clang6_debug(cluster, dirname, exes, weekly, + debug_build, data_reader_percent): + skeleton_mnist_debug(cluster, dirname, exes, 'clang6_debug', weekly, + debug_build, data_reader_percent) -def test_integration_cifar_clang6_debug(cluster, dirname, exes, weekly, debug_build): - skeleton_cifar_debug(cluster, dirname, exes, 'clang6_debug', weekly, debug_build) +def test_integration_cifar_clang6_debug(cluster, dirname, exes, weekly, + debug_build, data_reader_percent): + skeleton_cifar_debug(cluster, dirname, exes, 'clang6_debug', weekly, + debug_build, data_reader_percent) -def test_integration_mnist_gcc7_debug(cluster, dirname, exes, weekly, debug_build): - skeleton_mnist_debug(cluster, dirname, exes, 'gcc7_debug', weekly, debug_build) +def test_integration_mnist_gcc7_debug(cluster, dirname, exes, weekly, + debug_build, data_reader_percent): + skeleton_mnist_debug(cluster, dirname, exes, 'gcc7_debug', weekly, + debug_build, data_reader_percent) -def test_integration_cifar_gcc7_debug(cluster, dirname, exes, weekly, debug_build): - skeleton_cifar_debug(cluster, dirname, exes, 'gcc7_debug', weekly, debug_build) +def test_integration_cifar_gcc7_debug(cluster, dirname, exes, weekly, + debug_build, data_reader_percent): + skeleton_cifar_debug(cluster, dirname, exes, 'gcc7_debug', weekly, + debug_build, data_reader_percent) -def test_integration_mnist_intel19_debug(cluster, dirname, exes, weekly, debug_build): - skeleton_mnist_debug(cluster, dirname, exes, 'intel19_debug', weekly, debug_build) +def test_integration_mnist_intel19_debug(cluster, dirname, exes, weekly, + debug_build, data_reader_percent): + skeleton_mnist_debug(cluster, dirname, exes, 'intel19_debug', weekly, + debug_build, data_reader_percent) -def test_integration_cifar_intel19_debug(cluster, dirname, exes, weekly, debug_build): - skeleton_cifar_debug(cluster, dirname, exes, 'intel19_debug', weekly, debug_build) +def test_integration_cifar_intel19_debug(cluster, dirname, exes, weekly, + debug_build, data_reader_percent): + skeleton_cifar_debug(cluster, dirname, exes, 'intel19_debug', weekly, + debug_build, data_reader_percent) # Run with python3 -m pytest -s test_integration_debug.py -k 'test_integration_mnist_exe' --exe= -def test_integration_mnist_exe(cluster, dirname, exe): +def test_integration_mnist_exe(cluster, dirname, exe, weekly, + data_reader_percent): if exe is None: e = 'test_integration_mnist_exe: Non-local testing' print('Skip - ' + e) pytest.skip(e) exes = {'exe': exe} - skeleton_mnist_debug(cluster, dirname, exes, 'exe', True, True) + debug_build = True + skeleton_mnist_debug(cluster, dirname, exes, 'exe', weekly, debug_build, + data_reader_percent=data_reader_percent) # Run with python3 -m pytest -s test_integration_debug.py -k 'test_integration_cifar_exe' --exe= -def test_integration_cifar_exe(cluster, dirname, exe): +def test_integration_cifar_exe(cluster, dirname, exe, weekly, + data_reader_percent): if exe == None: e = 'test_integration_cifar_exe: Non-local testing' print('Skip - ' + e) pytest.skip(e) exes = {'exe': exe} - skeleton_cifar_debug(cluster, dirname, exes, 'exe', True, True) + debug_build=True + skeleton_cifar_debug(cluster, dirname, exes, 'exe', weekly, debug_build, + data_reader_percent=data_reader_percent) diff --git a/bamboo/integration_tests/test_integration_performance.py b/bamboo/integration_tests/test_integration_performance.py index c6e33775e9c..5370a6e81fa 100644 --- a/bamboo/integration_tests/test_integration_performance.py +++ b/bamboo/integration_tests/test_integration_performance.py @@ -98,7 +98,8 @@ def run_tests(actual_performance, model_name, dir_name, should_log, def skeleton_performance_lenet_mnist(cluster, dir_name, executables, - compiler_name): + compiler_name, weekly, + data_reader_percent): if compiler_name not in executables: e = 'skeleton_performance_lenet_mnist: default_exes[%s] does not exist' % compiler_name print('Skip - ' + e) @@ -115,7 +116,7 @@ def skeleton_performance_lenet_mnist(cluster, dir_name, executables, def skeleton_performance_alexnet(cluster, dir_name, executables, compiler_name, - weekly): + weekly, data_reader_percent): if compiler_name not in executables: e = 'skeleton_performance_alexnet: default_exes[%s] does not exist' % compiler_name print('Skip - ' + e) @@ -135,7 +136,10 @@ def skeleton_performance_alexnet(cluster, dir_name, executables, compiler_name, def skeleton_performance_full_alexnet(cluster, dir_name, executables, - compiler_name, weekly, run): + compiler_name, weekly, run, + data_reader_percent): + # `run` is False for calls to run.sh. + # `run` is True, in allocate_and_run.sh, if this is a Weekly test on Catalyst. if not run: e = 'skeleton_performance_full_alexnet: Ignored' print('Skip - ' + e) @@ -155,9 +159,10 @@ def skeleton_performance_full_alexnet(cluster, dir_name, executables, should_log = True output_file_name = '%s/bamboo/integration_tests/output/%s_%s_output.txt' %(dir_name, model_name, compiler_name) error_file_name = '%s/bamboo/integration_tests/error/%s_%s_error.txt' %(dir_name, model_name, compiler_name) + # No use for data_reader_percent here. if cluster in ['catalyst']: command = 'salloc --nodes 128 %s/bamboo/integration_tests/%s.sh > %s 2> %s' % (dir_name, model_name, output_file_name, error_file_name) - elif cluster in ['pascal', 'ray']: + elif cluster in ['lassen', 'pascal', 'ray']: e = 'skeleton_performance_full_alexnet: Pascal, Ray are unsupported for skeleton_performance_full_alexnet' print('Skip - ' + e) pytest.skip(e) @@ -171,75 +176,96 @@ def skeleton_performance_full_alexnet(cluster, dir_name, executables, cluster) -def test_integration_performance_lenet_mnist_clang6(cluster, dirname, exes): - skeleton_performance_lenet_mnist(cluster, dirname, exes, 'clang6') +def test_integration_performance_lenet_mnist_clang6(cluster, dirname, exes, + weekly, data_reader_percent): + skeleton_performance_lenet_mnist(cluster, dirname, exes, 'clang6', weekly, + data_reader_percent) -def test_integration_performance_alexnet_clang6(cluster, dirname, exes, weekly): - skeleton_performance_alexnet(cluster, dirname, exes, 'clang6', weekly) +def test_integration_performance_alexnet_clang6(cluster, dirname, exes, weekly, + data_reader_percent): + skeleton_performance_alexnet(cluster, dirname, exes, 'clang6', weekly, + data_reader_percent) def test_integration_performance_full_alexnet_clang6(cluster, dirname, exes, - weekly, run): + weekly, run, + data_reader_percent): skeleton_performance_full_alexnet(cluster, dirname, exes, 'clang6', weekly, - run) + run, data_reader_percent) -def test_integration_performance_lenet_mnist_gcc7(cluster, dirname, exes): - skeleton_performance_lenet_mnist(cluster, dirname, exes, 'gcc7') +def test_integration_performance_lenet_mnist_gcc7(cluster, dirname, exes, + weekly, data_reader_percent): + skeleton_performance_lenet_mnist(cluster, dirname, exes, 'gcc7', weekly, + data_reader_percent) -def test_integration_performance_alexnet_gcc7(cluster, dirname, exes, weekly): - skeleton_performance_alexnet(cluster, dirname, exes, 'gcc7', weekly) +def test_integration_performance_alexnet_gcc7(cluster, dirname, exes, weekly, + data_reader_percent): + skeleton_performance_alexnet(cluster, dirname, exes, 'gcc7', weekly, + data_reader_percent) def test_integration_performance_full_alexnet_gcc7(cluster, dirname, exes, - weekly, run): - skeleton_performance_full_alexnet(cluster, dirname, exes, 'gcc7', weekly, run) + weekly, run, + data_reader_percent): + skeleton_performance_full_alexnet(cluster, dirname, exes, 'gcc7', weekly, run, + data_reader_percent) -def test_integration_performance_lenet_mnist_intel19(cluster, dirname, exes): - skeleton_performance_lenet_mnist(cluster, dirname, exes, 'intel19') +def test_integration_performance_lenet_mnist_intel19(cluster, dirname, exes, + weekly, + data_reader_percent): + skeleton_performance_lenet_mnist(cluster, dirname, exes, 'intel19', weekly, + data_reader_percent) def test_integration_performance_alexnet_intel19(cluster, dirname, exes, - weekly): - skeleton_performance_alexnet(cluster, dirname, exes, 'intel19', weekly) + weekly, data_reader_percent): + skeleton_performance_alexnet(cluster, dirname, exes, 'intel19', weekly, + data_reader_percent) def test_integration_performance_full_alexnet_intel19(cluster, dirname, exes, - weekly, run): + weekly, run, + data_reader_percent): skeleton_performance_full_alexnet(cluster, dirname, exes, 'intel19', weekly, - run) + run, data_reader_percent) # Run with python3 -m pytest -s test_integration_performance.py -k 'test_integration_performance_lenet_mnist_exe' --exe= -def test_integration_performance_lenet_mnist_exe(cluster, dirname, exe): +def test_integration_performance_lenet_mnist_exe(cluster, dirname, exe, weekly, + data_reader_percent): if exe is None: e = 'test_integration_performance_lenet_mnist_exe: Non-local testing' print('Skip - ' + e) pytest.skip(e) exes = {'exe': exe} - skeleton_performance_lenet_mnist(cluster, dirname, exes, 'exe') + skeleton_performance_lenet_mnist(cluster, dirname, exes, 'exe', weekly, + data_reader_percent) # Run with python3 -m pytest -s test_integration_performance.py -k 'test_integration_performance_alexnet_exe' --exe= -def test_integration_performance_alexnet_exe(cluster, dirname, exe): +def test_integration_performance_alexnet_exe(cluster, dirname, exe, weekly, + data_reader_percent): if exe is None: e = 'stest_integration_performance_alexnet_exe: Non-local testing' print('Skip - ' + e) pytest.skip(e) exes = {'exe': exe} - skeleton_performance_alexnet(cluster, dirname, exes, 'exe', True) + skeleton_performance_alexnet(cluster, dirname, exes, 'exe', weekly, + data_reader_percent) # Run with python3 -m pytest -s test_integration_performance.py -k 'test_integration_performance_full_alexnet_exe' --weekly --run --exe= def test_integration_performance_full_alexnet_exe(cluster, dirname, weekly, - run, exe): + run, exe, + data_reader_percent): if exe is None: e = 'test_integration_performance_full_alexnet_exe: Non-local testing' print('Skip - ' + e) pytest.skip(e) exes = {'exe': exe} skeleton_performance_full_alexnet(cluster, dirname, exes, 'exe', weekly, - run) + run, data_reader_percent) diff --git a/bamboo/local_test.cmd b/bamboo/local_test.cmd index 0b3c18fd93f..aa17ec3101b 100644 --- a/bamboo/local_test.cmd +++ b/bamboo/local_test.cmd @@ -1,10 +1,11 @@ #!/bin/bash #SBATCH --nodes 16 #SBATCH --partition pbatch -#SBATCH --time 960 +#SBATCH --time 1440 # Update "--time" above to increase/decrease allocation time. # Update "executable" with your executable. +# Use "data-reader-percent" to specify data reader percent. Note that `data-reader-percent=1.0` means 100%, not 1%. # Use "--integration-tests" to only run integration tests. # Use "--unit-tests" to only run unit tests. -./local_test.sh --executable "../build/gnu.Release.pascal.llnl.gov/install/bin/lbann" +./local_test.sh --executable "../build/gnu.Release.pascal.llnl.gov/install/bin/lbann" --data-reader-percent 0.001 --unit-tests diff --git a/bamboo/local_test.sh b/bamboo/local_test.sh index 09d98743c2c..051c1931c8c 100755 --- a/bamboo/local_test.sh +++ b/bamboo/local_test.sh @@ -14,10 +14,11 @@ function help_message { Run integration and unit tests locally, outside Bamboo. Usage: ./${SCRIPT} [options] Options: - ${C}--help${N} Display this help message and exit. - ${C}--executable${N} Specify executable to be used. Required field. - ${C}--integration-tests${N} Specify that only integration tests should be run. - ${C}--unit-tests${N} Specify that only unit tests should be run. + ${C}--help${N} Display this help message and exit. + ${C}--data-reader-percent${N} Specify data reader percent. Note that `data-reader-percent=1.0` means 100%, not 1%. + ${C}--executable${N} Specify executable to be used. Required field. + ${C}--integration-tests${N} Specify that only integration tests should be run. + ${C}--unit-tests${N} Specify that only unit tests should be run. EOF } @@ -25,6 +26,7 @@ EOF # Parse command-line arguments ################################################################ +DATA_READER_PERCENT=0.001 EXECUTABLE= INTEGRATION_TESTS=1 UNIT_TESTS=1 @@ -35,8 +37,20 @@ while :; do help_message exit 0 ;; + -d|--data-reader-percent) + # Set data reader percent. + # -n: check if string has non-zero length. + if [ -n "${2}" ]; then + DATA_READER_PERCENT=${2} + shift + else + echo "\"${1}\" option requires a non-empty option argument" >&2 + help_message + exit 1 + fi + ;; -e|--executable) - # Set executable + # Set executable. # -n: check if string has non-zero length. if [ -n "${2}" ]; then EXECUTABLE=${2} @@ -99,7 +113,7 @@ cd .. echo "Task: Unit Tests" cd unit_tests if [ ${UNIT_TESTS} -ne 0 ]; then - $PYTHON -m pytest -s -vv --durations=0 --exe=${EXECUTABLE} + $PYTHON -m pytest -s -vv --durations=0 --exe=${EXECUTABLE} --data-reader-percent=${DATA_READER_PERCENT} fi cd .. diff --git a/bamboo/unit_tests/conftest.py b/bamboo/unit_tests/conftest.py index a750292c101..cf646ad1e04 100644 --- a/bamboo/unit_tests/conftest.py +++ b/bamboo/unit_tests/conftest.py @@ -14,13 +14,14 @@ def pytest_addoption(parser): parser.addoption('--cluster', action='store', default=cluster, help='--cluster= to specify the cluster being run on, for the purpose of determing which commands to use. Default the current cluster') parser.addoption('--dirname', action='store', default=default_dirname, - help='--dirname specifies the top-level directory') + help='--dirname= specifies the top-level directory') parser.addoption('--exes', action='store', default=default_exes, help='--exes={compiler_name: path}') + parser.addoption('--weekly', action='store_true', default=False, + help='--weekly specifies that the test should ONLY be run weekly, not nightly. Default False') # For local testing only - parser.addoption('--data-reader-percent', action='store', default=1.0, - help='--data-reader-percent= -def test_unit_models_exe(cluster, dirname, exe): +def test_unit_models_exe(cluster, dirname, exe, weekly, data_reader_percent): if exe is None: e = 'test_unit_models_exe: Non-local testing' print('Skip - ' + e) pytest.skip(e) exes = {'exe' : exe} - skeleton_models(cluster, dirname, exes, 'exe') + skeleton_models(cluster, dirname, exes, 'exe', weekly, data_reader_percent) diff --git a/bamboo/unit_tests/test_unit_checkpoint.py b/bamboo/unit_tests/test_unit_checkpoint.py index af6f0acccca..e6516cfcec9 100644 --- a/bamboo/unit_tests/test_unit_checkpoint.py +++ b/bamboo/unit_tests/test_unit_checkpoint.py @@ -6,7 +6,7 @@ def skeleton_checkpoint_lenet_shared(cluster, executables, dir_name, - compiler_name, data_reader_percent=1.0): + compiler_name, weekly, data_reader_percent): if compiler_name not in executables: e = 'skeleton_checkpoint_lenet_shared: default_exes[%s] does not exist' % compiler_name print('Skip - ' + e) @@ -25,7 +25,7 @@ def skeleton_checkpoint_lenet_shared(cluster, executables, dir_name, data_reader_name='mnist', data_reader_percent=data_reader_percent, ckpt_dir=no_ckpt_dir, model_folder='tests', model_name='lenet_mnist_ckpt', num_epochs=2, optimizer_name='sgd', - output_file_name=output_file_name, error_file_name=error_file_name) + output_file_name=output_file_name, error_file_name=error_file_name, weekly=weekly) return_code_nockpt = os.system(command) tools.assert_success(return_code_nockpt, error_file_name) @@ -40,7 +40,7 @@ def skeleton_checkpoint_lenet_shared(cluster, executables, dir_name, data_reader_name='mnist', data_reader_percent=data_reader_percent, ckpt_dir=ckpt_dir, model_folder='tests', model_name='lenet_mnist_ckpt', num_epochs=1, optimizer_name='sgd', - output_file_name=output_file_name, error_file_name=error_file_name) + output_file_name=output_file_name, error_file_name=error_file_name, weekly=weekly) return_code_ckpt_1 = os.system(command) tools.assert_success(return_code_ckpt_1, error_file_name) @@ -54,7 +54,7 @@ def skeleton_checkpoint_lenet_shared(cluster, executables, dir_name, data_reader_name='mnist', data_reader_percent=data_reader_percent, ckpt_dir=ckpt_dir, model_folder='tests', model_name='lenet_mnist_ckpt', num_epochs=2, optimizer_name='sgd', - output_file_name=output_file_name, error_file_name=error_file_name) + output_file_name=output_file_name, error_file_name=error_file_name, weekly=weekly) return_code_ckpt_2 = os.system(command) tools.assert_success(return_code_ckpt_2, error_file_name) @@ -68,7 +68,7 @@ def skeleton_checkpoint_lenet_shared(cluster, executables, dir_name, def skeleton_checkpoint_lenet_distributed(cluster, executables, dir_name, compiler_name, - data_reader_percent=1.0): + weekly, data_reader_percent): if compiler_name not in executables: e = 'skeleton_checkpoint_lenet_distributed: default_exes[%s] does not exist' % compiler_name print('Skip - ' + e) @@ -87,7 +87,7 @@ def skeleton_checkpoint_lenet_distributed(cluster, executables, dir_name, data_reader_name='mnist', data_reader_percent=data_reader_percent, ckpt_dir=no_ckpt_dir, model_folder='tests', model_name='lenet_mnist_dist_ckpt', num_epochs=2, optimizer_name='sgd', - output_file_name=output_file_name, error_file_name=error_file_name) + output_file_name=output_file_name, error_file_name=error_file_name, weekly=weekly) return_code_nockpt = os.system(command) tools.assert_success(return_code_nockpt, error_file_name) @@ -102,7 +102,7 @@ def skeleton_checkpoint_lenet_distributed(cluster, executables, dir_name, data_reader_name='mnist', data_reader_percent=data_reader_percent, ckpt_dir=ckpt_dir, model_folder='tests', model_name='lenet_mnist_dist_ckpt', num_epochs=1, optimizer_name='sgd', - output_file_name=output_file_name, error_file_name=error_file_name) + output_file_name=output_file_name, error_file_name=error_file_name, weekly=weekly) return_code_ckpt_1 = os.system(command) tools.assert_success(return_code_ckpt_1, error_file_name) @@ -116,7 +116,7 @@ def skeleton_checkpoint_lenet_distributed(cluster, executables, dir_name, data_reader_name='mnist', data_reader_percent=data_reader_percent, ckpt_dir=ckpt_dir, model_folder='tests', model_name='lenet_mnist_dist_ckpt', num_epochs=2, optimizer_name='sgd', - output_file_name=output_file_name, error_file_name=error_file_name) + output_file_name=output_file_name, error_file_name=error_file_name, weekly=weekly) return_code_ckpt_2 = os.system(command) tools.assert_success(return_code_ckpt_2, error_file_name) @@ -129,45 +129,59 @@ def skeleton_checkpoint_lenet_distributed(cluster, executables, dir_name, dt=diff_test, ncd=no_ckpt_dir, cd=ckpt_dir, p=path_prefix)) -def test_unit_checkpoint_lenet_shared_clang6(cluster, exes, dirname): - skeleton_checkpoint_lenet_shared(cluster, exes, dirname, 'clang6') +def test_unit_checkpoint_lenet_shared_clang6(cluster, exes, dirname, + weekly, data_reader_percent): + skeleton_checkpoint_lenet_shared(cluster, exes, dirname, 'clang6', + weekly, data_reader_percent) -def test_unit_checkpoint_lenet_distributed_clang6(cluster, exes, dirname): - skeleton_checkpoint_lenet_distributed(cluster, exes, dirname, 'clang6') +def test_unit_checkpoint_lenet_distributed_clang6(cluster, exes, dirname, + weekly, data_reader_percent): + skeleton_checkpoint_lenet_distributed(cluster, exes, dirname, 'clang6', + weekly, data_reader_percent) -def test_unit_checkpoint_lenet_shared_gcc7(cluster, exes, dirname): - skeleton_checkpoint_lenet_shared(cluster, exes, dirname, 'gcc7') +def test_unit_checkpoint_lenet_shared_gcc7(cluster, exes, dirname, + weekly, data_reader_percent): + skeleton_checkpoint_lenet_shared(cluster, exes, dirname, 'gcc7', + weekly, data_reader_percent) -def test_unit_checkpoint_lenet_distributed_gcc7(cluster, exes, dirname): - skeleton_checkpoint_lenet_distributed(cluster, exes, dirname, 'gcc7') +def test_unit_checkpoint_lenet_distributed_gcc7(cluster, exes, dirname, + weekly, data_reader_percent): + skeleton_checkpoint_lenet_distributed(cluster, exes, dirname, 'gcc7', + weekly, data_reader_percent) -def test_unit_checkpoint_lenet_shared_intel19(cluster, exes, dirname): - skeleton_checkpoint_lenet_shared(cluster, exes, dirname, 'intel19') +def test_unit_checkpoint_lenet_shared_intel19(cluster, exes, dirname, + weekly, data_reader_percent): + skeleton_checkpoint_lenet_shared(cluster, exes, dirname, 'intel19', + weekly, data_reader_percent) -def test_unit_checkpoint_lenet_distributed_intel19(cluster, exes, dirname): - skeleton_checkpoint_lenet_distributed(cluster, exes, dirname, 'intel19') +def test_unit_checkpoint_lenet_distributed_intel19(cluster, exes, dirname, + weekly, data_reader_percent): + skeleton_checkpoint_lenet_distributed(cluster, exes, dirname, 'intel19', + weekly, data_reader_percent) # Run with python3 -m pytest -s test_unit_checkpoint.py -k 'test_unit_checkpoint_lenet_shared_exe' --exe= -def test_unit_checkpoint_lenet_shared_exe(cluster, dirname, exe, data_reader_percent): +def test_unit_checkpoint_lenet_shared_exe(cluster, dirname, exe, + weekly, data_reader_percent): if exe is None: e = 'test_unit_checkpoint_lenet_exe: Non-local testing' print('Skip - ' + e) pytest.skip(e) exes = {'exe': exe} - skeleton_checkpoint_lenet_shared(cluster, exes, dirname, 'exe', data_reader_percent) + skeleton_checkpoint_lenet_shared(cluster, exes, dirname, 'exe', + weekly, data_reader_percent) # Run with python3 -m pytest -s test_unit_checkpoint.py -k 'test_unit_checkpoint_lenet_distributed_exe' --exe= -def test_unit_checkpoint_lenet_distributed_exe(cluster, dirname, exe, data_reader_percent): +def test_unit_checkpoint_lenet_distributed_exe(cluster, dirname, exe, weekly, data_reader_percent): if exe is None: e = 'test_unit_checkpoint_lenet_exe: Non-local testing' print('Skip - ' + e) pytest.skip(e) exes = {'exe': exe} - skeleton_checkpoint_lenet_distributed(cluster, exes, dirname, 'exe', data_reader_percent) + skeleton_checkpoint_lenet_distributed(cluster, exes, dirname, 'exe', weekly, data_reader_percent) diff --git a/bamboo/unit_tests/test_unit_layer_clamp.py b/bamboo/unit_tests/test_unit_layer_clamp.py index ddfddd5be8a..d1fda5ac057 100644 --- a/bamboo/unit_tests/test_unit_layer_clamp.py +++ b/bamboo/unit_tests/test_unit_layer_clamp.py @@ -5,7 +5,8 @@ import os -def skeleton_layer_clamp(cluster, executables, dir_name, compiler_name): +def skeleton_layer_clamp(cluster, executables, dir_name, compiler_name, + weekly, data_reader_percent): if compiler_name not in executables: e = 'skeleton_layer_clamp: default_exes[%s] does not exist' % compiler_name print('Skip - ' + e) @@ -17,30 +18,31 @@ def skeleton_layer_clamp(cluster, executables, dir_name, compiler_name): time_limit=10, num_processes=2, dir_name=dir_name, data_reader_name='synthetic', + data_reader_percent=data_reader_percent, model_folder='tests/layer_tests', model_name='clamp', optimizer_name='sgd', - output_file_name=output_file_name, error_file_name=error_file_name) + output_file_name=output_file_name, error_file_name=error_file_name, weekly=weekly) return_code = os.system(command) tools.assert_success(return_code, error_file_name) -def test_unit_layer_clamp_clang6(cluster, exes, dirname): - skeleton_layer_clamp(cluster, exes, dirname, 'clang6') +def test_unit_layer_clamp_clang6(cluster, exes, dirname, weekly, data_reader_percent): + skeleton_layer_clamp(cluster, exes, dirname, 'clang6', weekly, data_reader_percent) -def test_unit_layer_clamp_gcc7(cluster, exes, dirname): - skeleton_layer_clamp(cluster, exes, dirname, 'gcc7') +def test_unit_layer_clamp_gcc7(cluster, exes, dirname, weekly, data_reader_percent): + skeleton_layer_clamp(cluster, exes, dirname, 'gcc7', weekly, data_reader_percent) -def test_unit_layer_clamp_intel19(cluster, exes, dirname): - skeleton_layer_clamp(cluster, exes, dirname, 'intel19') +def test_unit_layer_clamp_intel19(cluster, exes, dirname, weekly, data_reader_percent): + skeleton_layer_clamp(cluster, exes, dirname, 'intel19', weekly, data_reader_percent) # Run with python3 -m pytest -s test_unit_layer_clamp.py -k 'test_unit_layer_clamp_exe' --exe= -def test_unit_layer_clamp_exe(cluster, dirname, exe): +def test_unit_layer_clamp_exe(cluster, dirname, exe, weekly, data_reader_percent): if exe is None: e = 'test_unit_layer_clamp_exe: Non-local testing' print('Skip - ' + e) pytest.skip(e) exes = {'exe': exe} - skeleton_layer_clamp(cluster, exes, dirname, 'exe') + skeleton_layer_clamp(cluster, exes, dirname, 'exe', weekly, data_reader_percent) diff --git a/bamboo/unit_tests/test_unit_layer_covariance.py b/bamboo/unit_tests/test_unit_layer_covariance.py index ff10756dc5a..fc9961a222a 100644 --- a/bamboo/unit_tests/test_unit_layer_covariance.py +++ b/bamboo/unit_tests/test_unit_layer_covariance.py @@ -5,7 +5,8 @@ import os -def skeleton_layer_covariance(cluster, executables, dir_name, compiler_name): +def skeleton_layer_covariance(cluster, executables, dir_name, compiler_name, + weekly, data_reader_percent): if compiler_name not in executables: e = 'skeleton_layer_covariance: default_exes[%s] does not exist' % compiler_name print('Skip - ' + e) @@ -17,30 +18,38 @@ def skeleton_layer_covariance(cluster, executables, dir_name, compiler_name): time_limit=10, num_processes=2, dir_name=dir_name, data_reader_name='synthetic', + data_reader_percent=data_reader_percent, model_folder='tests/layer_tests', model_name='covariance', optimizer_name='sgd', - output_file_name=output_file_name, error_file_name=error_file_name) + output_file_name=output_file_name, error_file_name=error_file_name, weekly=weekly) return_code = os.system(command) tools.assert_success(return_code, error_file_name) -def test_unit_layer_covariance_clang6(cluster, exes, dirname): - skeleton_layer_covariance(cluster, exes, dirname, 'clang6') +def test_unit_layer_covariance_clang6(cluster, exes, dirname, + weekly, data_reader_percent): + skeleton_layer_covariance(cluster, exes, dirname, 'clang6', + weekly, data_reader_percent) -def test_unit_layer_covariance_gcc7(cluster, exes, dirname): - skeleton_layer_covariance(cluster, exes, dirname, 'gcc7') +def test_unit_layer_covariance_gcc7(cluster, exes, dirname, + weekly, data_reader_percent): + skeleton_layer_covariance(cluster, exes, dirname, 'gcc7', + weekly, data_reader_percent) -def test_unit_layer_covariance_intel19(cluster, exes, dirname): - skeleton_layer_covariance(cluster, exes, dirname, 'intel19') +def test_unit_layer_covariance_intel19(cluster, exes, dirname, + weekly, data_reader_percent): + skeleton_layer_covariance(cluster, exes, dirname, 'intel19', + weekly, data_reader_percent) # Run with python3 -m pytest -s test_unit_ridge_regression.py -k 'test_unit_layer_covariance_exe' --exe= -def test_unit_layer_covariance_exe(cluster, dirname, exe): +def test_unit_layer_covariance_exe(cluster, dirname, exe, weekly, data_reader_percent): if exe is None: e = 'test_unit_layer_covariance_exe: Non-local testing' print('Skip - ' + e) pytest.skip(e) exes = {'exe': exe} - skeleton_layer_covariance(cluster, exes, dirname, 'exe') + skeleton_layer_covariance(cluster, exes, dirname, 'exe', + weekly, data_reader_percent) diff --git a/bamboo/unit_tests/test_unit_layer_elu.py b/bamboo/unit_tests/test_unit_layer_elu.py index 8282974d850..4ab8c576325 100644 --- a/bamboo/unit_tests/test_unit_layer_elu.py +++ b/bamboo/unit_tests/test_unit_layer_elu.py @@ -5,7 +5,8 @@ import os -def skeleton_layer_elu(cluster, executables, dir_name, compiler_name): +def skeleton_layer_elu(cluster, executables, dir_name, compiler_name, + weekly, data_reader_percent): if compiler_name not in executables: e = 'skeleton_layer_elu: default_exes[%s] does not exist' % compiler_name print('Skip - ' + e) @@ -17,30 +18,31 @@ def skeleton_layer_elu(cluster, executables, dir_name, compiler_name): time_limit=10, num_processes=2, dir_name=dir_name, data_reader_name='synthetic', + data_reader_percent=data_reader_percent, model_folder='tests/layer_tests', model_name='elu', optimizer_name='sgd', - output_file_name=output_file_name, error_file_name=error_file_name) + output_file_name=output_file_name, error_file_name=error_file_name, weekly=weekly) return_code = os.system(command) tools.assert_success(return_code, error_file_name) -def test_unit_layer_elu_clang6(cluster, exes, dirname): - skeleton_layer_elu(cluster, exes, dirname, 'clang6') +def test_unit_layer_elu_clang6(cluster, exes, dirname, weekly, data_reader_percent): + skeleton_layer_elu(cluster, exes, dirname, 'clang6', weekly, data_reader_percent) -def test_unit_layer_elu_gcc7(cluster, exes, dirname): - skeleton_layer_elu(cluster, exes, dirname, 'gcc7') +def test_unit_layer_elu_gcc7(cluster, exes, dirname, weekly, data_reader_percent): + skeleton_layer_elu(cluster, exes, dirname, 'gcc7', weekly, data_reader_percent) -def test_unit_layer_elu_intel19(cluster, exes, dirname): - skeleton_layer_elu(cluster, exes, dirname, 'intel19') +def test_unit_layer_elu_intel19(cluster, exes, dirname, weekly, data_reader_percent): + skeleton_layer_elu(cluster, exes, dirname, 'intel19', weekly, data_reader_percent) # Run with python3 -m pytest -s test_unit_layer_elu.py -k 'test_unit_layer_elu_exe' --exe= -def test_unit_layer_elu_exe(cluster, dirname, exe): +def test_unit_layer_elu_exe(cluster, dirname, exe, weekly, data_reader_percent): if exe is None: e = 'test_unit_layer_elu_exe: Non-local testing' print('Skip - ' + e) pytest.skip(e) exes = {'exe': exe} - skeleton_layer_elu(cluster, exes, dirname, 'exe') + skeleton_layer_elu(cluster, exes, dirname, 'exe', weekly, data_reader_percent) diff --git a/bamboo/unit_tests/test_unit_layer_identity.py b/bamboo/unit_tests/test_unit_layer_identity.py index 6212b317acd..b2e7d2058cb 100644 --- a/bamboo/unit_tests/test_unit_layer_identity.py +++ b/bamboo/unit_tests/test_unit_layer_identity.py @@ -5,7 +5,8 @@ import os -def skeleton_layer_identity(cluster, executables, dir_name, compiler_name): +def skeleton_layer_identity(cluster, executables, dir_name, compiler_name, + weekly, data_reader_percent): if compiler_name not in executables: e = 'skeleton_layer_identity: default_exes[%s] does not exist' % compiler_name print('Skip - ' + e) @@ -17,30 +18,35 @@ def skeleton_layer_identity(cluster, executables, dir_name, compiler_name): time_limit=10, num_processes=2, dir_name=dir_name, data_reader_name='synthetic', + data_reader_percent=data_reader_percent, model_folder='tests/layer_tests', model_name='identity', optimizer_name='sgd', - output_file_name=output_file_name, error_file_name=error_file_name) + output_file_name=output_file_name, error_file_name=error_file_name, weekly=weekly) return_code = os.system(command) tools.assert_success(return_code, error_file_name) -def test_unit_layer_identity_clang6(cluster, exes, dirname): - skeleton_layer_identity(cluster, exes, dirname, 'clang6') +def test_unit_layer_identity_clang6(cluster, exes, dirname, + weekly, data_reader_percent): + skeleton_layer_identity(cluster, exes, dirname, 'clang6', + weekly, data_reader_percent) -def test_unit_layer_identity_gcc7(cluster, exes, dirname): - skeleton_layer_identity(cluster, exes, dirname, 'gcc7') +def test_unit_layer_identity_gcc7(cluster, exes, dirname, weekly, data_reader_percent): + skeleton_layer_identity(cluster, exes, dirname, 'gcc7', weekly, data_reader_percent) -def test_unit_layer_identity_intel19(cluster, exes, dirname): - skeleton_layer_identity(cluster, exes, dirname, 'intel19') +def test_unit_layer_identity_intel19(cluster, exes, dirname, + weekly, data_reader_percent): + skeleton_layer_identity(cluster, exes, dirname, 'intel19', + weekly, data_reader_percent) # Run with python3 -m pytest -s test_unit_layer_identity.py -k 'test_unit_layer_identity_exe' --exe= -def test_unit_layer_identity_exe(cluster, dirname, exe): +def test_unit_layer_identity_exe(cluster, dirname, exe, weekly, data_reader_percent): if exe is None: e = 'test_unit_layer_identity_exe: Non-local testing' print('Skip - ' + e) pytest.skip(e) exes = {'exe': exe} - skeleton_layer_identity(cluster, exes, dirname, 'exe') + skeleton_layer_identity(cluster, exes, dirname, 'exe', weekly, data_reader_percent) diff --git a/bamboo/unit_tests/test_unit_layer_l1_norm.py b/bamboo/unit_tests/test_unit_layer_l1_norm.py index 6a7ae7e7d54..92fdd3c36c7 100644 --- a/bamboo/unit_tests/test_unit_layer_l1_norm.py +++ b/bamboo/unit_tests/test_unit_layer_l1_norm.py @@ -5,7 +5,8 @@ import os -def skeleton_layer_l1_norm(cluster, executables, dir_name, compiler_name): +def skeleton_layer_l1_norm(cluster, executables, dir_name, compiler_name, + weekly, data_reader_percent): if compiler_name not in executables: e = 'skeleton_layer_l1_norm: default_exes[%s] does not exist' % compiler_name print('Skip - ' + e) @@ -17,30 +18,34 @@ def skeleton_layer_l1_norm(cluster, executables, dir_name, compiler_name): time_limit=10, num_processes=2, dir_name=dir_name, data_reader_name='synthetic', + data_reader_percent=data_reader_percent, model_folder='tests/layer_tests', model_name='l1_norm', optimizer_name='sgd', - output_file_name=output_file_name, error_file_name=error_file_name) + output_file_name=output_file_name, error_file_name=error_file_name, weekly=weekly) return_code = os.system(command) tools.assert_success(return_code, error_file_name) -def test_unit_layer_l1_norm_clang6(cluster, exes, dirname): - skeleton_layer_l1_norm(cluster, exes, dirname, 'clang6') +def test_unit_layer_l1_norm_clang6(cluster, exes, dirname, weekly, data_reader_percent): + skeleton_layer_l1_norm(cluster, exes, dirname, 'clang6', + weekly, data_reader_percent) -def test_unit_layer_l1_norm_gcc7(cluster, exes, dirname): - skeleton_layer_l1_norm(cluster, exes, dirname, 'gcc7') +def test_unit_layer_l1_norm_gcc7(cluster, exes, dirname, weekly, data_reader_percent): + skeleton_layer_l1_norm(cluster, exes, dirname, 'gcc7', weekly, data_reader_percent) -def test_unit_layer_l1_norm_intel19(cluster, exes, dirname): - skeleton_layer_l1_norm(cluster, exes, dirname, 'intel19') +def test_unit_layer_l1_norm_intel19(cluster, exes, dirname, + weekly, data_reader_percent): + skeleton_layer_l1_norm(cluster, exes, dirname, 'intel19', + weekly, data_reader_percent) # Run with python3 -m pytest -s test_unit_ridge_regression.py -k 'test_unit_layer_l1_norm_exe' --exe= -def test_unit_layer_l1_norm_exe(cluster, dirname, exe): +def test_unit_layer_l1_norm_exe(cluster, dirname, exe, weekly, data_reader_percent): if exe is None: e = 'test_unit_layer_l1_norm_exe: Non-local testing' print('Skip - ' + e) pytest.skip(e) exes = {'exe': exe} - skeleton_layer_l1_norm(cluster, exes, dirname, 'exe') + skeleton_layer_l1_norm(cluster, exes, dirname, 'exe', weekly, data_reader_percent) diff --git a/bamboo/unit_tests/test_unit_layer_l2_norm2.py b/bamboo/unit_tests/test_unit_layer_l2_norm2.py index 5a13f0da5f3..90901e881b1 100644 --- a/bamboo/unit_tests/test_unit_layer_l2_norm2.py +++ b/bamboo/unit_tests/test_unit_layer_l2_norm2.py @@ -5,7 +5,8 @@ import os -def skeleton_layer_l2_norm2(cluster, executables, dir_name, compiler_name): +def skeleton_layer_l2_norm2(cluster, executables, dir_name, compiler_name, + weekly, data_reader_percent): if compiler_name not in executables: e = 'skeleton_layer_l2_norm2: default_exes[%s] does not exist' % compiler_name print('Skip - ' + e) @@ -17,29 +18,34 @@ def skeleton_layer_l2_norm2(cluster, executables, dir_name, compiler_name): time_limit=10, num_processes=2, dir_name=dir_name, data_reader_name='synthetic', + data_reader_percent=data_reader_percent, model_folder='tests/layer_tests', model_name='l2_norm2', optimizer_name='sgd', - output_file_name=output_file_name, error_file_name=error_file_name) + output_file_name=output_file_name, error_file_name=error_file_name, weekly=weekly) return_code = os.system(command) tools.assert_success(return_code, error_file_name) -def test_unit_layer_l2_norm2_clang6(cluster, exes, dirname): - skeleton_layer_l2_norm2(cluster, exes, dirname, 'clang6') +def test_unit_layer_l2_norm2_clang6(cluster, exes, dirname, + weekly, data_reader_percent): + skeleton_layer_l2_norm2(cluster, exes, dirname, 'clang6', + weekly, data_reader_percent) -def test_unit_layer_l2_norm2_gcc7(cluster, exes, dirname): - skeleton_layer_l2_norm2(cluster, exes, dirname, 'gcc7') +def test_unit_layer_l2_norm2_gcc7(cluster, exes, dirname, weekly, data_reader_percent): + skeleton_layer_l2_norm2(cluster, exes, dirname, 'gcc7', weekly, data_reader_percent) -def test_unit_layer_l2_norm2_intel19(cluster, exes, dirname): - skeleton_layer_l2_norm2(cluster, exes, dirname, 'intel19') +def test_unit_layer_l2_norm2_intel19(cluster, exes, dirname, + weekly, data_reader_percent): + skeleton_layer_l2_norm2(cluster, exes, dirname, 'intel19', + weekly, data_reader_percent) # Run with python3 -m pytest -s test_unit_ridge_regression.py -k 'test_unit_layer_l2_norm2_exe' --exe= -def test_unit_layer_l2_norm2_exe(cluster, dirname, exe): +def test_unit_layer_l2_norm2_exe(cluster, dirname, exe, weekly, data_reader_percent): if exe is None: e = 'test_unit_layer_l2_norm2_exe: Non-local testing' print('Skip - ' + e) pytest.skip(e) exes = {'exe': exe} - skeleton_layer_l2_norm2(cluster, exes, dirname, 'exe') + skeleton_layer_l2_norm2(cluster, exes, dirname, 'exe', weekly, data_reader_percent) diff --git a/bamboo/unit_tests/test_unit_layer_leaky_relu.py b/bamboo/unit_tests/test_unit_layer_leaky_relu.py index ca8d02e245d..73e5f02d769 100644 --- a/bamboo/unit_tests/test_unit_layer_leaky_relu.py +++ b/bamboo/unit_tests/test_unit_layer_leaky_relu.py @@ -5,7 +5,8 @@ import os -def skeleton_layer_leaky_relu(cluster, executables, dir_name, compiler_name): +def skeleton_layer_leaky_relu(cluster, executables, dir_name, compiler_name, + weekly, data_reader_percent): if compiler_name not in executables: e = 'skeleton_layer_leaky_relu: default_exes[%s] does not exist' % compiler_name print('Skip - ' + e) @@ -17,30 +18,37 @@ def skeleton_layer_leaky_relu(cluster, executables, dir_name, compiler_name): time_limit=10, num_processes=2, dir_name=dir_name, data_reader_name='synthetic', + data_reader_percent=data_reader_percent, model_folder='tests/layer_tests', model_name='leaky_relu', optimizer_name='sgd', - output_file_name=output_file_name, error_file_name=error_file_name) + output_file_name=output_file_name, error_file_name=error_file_name, weekly=weekly) return_code = os.system(command) tools.assert_success(return_code, error_file_name) -def test_unit_layer_leaky_relu_clang6(cluster, exes, dirname): - skeleton_layer_leaky_relu(cluster, exes, dirname, 'clang6') +def test_unit_layer_leaky_relu_clang6(cluster, exes, dirname, + weekly, data_reader_percent): + skeleton_layer_leaky_relu(cluster, exes, dirname, 'clang6', + weekly, data_reader_percent) -def test_unit_layer_leaky_relu_gcc7(cluster, exes, dirname): - skeleton_layer_leaky_relu(cluster, exes, dirname, 'gcc7') +def test_unit_layer_leaky_relu_gcc7(cluster, exes, dirname, + weekly, data_reader_percent): + skeleton_layer_leaky_relu(cluster, exes, dirname, 'gcc7', + weekly, data_reader_percent) -def test_unit_layer_leaky_relu_intel19(cluster, exes, dirname): - skeleton_layer_leaky_relu(cluster, exes, dirname, 'intel19') +def test_unit_layer_leaky_relu_intel19(cluster, exes, dirname, + weekly, data_reader_percent): + skeleton_layer_leaky_relu(cluster, exes, dirname, 'intel19', + weekly, data_reader_percent) # Run with python3 -m pytest -s test_unit_ridge_regression.py -k 'test_unit_layer_leaky_relu_exe' --exe= -def test_unit_layer_leaky_relu_exe(cluster, dirname, exe): +def test_unit_layer_leaky_relu_exe(cluster, dirname, exe, weekly, data_reader_percent): if exe is None: e = 'test_unit_layer_leaky_relu_exe: Non-local testing' print('Skip - ' + e) pytest.skip(e) exes = {'exe': exe} - skeleton_layer_leaky_relu(cluster, exes, dirname, 'exe') + skeleton_layer_leaky_relu(cluster, exes, dirname, 'exe', weekly, data_reader_percent) diff --git a/bamboo/unit_tests/test_unit_layer_log_sigmoid.py b/bamboo/unit_tests/test_unit_layer_log_sigmoid.py index dbbb0663d5d..a4265f52bb9 100644 --- a/bamboo/unit_tests/test_unit_layer_log_sigmoid.py +++ b/bamboo/unit_tests/test_unit_layer_log_sigmoid.py @@ -5,7 +5,8 @@ import os -def skeleton_layer_log_sigmoid(cluster, executables, dir_name, compiler_name): +def skeleton_layer_log_sigmoid(cluster, executables, dir_name, compiler_name, + weekly, data_reader_percent): if compiler_name not in executables: e = 'skeleton_layer_log_sigmoid: default_exes[%s] does not exist' % compiler_name print('Skip - ' + e) @@ -17,30 +18,38 @@ def skeleton_layer_log_sigmoid(cluster, executables, dir_name, compiler_name): time_limit=10, num_processes=2, dir_name=dir_name, data_reader_name='synthetic', + data_reader_percent=data_reader_percent, model_folder='tests/layer_tests', model_name='log_sigmoid', optimizer_name='sgd', - output_file_name=output_file_name, error_file_name=error_file_name) + output_file_name=output_file_name, error_file_name=error_file_name, weekly=weekly) return_code = os.system(command) tools.assert_success(return_code, error_file_name) -def test_unit_layer_log_sigmoid_clang6(cluster, exes, dirname): - skeleton_layer_log_sigmoid(cluster, exes, dirname, 'clang6') +def test_unit_layer_log_sigmoid_clang6(cluster, exes, dirname, + weekly, data_reader_percent): + skeleton_layer_log_sigmoid(cluster, exes, dirname, 'clang6', + weekly, data_reader_percent) -def test_unit_layer_log_sigmoid_gcc7(cluster, exes, dirname): - skeleton_layer_log_sigmoid(cluster, exes, dirname, 'gcc7') +def test_unit_layer_log_sigmoid_gcc7(cluster, exes, dirname, + weekly, data_reader_percent): + skeleton_layer_log_sigmoid(cluster, exes, dirname, 'gcc7', + weekly, data_reader_percent) -def test_unit_layer_log_sigmoid_intel19(cluster, exes, dirname): - skeleton_layer_log_sigmoid(cluster, exes, dirname, 'intel19') +def test_unit_layer_log_sigmoid_intel19(cluster, exes, dirname, + weekly, data_reader_percent): + skeleton_layer_log_sigmoid(cluster, exes, dirname, 'intel19', + weekly, data_reader_percent) # Run with python3 -m pytest -s test_unit_layer_log_sigmoid.py -k 'test_unit_layer_log_sigmoid_exe' --exe= -def test_unit_layer_log_sigmoid_exe(cluster, dirname, exe): +def test_unit_layer_log_sigmoid_exe(cluster, dirname, exe, weekly, data_reader_percent): if exe is None: e = 'test_unit_layer_log_sigmoid_exe: Non-local testing' print('Skip - ' + e) pytest.skip(e) exes = {'exe': exe} - skeleton_layer_log_sigmoid(cluster, exes, dirname, 'exe') + skeleton_layer_log_sigmoid(cluster, exes, dirname, 'exe', + weekly, data_reader_percent) diff --git a/bamboo/unit_tests/test_unit_layer_log_softmax.py b/bamboo/unit_tests/test_unit_layer_log_softmax.py index 0be482f7701..1dc2d7bab74 100644 --- a/bamboo/unit_tests/test_unit_layer_log_softmax.py +++ b/bamboo/unit_tests/test_unit_layer_log_softmax.py @@ -5,7 +5,8 @@ import os -def skeleton_layer_log_softmax(cluster, executables, dir_name, compiler_name): +def skeleton_layer_log_softmax(cluster, executables, dir_name, compiler_name, + weekly, data_reader_percent): if compiler_name not in executables: e = 'skeleton_layer_log_softmax: default_exes[%s] does not exist' % compiler_name print('Skip - ' + e) @@ -18,30 +19,38 @@ def skeleton_layer_log_softmax(cluster, executables, dir_name, compiler_name): time_limit=10, num_processes=2, dir_name=dir_name, data_reader_name='synthetic', + data_reader_percent=data_reader_percent, model_folder='tests/layer_tests', model_name='log_softmax', optimizer_name='sgd', - output_file_name=output_file_name, error_file_name=error_file_name) + output_file_name=output_file_name, error_file_name=error_file_name, weekly=weekly) return_code = os.system(command) tools.assert_success(return_code, error_file_name) -def test_unit_layer_log_softmax_clang6(cluster, exes, dirname): - skeleton_layer_log_softmax(cluster, exes, dirname, 'clang6') +def test_unit_layer_log_softmax_clang6(cluster, exes, dirname, + weekly, data_reader_percent): + skeleton_layer_log_softmax(cluster, exes, dirname, 'clang6', + weekly, data_reader_percent) -def test_unit_layer_log_softmax_gcc7(cluster, exes, dirname): - skeleton_layer_log_softmax(cluster, exes, dirname, 'gcc7') +def test_unit_layer_log_softmax_gcc7(cluster, exes, dirname, + weekly, data_reader_percent): + skeleton_layer_log_softmax(cluster, exes, dirname, 'gcc7', + weekly, data_reader_percent) -def test_unit_layer_log_softmax_intel19(cluster, exes, dirname): - skeleton_layer_log_softmax(cluster, exes, dirname, 'intel19') +def test_unit_layer_log_softmax_intel19(cluster, exes, dirname, + weekly, data_reader_percent): + skeleton_layer_log_softmax(cluster, exes, dirname, 'intel19', + weekly, data_reader_percent) # Run with python3 -m pytest -s test_unit_ridge_regression.py -k 'test_unit_layer_log_softmax_exe' --exe= -def test_unit_layer_log_softmax_exe(cluster, dirname, exe): +def test_unit_layer_log_softmax_exe(cluster, dirname, exe, weekly, data_reader_percent): if exe is None: e = 'test_unit_layer_log_softmax_exe: Non-local testing' print('Skip - ' + e) pytest.skip(e) exes = {'exe': exe} - skeleton_layer_log_softmax(cluster, exes, dirname, 'exe') + skeleton_layer_log_softmax(cluster, exes, dirname, 'exe', + weekly, data_reader_percent) diff --git a/bamboo/unit_tests/test_unit_layer_mean_absolute_error.py b/bamboo/unit_tests/test_unit_layer_mean_absolute_error.py index e32a08c77b4..b37f79e6ef0 100644 --- a/bamboo/unit_tests/test_unit_layer_mean_absolute_error.py +++ b/bamboo/unit_tests/test_unit_layer_mean_absolute_error.py @@ -5,7 +5,8 @@ import os -def skeleton_layer_mean_absolute_error(cluster, executables, dir_name, compiler_name): +def skeleton_layer_mean_absolute_error(cluster, executables, dir_name, + compiler_name, weekly, data_reader_percent): if compiler_name not in executables: e = 'skeleton_layer_mean_absolute_error: default_exes[%s] does not exist' % compiler_name print('Skip - ' + e) @@ -17,30 +18,39 @@ def skeleton_layer_mean_absolute_error(cluster, executables, dir_name, compiler_ time_limit=10, num_processes=2, dir_name=dir_name, data_reader_name='synthetic', + data_reader_percent=data_reader_percent, model_folder='tests/layer_tests', model_name='mean_absolute_error', optimizer_name='sgd', - output_file_name=output_file_name, error_file_name=error_file_name) + output_file_name=output_file_name, error_file_name=error_file_name, weekly=weekly) return_code = os.system(command) tools.assert_success(return_code, error_file_name) -def test_unit_layer_mean_absolute_error_clang6(cluster, exes, dirname): - skeleton_layer_mean_absolute_error(cluster, exes, dirname, 'clang6') +def test_unit_layer_mean_absolute_error_clang6(cluster, exes, dirname, + weekly, data_reader_percent): + skeleton_layer_mean_absolute_error(cluster, exes, dirname, 'clang6', + weekly, data_reader_percent) -def test_unit_layer_mean_absolute_error_gcc7(cluster, exes, dirname): - skeleton_layer_mean_absolute_error(cluster, exes, dirname, 'gcc7') +def test_unit_layer_mean_absolute_error_gcc7(cluster, exes, dirname, + weekly, data_reader_percent): + skeleton_layer_mean_absolute_error(cluster, exes, dirname, 'gcc7', + weekly, data_reader_percent) -def test_unit_layer_mean_absolute_error_intel19(cluster, exes, dirname): - skeleton_layer_mean_absolute_error(cluster, exes, dirname, 'intel19') +def test_unit_layer_mean_absolute_error_intel19(cluster, exes, dirname, + weekly, data_reader_percent): + skeleton_layer_mean_absolute_error(cluster, exes, dirname, 'intel19', + weekly, data_reader_percent) # Run with python3 -m pytest -s test_unit_ridge_regression.py -k 'test_unit_layer_mean_absolute_error_exe' --exe= -def test_unit_layer_mean_absolute_error_exe(cluster, dirname, exe): +def test_unit_layer_mean_absolute_error_exe(cluster, dirname, exe, + weekly, data_reader_percent): if exe is None: e = 'test_unit_layer_mean_absolute_error_exe: Non-local testing' print('Skip - ' + e) pytest.skip(e) exes = {'exe': exe} - skeleton_layer_mean_absolute_error(cluster, exes, dirname, 'exe') + skeleton_layer_mean_absolute_error(cluster, exes, dirname, 'exe', + weekly, data_reader_percent) diff --git a/bamboo/unit_tests/test_unit_layer_relu.py b/bamboo/unit_tests/test_unit_layer_relu.py index 851c3137c2c..ca4cfa92b0d 100644 --- a/bamboo/unit_tests/test_unit_layer_relu.py +++ b/bamboo/unit_tests/test_unit_layer_relu.py @@ -5,7 +5,8 @@ import os -def skeleton_layer_relu(cluster, executables, dir_name, compiler_name): +def skeleton_layer_relu(cluster, executables, dir_name, compiler_name, + weekly, data_reader_percent): if compiler_name not in executables: e = 'skeleton_layer_relu: default_exes[%s] does not exist' % compiler_name print('Skip - ' + e) @@ -18,30 +19,31 @@ def skeleton_layer_relu(cluster, executables, dir_name, compiler_name): time_limit=10, num_processes=2, dir_name=dir_name, data_reader_name='synthetic', + data_reader_percent=data_reader_percent, model_folder='tests/layer_tests', model_name='relu', optimizer_name='sgd', - output_file_name=output_file_name, error_file_name=error_file_name) + output_file_name=output_file_name, error_file_name=error_file_name, weekly=weekly) return_code = os.system(command) tools.assert_success(return_code, error_file_name) -def test_unit_layer_relu_clang6(cluster, exes, dirname): - skeleton_layer_relu(cluster, exes, dirname, 'clang6') +def test_unit_layer_relu_clang6(cluster, exes, dirname, weekly, data_reader_percent): + skeleton_layer_relu(cluster, exes, dirname, 'clang6', weekly, data_reader_percent) -def test_unit_layer_relu_gcc7(cluster, exes, dirname): - skeleton_layer_relu(cluster, exes, dirname, 'gcc7') +def test_unit_layer_relu_gcc7(cluster, exes, dirname, weekly, data_reader_percent): + skeleton_layer_relu(cluster, exes, dirname, 'gcc7', weekly, data_reader_percent) -def test_unit_layer_relu_intel19(cluster, exes, dirname): - skeleton_layer_relu(cluster, exes, dirname, 'intel19') +def test_unit_layer_relu_intel19(cluster, exes, dirname, weekly, data_reader_percent): + skeleton_layer_relu(cluster, exes, dirname, 'intel19', weekly, data_reader_percent) # Run with python3 -m pytest -s test_unit_layer_relu.py -k 'test_unit_layer_relu_exe' --exe= -def test_unit_layer_relu_exe(cluster, dirname, exe): +def test_unit_layer_relu_exe(cluster, dirname, exe, weekly, data_reader_percent): if exe is None: e = 'test_unit_layer_relu_exe: Non-local testing' print('Skip - ' + e) pytest.skip(e) exes = {'exe': exe} - skeleton_layer_relu(cluster, exes, dirname, 'exe') + skeleton_layer_relu(cluster, exes, dirname, 'exe', weekly, data_reader_percent) diff --git a/bamboo/unit_tests/test_unit_layer_selu.py b/bamboo/unit_tests/test_unit_layer_selu.py index 8f1f7b69fb6..778628e5ee9 100644 --- a/bamboo/unit_tests/test_unit_layer_selu.py +++ b/bamboo/unit_tests/test_unit_layer_selu.py @@ -5,7 +5,8 @@ import os -def skeleton_layer_selu(cluster, executables, dir_name, compiler_name): +def skeleton_layer_selu(cluster, executables, dir_name, compiler_name, + weekly, data_reader_percent): if compiler_name not in executables: e = 'skeleton_layer_selu: default_exes[%s] does not exist' % compiler_name print('Skip - ' + e) @@ -18,30 +19,31 @@ def skeleton_layer_selu(cluster, executables, dir_name, compiler_name): time_limit=10, num_processes=2, dir_name=dir_name, data_reader_name='synthetic', + data_reader_percent=data_reader_percent, model_folder='tests/layer_tests', model_name='selu', optimizer_name='sgd', - output_file_name=output_file_name, error_file_name=error_file_name) + output_file_name=output_file_name, error_file_name=error_file_name, weekly=weekly) return_code = os.system(command) tools.assert_success(return_code, error_file_name) -def test_unit_layer_selu_clang6(cluster, exes, dirname): - skeleton_layer_selu(cluster, exes, dirname, 'clang6') +def test_unit_layer_selu_clang6(cluster, exes, dirname, weekly, data_reader_percent): + skeleton_layer_selu(cluster, exes, dirname, 'clang6', weekly, data_reader_percent) -def test_unit_layer_selu_gcc7(cluster, exes, dirname): - skeleton_layer_selu(cluster, exes, dirname, 'gcc7') +def test_unit_layer_selu_gcc7(cluster, exes, dirname, weekly, data_reader_percent): + skeleton_layer_selu(cluster, exes, dirname, 'gcc7', weekly, data_reader_percent) -def test_unit_layer_selu_intel19(cluster, exes, dirname): - skeleton_layer_selu(cluster, exes, dirname, 'intel19') +def test_unit_layer_selu_intel19(cluster, exes, dirname, weekly, data_reader_percent): + skeleton_layer_selu(cluster, exes, dirname, 'intel19', weekly, data_reader_percent) # Run with python3 -m pytest -s test_unit_layer_selu.py -k 'test_unit_layer_selu_exe' --exe= -def test_unit_layer_selu_exe(cluster, dirname, exe): +def test_unit_layer_selu_exe(cluster, dirname, exe, weekly, data_reader_percent): if exe is None: e = 'test_unit_layer_selu_exe: Non-local testing' print('Skip - ' + e) pytest.skip(e) exes = {'exe': exe} - skeleton_layer_selu(cluster, exes, dirname, 'exe') + skeleton_layer_selu(cluster, exes, dirname, 'exe', weekly, data_reader_percent) diff --git a/bamboo/unit_tests/test_unit_layer_sigmoid.py b/bamboo/unit_tests/test_unit_layer_sigmoid.py index c03895f425e..f2cdbc3fafa 100644 --- a/bamboo/unit_tests/test_unit_layer_sigmoid.py +++ b/bamboo/unit_tests/test_unit_layer_sigmoid.py @@ -5,7 +5,8 @@ import os -def skeleton_layer_sigmoid(cluster, executables, dir_name, compiler_name): +def skeleton_layer_sigmoid(cluster, executables, dir_name, compiler_name, + weekly, data_reader_percent): if compiler_name not in executables: e = 'skeleton_layer_sigmoid: default_exes[%s] does not exist' % compiler_name print('Skip - ' + e) @@ -18,30 +19,34 @@ def skeleton_layer_sigmoid(cluster, executables, dir_name, compiler_name): time_limit=10, num_processes=2, dir_name=dir_name, data_reader_name='synthetic', + data_reader_percent=data_reader_percent, model_folder='tests/layer_tests', model_name='sigmoid', optimizer_name='sgd', - output_file_name=output_file_name, error_file_name=error_file_name) + output_file_name=output_file_name, error_file_name=error_file_name, weekly=weekly) return_code = os.system(command) tools.assert_success(return_code, error_file_name) -def test_unit_layer_sigmoid_clang6(cluster, exes, dirname): - skeleton_layer_sigmoid(cluster, exes, dirname, 'clang6') +def test_unit_layer_sigmoid_clang6(cluster, exes, dirname, weekly, data_reader_percent): + skeleton_layer_sigmoid(cluster, exes, dirname, 'clang6', + weekly, data_reader_percent) -def test_unit_layer_sigmoid_gcc7(cluster, exes, dirname): - skeleton_layer_sigmoid(cluster, exes, dirname, 'gcc7') +def test_unit_layer_sigmoid_gcc7(cluster, exes, dirname, weekly, data_reader_percent): + skeleton_layer_sigmoid(cluster, exes, dirname, 'gcc7', weekly, data_reader_percent) -def test_unit_layer_sigmoid_intel19(cluster, exes, dirname): - skeleton_layer_sigmoid(cluster, exes, dirname, 'intel19') +def test_unit_layer_sigmoid_intel19(cluster, exes, dirname, + weekly, data_reader_percent): + skeleton_layer_sigmoid(cluster, exes, dirname, 'intel19', + weekly, data_reader_percent) # Run with python3 -m pytest -s test_unit_layer_sigmoid.py -k 'test_unit_layer_sigmoid_exe' --exe= -def test_unit_layer_sigmoid_exe(cluster, dirname, exe): +def test_unit_layer_sigmoid_exe(cluster, dirname, exe, weekly, data_reader_percent): if exe is None: e = 'test_unit_layer_sigmoid_exe: Non-local testing' print('Skip - ' + e) pytest.skip(e) exes = {'exe': exe} - skeleton_layer_sigmoid(cluster, exes, dirname, 'exe') + skeleton_layer_sigmoid(cluster, exes, dirname, 'exe', weekly, data_reader_percent) diff --git a/bamboo/unit_tests/test_unit_layer_softmax.py b/bamboo/unit_tests/test_unit_layer_softmax.py index 3e06ae2b890..80a3d3f51a0 100644 --- a/bamboo/unit_tests/test_unit_layer_softmax.py +++ b/bamboo/unit_tests/test_unit_layer_softmax.py @@ -5,7 +5,8 @@ import os -def skeleton_layer_softmax(cluster, executables, dir_name, compiler_name): +def skeleton_layer_softmax(cluster, executables, dir_name, compiler_name, + weekly, data_reader_percent): if compiler_name not in executables: e = 'skeleton_layer_softmax: default_exes[%s] does not exist' % compiler_name print('Skip - ' + e) @@ -18,30 +19,34 @@ def skeleton_layer_softmax(cluster, executables, dir_name, compiler_name): time_limit=10, num_processes=2, dir_name=dir_name, data_reader_name='synthetic', + data_reader_percent=data_reader_percent, model_folder='tests/layer_tests', model_name='softmax', optimizer_name='sgd', - output_file_name=output_file_name, error_file_name=error_file_name) + output_file_name=output_file_name, error_file_name=error_file_name, weekly=weekly) return_code = os.system(command) tools.assert_success(return_code, error_file_name) -def test_unit_layer_softmax_clang6(cluster, exes, dirname): - skeleton_layer_softmax(cluster, exes, dirname, 'clang6') +def test_unit_layer_softmax_clang6(cluster, exes, dirname, weekly, data_reader_percent): + skeleton_layer_softmax(cluster, exes, dirname, 'clang6', + weekly, data_reader_percent) -def test_unit_layer_softmax_gcc7(cluster, exes, dirname): - skeleton_layer_softmax(cluster, exes, dirname, 'gcc7') +def test_unit_layer_softmax_gcc7(cluster, exes, dirname, weekly, data_reader_percent): + skeleton_layer_softmax(cluster, exes, dirname, 'gcc7', weekly, data_reader_percent) -def test_unit_layer_softmax_intel19(cluster, exes, dirname): - skeleton_layer_softmax(cluster, exes, dirname, 'intel19') +def test_unit_layer_softmax_intel19(cluster, exes, dirname, + weekly, data_reader_percent): + skeleton_layer_softmax(cluster, exes, dirname, 'intel19', + weekly, data_reader_percent) # Run with python3 -m pytest -s test_unit_ridge_regression.py -k 'test_unit_layer_softmax_exe' --exe= -def test_unit_layer_softmax_exe(cluster, dirname, exe): +def test_unit_layer_softmax_exe(cluster, dirname, exe, weekly, data_reader_percent): if exe is None: e = 'test_unit_layer_softmax_exe: Non-local testing' print('Skip - ' + e) pytest.skip(e) exes = {'exe': exe} - skeleton_layer_softmax(cluster, exes, dirname, 'exe') + skeleton_layer_softmax(cluster, exes, dirname, 'exe', weekly, data_reader_percent) diff --git a/bamboo/unit_tests/test_unit_layer_softplus.py b/bamboo/unit_tests/test_unit_layer_softplus.py index fed22827d05..c9e1ef426ea 100644 --- a/bamboo/unit_tests/test_unit_layer_softplus.py +++ b/bamboo/unit_tests/test_unit_layer_softplus.py @@ -5,7 +5,8 @@ import os -def skeleton_layer_softplus(cluster, executables, dir_name, compiler_name): +def skeleton_layer_softplus(cluster, executables, dir_name, compiler_name, + weekly, data_reader_percent): if compiler_name not in executables: e = 'skeleton_layer_softplus: default_exes[%s] does not exist' % compiler_name print('Skip - ' + e) @@ -17,30 +18,35 @@ def skeleton_layer_softplus(cluster, executables, dir_name, compiler_name): time_limit=10, num_processes=2, dir_name=dir_name, data_reader_name='synthetic', + data_reader_percent=data_reader_percent, model_folder='tests/layer_tests', model_name='softplus', optimizer_name='sgd', - output_file_name=output_file_name, error_file_name=error_file_name) + output_file_name=output_file_name, error_file_name=error_file_name, weekly=weekly) return_code = os.system(command) tools.assert_success(return_code, error_file_name) -def test_unit_layer_softplus_clang6(cluster, exes, dirname): - skeleton_layer_softplus(cluster, exes, dirname, 'clang6') +def test_unit_layer_softplus_clang6(cluster, exes, dirname, + weekly, data_reader_percent): + skeleton_layer_softplus(cluster, exes, dirname, 'clang6', + weekly, data_reader_percent) -def test_unit_layer_softplus_gcc7(cluster, exes, dirname): - skeleton_layer_softplus(cluster, exes, dirname, 'gcc7') +def test_unit_layer_softplus_gcc7(cluster, exes, dirname, weekly, data_reader_percent): + skeleton_layer_softplus(cluster, exes, dirname, 'gcc7', weekly, data_reader_percent) -def test_unit_layer_softplus_intel19(cluster, exes, dirname): - skeleton_layer_softplus(cluster, exes, dirname, 'intel19') +def test_unit_layer_softplus_intel19(cluster, exes, dirname, + weekly, data_reader_percent): + skeleton_layer_softplus(cluster, exes, dirname, 'intel19', + weekly, data_reader_percent) # Run with python3 -m pytest -s test_unit_layer_softplus.py -k 'test_unit_layer_softplus_exe' --exe= -def test_unit_layer_softplus_exe(cluster, dirname, exe): +def test_unit_layer_softplus_exe(cluster, dirname, exe, weekly, data_reader_percent): if exe is None: e = 'test_unit_layer_softplus_exe: Non-local testing' print('Skip - ' + e) pytest.skip(e) exes = {'exe': exe} - skeleton_layer_softplus(cluster, exes, dirname, 'exe') + skeleton_layer_softplus(cluster, exes, dirname, 'exe', weekly, data_reader_percent) diff --git a/bamboo/unit_tests/test_unit_layer_softsign.py b/bamboo/unit_tests/test_unit_layer_softsign.py index 878e4e4474b..e47b10ae649 100644 --- a/bamboo/unit_tests/test_unit_layer_softsign.py +++ b/bamboo/unit_tests/test_unit_layer_softsign.py @@ -5,7 +5,8 @@ import os -def skeleton_layer_softsign(cluster, executables, dir_name, compiler_name): +def skeleton_layer_softsign(cluster, executables, dir_name, compiler_name, + weekly, data_reader_percent): if compiler_name not in executables: e = 'skeleton_layer_softsign: default_exes[%s] does not exist' % compiler_name print('Skip - ' + e) @@ -17,34 +18,40 @@ def skeleton_layer_softsign(cluster, executables, dir_name, compiler_name): time_limit=10, num_processes=2, dir_name=dir_name, data_reader_name='synthetic', + data_reader_percent=data_reader_percent, model_folder='tests/layer_tests', model_name='softsign', optimizer_name='sgd', - output_file_name=output_file_name, error_file_name=error_file_name) + output_file_name=output_file_name, error_file_name=error_file_name, weekly=weekly) return_code = os.system(command) tools.assert_success(return_code, error_file_name) -def test_unit_layer_softsign_clang6(cluster, exes, dirname): - skeleton_layer_softsign(cluster, exes, dirname, 'clang6') +def test_unit_layer_softsign_clang6(cluster, exes, dirname, weekly, data_reader_percent): + skeleton_layer_softsign(cluster, exes, dirname, 'clang6', + weekly, data_reader_percent) -def test_unit_layer_softsign_gcc7(cluster, exes, dirname): - skeleton_layer_softsign(cluster, exes, dirname, 'gcc7') +def test_unit_layer_softsign_gcc7(cluster, exes, dirname, weekly, data_reader_percent): + skeleton_layer_softsign(cluster, exes, dirname, 'gcc7', weekly, data_reader_percent) -def test_unit_layer_softsign_intel19(cluster, exes, dirname): - skeleton_layer_softsign(cluster, exes, dirname, 'intel19') +def test_unit_layer_softsign_intel19(cluster, exes, dirname, + weekly, data_reader_percent): + skeleton_layer_softsign(cluster, exes, dirname, 'intel19', + weekly, data_reader_percent) -def test_unit_layer_softsign_intel19(cluster, exes, dirname): - skeleton_layer_softsign(cluster, exes, dirname, 'intel19') +def test_unit_layer_softsign_intel19(cluster, exes, dirname, + weekly, data_reader_percent): + skeleton_layer_softsign(cluster, exes, dirname, 'intel19', + weekly, data_reader_percent) # Run with python3 -m pytest -s test_unit_layer_softsign.py -k 'test_unit_layer_softsign_exe' --exe= -def test_unit_layer_softsign_exe(cluster, dirname, exe): +def test_unit_layer_softsign_exe(cluster, dirname, exe, weekly, data_reader_percent): if exe is None: e = 'test_unit_layer_softsign_exe: Non-local testing' print('Skip - ' + e) pytest.skip(e) exes = {'exe': exe} - skeleton_layer_softsign(cluster, exes, dirname, 'exe') + skeleton_layer_softsign(cluster, exes, dirname, 'exe', weekly, data_reader_percent) diff --git a/bamboo/unit_tests/test_unit_layer_squared_difference.py b/bamboo/unit_tests/test_unit_layer_squared_difference.py index 768cc93ff85..9a219f5463a 100644 --- a/bamboo/unit_tests/test_unit_layer_squared_difference.py +++ b/bamboo/unit_tests/test_unit_layer_squared_difference.py @@ -5,7 +5,8 @@ import os -def skeleton_layer_squared_difference(cluster, executables, dir_name, compiler_name): +def skeleton_layer_squared_difference(cluster, executables, dir_name, + compiler_name, weekly, data_reader_percent): if compiler_name not in executables: e = 'skeleton_layer_squared_difference: default_exes[%s] does not exist' % compiler_name print('Skip - ' + e) @@ -17,30 +18,39 @@ def skeleton_layer_squared_difference(cluster, executables, dir_name, compiler_n time_limit=10, num_processes=2, dir_name=dir_name, data_reader_name='synthetic', + data_reader_percent=data_reader_percent, model_folder='tests/layer_tests', model_name='squared_difference', optimizer_name='sgd', - output_file_name=output_file_name, error_file_name=error_file_name) + output_file_name=output_file_name, error_file_name=error_file_name, weekly=weekly) return_code = os.system(command) tools.assert_success(return_code, error_file_name) -def test_unit_layer_squared_difference_clang6(cluster, exes, dirname): - skeleton_layer_squared_difference(cluster, exes, dirname, 'clang6') +def test_unit_layer_squared_difference_clang6(cluster, exes, dirname, + weekly, data_reader_percent): + skeleton_layer_squared_difference(cluster, exes, dirname, 'clang6', + weekly, data_reader_percent) -def test_unit_layer_squared_difference_gcc7(cluster, exes, dirname): - skeleton_layer_squared_difference(cluster, exes, dirname, 'gcc7') +def test_unit_layer_squared_difference_gcc7(cluster, exes, dirname, + weekly, data_reader_percent): + skeleton_layer_squared_difference(cluster, exes, dirname, 'gcc7', + weekly, data_reader_percent) -def test_unit_layer_squared_difference_intel19(cluster, exes, dirname): - skeleton_layer_squared_difference(cluster, exes, dirname, 'intel19') +def test_unit_layer_squared_difference_intel19(cluster, exes, dirname, + weekly, data_reader_percent): + skeleton_layer_squared_difference(cluster, exes, dirname, 'intel19', + weekly, data_reader_percent) # Run with python3 -m pytest -s test_unit_layer_squared_difference.py -k 'test_unit_layer_squared_difference_exe' --exe= -def test_unit_layer_squared_difference_exe(cluster, dirname, exe): +def test_unit_layer_squared_difference_exe(cluster, dirname, exe, + weekly, data_reader_percent): if exe is None: e = 'test_unit_layer_squared_difference_exe: Non-local testing' print('Skip - ' + e) pytest.skip(e) exes = {'exe': exe} - skeleton_layer_squared_difference(cluster, exes, dirname, 'exe') + skeleton_layer_squared_difference(cluster, exes, dirname, 'exe', + weekly, data_reader_percent) diff --git a/bamboo/unit_tests/test_unit_layer_tessellate.py b/bamboo/unit_tests/test_unit_layer_tessellate.py index 4d788d005ca..f53292a3baf 100644 --- a/bamboo/unit_tests/test_unit_layer_tessellate.py +++ b/bamboo/unit_tests/test_unit_layer_tessellate.py @@ -5,7 +5,8 @@ import os -def skeleton_layer_tessellate(cluster, executables, dir_name, compiler_name): +def skeleton_layer_tessellate(cluster, executables, dir_name, compiler_name, + weekly, data_reader_percent): if compiler_name not in executables: e = 'skeleton_layer_tessellate: default_exes[%s] does not exist' % compiler_name print('Skip - ' + e) @@ -17,30 +18,37 @@ def skeleton_layer_tessellate(cluster, executables, dir_name, compiler_name): time_limit=10, num_processes=2, dir_name=dir_name, data_reader_name='synthetic', + data_reader_percent=data_reader_percent, model_folder='tests/layer_tests', model_name='tessellate', optimizer_name='sgd', - output_file_name=output_file_name, error_file_name=error_file_name) + output_file_name=output_file_name, error_file_name=error_file_name, weekly=weekly) return_code = os.system(command) tools.assert_success(return_code, error_file_name) -def test_unit_layer_tessellate_clang6(cluster, exes, dirname): - skeleton_layer_tessellate(cluster, exes, dirname, 'clang6') +def test_unit_layer_tessellate_clang6(cluster, exes, dirname, + weekly, data_reader_percent): + skeleton_layer_tessellate(cluster, exes, dirname, 'clang6', + weekly, data_reader_percent) -def test_unit_layer_tessellate_gcc7(cluster, exes, dirname): - skeleton_layer_tessellate(cluster, exes, dirname, 'gcc7') +def test_unit_layer_tessellate_gcc7(cluster, exes, dirname, weekly, data_reader_percent): + skeleton_layer_tessellate(cluster, exes, dirname, 'gcc7', + weekly, data_reader_percent) -def test_unit_layer_tessellate_intel19(cluster, exes, dirname): - skeleton_layer_tessellate(cluster, exes, dirname, 'intel19') +def test_unit_layer_tessellate_intel19(cluster, exes, dirname, + weekly, data_reader_percent): + skeleton_layer_tessellate(cluster, exes, dirname, 'intel19', + weekly, data_reader_percent) # Run with python3 -m pytest -s test_unit_layer_tessellate.py -k 'test_unit_layer_tessellate_exe' --exe= -def test_unit_layer_tessellate_exe(cluster, dirname, exe): +def test_unit_layer_tessellate_exe(cluster, dirname, exe, weekly, data_reader_percent): if exe is None: e = 'test_unit_layer_tessellate_exe: Non-local testing' print('Skip - ' + e) pytest.skip(e) exes = {'exe': exe} - skeleton_layer_tessellate(cluster, exes, dirname, 'exe') + skeleton_layer_tessellate(cluster, exes, dirname, 'exe', + weekly, data_reader_percent) diff --git a/bamboo/unit_tests/test_unit_layer_variance.py b/bamboo/unit_tests/test_unit_layer_variance.py index bccafe90be9..5968a39f585 100644 --- a/bamboo/unit_tests/test_unit_layer_variance.py +++ b/bamboo/unit_tests/test_unit_layer_variance.py @@ -5,7 +5,8 @@ import os -def skeleton_layer_variance(cluster, executables, dir_name, compiler_name): +def skeleton_layer_variance(cluster, executables, dir_name, compiler_name, + weekly, data_reader_percent): if compiler_name not in executables: e = 'skeleton_layer_variance: default_exes[%s] does not exist' % compiler_name print('Skip - ' + e) @@ -18,30 +19,35 @@ def skeleton_layer_variance(cluster, executables, dir_name, compiler_name): time_limit=10, num_processes=2, dir_name=dir_name, data_reader_name='synthetic', + data_reader_percent=data_reader_percent, model_folder='tests/layer_tests', model_name='variance', optimizer_name='sgd', - output_file_name=output_file_name, error_file_name=error_file_name) + output_file_name=output_file_name, error_file_name=error_file_name, weekly=weekly) return_code = os.system(command) tools.assert_success(return_code, error_file_name) -def test_unit_layer_variance_clang6(cluster, exes, dirname): - skeleton_layer_variance(cluster, exes, dirname, 'clang6') +def test_unit_layer_variance_clang6(cluster, exes, dirname, + weekly, data_reader_percent): + skeleton_layer_variance(cluster, exes, dirname, 'clang6', + weekly, data_reader_percent) -def test_unit_layer_variance_gcc7(cluster, exes, dirname): - skeleton_layer_variance(cluster, exes, dirname, 'gcc7') +def test_unit_layer_variance_gcc7(cluster, exes, dirname, weekly, data_reader_percent): + skeleton_layer_variance(cluster, exes, dirname, 'gcc7', weekly, data_reader_percent) -def test_unit_layer_variance_intel19(cluster, exes, dirname): - skeleton_layer_variance(cluster, exes, dirname, 'intel19') +def test_unit_layer_variance_intel19(cluster, exes, dirname, + weekly, data_reader_percent): + skeleton_layer_variance(cluster, exes, dirname, 'intel19', + weekly, data_reader_percent) # Run with python3 -m pytest -s test_unit_ridge_regression.py -k 'test_unit_layer_variance_exe' --exe= -def test_unit_layer_variance_exe(cluster, dirname, exe): +def test_unit_layer_variance_exe(cluster, dirname, exe, weekly, data_reader_percent): if exe is None: e = 'test_unit_layer_variance_exe: Non-local testing' print('Skip - ' + e) pytest.skip(e) exes = {'exe': exe} - skeleton_layer_variance(cluster, exes, dirname, 'exe') + skeleton_layer_variance(cluster, exes, dirname, 'exe', weekly, data_reader_percent) diff --git a/bamboo/unit_tests/test_unit_lbann2_reload.py b/bamboo/unit_tests/test_unit_lbann2_reload.py index 5c0f65998c3..2bc0bef69dc 100644 --- a/bamboo/unit_tests/test_unit_lbann2_reload.py +++ b/bamboo/unit_tests/test_unit_lbann2_reload.py @@ -2,10 +2,11 @@ sys.path.insert(0, '../common_python') import tools import pytest -import os, sys +import os -def skeleton_lbann2_reload(cluster, executables, dir_name, compiler_name, data_reader_percent=1.0): +def skeleton_lbann2_reload(cluster, executables, dir_name, compiler_name, + weekly, data_reader_percent): if compiler_name not in executables: e = 'skeleton_lbann2_reload: default_exes[%s] does not exist' % compiler_name print('Skip - ' + e) @@ -28,7 +29,7 @@ def skeleton_lbann2_reload(cluster, executables, dir_name, compiler_name, data_r optimizer_name='sgd', num_epochs=2, output_file_name=output_file_name, - error_file_name=error_file_name) + error_file_name=error_file_name, weekly=weekly) return_code_no_ckpt = os.system(command) tools.assert_success(return_code_no_ckpt, error_file_name) @@ -45,7 +46,7 @@ def skeleton_lbann2_reload(cluster, executables, dir_name, compiler_name, data_r ckpt_dir=ckpt_dir, model_folder='tests', model_name='lenet_mnist_ckpt', num_epochs=2, optimizer_name='sgd', output_file_name=output_file_name, - error_file_name=error_file_name) + error_file_name=error_file_name, weekly=weekly) return_code_ckpt_1 = os.system(command) tools.assert_success(return_code_ckpt_1, error_file_name) @@ -62,13 +63,14 @@ def skeleton_lbann2_reload(cluster, executables, dir_name, compiler_name, data_r model_path='../../model_zoo/tests/model_lenet_mnist_lbann2ckpt.prototext', num_epochs=2, optimizer_name='sgd', output_file_name=output_file_name, - error_file_name=error_file_name) + error_file_name=error_file_name, weekly=weekly) return_code_ckpt_2 = os.system(command) tools.assert_success(return_code_ckpt_2, error_file_name) # os.system('rm lbann2_ckpt/model0-epoch*') # os.system('rm lbann2_nockpt/model0-epoch*') - diff_result = os.system('diff -rq {ckpt} {no_ckpt}'.format(ckpt=ckpt_dir, no_ckpt=no_ckpt_dir)) + diff_result = os.system('diff -rq {ckpt} {no_ckpt}'.format( + ckpt=ckpt_dir, no_ckpt=no_ckpt_dir)) allow_epsilon_diff = False if allow_epsilon_diff and (diff_result != 0): equal_within_epsilon = True @@ -110,23 +112,26 @@ def skeleton_lbann2_reload(cluster, executables, dir_name, compiler_name, data_r assert diff_result == 0 -def test_unit_lbann2_reload_clang6(cluster, exes, dirname): - skeleton_lbann2_reload(cluster, exes, dirname, 'clang6') +def test_unit_lbann2_reload_clang6(cluster, exes, dirname, weekly, data_reader_percent): + skeleton_lbann2_reload(cluster, exes, dirname, 'clang6', + weekly, data_reader_percent) -def test_unit_lbann2_reload_gcc7(cluster, exes, dirname): - skeleton_lbann2_reload(cluster, exes, dirname, 'gcc7') +def test_unit_lbann2_reload_gcc7(cluster, exes, dirname, weekly, data_reader_percent): + skeleton_lbann2_reload(cluster, exes, dirname, 'gcc7', weekly, data_reader_percent) -def test_unit_lbann2_reload_intel19(cluster, exes, dirname): - skeleton_lbann2_reload(cluster, exes, dirname, 'intel19') +def test_unit_lbann2_reload_intel19(cluster, exes, dirname, + weekly, data_reader_percent): + skeleton_lbann2_reload(cluster, exes, dirname, 'intel19', + weekly, data_reader_percent) # Run with python3 -m pytest -s test_unit_lbann2_reload.py -k 'test_unit_lbann2_reload_exe' --exe= -def test_unit_lbann2_reload_exe(cluster, dirname, exe, data_reader_percent): +def test_unit_lbann2_reload_exe(cluster, dirname, exe, weekly, data_reader_percent): if exe is None: e = 'test_unit_lbann2_reload_exe: Non-local testing' print('Skip - ' + e) pytest.skip(e) exes = {'exe': exe} - skeleton_lbann2_reload(cluster, exes, dirname, 'exe', data_reader_percent) + skeleton_lbann2_reload(cluster, exes, dirname, 'exe', weekly, data_reader_percent) diff --git a/bamboo/unit_tests/test_unit_mnist_conv_graph.py b/bamboo/unit_tests/test_unit_mnist_conv_graph.py index 0c6a6610368..1ef04a0ce1a 100644 --- a/bamboo/unit_tests/test_unit_mnist_conv_graph.py +++ b/bamboo/unit_tests/test_unit_mnist_conv_graph.py @@ -5,7 +5,8 @@ import os -def skeleton_mnist_conv_graph(cluster, executables, dir_name, compiler_name): +def skeleton_mnist_conv_graph(cluster, executables, dir_name, compiler_name, + weekly, data_reader_percent): if compiler_name not in executables: e = 'skeleton_mnist_conv_graph: default_exes[%s] does not exist' % compiler_name print('Skip - ' + e) @@ -21,32 +22,40 @@ def skeleton_mnist_conv_graph(cluster, executables, dir_name, compiler_name): num_nodes=1, time_limit=tl, num_processes=1, dir_name=dir_name, data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST', - data_reader_name='mnist', model_folder='tests', + data_reader_name='mnist', + data_reader_percent=data_reader_percent, + model_folder='tests', model_name='mnist_conv_graph', optimizer_name='adam', output_file_name=output_file_name, - error_file_name=error_file_name) + error_file_name=error_file_name, weekly=weekly) return_code = os.system(command) tools.assert_success(return_code, error_file_name) -def test_unit_mnist_conv_graph_clang6(cluster, exes, dirname): - skeleton_mnist_conv_graph(cluster, exes, dirname, 'clang6') +def test_unit_mnist_conv_graph_clang6(cluster, exes, dirname, + weekly, data_reader_percent): + skeleton_mnist_conv_graph(cluster, exes, dirname, 'clang6', + weekly, data_reader_percent) -def test_unit_mnist_conv_graph_gcc7(cluster, exes, dirname): - skeleton_mnist_conv_graph(cluster, exes, dirname, 'gcc7') +def test_unit_mnist_conv_graph_gcc7(cluster, exes, dirname, + weekly, data_reader_percent): + skeleton_mnist_conv_graph(cluster, exes, dirname, 'gcc7', + weekly, data_reader_percent) -def test_unit_mnist_conv_graph_intel19(cluster, exes, dirname): - skeleton_mnist_conv_graph(cluster, exes, dirname, 'intel19') +def test_unit_mnist_conv_graph_intel19(cluster, exes, dirname, + weekly, data_reader_percent): + skeleton_mnist_conv_graph(cluster, exes, dirname, 'intel19', + weekly, data_reader_percent) # Run with python3 -m pytest -s test_unit_conv_graph.py -k 'test_unit_mnist_conv_graph_exe' --exe= -def test_unit_mnist_conv_graph_exe(cluster, dirname, exe): +def test_unit_mnist_conv_graph_exe(cluster, dirname, exe, weekly, data_reader_percent): if exe is None: e = 'test_unit_mnist_conv_graph_exe: Non-local testing' print('Skip - ' + e) pytest.skip(e) exes = {'exe': exe} - skeleton_mnist_conv_graph(cluster, exes, dirname, 'exe') + skeleton_mnist_conv_graph(cluster, exes, dirname, 'exe', weekly, data_reader_percent) diff --git a/bamboo/unit_tests/test_unit_mnist_ridge_regression.py b/bamboo/unit_tests/test_unit_mnist_ridge_regression.py index 4321d0f0cdd..289af72bbb3 100644 --- a/bamboo/unit_tests/test_unit_mnist_ridge_regression.py +++ b/bamboo/unit_tests/test_unit_mnist_ridge_regression.py @@ -7,7 +7,7 @@ def skeleton_mnist_ridge_regression(cluster, executables, dir_name, - compiler_name): + compiler_name, weekly, data_reader_percent): tools.process_executable( 'skeleton_mnist_ridge_regression', compiler_name, executables) @@ -91,8 +91,16 @@ def skeleton_mnist_ridge_regression(cluster, executables, dir_name, 'procs_per_node': 1 } + if data_reader_percent is None: + if weekly: + data_reader_percent = 1.00 + else: + # Nightly + data_reader_percent = 0.10 + lbann_args = '--data_reader_percent={drp}'.format(drp=data_reader_percent) if cluster == 'lassen': - kwargs['lbann_args'] = '--data_filedir_train=/p/gpfs1/brainusr/datasets/MNIST --data_filedir_test=/p/gpfs1/brainusr/datasets/MNIST' + lbann_args += ' --data_filedir_train=/p/gpfs1/brainusr/datasets/MNIST --data_filedir_test=/p/gpfs1/brainusr/datasets/MNIST' + kwargs['lbann_args'] = lbann_args # Run experiment_dir = '{d}/bamboo/unit_tests/experiments/mnist_ridge_regression_{c}'.format( @@ -114,23 +122,31 @@ def skeleton_mnist_ridge_regression(cluster, executables, dir_name, tools.assert_success(return_code, error_file_name) -def test_unit_mnist_ridge_regression_clang6(cluster, exes, dirname): - skeleton_mnist_ridge_regression(cluster, exes, dirname, 'clang6') +def test_unit_mnist_ridge_regression_clang6(cluster, exes, dirname, + weekly, data_reader_percent): + skeleton_mnist_ridge_regression(cluster, exes, dirname, 'clang6', + weekly, data_reader_percent) -def test_unit_mnist_ridge_regression_gcc7(cluster, exes, dirname): - skeleton_mnist_ridge_regression(cluster, exes, dirname, 'gcc7') +def test_unit_mnist_ridge_regression_gcc7(cluster, exes, dirname, + weekly, data_reader_percent): + skeleton_mnist_ridge_regression(cluster, exes, dirname, 'gcc7', + weekly, data_reader_percent) -def test_unit_mnist_ridge_regression_intel19(cluster, exes, dirname): - skeleton_mnist_ridge_regression(cluster, exes, dirname, 'intel19') +def test_unit_mnist_ridge_regression_intel19(cluster, exes, dirname, + weekly, data_reader_percent): + skeleton_mnist_ridge_regression(cluster, exes, dirname, 'intel19', + weekly, data_reader_percent) # Run with python3 -m pytest -s test_unit_ridge_regression.py -k 'test_unit_mnist_ridge_regression_exe' --exe= -def test_unit_mnist_ridge_regression_exe(cluster, dirname, exe): +def test_unit_mnist_ridge_regression_exe(cluster, dirname, exe, + weekly, data_reader_percent): if exe is None: e = 'test_unit_mnist_ridge_regression_exe: Non-local testing' print('Skip - ' + e) pytest.skip(e) exes = {'exe': exe} - skeleton_mnist_ridge_regression(cluster, exes, dirname, 'exe') + skeleton_mnist_ridge_regression(cluster, exes, dirname, 'exe', + weekly, data_reader_percent) diff --git a/bamboo/unit_tests/test_unit_mnist_softmax_classifier.py b/bamboo/unit_tests/test_unit_mnist_softmax_classifier.py index ad289e839c3..9dca795f968 100644 --- a/bamboo/unit_tests/test_unit_mnist_softmax_classifier.py +++ b/bamboo/unit_tests/test_unit_mnist_softmax_classifier.py @@ -5,7 +5,8 @@ import os -def skeleton_mnist_softmax_classifier(cluster, executables, dir_name, compiler_name): +def skeleton_mnist_softmax_classifier(cluster, executables, dir_name, compiler_name, + weekly, data_reader_percent): if compiler_name not in executables: e = 'skeleton_mnist_softmax_classifier: default_exes[%s] does not exist' % compiler_name print('Skip - ' + e) @@ -17,30 +18,39 @@ def skeleton_mnist_softmax_classifier(cluster, executables, dir_name, compiler_n num_processes=1, dir_name=dir_name, data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST', data_reader_name='mnist', + data_reader_percent=data_reader_percent, model_folder='tests', model_name='mnist_softmax_classifier', optimizer_name='adam', - output_file_name=output_file_name, error_file_name=error_file_name) + output_file_name=output_file_name, error_file_name=error_file_name, weekly=weekly) return_code = os.system(command) tools.assert_success(return_code, error_file_name) -def test_unit_mnist_softmax_classifier_clang6(cluster, exes, dirname): - skeleton_mnist_softmax_classifier(cluster, exes, dirname, 'clang6') +def test_unit_mnist_softmax_classifier_clang6(cluster, exes, dirname, + weekly, data_reader_percent): + skeleton_mnist_softmax_classifier(cluster, exes, dirname, 'clang6', + weekly, data_reader_percent) -def test_unit_mnist_softmax_classifier_gcc7(cluster, exes, dirname): - skeleton_mnist_softmax_classifier(cluster, exes, dirname, 'gcc7') +def test_unit_mnist_softmax_classifier_gcc7(cluster, exes, dirname, + weekly, data_reader_percent): + skeleton_mnist_softmax_classifier(cluster, exes, dirname, 'gcc7', + weekly, data_reader_percent) -def test_unit_mnist_softmax_classifier_intel19(cluster, exes, dirname): - skeleton_mnist_softmax_classifier(cluster, exes, dirname, 'intel19') +def test_unit_mnist_softmax_classifier_intel19(cluster, exes, dirname, + weekly, data_reader_percent): + skeleton_mnist_softmax_classifier(cluster, exes, dirname, 'intel19', + weekly, data_reader_percent) # Run with python3 -m pytest -s test_unit_softmax_classifier.py -k 'test_unit_mnist_softmax_classifier_exe' --exe= -def test_unit_mnist_softmax_classifier_exe(cluster, dirname, exe): +def test_unit_mnist_softmax_classifier_exe(cluster, dirname, exe, + weekly, data_reader_percent): if exe is None: e = 'test_unit_mnist_softmax_classifier_exe: Non-local testing' print('Skip - ' + e) pytest.skip(e) exes = {'exe': exe} - skeleton_mnist_softmax_classifier(cluster, exes, dirname, 'exe') + skeleton_mnist_softmax_classifier(cluster, exes, dirname, 'exe', + weekly, data_reader_percent) diff --git a/bamboo/unit_tests/test_unit_reconstruction_loss.py b/bamboo/unit_tests/test_unit_reconstruction_loss.py index c5617a0335a..d9fb6aa2b0c 100644 --- a/bamboo/unit_tests/test_unit_reconstruction_loss.py +++ b/bamboo/unit_tests/test_unit_reconstruction_loss.py @@ -5,7 +5,8 @@ import tools -def skeleton_jag_reconstruction_loss(cluster, executables, dir_name, compiler_name): +def skeleton_jag_reconstruction_loss(cluster, executables, dir_name, compiler_name, + weekly, data_reader_percent): if compiler_name not in executables: e = 'skeleton_jag_reconstruction_loss: default_exes[%s] does not exist' % compiler_name print('Skip - ' + e) @@ -20,33 +21,42 @@ def skeleton_jag_reconstruction_loss(cluster, executables, dir_name, compiler_na dir_name=dir_name, data_filedir_default='/p/lscratchh/brainusr/datasets/10MJAG/1M_A/100K4trainers', data_reader_name='jag', + data_reader_percent=data_reader_percent, metadata='model_zoo/models/jag/wae_cycle_gan/jag_100M_metadata.prototext', model_folder='tests', model_name='jag_single_layer_ae', optimizer_name='adam', output_file_name=output_file_name, - error_file_name=error_file_name) + error_file_name=error_file_name, weekly=weekly) return_code = os.system(command) tools.assert_success(return_code, error_file_name) -def test_unit_jag_reconstruction_loss_clang6(cluster, exes, dirname): - skeleton_jag_reconstruction_loss(cluster, exes, dirname, 'clang6') +def test_unit_jag_reconstruction_loss_clang6(cluster, exes, dirname, + weekly, data_reader_percent): + skeleton_jag_reconstruction_loss(cluster, exes, dirname, 'clang6', + weekly, data_reader_percent) -def test_unit_jag_reconstruction_loss_gcc7(cluster, exes, dirname): - skeleton_jag_reconstruction_loss(cluster, exes, dirname, 'gcc7') +def test_unit_jag_reconstruction_loss_gcc7(cluster, exes, dirname, + weekly, data_reader_percent): + skeleton_jag_reconstruction_loss(cluster, exes, dirname, 'gcc7', + weekly, data_reader_percent) -def test_unit_jag_reconstruction_loss_intel19(cluster, exes, dirname): - skeleton_jag_reconstruction_loss(cluster, exes, dirname, 'intel19') +def test_unit_jag_reconstruction_loss_intel19(cluster, exes, dirname, + weekly, data_reader_percent): + skeleton_jag_reconstruction_loss(cluster, exes, dirname, 'intel19', + weekly, data_reader_percent) # Run with python3 -m pytest -s test_unit_ridge_regression.py -k 'test_unit_jag_reconstruction_loss_exe' --exe= -def test_unit_jag_reconstruction_loss_exe(cluster, dirname, exe): +def test_unit_jag_reconstruction_loss_exe(cluster, dirname, exe, + weekly, data_reader_percent): if exe is None: e = 'test_unit_jag_reconstruction_loss_exe: Non-local testing' print('Skip - ' + e) pytest.skip(e) exes = {'exe': exe} - skeleton_jag_reconstruction_loss(cluster, exes, dirname, 'exe') + skeleton_jag_reconstruction_loss(cluster, exes, dirname, 'exe', + weekly, data_reader_percent) diff --git a/docs/continuous_integration.rst b/docs/continuous_integration.rst index 8735fde0b3f..ddced95ebff 100644 --- a/docs/continuous_integration.rst +++ b/docs/continuous_integration.rst @@ -59,7 +59,7 @@ They consist of an identical list of tasks: The tests in Task 2 run -:bash:`$PYTHON -m pytest -s [--weekly] --junitxml=results.xml`, +:bash:`$PYTHON -m pytest -s -vv --durations=0 [--weekly] --junitxml=results.xml`, which will run all the pytests in the job's associated directory. Note that :bash:`$PYTHON` refers to the Python build to use. Also note that only Weekly Develop adds the :bash:`--weekly` option. @@ -80,7 +80,8 @@ Writing Your Own Tests A side effect of our Bamboo setup is that tests must be written using pytest. Test files must begin with :bash:`test_` to be recognized by pytest. Individual test methods must also begin with :python:`test_`. -Test methods should use the :python:`assert` keyword. +Test methods should use the :python:`assert` keyword or raise an +:python:`AssertionError`. A test will only fail if the assertion turns out to be false. Not putting an assertion will automatically cause the test to pass. @@ -133,21 +134,6 @@ Alternatively, you can determine the agent from one of the first lines in the build logs: "Build working directory is /usr/workspace/wsb/lbannusr/bamboo//xml-data/build-dir/". -Some build logs can be very large (e.g. over 100,000 lines). -Beyond about 5,000 lines it is a good idea to download a log instead of -viewing it in the browser. -Beyond about 10,000 lines, some text editors may experience slowness. -At this point it is good to split up the files with -:bash:`split -l 10000 `, which creates files of the form `x*` and of -length 10,000. -You can then run a command such as :bash:`grep -in "Errors for:" x*` to find -which files have reported errors. -After you are done, you can remove the files with :bash:`rm x*`. -Note that the original log file is not modified by any of these steps. - -As an alternative to splitting the file, -errors can be searched for with -:bash:`grep -in -A "Errors for:" `. Bamboo Agent Properties ---------------------------------------- @@ -197,8 +183,9 @@ There should be a line above the test that gives the command to run the test locally, likely in the following form: :bash:`python -m pytest -s .py -k '' --exe=`. -At this time, there is no way to run all the :python:`_exe` tests in a subdirectory -and only those. +If you have an executable, you can run the :python:`_exe` tests with +:bash:`local_test.sh`. Use :bash:`local_test.cmd` as a template for writing +a batch script. You can run only integration tests, only unit tests, or both. Helpful Files ---------------------------------------- @@ -206,7 +193,11 @@ Helpful Files First, run :bash:`sudo lbannusr`. To look at output and error from previous builds: -:bash:`cd /usr/workspace/wsb/lbannusr/bamboo//xml-data/build-dir//bamboo//` +:bash:`cd /usr/workspace/wsb/lbannusr/bamboo//xml-data/build-dir//bamboo//`. +If the test uses the Python Front-End, use: +:bash:`cd /usr/workspace/wsb/lbannusr/bamboo//xml-data/build-dir//bamboo//experiments/`. +(Note that these files can also be read by clicking on the "Artifacts" tab on +the Bamboo build). To look at archived results from previous builds: :bash:`cd /usr/workspace/wsb/lbannusr/archives/` From 91c769f705fbedccd3f8dbf2efa7de3f9d288692 Mon Sep 17 00:00:00 2001 From: Tim Moon Date: Fri, 20 Sep 2019 10:47:31 -0700 Subject: [PATCH 307/634] Clean up protobuf message for fully-connected layer (#1251) * Remove weight_initialization field from FC layer protobuf * Remove regularization fields from FC layer protobuf * Remove num_neurons_is_num_labels field from FC layer protobuf * Remove fields in FC layer protobuf to get dims from reader * Remove fields in FC layer protobuf to get slice points from reader * Remove bias_initial_value field from FC layer protobuf * Expand documentation of FC layer --- .../lbann/layers/learning/fully_connected.hpp | 14 +++++++- .../model_autoencoder_chem_ecfp.prototext | 1 - .../model_autoencoder_mnist.prototext | 8 ----- .../models/candle/pilot1/combo.prototext | 7 ---- model_zoo/models/jag/wae.prototext | 3 +- .../jag/wae_cycle_gan/cycle_gan.prototext | 9 ++--- .../wae_cycle_gan/cycle_gan_only.prototext | 9 ++--- .../models/jag/wae_cycle_gan/wae.prototext | 3 +- .../jag/wae_cycle_gan/wae_fw_inv.prototext | 12 +++---- .../jag/wae_cycle_gan/wae_nobn.prototext | 3 +- .../siamese/finetune-cub/model_cub.prototext | 6 ++-- .../model_cub_batchnorm.prototext | 6 ++-- ...batchnorm_transferred_and_frozen.prototext | 6 ++-- ..._alexnet_batchnorm_dag_frozen_bn.prototext | 2 +- .../jag_single_layer_ae.prototext | 3 +- .../tests/model_jag_single_layer_ae.prototext | 3 +- src/proto/factories/layer_factory.cpp | 34 +----------------- src/proto/factories/layer_graph_factory.cpp | 36 ------------------- src/proto/layers.proto | 33 +++++++++-------- 19 files changed, 58 insertions(+), 140 deletions(-) diff --git a/include/lbann/layers/learning/fully_connected.hpp b/include/lbann/layers/learning/fully_connected.hpp index 24445b240e4..b7e7bcc2b29 100644 --- a/include/lbann/layers/learning/fully_connected.hpp +++ b/include/lbann/layers/learning/fully_connected.hpp @@ -36,7 +36,19 @@ namespace lbann { -/** @brief Perform an affine transformation. */ +/** @brief Affine transformation + * + * Flattens the input tensor, multiplies with a weights matrix, and + * optionally applies an entry-wise bias. Following the + * column-vector convention: + * @f[ y = W * \text{vec}(x) + b @f] + * + * Two weights are required if bias is applied: the linearity and the + * bias. Only the linearity weights are required if bias is not + * applied. If weights aren't provided, the linearity weights are + * initialized with He normal initialization and the bias weights are + * initialized to zero. + */ template class fully_connected_layer : public learning_layer { public: diff --git a/model_zoo/models/autoencoder_candle_pilot1/model_autoencoder_chem_ecfp.prototext b/model_zoo/models/autoencoder_candle_pilot1/model_autoencoder_chem_ecfp.prototext index 31be0dc7969..7774b0ffbb3 100644 --- a/model_zoo/models/autoencoder_candle_pilot1/model_autoencoder_chem_ecfp.prototext +++ b/model_zoo/models/autoencoder_candle_pilot1/model_autoencoder_chem_ecfp.prototext @@ -195,7 +195,6 @@ model { data_layout: "model_parallel" fully_connected { num_neurons: 250 - weight_initialization: "glorot_uniform" has_bias: true } } diff --git a/model_zoo/models/autoencoder_mnist/model_autoencoder_mnist.prototext b/model_zoo/models/autoencoder_mnist/model_autoencoder_mnist.prototext index 7a7a9ba3fe0..f8321142088 100644 --- a/model_zoo/models/autoencoder_mnist/model_autoencoder_mnist.prototext +++ b/model_zoo/models/autoencoder_mnist/model_autoencoder_mnist.prototext @@ -59,7 +59,6 @@ model { data_layout: "model_parallel" fully_connected { num_neurons: 1000 - weight_initialization: "glorot_uniform" has_bias: true } } @@ -82,7 +81,6 @@ model { data_layout: "model_parallel" fully_connected { num_neurons: 500 - weight_initialization: "glorot_uniform" has_bias: true } } @@ -105,7 +103,6 @@ model { data_layout: "model_parallel" fully_connected { num_neurons: 250 - weight_initialization: "glorot_uniform" has_bias: true } } @@ -129,7 +126,6 @@ model { data_layout: "model_parallel" fully_connected { num_neurons: 30 - weight_initialization: "glorot_uniform" has_bias: true } } @@ -142,7 +138,6 @@ model { data_layout: "model_parallel" fully_connected { num_neurons: 250 - weight_initialization: "glorot_uniform" has_bias: true } } @@ -165,7 +160,6 @@ model { data_layout: "model_parallel" fully_connected { num_neurons: 500 - weight_initialization: "glorot_uniform" has_bias: true } } @@ -189,7 +183,6 @@ model { data_layout: "model_parallel" fully_connected { num_neurons: 1000 - weight_initialization: "glorot_uniform" has_bias: true } } @@ -212,7 +205,6 @@ model { data_layout: "model_parallel" hint_layer: "image" fully_connected { - weight_initialization: "glorot_uniform" has_bias: true } } diff --git a/model_zoo/models/candle/pilot1/combo.prototext b/model_zoo/models/candle/pilot1/combo.prototext index 0ceac3cfec4..dcb39f8031e 100644 --- a/model_zoo/models/candle/pilot1/combo.prototext +++ b/model_zoo/models/candle/pilot1/combo.prototext @@ -91,7 +91,6 @@ model { data_layout: "model_parallel" fully_connected { num_neurons: 1000 - weight_initialization: "he_normal" has_bias: true } } @@ -122,7 +121,6 @@ model { data_layout: "model_parallel" fully_connected { num_neurons: 1000 - weight_initialization: "he_normal" has_bias: true } } @@ -153,7 +151,6 @@ model { data_layout: "model_parallel" fully_connected { num_neurons: 1000 - weight_initialization: "he_normal" has_bias: true } } @@ -406,7 +403,6 @@ model { data_layout: "model_parallel" fully_connected { num_neurons: 1000 - weight_initialization: "he_normal" has_bias: true } } @@ -437,7 +433,6 @@ model { data_layout: "model_parallel" fully_connected { num_neurons: 1000 - weight_initialization: "he_normal" has_bias: true } } @@ -468,7 +463,6 @@ model { data_layout: "model_parallel" fully_connected { num_neurons: 1000 - weight_initialization: "he_normal" has_bias: true } } @@ -499,7 +493,6 @@ model { data_layout: "model_parallel" fully_connected { num_neurons: 1 - weight_initialization: "he_normal" has_bias: true } } diff --git a/model_zoo/models/jag/wae.prototext b/model_zoo/models/jag/wae.prototext index 9a87ecfffba..8c7a125c64a 100644 --- a/model_zoo/models/jag/wae.prototext +++ b/model_zoo/models/jag/wae.prototext @@ -472,9 +472,8 @@ model { parents: "decode1_dropout" name: "decode0" data_layout: "data_parallel" + hint_layer: "image_data_dummy" fully_connected { - get_slice_points_from_reader: "independent" - get_num_neurons_of_slice_from_reader: [ 1 ] has_bias: true } } diff --git a/model_zoo/models/jag/wae_cycle_gan/cycle_gan.prototext b/model_zoo/models/jag/wae_cycle_gan/cycle_gan.prototext index a5ce5742c6b..408daf91122 100644 --- a/model_zoo/models/jag/wae_cycle_gan/cycle_gan.prototext +++ b/model_zoo/models/jag/wae_cycle_gan/cycle_gan.prototext @@ -297,8 +297,6 @@ model { layer { fully_connected { #num_neurons: 2500 - #get_slice_points_from_reader: "independent" - #get_num_neurons_of_slice_from_reader: [ 1 ] #replace image_dim with latent_dim num_neurons: 20 has_bias: true @@ -307,6 +305,7 @@ model { data_layout: "data_parallel" weights: "gen1fc4linearity gen1fc4bias" parents: "gen1leaky_relu3" + # hint_layer: "image_data_dummy" } #concat latenty sample (image_data_dummy) and param layer { @@ -597,14 +596,13 @@ model { layer { fully_connected { #num_neurons: 11 - get_slice_points_from_reader: "independent" - get_num_neurons_of_slice_from_reader: [ 2 ] has_bias: true } name: "gen2fc4" data_layout: "data_parallel" weights: "gen2fc4linearity gen2fc4bias" parents: "gen2leaky_relu3" + hint_layer: "param_data_id" } layer { name: "concat_param_n_img" @@ -1107,14 +1105,13 @@ model { layer { fully_connected { #num_neurons: 11 - get_slice_points_from_reader: "independent" - get_num_neurons_of_slice_from_reader: [ 2 ] has_bias: true } name: "gen2fc4_cyclic" data_layout: "data_parallel" weights: "gen2fc4linearity gen2fc4bias" parents: "gen2leaky_relu3_cyclic" + hint_layer: "param_data_id" } layer { name: "L_cyc_x" diff --git a/model_zoo/models/jag/wae_cycle_gan/cycle_gan_only.prototext b/model_zoo/models/jag/wae_cycle_gan/cycle_gan_only.prototext index 858cf6909ce..d0d79a4244f 100644 --- a/model_zoo/models/jag/wae_cycle_gan/cycle_gan_only.prototext +++ b/model_zoo/models/jag/wae_cycle_gan/cycle_gan_only.prototext @@ -189,8 +189,6 @@ model { layer { fully_connected { #num_neurons: 2500 - get_slice_points_from_reader: "independent" - get_num_neurons_of_slice_from_reader: [ 1 ] #replace image_dim with latent_dim #num_neurons: 20 has_bias: true @@ -199,6 +197,7 @@ model { data_layout: "data_parallel" weights: "gen1fc4linearity gen1fc4bias" parents: "gen1leaky_relu3" + hint_layer: "image_data_dummy" } #concat latenty sample (image_data_dummy) and param layer { @@ -479,14 +478,13 @@ model { layer { fully_connected { #num_neurons: 11 - get_slice_points_from_reader: "independent" - get_num_neurons_of_slice_from_reader: [ 2 ] has_bias: true } name: "gen2fc4" data_layout: "data_parallel" weights: "gen2fc4linearity gen2fc4bias" parents: "gen2leaky_relu3" + hint_layer: "param_data_id" } layer { name: "concat_param_n_img" @@ -967,14 +965,13 @@ model { layer { fully_connected { #num_neurons: 11 - get_slice_points_from_reader: "independent" - get_num_neurons_of_slice_from_reader: [ 2 ] has_bias: true } name: "gen2fc4_cyclic" data_layout: "data_parallel" weights: "gen2fc4linearity gen2fc4bias" parents: "gen2leaky_relu3_cyclic" + hint_layer: "param_data_id" } layer { name: "L_cyc_x" diff --git a/model_zoo/models/jag/wae_cycle_gan/wae.prototext b/model_zoo/models/jag/wae_cycle_gan/wae.prototext index 5234bbb6625..ebefbe75fa5 100644 --- a/model_zoo/models/jag/wae_cycle_gan/wae.prototext +++ b/model_zoo/models/jag/wae_cycle_gan/wae.prototext @@ -599,9 +599,8 @@ model { name: "decode0" weights: "decode0linearity decode0bias" data_layout: "data_parallel" + hint_layer: "image_data_dummy" fully_connected { - get_slice_points_from_reader: "independent" - get_num_neurons_of_slice_from_reader: [ 1 ] has_bias: true } } diff --git a/model_zoo/models/jag/wae_cycle_gan/wae_fw_inv.prototext b/model_zoo/models/jag/wae_cycle_gan/wae_fw_inv.prototext index aa0e545a486..b8e70ace800 100644 --- a/model_zoo/models/jag/wae_cycle_gan/wae_fw_inv.prototext +++ b/model_zoo/models/jag/wae_cycle_gan/wae_fw_inv.prototext @@ -448,10 +448,9 @@ model { name: "decode0" data_layout: "data_parallel" weights: "decode0linearity decode0bias" + hint_layer: "image_data_id" fully_connected { #num_neurons: 16384 - get_slice_points_from_reader: "independent" - get_num_neurons_of_slice_from_reader: [ 1 ] has_bias: true } } @@ -620,10 +619,9 @@ model { name: "ae_decode0" data_layout: "data_parallel" weights: "decode0linearity decode0bias" + hint_layer: "image_data_id" fully_connected { #num_neurons: 16384 - get_slice_points_from_reader: "independent" - get_num_neurons_of_slice_from_reader: [ 1 ] has_bias: true } } @@ -804,14 +802,13 @@ model { layer { fully_connected { #num_neurons: 11 - get_slice_points_from_reader: "independent" - get_num_neurons_of_slice_from_reader: [ 2 ] has_bias: true } name: "gen2fc4" data_layout: "data_parallel" weights: "gen2fc4linearity gen2fc4bias" parents: "gen2leaky_relu3" + hint_layer: "param_data_id" } @@ -900,14 +897,13 @@ model { layer { fully_connected { #num_neurons: 11 - get_slice_points_from_reader: "independent" - get_num_neurons_of_slice_from_reader: [ 2 ] has_bias: true } name: "gen2fc4_cyclic" data_layout: "data_parallel" weights: "gen2fc4linearity gen2fc4bias" parents: "gen2leaky_relu3_cyclic" + hint_layer: "param_data_id" } layer { name: "L_cyc_x" diff --git a/model_zoo/models/jag/wae_cycle_gan/wae_nobn.prototext b/model_zoo/models/jag/wae_cycle_gan/wae_nobn.prototext index 3bfe7e17686..4dfbf4d52f8 100644 --- a/model_zoo/models/jag/wae_cycle_gan/wae_nobn.prototext +++ b/model_zoo/models/jag/wae_cycle_gan/wae_nobn.prototext @@ -578,9 +578,8 @@ model { name: "decode0" weights: "decode0linearity decode0bias" data_layout: "data_parallel" + hint_layer: "image_data_dummy" fully_connected { - get_slice_points_from_reader: "independent" - get_num_neurons_of_slice_from_reader: [ 1 ] has_bias: true } } diff --git a/model_zoo/models/siamese/finetune-cub/model_cub.prototext b/model_zoo/models/siamese/finetune-cub/model_cub.prototext index df6b9335943..af345911707 100644 --- a/model_zoo/models/siamese/finetune-cub/model_cub.prototext +++ b/model_zoo/models/siamese/finetune-cub/model_cub.prototext @@ -616,10 +616,10 @@ model { parents: "drop7_new" name: "fc8_new" data_layout: "model_parallel" + # The number of outputs specific to the dataset used. + # E.g., 200 for CUB, and 431 for CompCars. + hint_layer: "label_new" fully_connected { - # The number of outputs specific to the dataset used. - # E.g., 200 for CUB, and 431 for CompCars. - num_neurons_is_num_labels: true has_bias: false } weights: "fc8_new_linearity fc8_new_bias" diff --git a/model_zoo/models/siamese/finetune-cub/model_cub_batchnorm.prototext b/model_zoo/models/siamese/finetune-cub/model_cub_batchnorm.prototext index 034d1cd41c6..1e2b87a0753 100644 --- a/model_zoo/models/siamese/finetune-cub/model_cub_batchnorm.prototext +++ b/model_zoo/models/siamese/finetune-cub/model_cub_batchnorm.prototext @@ -770,10 +770,10 @@ model { parents: "drop7_new" name: "fc8_new" data_layout: "data_parallel" + # The number of outputs specific to the dataset used. + # E.g., 200 for CUB, and 431 for CompCars. + hint_layer: "label_new" fully_connected { - # The number of outputs specific to the dataset used. - # E.g., 200 for CUB, and 431 for CompCars. - num_neurons_is_num_labels: true has_bias: false } weights: "fc8_new_linearity fc8_new_bias" diff --git a/model_zoo/models/siamese/finetune-cub/model_cub_batchnorm_transferred_and_frozen.prototext b/model_zoo/models/siamese/finetune-cub/model_cub_batchnorm_transferred_and_frozen.prototext index 3164f38097f..04d0a8c8efe 100644 --- a/model_zoo/models/siamese/finetune-cub/model_cub_batchnorm_transferred_and_frozen.prototext +++ b/model_zoo/models/siamese/finetune-cub/model_cub_batchnorm_transferred_and_frozen.prototext @@ -1032,10 +1032,10 @@ model { name: "fc8_new" children: "prob_new" data_layout: "data_parallel" + # The number of outputs specific to the dataset used. + # E.g., 200 for CUB, and 431 for CompCars. + hint_layer: "label_new" fully_connected { - # The number of outputs specific to the dataset used. - # E.g., 200 for CUB, and 431 for CompCars. - num_neurons_is_num_labels: true has_bias: false } weights: "fc8_new_linearity fc8_new_bias" diff --git a/model_zoo/models/siamese/triplet/model_triplet_alexnet_batchnorm_dag_frozen_bn.prototext b/model_zoo/models/siamese/triplet/model_triplet_alexnet_batchnorm_dag_frozen_bn.prototext index 3c9d16d8d25..555b2736cec 100644 --- a/model_zoo/models/siamese/triplet/model_triplet_alexnet_batchnorm_dag_frozen_bn.prototext +++ b/model_zoo/models/siamese/triplet/model_triplet_alexnet_batchnorm_dag_frozen_bn.prototext @@ -1610,8 +1610,8 @@ model { name: "fc9" children: "prob" data_layout: "data_parallel" + hint_layer: "label" fully_connected { - num_neurons_is_num_labels: true has_bias: false } weights: "fc9_linearity fc9_bias" diff --git a/model_zoo/tests/data_reader_tests/jag_single_layer_ae.prototext b/model_zoo/tests/data_reader_tests/jag_single_layer_ae.prototext index 54006968dae..5f1684c27f8 100644 --- a/model_zoo/tests/data_reader_tests/jag_single_layer_ae.prototext +++ b/model_zoo/tests/data_reader_tests/jag_single_layer_ae.prototext @@ -95,9 +95,8 @@ model { parents: "encodefc1" name: "decode0" data_layout: "data_parallel" + hint_layer: "image_data_dummy" fully_connected { - get_slice_points_from_reader: "independent" - get_num_neurons_of_slice_from_reader: [ 1 ] has_bias: true } } diff --git a/model_zoo/tests/model_jag_single_layer_ae.prototext b/model_zoo/tests/model_jag_single_layer_ae.prototext index 6ed218e7216..dfcb5501d72 100644 --- a/model_zoo/tests/model_jag_single_layer_ae.prototext +++ b/model_zoo/tests/model_jag_single_layer_ae.prototext @@ -123,9 +123,8 @@ model { parents: "encodeelu" name: "decode" data_layout: "data_parallel" + hint_layer: "image_data_dummy" fully_connected { - get_slice_points_from_reader: "independent" - get_num_neurons_of_slice_from_reader: [ 1 ] has_bias: true } } diff --git a/src/proto/factories/layer_factory.cpp b/src/proto/factories/layer_factory.cpp index 8ba40ae4df3..8910d8a1114 100644 --- a/src/proto/factories/layer_factory.cpp +++ b/src/proto/factories/layer_factory.cpp @@ -149,41 +149,9 @@ std::unique_ptr construct_layer( // Fully connected layer if (proto_layer.has_fully_connected()) { const auto& params = proto_layer.fully_connected(); - int num_neurons = 0; - std::string num_neurons_method_name; - - if (params.get_num_neurons_of_slice_from_reader_size() > 0) { - num_neurons_method_name = "get_num_neurons_of_slice_from_reader"; - const auto dr_generic = lbann::peek_map(data_readers, execution_mode::training); - const int num_slice_indices = params.get_num_neurons_of_slice_from_reader_size(); - if (dynamic_cast(dr_generic) != nullptr) { - const std::string& var = params.get_slice_points_from_reader(); - bool is_supported = false; /// @todo Remove unneeded function parameter - const auto slice_points = get_slice_points_from_reader(dr_generic, var, is_supported); - for (int i = 0; i < num_slice_indices; ++i) { - const size_t idx = static_cast(params.get_num_neurons_of_slice_from_reader(i)); - if ((idx == 0u) || (idx >= slice_points.size())) { - err << "invalid slice index from get_num_neurons_of_slice_from_reader"; - LBANN_ERROR(err.str()); - } - const int diff = static_cast(slice_points[idx] - slice_points[idx-1]); - num_neurons += diff; - } - } - } else { - num_neurons_method_name = "num_neurons"; - num_neurons = params.num_neurons(); - if (proto_layer.num_neurons_from_data_reader()) { - const auto dr = lbann::peek_map(data_readers, execution_mode::training); - if (!dr) { - LBANN_ERROR("training data reader does not exist!"); - } - num_neurons = dr->get_linearized_data_size(); - } - } return lbann::make_unique>( comm, - num_neurons, + params.num_neurons(), params.transpose(), nullptr, params.has_bias()); diff --git a/src/proto/factories/layer_graph_factory.cpp b/src/proto/factories/layer_graph_factory.cpp index 804a9d8fe9c..6bc9f2091ec 100644 --- a/src/proto/factories/layer_graph_factory.cpp +++ b/src/proto/factories/layer_graph_factory.cpp @@ -90,39 +90,6 @@ void setup_hints( } } -void setup_fc_num_neurons( - std::vector& layers, - const std::map& data_readers, - const lbann_data::Model& proto_model) { - std::stringstream err; - for (int i=0; iget_role() == "train") { - std::vector dims(1, t.second->get_num_labels()); - auto&& fc_dp_cpu = dynamic_cast*>(l); - auto&& fc_mp_cpu = dynamic_cast*>(l); -#ifdef LBANN_HAS_GPU - auto&& fc_dp_gpu = dynamic_cast*>(l); - auto&& fc_mp_gpu = dynamic_cast*>(l); -#endif // LBANN_HAS_GPU - if (fc_dp_cpu != nullptr) { fc_dp_cpu->set_output_dims(dims); } - if (fc_mp_cpu != nullptr) { fc_mp_cpu->set_output_dims(dims); } -#ifdef LBANN_HAS_GPU - if (fc_dp_gpu != nullptr) { fc_dp_gpu->set_output_dims(dims); } - if (fc_mp_gpu != nullptr) { fc_mp_gpu->set_output_dims(dims); } -#endif // LBANN_HAS_GPU - } - } - } - } - } -} - /** Setup paired pooling layers for unpooling layers. */ void setup_unpooling_pointers(lbann_comm* comm, std::vector& layers, @@ -280,9 +247,6 @@ std::vector> construct_layer_graph( setup_hints(layer_pointers, names_to_layers, proto_model); setup_unpooling_pointers(comm, layer_pointers, names_to_layers, proto_model); - // Optionally Set num_neurons = num_labels - setup_fc_num_neurons(layer_pointers, data_readers, proto_model); - // Return layer list return layers; diff --git a/src/proto/layers.proto b/src/proto/layers.proto index bd1b684c0e1..550d5bb7a9e 100644 --- a/src/proto/layers.proto +++ b/src/proto/layers.proto @@ -444,22 +444,27 @@ message Layer { ///////////////////// // Learning layers // ///////////////////// + + /** @brief Affine transformation + * + * Flattens the input tensor, multiplies with a weights matrix, and + * optionally applies an entry-wise bias. Following the + * column-vector convention: + * @f[ y = W * \text{vec}(x) + b @f] + * + * Two weights are required if bias is applied: the linearity and the + * bias. Only the linearity weights are required if bias is not + * applied. If weights aren't provided, the linearity weights are + * initialized with He normal initialization and the bias weights are + * initialized to zero. + */ message FullyConnected { + // Output tensor size int64 num_neurons = 1; - string weight_initialization = 2; //DEPRECATED - bool has_bias = 3; //default: true - double bias_initial_value = 4; //default: 0 - double l2_regularization_factor = 5; //default: 0 - double group_lasso_regularization_factor = 6; //default: 0 - bool transpose = 7; - bool num_neurons_is_num_labels = 8; - - bool get_input_dimension_from_reader = 9; - bool get_image_and_scalar_dimension_from_reader = 10; - bool get_image_dimension_from_reader = 11; - bool get_scalar_dimension_from_reader = 12; - repeated uint32 get_num_neurons_of_slice_from_reader = 13; - string get_slice_points_from_reader = 14; + // Whether to apply entry-wise bias + bool has_bias = 2; + // Whether to apply transpose of weights matrix + bool transpose = 3; } message Convolution { From 2f4f4f8f062ce67f6e9def32821c870e8f15463b Mon Sep 17 00:00:00 2001 From: Ryan Forsyth Date: Fri, 20 Sep 2019 11:05:00 -0700 Subject: [PATCH 308/634] Add missed parameters in performance tests --- bamboo/integration_tests/test_integration_performance.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/bamboo/integration_tests/test_integration_performance.py b/bamboo/integration_tests/test_integration_performance.py index 5370a6e81fa..d16367693a3 100644 --- a/bamboo/integration_tests/test_integration_performance.py +++ b/bamboo/integration_tests/test_integration_performance.py @@ -110,7 +110,8 @@ def skeleton_performance_lenet_mnist(cluster, dir_name, executables, should_log = True actual_performance = common_code.skeleton( cluster, dir_name, executable, model_folder, model_name, DATA_FIELDS, - should_log, compiler_name=compiler_name) + should_log, compiler_name=compiler_name, weekly=weekly, + data_reader_percent=data_reader_percent) run_tests(actual_performance, model_name, dir_name, should_log, compiler_name, cluster) @@ -127,7 +128,8 @@ def skeleton_performance_alexnet(cluster, dir_name, executables, compiler_name, should_log = True actual_performance = common_code.skeleton( cluster, dir_name, executable, model_folder, model_name, DATA_FIELDS, - should_log, compiler_name=compiler_name, weekly=weekly) + should_log, compiler_name=compiler_name, weekly=weekly, + data_reader_percent=data_reader_percent) frequency_str = '_nightly' if weekly: frequency_str = '_weekly' @@ -160,6 +162,8 @@ def skeleton_performance_full_alexnet(cluster, dir_name, executables, output_file_name = '%s/bamboo/integration_tests/output/%s_%s_output.txt' %(dir_name, model_name, compiler_name) error_file_name = '%s/bamboo/integration_tests/error/%s_%s_error.txt' %(dir_name, model_name, compiler_name) # No use for data_reader_percent here. + # Keeping it as a parameter since a user may pass it in when + # running all exe tests. if cluster in ['catalyst']: command = 'salloc --nodes 128 %s/bamboo/integration_tests/%s.sh > %s 2> %s' % (dir_name, model_name, output_file_name, error_file_name) elif cluster in ['lassen', 'pascal', 'ray']: From 21580c0073580fd6b387d2819e7423bc29f1bb84 Mon Sep 17 00:00:00 2001 From: "Thomas R. Benson" Date: Fri, 20 Sep 2019 12:52:02 -0700 Subject: [PATCH 309/634] fix some documentation problems --- docs/SourceTreeDoxyfile | 2 +- include/lbann/execution_contexts/execution_context.hpp | 8 ++++---- .../lbann/execution_contexts/sgd_execution_context.hpp | 4 ++-- include/lbann/layers/learning/channelwise_scale_bias.hpp | 2 +- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/docs/SourceTreeDoxyfile b/docs/SourceTreeDoxyfile index eb38cd65aa7..030ca51b653 100644 --- a/docs/SourceTreeDoxyfile +++ b/docs/SourceTreeDoxyfile @@ -1616,7 +1616,7 @@ PAPER_TYPE = a4wide # If left blank no extra packages will be included. # This tag requires that the tag GENERATE_LATEX is set to YES. -EXTRA_PACKAGES = amsmath +EXTRA_PACKAGES = amsmath, amssymb, amsfonts, latexsym # The LATEX_HEADER tag can be used to specify a personal LaTeX header for the # generated LaTeX document. The header should contain everything until the first diff --git a/include/lbann/execution_contexts/execution_context.hpp b/include/lbann/execution_contexts/execution_context.hpp index d60729710de..703b2a84c2e 100644 --- a/include/lbann/execution_contexts/execution_context.hpp +++ b/include/lbann/execution_contexts/execution_context.hpp @@ -65,13 +65,13 @@ class execution_context { } /** @brief Current step in the training algorithm - * @detailed Step counts the number of iterations in the training + * @details Step counts the number of iterations in the training * algorithm's internal state */ size_t get_step() const noexcept { return m_step; } /** @brief Increment the current step in the training algorithm - * @detailed Increment the step count in the training + * @details Increment the step count in the training * algorithm's internal state */ void inc_step() noexcept { ++m_step; } @@ -139,13 +139,13 @@ class execution_context { execution_mode m_execution_mode = execution_mode::training; /** @brief Current step in the training algorithm - * @detailed Step counts the number of iterations in the training + * @details Step counts the number of iterations in the training * algorithm's internal state */ size_t m_step = 0; /** @brief Whether to terminate training. - * @detailed If true, training will terminate immediately before + * @details If true, training will terminate immediately before * the next epoch. */ bool m_terminate_training = false; diff --git a/include/lbann/execution_contexts/sgd_execution_context.hpp b/include/lbann/execution_contexts/sgd_execution_context.hpp index 255baa80035..478785739db 100644 --- a/include/lbann/execution_contexts/sgd_execution_context.hpp +++ b/include/lbann/execution_contexts/sgd_execution_context.hpp @@ -39,7 +39,7 @@ class sgd_termination_criteria : public termination_criteria { /** @brief SGD Uses the step to track the Current mini-batch step for * execution mode. - * @detailed Step counts are not reset after each epoch. + * @details Step counts are not reset after each epoch. */ class sgd_execution_context final : public execution_context { public: @@ -71,7 +71,7 @@ class sgd_execution_context final : public execution_context { inline size_t get_epoch() const noexcept { return m_epoch; } /** @brief Increment the current epoch in the execution context - * @detailed Increment the counter tracking the number of times + * @details Increment the counter tracking the number of times * that the data set has been traversed. */ void inc_epoch() noexcept { ++m_epoch; } diff --git a/include/lbann/layers/learning/channelwise_scale_bias.hpp b/include/lbann/layers/learning/channelwise_scale_bias.hpp index fc93aa53ca9..3270340ee5d 100644 --- a/include/lbann/layers/learning/channelwise_scale_bias.hpp +++ b/include/lbann/layers/learning/channelwise_scale_bias.hpp @@ -48,7 +48,7 @@ namespace lbann { * The scale and bias vectors are fused into a single weights tensor * to reduce the number of gradient allreduces during backprop. In * particular, the weights tensor is a - * @f$ \text{num_channels} \times 2 @f$ matrix, where the first + * @f$ \text{num\_channels} \times 2 @f$ matrix, where the first * column correspond to scale terms and the second column to bias * terms. */ From 4ebe5b4c8256c498c6ce3645171d5e2e2e1d5236 Mon Sep 17 00:00:00 2001 From: "Thomas R. Benson" Date: Fri, 20 Sep 2019 15:07:33 -0700 Subject: [PATCH 310/634] add some flavor text for new directories --- docs/RSTDocsFlavorText.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/RSTDocsFlavorText.py b/docs/RSTDocsFlavorText.py index 2f6b5969f08..ba58d7efc29 100644 --- a/docs/RSTDocsFlavorText.py +++ b/docs/RSTDocsFlavorText.py @@ -9,6 +9,7 @@ 'callbacks' : 'Callback Interface', 'data_readers' : 'Data Readers Interface', 'data_store' : 'Data Store Interface', + 'execution_contexts' : 'Execution Context Interface', 'layers' : 'Layer Interface', 'layers/activations' : 'Activation Layers', 'layers/image' : 'Image Layers', @@ -26,6 +27,9 @@ 'objective_functions/weight_regularization' : 'Objective Functions for Weight Regularization', 'optimizers' : 'Optimizer Interface', 'proto' : 'Protobuf and Front-End Utilities', + 'trainers' : 'Trainer Interface', + 'training_algorithms' : 'Training Algorithm Interface', + 'transforms' : 'Transform Interface', 'utils' : 'General Utilities', 'utils/threads' : 'Multithreading Utilities', 'weights' : 'Weights Interface' From 31f2c04776a56dc374ea611b137556bb3c4e14da Mon Sep 17 00:00:00 2001 From: "Thomas R. Benson" Date: Sat, 21 Sep 2019 17:12:46 -0700 Subject: [PATCH 311/634] add documentation of trainers, training algorithms, and execution contexts to sphinx directory pages --- docs/RSTDocsFlavorText.py | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/docs/RSTDocsFlavorText.py b/docs/RSTDocsFlavorText.py index ba58d7efc29..8e5146cfb78 100644 --- a/docs/RSTDocsFlavorText.py +++ b/docs/RSTDocsFlavorText.py @@ -56,6 +56,21 @@ The data store provides in-memory caching of the data set and inter-epoch data shuffling.''', + 'execution_contexts' : ''' +When a model is attached to a trainer the execution context of the +training algorithm is stored in an execution_context class (or +sub-class) per execution mode. Thus there is one execution context +per model and mode that contains all of the state with respect to the +training algorithm being applied to the model. + +For example it tracks the current: + +* step +* execution mode +* epoch +* and a pointer back to the trainer +''', + 'layers' : ''' LBANN models are defined in model prototext files. The bulk of these defintions will be the series of layers which make up the model @@ -107,6 +122,30 @@ python front end of LBANN will emit a network description in the protobuf format that is ingested at runtime.''', + 'trainers' : ''' +A trainer is a collection of compute resources and defines a explicit +communication domain. It provides the execution for both the training +and inference of a trained model. Once constructed a trainer owns an +LBANN comm object that defines both intra- and inter-trainer +communication domains. Additionally, a trainer will contain an I/O +thread pool that is used to fetch and pre-process data that will be +provided to the trainer's models. + +A trainer owns: + +* comm object +* I/O thread pool +* One or more models +* Execution context for each model +* In the future, it will also contain the data readers. +''', + + 'training_algorithms' : ''' +The training algorithm defines the optimization that is to be +applied to the model(s) being trained. Additionally, it can +specify how to evaluate the model. +''', + 'utils' : 'Utility classes and functions.', 'utils/threads' : 'TODO: Something about utils/threads', From 57334219639c77039981609a16c6fb31bb0729f2 Mon Sep 17 00:00:00 2001 From: "Thomas R. Benson" Date: Sat, 21 Sep 2019 17:18:36 -0700 Subject: [PATCH 312/634] clean up language slightly --- docs/RSTDocsFlavorText.py | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/docs/RSTDocsFlavorText.py b/docs/RSTDocsFlavorText.py index 8e5146cfb78..96555316b14 100644 --- a/docs/RSTDocsFlavorText.py +++ b/docs/RSTDocsFlavorText.py @@ -57,10 +57,10 @@ inter-epoch data shuffling.''', 'execution_contexts' : ''' -When a model is attached to a trainer the execution context of the -training algorithm is stored in an execution_context class (or -sub-class) per execution mode. Thus there is one execution context -per model and mode that contains all of the state with respect to the +When a model is attached to a trainer, the execution context of the +training algorithm is stored in an `execution_context` (or sub-class) +object per execution mode. Thus there is one execution context per +model and mode that contains all of the state with respect to the training algorithm being applied to the model. For example it tracks the current: @@ -68,7 +68,7 @@ * step * execution mode * epoch -* and a pointer back to the trainer +* and a pointer back to the trainer. ''', 'layers' : ''' @@ -123,21 +123,22 @@ protobuf format that is ingested at runtime.''', 'trainers' : ''' -A trainer is a collection of compute resources and defines a explicit -communication domain. It provides the execution for both the training -and inference of a trained model. Once constructed a trainer owns an -LBANN comm object that defines both intra- and inter-trainer +A trainer is a collection of compute resources and defines an explicit +communication domain. It manages the execution for both the training +and inference of a trained model. Once constructed, a trainer owns an +`lbann_comm` object that defines both intra- and inter-trainer communication domains. Additionally, a trainer will contain an I/O -thread pool that is used to fetch and pre-process data that will be +thread pool that is used to fetch and preprocess data that will be provided to the trainer's models. A trainer owns: -* comm object -* I/O thread pool -* One or more models -* Execution context for each model -* In the future, it will also contain the data readers. +* `lbann_comm` object, +* I/O thread pool, +* One or more models, and +* Execution context for each model. + +In the future, it will also contain the data readers. ''', 'training_algorithms' : ''' From a0bc9bbf5425a507d0fd61d05af195dfba8723f5 Mon Sep 17 00:00:00 2001 From: "Thomas R. Benson" Date: Wed, 25 Sep 2019 10:20:29 -0700 Subject: [PATCH 313/634] fix LBANNConfig.cmake.in so projects can depend on LBANN --- cmake/configure_files/LBANNConfig.cmake.in | 121 ++++++++++++++++++++- 1 file changed, 116 insertions(+), 5 deletions(-) diff --git a/cmake/configure_files/LBANNConfig.cmake.in b/cmake/configure_files/LBANNConfig.cmake.in index e55515437d3..a395f2604f4 100644 --- a/cmake/configure_files/LBANNConfig.cmake.in +++ b/cmake/configure_files/LBANNConfig.cmake.in @@ -10,9 +10,11 @@ list(APPEND CMAKE_MODULE_PATH "@EXTRA_CMAKE_MODULE_DIR@") set(LBANN_VERSION ${PACKAGE_VERSION}) +set(LBANN_BUILD_TYPE "@CMAKE_BUILD_TYPE@") + # Record compiler information set(LBANN_CXX_COMPILER "@CMAKE_CXX_COMPILER@") -set(LBANN_CUDA_COMPILER "$@CMAKE_CUDA_COMPILER@") +set(LBANN_CUDA_COMPILER "@CMAKE_CUDA_COMPILER@") set(LBANN_CXX_FLAGS "@CMAKE_CXX_FLAGS@") set(LBANN_CUDA_FLAGS "@CMAKE_CUDA_FLAGS@") @@ -28,6 +30,7 @@ if (CMAKE_CXX_STANDARD LESS LBANN_CXX_STANDARD) endif () set(CMAKE_CXX_STANDARD_REQUIRED TRUE) + # Record the various flags and switches accumlated in LBANN set(LBANN_ALUMINUM_MPI_PASSTHROUGH @LBANN_ALUMINUM_MPI_PASSTHROUGH@) set(LBANN_BUILT_WITH_SPECTRUM @LBANN_BUILT_WITH_SPECTRUM@) @@ -37,6 +40,7 @@ set(LBANN_GNU_LINUX @LBANN_GNU_LINUX@) set(LBANN_HAS_ALUMINUM @LBANN_HAS_ALUMINUM@) set(LBANN_HAS_CEREAL @LBANN_HAS_CEREAL@) set(LBANN_HAS_CNPY @LBANN_HAS_CNPY@) +set(LBANN_HAS_CONDUIT @LBANN_WITH_CONDUIT@) set(LBANN_HAS_CUDA @LBANN_HAS_CUDA@) set(LBANN_HAS_CUDNN @LBANN_HAS_CUDNN@) set(LBANN_HAS_DOXYGEN @LBANN_HAS_DOXYGEN@) @@ -45,28 +49,39 @@ set(LBANN_HAS_LBANN_PROTO @LBANN_HAS_LBANN_PROTO@) set(LBANN_HAS_OPENCV @LBANN_HAS_OPENCV@) set(LBANN_HAS_NCCL2 @LBANN_HAS_NCCL2@) set(LBANN_HAS_PROTOBUF @LBANN_HAS_PROTOBUF@) +set(LBANN_HAS_PYTHON @LBANN_HAS_PYTHON@) set(LBANN_HAS_TBINF @LBANN_HAS_TBINF@) set(LBANN_HAS_VTUNE @LBANN_HAS_VTUNE@) -set(LBANN_NO_OMP_FOR_DATA_READERS @LBANN_NO_OMP_FOR_DATA_READERS@) set(LBANN_NVPROF @LBANN_NVPROF@) set(LBANN_SEQUENTIAL_INITIALIZATION @LBANN_SEQUENTIAL_INITIALIZAION@) set(LBANN_TOPO_AWARE @LBANN_TOPO_AWARE@) # Setup dependencies +find_package(Threads REQUIRED) -# First, CEREAL. if (LBANN_HAS_CEREAL) - find_package(CEREAL NO_MODULE + find_package(CEREAL NO_MODULE QUIET HINTS ${CEREAL_DIR} $ENV{CEREAL_DIR} PATH_SUFFIXES share/cmake/cereal NO_DEFAULT_PATH) if (NOT CEREAL_FOUND) - find_package(CEREAL NO_MODULE) + find_package(CEREAL NO_MODULE QUIET) endif () if (NOT CEREAL_FOUND AND NOT CEREAL_DIR) set(CEREAL_DIR "@CEREAL_DIR@") find_package(CEREAL NO_MODULE REQUIRED) endif () + if (NOT CEREAL_FOUND) + message(FATAL_ERROR "Required dependency CEREAL not found.") + endif () +endif () + +if (NOT HWLOC_DIR) + set(HWLOC_DIR "@HWLOC_DIR@") +endif () +if (LBANN_TOPO_AWARE) + find_package(HWLOC REQUIRED) + set(LBANN_TOPO_AWARE ${HWLOC_FOUND}) endif () # Next, Hydrogen. We can probably inherit Aluminum-ness from @@ -191,6 +206,102 @@ if (LBANN_HAS_CUDA) include(SetupCUDAToolkit) endif (LBANN_HAS_CUDA) +set(_LBANN_CONDUIT_DIR "@Conduit_DIR@") +set(_LBANN_HDF5_DIR "@HDF5_DIR@") +if (LBANN_HAS_CONDUIT) + # Apparently we have to find HDF5, too. + find_package(HDF5 CONFIG QUIET + HINTS ${HDF5_DIR} $ENV{HDF5_DIR} ${_LBANN_HDF5_DIR} + PATH_SUFFIXES share/cmake/hdf5 + NO_DEFAULT_PATH) + if (NOT HDF5_FOUND) + find_package(HDF5 CONFIG QUIET) + endif () + if (NOT HDF5_FOUND) + enable_language(C) # WHY?????????????? + find_package(HDF5 REQUIRED) + set(HDF5_FOUND_WITH_MODULE TRUE) + else () + message(STATUS "Found HDF5: ${HDF5_DIR}") + endif () + + find_package(Conduit CONFIG QUIET + HINTS ${Conduit_DIR} $ENV{Conduit_DIR} + ${CONDUIT_DIR} $ENV{CONDUIT_DIR} + ${_LBANN_CONDUIT_DIR} + PATH_SUFFIXES lib64/cmake lib/cmake + NO_DEFAULT_PATH) + if (NOT Conduit_FOUND) + find_package(Conduit CONFIG REQUIRED + PATH_SUFFIXES lib64/cmake lib/cmake) + endif () + message(STATUS "Found CONDUIT: ${Conduit_DIR}") + + # Ugh. I don't like that this requires intimate knowledge of + # specific targets that CONDUIT exports. It should support + # components. + if (NOT TARGET conduit_relay_mpi) + message(FATAL_ERROR "CONDUIT does not have proper MPI support.") + endif () + + if (NOT TARGET conduit OR NOT TARGET conduit_relay + OR NOT TARGET conduit_blueprint) + message(FATAL_ERROR "Missing some CONDUIT required library.") + endif () + + if (NOT TARGET conduit::conduit) + add_library(conduit::conduit INTERFACE IMPORTED) + endif () + + set(_conduit_interface_link_libs + "conduit;conduit_relay;conduit_relay_mpi;conduit_blueprint") + + # Remove -pthread from linkage, if found + foreach (_lib IN LISTS _conduit_interface_link_libs) + if (TARGET ${_lib}) + get_property(_tmp_interface_link_libs TARGET ${_lib} + PROPERTY INTERFACE_LINK_LIBRARIES) + + list(FIND _tmp_interface_link_libs "-pthread" _pthread_idx) + if (_pthread_idx GREATER_EQUAL 0) + list(REMOVE_AT _tmp_interface_link_libs ${_pthread_idx}) + + set_property(TARGET ${_lib} PROPERTY + INTERFACE_LINK_LIBRARIES ${_tmp_interface_link_libs}) + endif () + + get_property(_tmp_interface_compile_opts TARGET ${_lib} + PROPERTY INTERFACE_COMPILE_OPTIONS) + set_property(TARGET ${_lib} + PROPERTY INTERFACE_COMPILE_OPTIONS + $<$:${_tmp_interface_compile_opts}>) + endif () + endforeach () + + get_filename_component(_conduit_include_dirs + "${CONDUIT_INCLUDE_DIRS}" DIRECTORY) + + if (HDF5_FOUND_WITH_MODULE) + list(APPEND _conduit_interface_link_libs + ${HDF5_LIBRARIES}) + + list(APPEND _conduit_include_dirs + "${HDF5_INCLUDE_DIRS}") + endif () + + set_property(TARGET conduit::conduit + PROPERTY + INTERFACE_INCLUDE_DIRECTORIES + "${_conduit_include_dirs}") + + set_target_properties(conduit::conduit + PROPERTIES + INTERFACE_LINK_LIBRARIES + "${_conduit_interface_link_libs}") + + set(CONDUIT_LIBRARIES conduit::conduit) +endif (LBANN_HAS_CONDUIT) + @PACKAGE_INIT@ # Now actually import the LBANN target From dd9e94c1b723983b15b8d3144199f0d91568bd42 Mon Sep 17 00:00:00 2001 From: Tom Benson <30674819+benson31@users.noreply.github.com> Date: Wed, 25 Sep 2019 12:16:39 -0700 Subject: [PATCH 314/634] Clean up the layer instantiation (#1168) * add ETI for elu_layer * add eti for identity layer * update ELU layer to match the decided-upon format * ETI for all non-macro-created layers * ETI for all unary layers * add missing files * ETI for binary layers * ETI for input layer Best I could tell, `partitioned_io_buffer` was the only valid type for the first template parameter, so that's the only one I ETI'd. * remove bad file * remove unused file * remove reference to removed file --- .../lbann/layers/activations/activations.hpp | 39 +++++++++--- include/lbann/layers/activations/elu.hpp | 9 +++ include/lbann/layers/activations/identity.hpp | 13 ++++ .../lbann/layers/activations/leaky_relu.hpp | 13 ++++ .../lbann/layers/activations/log_softmax.hpp | 13 ++++ include/lbann/layers/activations/softmax.hpp | 13 ++++ .../lbann/layers/image/bilinear_resize.hpp | 13 +++- include/lbann/layers/io/input/input_layer.hpp | 20 +++++- .../layers/learning/base_convolution.hpp | 15 ++++- .../learning/channelwise_scale_bias.hpp | 15 ++++- include/lbann/layers/learning/convolution.hpp | 17 ++++-- .../lbann/layers/learning/deconvolution.hpp | 14 ++++- include/lbann/layers/learning/embedding.hpp | 13 ++-- .../layers/learning/entrywise_scale_bias.hpp | 13 ++++ .../lbann/layers/learning/fully_connected.hpp | 13 ++++ .../layers/loss/categorical_accuracy.hpp | 13 ++++ include/lbann/layers/loss/cross_entropy.hpp | 13 ++++ include/lbann/layers/loss/entrywise.hpp | 37 +++++++++-- include/lbann/layers/loss/l1_norm.hpp | 13 ++++ include/lbann/layers/loss/l2_norm2.hpp | 13 ++++ .../lbann/layers/loss/mean_absolute_error.hpp | 13 ++++ .../lbann/layers/loss/mean_squared_error.hpp | 13 ++++ .../loss/top_k_categorical_accuracy.hpp | 13 ++++ include/lbann/layers/math/binary.hpp | 33 +++++++++- include/lbann/layers/math/clamp.hpp | 13 ++++ include/lbann/layers/math/unary.hpp | 35 ++++++++++- include/lbann/layers/misc/argmax.hpp | 16 +++-- include/lbann/layers/misc/argmin.hpp | 15 +++-- .../lbann/layers/misc/channelwise_mean.hpp | 18 ++++-- include/lbann/layers/misc/covariance.hpp | 13 ++++ .../lbann/layers/misc/mini_batch_index.hpp | 16 ++++- include/lbann/layers/misc/mini_batch_size.hpp | 16 ++++- include/lbann/layers/misc/one_hot.hpp | 13 +++- include/lbann/layers/misc/variance.hpp | 13 ++++ .../regularizers/batch_normalization.hpp | 14 ++++- include/lbann/layers/regularizers/dropout.hpp | 12 +++- .../entrywise_batch_normalization.hpp | 13 ++++ .../local_response_normalization.hpp | 20 ++++-- .../layers/regularizers/selu_dropout.hpp | 14 +++++ include/lbann/layers/transform/bernoulli.hpp | 17 +++++- .../layers/transform/categorical_random.hpp | 19 ++++-- .../lbann/layers/transform/concatenation.hpp | 18 +++++- include/lbann/layers/transform/constant.hpp | 16 ++++- include/lbann/layers/transform/crop.hpp | 14 ++++- .../layers/transform/discrete_random.hpp | 16 +++-- include/lbann/layers/transform/dummy.hpp | 16 ++++- include/lbann/layers/transform/evaluation.hpp | 16 ++++- include/lbann/layers/transform/gaussian.hpp | 17 +++++- include/lbann/layers/transform/hadamard.hpp | 16 ++++- include/lbann/layers/transform/in_top_k.hpp | 16 ++++- include/lbann/layers/transform/pooling.hpp | 19 ++++-- include/lbann/layers/transform/reduction.hpp | 16 ++++- include/lbann/layers/transform/reshape.hpp | 13 ++++ include/lbann/layers/transform/slice.hpp | 13 +++- include/lbann/layers/transform/sort.hpp | 16 ++++- include/lbann/layers/transform/split.hpp | 12 +++- .../lbann/layers/transform/stop_gradient.hpp | 13 ++++ include/lbann/layers/transform/sum.hpp | 12 +++- include/lbann/layers/transform/tessellate.hpp | 16 ++++- include/lbann/layers/transform/uniform.hpp | 17 +++++- include/lbann/layers/transform/unpooling.hpp | 21 ++++--- .../lbann/layers/transform/weighted_sum.hpp | 16 ++++- include/lbann/layers/transform/weights.hpp | 17 +++++- include/lbann/utils/CMakeLists.txt | 1 + src/layers/CMakeLists.txt | 1 + src/layers/activations/CMakeLists.txt | 1 + src/layers/activations/activations.cpp | 17 +++--- src/layers/activations/activations.cu | 17 +++--- src/layers/activations/elu.cpp | 4 ++ src/layers/activations/elu.cu | 4 ++ src/layers/activations/identity.cpp | 39 ++++++++++++ src/layers/activations/leaky_relu.cpp | 6 ++ src/layers/activations/leaky_relu.cu | 6 ++ src/layers/activations/log_softmax.cpp | 6 ++ src/layers/activations/log_softmax.cu | 6 ++ src/layers/activations/softmax.cpp | 6 ++ src/layers/activations/softmax.cu | 6 ++ src/layers/image/bilinear_resize.cpp | 4 ++ src/layers/image/bilinear_resize.cu | 4 ++ src/layers/io/input/CMakeLists.txt | 7 +++ src/layers/io/input/input_layer.cpp | 43 +++++++++++++ src/layers/learning/CMakeLists.txt | 2 + .../learning/channelwise_scale_bias.cpp | 4 ++ src/layers/learning/channelwise_scale_bias.cu | 4 ++ src/layers/learning/convolution.cpp | 42 +++++++++++++ src/layers/learning/deconvolution.cpp | 40 ++++++++++++ src/layers/learning/embedding.cpp | 4 ++ src/layers/learning/entrywise_scale_bias.cpp | 6 ++ src/layers/learning/entrywise_scale_bias.cu | 6 ++ src/layers/learning/fully_connected.cpp | 13 ++++ src/layers/loss/categorical_accuracy.cpp | 6 ++ src/layers/loss/categorical_accuracy.cu | 6 ++ src/layers/loss/cross_entropy.cpp | 6 ++ src/layers/loss/cross_entropy.cu | 6 ++ src/layers/loss/entrywise.cpp | 16 +++-- src/layers/loss/entrywise.cu | 16 +++-- src/layers/loss/l1_norm.cpp | 6 ++ src/layers/loss/l1_norm.cu | 6 ++ src/layers/loss/l2_norm2.cpp | 6 ++ src/layers/loss/l2_norm2.cu | 6 ++ src/layers/loss/mean_absolute_error.cpp | 6 ++ src/layers/loss/mean_absolute_error.cu | 6 ++ src/layers/loss/mean_squared_error.cpp | 6 ++ src/layers/loss/mean_squared_error.cu | 6 ++ .../loss/top_k_categorical_accuracy.cpp | 6 ++ src/layers/loss/top_k_categorical_accuracy.cu | 6 ++ src/layers/math/binary.cpp | 43 +++++++------ src/layers/math/binary.cu | 43 +++++++------ src/layers/math/clamp.cpp | 6 ++ src/layers/math/clamp.cu | 6 ++ src/layers/math/unary.cpp | 61 ++++++++++--------- src/layers/math/unary.cu | 61 ++++++++++--------- src/layers/misc/CMakeLists.txt | 8 ++- src/layers/misc/argmax.cpp | 4 ++ src/layers/misc/argmin.cpp | 4 ++ src/layers/misc/channelwise_mean.cpp | 4 ++ src/layers/misc/channelwise_mean.cu | 4 ++ src/layers/misc/covariance.cpp | 6 ++ src/layers/misc/covariance.cu | 6 ++ src/layers/misc/mini_batch_index.cpp | 43 +++++++++++++ src/layers/misc/mini_batch_size.cpp | 43 +++++++++++++ src/layers/misc/one_hot.cpp | 4 ++ src/layers/misc/one_hot.cu | 4 ++ src/layers/misc/variance.cpp | 6 ++ src/layers/misc/variance.cu | 6 ++ src/layers/regularizers/CMakeLists.txt | 3 + .../regularizers/batch_normalization.cpp | 4 ++ .../regularizers/batch_normalization.cu | 4 ++ src/layers/regularizers/dropout.cpp | 39 ++++++++++++ .../entrywise_batch_normalization.cpp | 6 ++ .../entrywise_batch_normalization.cu | 6 ++ .../local_response_normalization.cpp | 39 ++++++++++++ src/layers/regularizers/selu_dropout.cpp | 39 ++++++++++++ src/layers/transform/CMakeLists.txt | 19 ++++++ src/layers/transform/bernoulli.cpp | 39 ++++++++++++ src/layers/transform/categorical_random.cpp | 35 +++++++++++ src/layers/transform/concatenation.cpp | 43 +++++++++++++ src/layers/transform/constant.cpp | 39 ++++++++++++ src/layers/transform/crop.cpp | 3 + src/layers/transform/crop.cu | 3 + src/layers/transform/discrete_random.cpp | 35 +++++++++++ src/layers/transform/dummy.cpp | 39 ++++++++++++ src/layers/transform/evaluation.cpp | 8 +++ src/layers/transform/gaussian.cpp | 39 ++++++++++++ src/layers/transform/hadamard.cpp | 39 ++++++++++++ src/layers/transform/in_top_k.cpp | 4 ++ src/layers/transform/in_top_k.cu | 4 ++ src/layers/transform/pooling.cpp | 37 +++++++++++ src/layers/transform/reduction.cpp | 37 +++++++++++ src/layers/transform/reshape.cpp | 39 ++++++++++++ src/layers/transform/slice.cpp | 39 ++++++++++++ src/layers/transform/sort.cpp | 3 + src/layers/transform/sort.cu | 3 + src/layers/transform/split.cpp | 39 ++++++++++++ src/layers/transform/stop_gradient.cpp | 43 +++++++++++++ src/layers/transform/sum.cpp | 39 ++++++++++++ src/layers/transform/tessellate.cpp | 4 ++ src/layers/transform/tessellate.cu | 4 ++ src/layers/transform/uniform.cpp | 39 ++++++++++++ src/layers/transform/unpooling.cpp | 34 +++++++++++ src/layers/transform/weighted_sum.cpp | 43 +++++++++++++ src/layers/transform/weights.cpp | 39 ++++++++++++ src/proto/factories/layer_graph_factory.cpp | 2 +- 163 files changed, 2485 insertions(+), 242 deletions(-) create mode 100644 src/layers/activations/identity.cpp create mode 100644 src/layers/io/input/CMakeLists.txt create mode 100644 src/layers/io/input/input_layer.cpp create mode 100644 src/layers/learning/convolution.cpp create mode 100644 src/layers/learning/deconvolution.cpp create mode 100644 src/layers/misc/mini_batch_index.cpp create mode 100644 src/layers/misc/mini_batch_size.cpp create mode 100644 src/layers/regularizers/dropout.cpp create mode 100644 src/layers/regularizers/local_response_normalization.cpp create mode 100644 src/layers/regularizers/selu_dropout.cpp create mode 100644 src/layers/transform/bernoulli.cpp create mode 100644 src/layers/transform/categorical_random.cpp create mode 100644 src/layers/transform/concatenation.cpp create mode 100644 src/layers/transform/constant.cpp create mode 100644 src/layers/transform/discrete_random.cpp create mode 100644 src/layers/transform/dummy.cpp create mode 100644 src/layers/transform/gaussian.cpp create mode 100644 src/layers/transform/hadamard.cpp create mode 100644 src/layers/transform/pooling.cpp create mode 100644 src/layers/transform/reduction.cpp create mode 100644 src/layers/transform/reshape.cpp create mode 100644 src/layers/transform/slice.cpp create mode 100644 src/layers/transform/split.cpp create mode 100644 src/layers/transform/stop_gradient.cpp create mode 100644 src/layers/transform/sum.cpp create mode 100644 src/layers/transform/uniform.cpp create mode 100644 src/layers/transform/unpooling.cpp create mode 100644 src/layers/transform/weighted_sum.cpp create mode 100644 src/layers/transform/weights.cpp diff --git a/include/lbann/layers/activations/activations.hpp b/include/lbann/layers/activations/activations.hpp index b36c8d61072..65e79ae3acb 100644 --- a/include/lbann/layers/activations/activations.hpp +++ b/include/lbann/layers/activations/activations.hpp @@ -31,6 +31,27 @@ namespace lbann { +// Convenience macros for ETI decls for unary layers + +#ifndef LBANN_ACTIVATIONS_LAYER_INSTANTIATE +#define UNARY_ETI_DECL_MACRO_DEV(LAYER_NAME, DEVICE) \ + extern template class entrywise_unary_layer< \ + data_layout::DATA_PARALLEL, DEVICE, LAYER_NAME##_name_struct>; \ + extern template class entrywise_unary_layer< \ + data_layout::MODEL_PARALLEL, DEVICE, LAYER_NAME##_name_struct> +#else +#define UNARY_ETI_DECL_MACRO_DEV(...) +#endif // LBANN_UNARY_LAYER_INSTANTIATE + +#ifdef LBANN_HAS_GPU +#define UNARY_ETI_DECL_MACRO(LAYER_NAME) \ + UNARY_ETI_DECL_MACRO_DEV(LAYER_NAME, El::Device::CPU); \ + UNARY_ETI_DECL_MACRO_DEV(LAYER_NAME, El::Device::GPU) +#else +#define UNARY_ETI_DECL_MACRO(LAYER_NAME) \ + UNARY_ETI_DECL_MACRO_DEV(LAYER_NAME, El::Device::CPU) +#endif // LBANN_HAS_GPU + // Convenience macro to define an entry-wise unary layer class #define DEFINE_ENTRYWISE_UNARY_LAYER(layer_name, layer_string) \ struct layer_name##_name_struct { \ @@ -38,7 +59,8 @@ namespace lbann { }; \ template \ using layer_name \ - = entrywise_unary_layer; + = entrywise_unary_layer; \ + UNARY_ETI_DECL_MACRO(layer_name) /** @class lbann::log_sigmoid_layer * @brief Logarithm of sigmoid function. @@ -46,7 +68,7 @@ namespace lbann { * @f[ \log(\sigma(x)) = -\log(1 + e^{-x}) @f] * See https://en.wikipedia.org/wiki/Sigmoid_function. */ -DEFINE_ENTRYWISE_UNARY_LAYER(log_sigmoid_layer, "log sigmoid") +DEFINE_ENTRYWISE_UNARY_LAYER(log_sigmoid_layer, "log sigmoid"); /** @class lbann::relu_layer * @brief Rectified linear unit. @@ -54,7 +76,7 @@ DEFINE_ENTRYWISE_UNARY_LAYER(log_sigmoid_layer, "log sigmoid") * @f[ \text{ReLU}(x) = \text{max}(x, 0) @f] * See https://en.wikipedia.org/wiki/Rectifier_(neural_networks). */ -DEFINE_ENTRYWISE_UNARY_LAYER(relu_layer, "ReLU") +DEFINE_ENTRYWISE_UNARY_LAYER(relu_layer, "ReLU"); /** @class lbann::selu_layer * @brief Scaled exponential rectified linear unit. @@ -73,7 +95,7 @@ DEFINE_ENTRYWISE_UNARY_LAYER(relu_layer, "ReLU") * Hochreiter. "Self-normalizing neural networks." In Advances in * Neural Information Processing Systems, pp. 971-980. 2017. */ -DEFINE_ENTRYWISE_UNARY_LAYER(selu_layer, "SELU") +DEFINE_ENTRYWISE_UNARY_LAYER(selu_layer, "SELU"); /** @class lbann::sigmoid_layer * @brief Special case of logistic function. @@ -81,7 +103,7 @@ DEFINE_ENTRYWISE_UNARY_LAYER(selu_layer, "SELU") * @f[ \sigma(x) = \frac{1}{1 + e^{-x}} @f] * See https://en.wikipedia.org/wiki/Sigmoid_function. */ -DEFINE_ENTRYWISE_UNARY_LAYER(sigmoid_layer, "sigmoid") +DEFINE_ENTRYWISE_UNARY_LAYER(sigmoid_layer, "sigmoid"); // Sigmoid function output is strictly in (0,1) // Note: Output is in the range [eps,1-eps], where 'eps' is machine // epsilon. This avoids denormalized floats and helps mitigate some @@ -94,16 +116,19 @@ DEFINE_ENTRYWISE_UNARY_LAYER(sigmoid_layer, "sigmoid") * @f[ \text{softplus}(x) = \log (e^x + 1) @f] * See https://en.wikipedia.org/wiki/Rectifier_(neural_networks) */ -DEFINE_ENTRYWISE_UNARY_LAYER(softplus_layer, "softplus") +DEFINE_ENTRYWISE_UNARY_LAYER(softplus_layer, "softplus"); /** @class lbann::softsign_layer * @brief Smooth approximation to sign function. * * @f[ \text{softsign}(x) = \frac{x}{1 + |x|} @f] */ -DEFINE_ENTRYWISE_UNARY_LAYER(softsign_layer, "softsign") +DEFINE_ENTRYWISE_UNARY_LAYER(softsign_layer, "softsign"); } // namespace lbann #undef DEFINE_ENTRYWISE_UNARY_LAYER +#undef UNARY_ETI_DECL_MACRO +#undef UNARY_ETI_DECL_MACRO_DEV + #endif // LBANN_LAYERS_ACTIVATIONS_ACTIVATIONS_HPP_INCLUDED diff --git a/include/lbann/layers/activations/elu.hpp b/include/lbann/layers/activations/elu.hpp index 8ca94393c77..9d9ce7a9cb4 100644 --- a/include/lbann/layers/activations/elu.hpp +++ b/include/lbann/layers/activations/elu.hpp @@ -76,6 +76,15 @@ class elu_layer : public Layer { }; +#ifndef LBANN_ELU_LAYER_INSTANTIATE +extern template class elu_layer; +extern template class elu_layer; +#ifdef LBANN_HAS_GPU +extern template class elu_layer; +extern template class elu_layer; +#endif // LBANN_HAS_GPU +#endif // LBANN_ELU_LAYER_INSTANTIATE + } // namespace lbann #endif // LBANN_LAYERS_ACTIVATIONS_ELU_HPP_INCLUDED diff --git a/include/lbann/layers/activations/identity.hpp b/include/lbann/layers/activations/identity.hpp index e895ba44b99..b7eeba766b7 100644 --- a/include/lbann/layers/activations/identity.hpp +++ b/include/lbann/layers/activations/identity.hpp @@ -59,6 +59,19 @@ class identity_layer : public Layer { void bp_compute() override {} }; +#ifndef LBANN_IDENTITY_LAYER_INSTANTIATE +extern template class identity_layer< + data_layout::DATA_PARALLEL, El::Device::CPU>; +extern template class identity_layer< + data_layout::MODEL_PARALLEL, El::Device::CPU>; +#ifdef LBANN_HAS_GPU +extern template class identity_layer< + data_layout::DATA_PARALLEL, El::Device::GPU>; +extern template class identity_layer< + data_layout::MODEL_PARALLEL, El::Device::GPU>; +#endif // LBANN_HAS_GPU +#endif // LBANN_IDENTITY_LAYER_INSTANTIATE + } // namespace lbann #endif // LBANN_LAYERS_ACTIVATIONS_IDENTITY_HPP_INCLUDED diff --git a/include/lbann/layers/activations/leaky_relu.hpp b/include/lbann/layers/activations/leaky_relu.hpp index 0c1f7d8f852..ad55718a7fa 100644 --- a/include/lbann/layers/activations/leaky_relu.hpp +++ b/include/lbann/layers/activations/leaky_relu.hpp @@ -76,6 +76,19 @@ class leaky_relu_layer : public Layer { }; +#ifndef LBANN_LEAKY_RELU_LAYER_INSTANTIATE +extern template class leaky_relu_layer< + data_layout::DATA_PARALLEL, El::Device::CPU>; +extern template class leaky_relu_layer< + data_layout::MODEL_PARALLEL, El::Device::CPU>; +#ifdef LBANN_HAS_GPU +extern template class leaky_relu_layer< + data_layout::DATA_PARALLEL, El::Device::GPU>; +extern template class leaky_relu_layer< + data_layout::MODEL_PARALLEL, El::Device::GPU>; +#endif // LBANN_HAS_GPU +#endif // LBANN_LEAKY_RELU_LAYER_INSTANTIATE + } // namespace lbann #endif // LBANN_LAYERS_ACTIVATIONS_LEAKY_RELU_HPP_INCLUDED diff --git a/include/lbann/layers/activations/log_softmax.hpp b/include/lbann/layers/activations/log_softmax.hpp index 136edf89600..fa4e1c48582 100644 --- a/include/lbann/layers/activations/log_softmax.hpp +++ b/include/lbann/layers/activations/log_softmax.hpp @@ -118,6 +118,19 @@ class log_softmax_layer : public Layer { }; +#ifndef LBANN_LOG_SOFTMAX_LAYER_INSTANTIATE +extern template class log_softmax_layer< + data_layout::DATA_PARALLEL, El::Device::CPU>; +extern template class log_softmax_layer< + data_layout::MODEL_PARALLEL, El::Device::CPU>; +#ifdef LBANN_HAS_GPU +extern template class log_softmax_layer< + data_layout::DATA_PARALLEL, El::Device::GPU>; +extern template class log_softmax_layer< + data_layout::MODEL_PARALLEL, El::Device::GPU>; +#endif // LBANN_HAS_GPU +#endif // LBANN_LOG_SOFTMAX_LAYER_INSTANTIATE + } // namespace lbann #endif // LBANN_LAYERS_ACTIVATIONS_LOG_SOFTMAX_HPP_INCLUDED diff --git a/include/lbann/layers/activations/softmax.hpp b/include/lbann/layers/activations/softmax.hpp index 665323c3c14..6a43b3e0e91 100644 --- a/include/lbann/layers/activations/softmax.hpp +++ b/include/lbann/layers/activations/softmax.hpp @@ -125,6 +125,19 @@ class softmax_layer : public Layer { }; +#ifndef LBANN_SOFTMAX_LAYER_INSTANTIATE +extern template class softmax_layer< + data_layout::DATA_PARALLEL, El::Device::CPU>; +extern template class softmax_layer< + data_layout::MODEL_PARALLEL, El::Device::CPU>; +#ifdef LBANN_HAS_GPU +extern template class softmax_layer< + data_layout::DATA_PARALLEL, El::Device::GPU>; +extern template class softmax_layer< + data_layout::MODEL_PARALLEL, El::Device::GPU>; +#endif // LBANN_HAS_GPU +#endif // LBANN_SOFTMAX_LAYER_INSTANTIATE + } // namespace lbann #endif // LBANN_LAYERS_ACTIVATIONS_SOFTMAX_HPP_INCLUDED diff --git a/include/lbann/layers/image/bilinear_resize.hpp b/include/lbann/layers/image/bilinear_resize.hpp index 2e3e9e9da67..1de21e3dc16 100644 --- a/include/lbann/layers/image/bilinear_resize.hpp +++ b/include/lbann/layers/image/bilinear_resize.hpp @@ -38,12 +38,12 @@ namespace lbann { */ template class bilinear_resize_layer : public Layer { + static_assert(Layout == data_layout::DATA_PARALLEL, + "bilinear_resize_layer only supports DATA_PARALLEL"); public: bilinear_resize_layer(lbann_comm *comm, El::Int height, El::Int width) : Layer(comm), m_height(height), m_width(width) { - static_assert(Layout == data_layout::DATA_PARALLEL, - "bilinear_resize_layer only supports DATA_PARALLEL"); } bilinear_resize_layer* copy() const override { @@ -106,6 +106,15 @@ class bilinear_resize_layer : public Layer { }; +#ifndef LBANN_BILINEAR_RESIZE_LAYER_INSTANTIATE +extern template class bilinear_resize_layer< + data_layout::DATA_PARALLEL, El::Device::CPU>; +#ifdef LBANN_HAS_GPU +extern template class bilinear_resize_layer< + data_layout::DATA_PARALLEL, El::Device::GPU>; +#endif // LBANN_HAS_GPU +#endif // LBANN_BILINEAR_RESIZE_LAYER_INSTANTIATE + } // namespace lbann #endif // LBANN_LAYERS_IMAGE_BILINEAR_RESIZE_HPP_INCLUDED diff --git a/include/lbann/layers/io/input/input_layer.hpp b/include/lbann/layers/io/input/input_layer.hpp index e2c144684b3..1f85202fd73 100644 --- a/include/lbann/layers/io/input/input_layer.hpp +++ b/include/lbann/layers/io/input/input_layer.hpp @@ -37,9 +37,10 @@ namespace lbann { -template - /** @brief Interface with data reader. */ +template class input_layer : public generic_input_layer { public: @@ -99,6 +100,19 @@ template<> inline void input_layer::validate_data_layout() {} #endif // LBANN_HAS_GPU -} +#ifndef LBANN_INPUT_LAYER_INSTANTIATE +extern template class input_layer< + partitioned_io_buffer, data_layout::DATA_PARALLEL, El::Device::CPU>; +extern template class input_layer< + partitioned_io_buffer, data_layout::MODEL_PARALLEL, El::Device::CPU>; +#ifdef LBANN_HAS_GPU +extern template class input_layer< + partitioned_io_buffer, data_layout::DATA_PARALLEL, El::Device::GPU>; +extern template class input_layer< + partitioned_io_buffer, data_layout::MODEL_PARALLEL, El::Device::GPU>; +#endif // LBANN_HAS_GPU +#endif // LBANN_INPUT_LAYER_INSTANTIATE + +} // namespace lbann #endif // LBANN_LAYERS_INPUT_LAYER_HPP_INCLUDED diff --git a/include/lbann/layers/learning/base_convolution.hpp b/include/lbann/layers/learning/base_convolution.hpp index b32abf9fca5..824b469f18f 100644 --- a/include/lbann/layers/learning/base_convolution.hpp +++ b/include/lbann/layers/learning/base_convolution.hpp @@ -27,9 +27,9 @@ #ifndef LBANN_LAYERS_LEARNING_BASE_CONVOLUTION_HPP_INCLUDED #define LBANN_LAYERS_LEARNING_BASE_CONVOLUTION_HPP_INCLUDED -#include -#include +#include "lbann/execution_contexts/sgd_execution_context.hpp" #include "lbann/layers/layer.hpp" +#include "lbann/models/model.hpp" #include "lbann/weights/initializer.hpp" #include "lbann/weights/variance_scaling_initializers.hpp" #include "lbann/utils/cudnn.hpp" @@ -37,7 +37,9 @@ #include "lbann/utils/random.hpp" #include "lbann/utils/timer.hpp" #include "lbann/utils/im2col.hpp" -#include "lbann/execution_contexts/sgd_execution_context.hpp" + +#include +#include namespace lbann { @@ -1218,6 +1220,13 @@ class base_convolution_layer : public Layer { }; +#ifndef LBANN_BASE_CONVOLUTION_LAYER_INSTANTIATE +extern template class base_convolution_layer; +#ifdef LBANN_HAS_GPU +extern template class base_convolution_layer; +#endif // LBANN_HAS_GPU +#endif // LBANN_BASE_CONVOLUTION_LAYER_INSTANTIATE + } // namespace lbann #endif // LBANN_LAYERS_LEARNING_BASE_CONVOLUTION_HPP_INCLUDED diff --git a/include/lbann/layers/learning/channelwise_scale_bias.hpp b/include/lbann/layers/learning/channelwise_scale_bias.hpp index 3270340ee5d..02eb1c74694 100644 --- a/include/lbann/layers/learning/channelwise_scale_bias.hpp +++ b/include/lbann/layers/learning/channelwise_scale_bias.hpp @@ -55,13 +55,13 @@ namespace lbann { template class channelwise_scale_bias_layer : public Layer { + static_assert(Layout == data_layout::DATA_PARALLEL, + "channelwise_mean_layer only supports " + "data-parallel data layout"); public: channelwise_scale_bias_layer(lbann_comm *comm) : Layer(comm) { - static_assert(Layout == data_layout::DATA_PARALLEL, - "channelwise_mean_layer only supports " - "data-parallel data layout"); } channelwise_scale_bias_layer(const channelwise_scale_bias_layer& other) @@ -138,6 +138,15 @@ class channelwise_scale_bias_layer : public Layer { }; +#ifndef LBANN_CHANNELWISE_SCALE_BIAS_LAYER_INSTANTIATE +extern template class channelwise_scale_bias_layer< + data_layout::DATA_PARALLEL, El::Device::CPU>; +#ifdef LBANN_HAS_GPU +extern template class channelwise_scale_bias_layer< + data_layout::DATA_PARALLEL, El::Device::GPU>; +#endif // LBANN_HAS_GPU +#endif // LBANN_CHANNELWISE_SCALE_BIAS_LAYER_INSTANTIATE + } // namespace lbann #endif // LBANN_LAYER_LEARNING_CHANNELWISE_SCALE_BIAS_HPP_INCLUDED diff --git a/include/lbann/layers/learning/convolution.hpp b/include/lbann/layers/learning/convolution.hpp index 1b7f0c23e12..155ddfde3b9 100644 --- a/include/lbann/layers/learning/convolution.hpp +++ b/include/lbann/layers/learning/convolution.hpp @@ -43,8 +43,11 @@ class imcomm; * tensors. This is primarily optimized for image data in NCHW * format. */ -template +template class convolution_layer : public base_convolution_layer { + static_assert(Layout == data_layout::DATA_PARALLEL, + "convolution layer only supports DATA_PARALLEL"); private: friend class callback::imcomm; @@ -89,9 +92,6 @@ class convolution_layer : public base_convolution_layer { std::move(dilations), groups, has_bias) { - static_assert(Layout == data_layout::DATA_PARALLEL, - "convolution layer only supports DATA_PARALLEL"); - } convolution_layer* copy() const override { return new convolution_layer(*this); } @@ -160,6 +160,15 @@ class convolution_layer : public base_convolution_layer { }; +#ifndef LBANN_CONVOLUTION_LAYER_INSTANTIATE +extern template class convolution_layer< + data_layout::DATA_PARALLEL, El::Device::CPU>; +#ifdef LBANN_HAS_GPU +extern template class convolution_layer< + data_layout::DATA_PARALLEL, El::Device::GPU>; +#endif // LBANN_HAS_GPU +#endif // LBANN_CONVOLUTION_LAYER_INSTANTIATE + } // namespace lbann #endif // LBANN_LAYERS_LEARNING_CONVOLUTION_HPP_INCLUDED diff --git a/include/lbann/layers/learning/deconvolution.hpp b/include/lbann/layers/learning/deconvolution.hpp index 19e98369e50..2cb5c93391b 100644 --- a/include/lbann/layers/learning/deconvolution.hpp +++ b/include/lbann/layers/learning/deconvolution.hpp @@ -40,6 +40,8 @@ class imcomm; /** @brief Transpose of the convolution layer. */ template class deconvolution_layer : public base_convolution_layer { + static_assert(Layout == data_layout::DATA_PARALLEL, + "deconvolution layer only supports DATA_PARALLEL"); private: friend class callback::imcomm; @@ -84,9 +86,6 @@ class deconvolution_layer : public base_convolution_layer { std::move(dilations), groups, has_bias) { - static_assert(Layout == data_layout::DATA_PARALLEL, - "deconvolution layer only supports DATA_PARALLEL"); - } deconvolution_layer* copy() const override { return new deconvolution_layer(*this); } @@ -176,6 +175,15 @@ class deconvolution_layer : public base_convolution_layer { }; +#ifndef LBANN_DECONVOLUTION_LAYER_INSTANTIATE +extern template class deconvolution_layer< + data_layout::DATA_PARALLEL, El::Device::CPU>; +#ifdef LBANN_HAS_GPU +extern template class deconvolution_layer< + data_layout::DATA_PARALLEL, El::Device::GPU>; +#endif // LBANN_HAS_GPU +#endif // LBANN_DECONVOLUTION_LAYER_INSTANTIATE + } // namespace lbann #endif // LBANN_LAYERS_LEARNING_DECONVOLUTION_HPP_INCLUDED diff --git a/include/lbann/layers/learning/embedding.hpp b/include/lbann/layers/learning/embedding.hpp index 9e5d5c697f0..7e9af4d0062 100644 --- a/include/lbann/layers/learning/embedding.hpp +++ b/include/lbann/layers/learning/embedding.hpp @@ -33,6 +33,10 @@ namespace lbann { template class embedding_layer : public Layer { + static_assert(Layout == data_layout::DATA_PARALLEL, + "embedding layer only supports data parallel layout"); + static_assert(Device == El::Device::CPU, + "embedding layer only supports CPU"); public: embedding_layer(lbann_comm* comm, @@ -41,10 +45,6 @@ class embedding_layer : public Layer { : Layer(comm), m_dictionary_size{dictionary_size}, m_embedding_size{embedding_size} { - static_assert(Layout == data_layout::DATA_PARALLEL, - "embedding layer only supports data parallel layout"); - static_assert(Device == El::Device::CPU, - "embedding layer only supports CPU"); } embedding_layer(const embedding_layer& other) = default; @@ -83,6 +83,11 @@ class embedding_layer : public Layer { }; +#ifndef LBANN_EMBEDDING_LAYER_INSTANTIATE +extern template class embedding_layer< + data_layout::DATA_PARALLEL, El::Device::CPU>; +#endif // LBANN_EMBEDDING_LAYER_INSTANTIATE + } // namespace lbann #endif // LBANN_LAYERS_LEARNING_EMBEDDING_HPP_INCLUDED diff --git a/include/lbann/layers/learning/entrywise_scale_bias.hpp b/include/lbann/layers/learning/entrywise_scale_bias.hpp index 0b313e68bc9..e7ff19a1bfb 100644 --- a/include/lbann/layers/learning/entrywise_scale_bias.hpp +++ b/include/lbann/layers/learning/entrywise_scale_bias.hpp @@ -173,6 +173,19 @@ class entrywise_scale_bias_layer : public Layer { }; +#ifndef LBANN_ENTRYWISE_SCALE_BIAS_LAYER_INSTANTIATE +extern template class entrywise_scale_bias_layer< + data_layout::DATA_PARALLEL, El::Device::CPU>; +extern template class entrywise_scale_bias_layer< + data_layout::MODEL_PARALLEL, El::Device::CPU>; +#ifdef LBANN_HAS_GPU +extern template class entrywise_scale_bias_layer< + data_layout::DATA_PARALLEL, El::Device::GPU>; +extern template class entrywise_scale_bias_layer< + data_layout::MODEL_PARALLEL, El::Device::GPU>; +#endif // LBANN_HAS_GPU +#endif // LBANN_ENTRYWISE_SCALE_BIAS_LAYER_INSTANTIATE + } // namespace lbann #endif // LBANN_LAYER_LEARNING_ENTRYWISE_SCALE_BIAS_HPP_INCLUDED diff --git a/include/lbann/layers/learning/fully_connected.hpp b/include/lbann/layers/learning/fully_connected.hpp index b7e7bcc2b29..a3573397a21 100644 --- a/include/lbann/layers/learning/fully_connected.hpp +++ b/include/lbann/layers/learning/fully_connected.hpp @@ -242,6 +242,19 @@ class fully_connected_layer : public learning_layer { }; +#ifndef LBANN_FULLY_CONNECTED_LAYER_INSTANTIATE +extern template class fully_connected_layer< + data_layout::DATA_PARALLEL, El::Device::CPU>; +extern template class fully_connected_layer< + data_layout::MODEL_PARALLEL, El::Device::CPU>; +#ifdef LBANN_HAS_GPU +extern template class fully_connected_layer< + data_layout::DATA_PARALLEL, El::Device::GPU>; +extern template class fully_connected_layer< + data_layout::MODEL_PARALLEL, El::Device::GPU>; +#endif // LBANN_HAS_GPU +#endif // LBANN_FULLY_CONNECTED_LAYER_INSTANTIATE + } // namespace lbann #endif // LBANN_LAYERS_LEARNING_FULLY_CONNECTED_HPP_INCLUDED diff --git a/include/lbann/layers/loss/categorical_accuracy.hpp b/include/lbann/layers/loss/categorical_accuracy.hpp index 078abb6b2a4..12acae52184 100644 --- a/include/lbann/layers/loss/categorical_accuracy.hpp +++ b/include/lbann/layers/loss/categorical_accuracy.hpp @@ -85,6 +85,19 @@ class categorical_accuracy_layer : public Layer { }; +#ifndef LBANN_CATEGORICAL_ACCURACY_LAYER_INSTANTIATE +extern template class categorical_accuracy_layer< + data_layout::DATA_PARALLEL, El::Device::CPU>; +extern template class categorical_accuracy_layer< + data_layout::MODEL_PARALLEL, El::Device::CPU>; +#ifdef LBANN_HAS_GPU +extern template class categorical_accuracy_layer< + data_layout::DATA_PARALLEL, El::Device::GPU>; +extern template class categorical_accuracy_layer< + data_layout::MODEL_PARALLEL, El::Device::GPU>; +#endif // LBANN_HAS_GPU +#endif // LBANN_CATEGORICAL_ACCURACY_LAYER_INSTANTIATE + } // namespace lbann #endif // LBANN_LAYERS_LOSS_CATEGORICAL_ACCURACY_HPP_INCLUDED diff --git a/include/lbann/layers/loss/cross_entropy.hpp b/include/lbann/layers/loss/cross_entropy.hpp index e2ee89e4350..96a7b27f66e 100644 --- a/include/lbann/layers/loss/cross_entropy.hpp +++ b/include/lbann/layers/loss/cross_entropy.hpp @@ -164,6 +164,19 @@ class cross_entropy_layer : public Layer { }; +#ifndef LBANN_CROSS_ENTROPY_LAYER_INSTANTIATE +extern template class cross_entropy_layer< + data_layout::DATA_PARALLEL, El::Device::CPU>; +extern template class cross_entropy_layer< + data_layout::MODEL_PARALLEL, El::Device::CPU>; +#ifdef LBANN_HAS_GPU +extern template class cross_entropy_layer< + data_layout::DATA_PARALLEL, El::Device::GPU>; +extern template class cross_entropy_layer< + data_layout::MODEL_PARALLEL, El::Device::GPU>; +#endif // LBANN_HAS_GPU +#endif // LBANN_CROSS_ENTROPY_LAYER_INSTANTIATE + } // namespace lbann #endif // LBANN_LAYERS_LOSS_CROSS_ENTROPY_HPP_INCLUDED diff --git a/include/lbann/layers/loss/entrywise.hpp b/include/lbann/layers/loss/entrywise.hpp index 6e55f58313e..f4fc640869d 100644 --- a/include/lbann/layers/loss/entrywise.hpp +++ b/include/lbann/layers/loss/entrywise.hpp @@ -31,6 +31,25 @@ namespace lbann { +#ifndef LBANN_ENTRYWISE_LAYER_INSTANTIATE +#define BINARY_ETI_DECL_MACRO_DEV(LAYER_NAME, DEVICE) \ + extern template class entrywise_binary_layer< \ + data_layout::DATA_PARALLEL, DEVICE, LAYER_NAME##_name_struct>; \ + extern template class entrywise_binary_layer< \ + data_layout::MODEL_PARALLEL, DEVICE, LAYER_NAME##_name_struct> +#else +#define BINARY_ETI_DECL_MACRO_DEV(...) +#endif // LBANN_BINARY_LAYER_INSTANTIATE + +#ifdef LBANN_HAS_GPU +#define BINARY_ETI_DECL_MACRO(LAYER_NAME) \ + BINARY_ETI_DECL_MACRO_DEV(LAYER_NAME, El::Device::CPU); \ + BINARY_ETI_DECL_MACRO_DEV(LAYER_NAME, El::Device::GPU) +#else +#define BINARY_ETI_DECL_MACRO(LAYER_NAME) \ + BINARY_ETI_DECL_MACRO_DEV(LAYER_NAME, El::Device::CPU) +#endif // LBANN_HAS_GPU + // Convenience macro to define an entry-wise binary layer class #define DEFINE_ENTRYWISE_BINARY_LAYER(layer_name, layer_string) \ struct layer_name##_name_struct { \ @@ -38,18 +57,26 @@ namespace lbann { }; \ template \ using layer_name \ - = entrywise_binary_layer; + = entrywise_binary_layer; \ + BINARY_ETI_DECL_MACRO(layer_name) // Cross entropy loss -DEFINE_ENTRYWISE_BINARY_LAYER(binary_cross_entropy_layer, "binary cross entropy"); -DEFINE_ENTRYWISE_BINARY_LAYER(sigmoid_binary_cross_entropy_layer, "sigmoid binary cross entropy"); +DEFINE_ENTRYWISE_BINARY_LAYER(binary_cross_entropy_layer, + "binary cross entropy"); +DEFINE_ENTRYWISE_BINARY_LAYER(sigmoid_binary_cross_entropy_layer, + "sigmoid binary cross entropy"); // Boolean loss functions DEFINE_ENTRYWISE_BINARY_LAYER(boolean_accuracy_layer, "Boolean accuracy"); -DEFINE_ENTRYWISE_BINARY_LAYER(boolean_false_negative_layer, "Boolean false negative rate"); -DEFINE_ENTRYWISE_BINARY_LAYER(boolean_false_positive_layer, "Boolean false positive rate"); +DEFINE_ENTRYWISE_BINARY_LAYER(boolean_false_negative_layer, + "Boolean false negative rate"); +DEFINE_ENTRYWISE_BINARY_LAYER(boolean_false_positive_layer, + "Boolean false positive rate"); } // namespace lbann #undef DEFINE_ENTRYWISE_BINARY_LAYER +#undef BINARY_ETI_DECL_MACRO +#undef BINARY_ETI_DECL_MACRO_DEV + #endif // LBANN_LAYERS_LOSS_ENTRYWISE_HPP_INCLUDED diff --git a/include/lbann/layers/loss/l1_norm.hpp b/include/lbann/layers/loss/l1_norm.hpp index 8ceb88c09c3..687a8bcb3a1 100644 --- a/include/lbann/layers/loss/l1_norm.hpp +++ b/include/lbann/layers/loss/l1_norm.hpp @@ -128,6 +128,19 @@ class l1_norm_layer : public Layer { }; +#ifndef LBANN_L1_NORM_LAYER_INSTANTIATE +extern template class l1_norm_layer< + data_layout::DATA_PARALLEL, El::Device::CPU>; +extern template class l1_norm_layer< + data_layout::MODEL_PARALLEL, El::Device::CPU>; +#ifdef LBANN_HAS_GPU +extern template class l1_norm_layer< + data_layout::DATA_PARALLEL, El::Device::GPU>; +extern template class l1_norm_layer< + data_layout::MODEL_PARALLEL, El::Device::GPU>; +#endif // LBANN_HAS_GPU +#endif // LBANN_L1_NORM_LAYER_INSTANTIATE + } // namespace lbann #endif // LBANN_LAYERS_LOSS_L1_NORM_HPP_INCLUDED diff --git a/include/lbann/layers/loss/l2_norm2.hpp b/include/lbann/layers/loss/l2_norm2.hpp index 15ad24adbd0..c49822f3711 100644 --- a/include/lbann/layers/loss/l2_norm2.hpp +++ b/include/lbann/layers/loss/l2_norm2.hpp @@ -128,6 +128,19 @@ class l2_norm2_layer : public Layer { }; +#ifndef LBANN_L2_NORM2_LAYER_INSTANTIATE +extern template class l2_norm2_layer< + data_layout::DATA_PARALLEL, El::Device::CPU>; +extern template class l2_norm2_layer< + data_layout::MODEL_PARALLEL, El::Device::CPU>; +#ifdef LBANN_HAS_GPU +extern template class l2_norm2_layer< + data_layout::DATA_PARALLEL, El::Device::GPU>; +extern template class l2_norm2_layer< + data_layout::MODEL_PARALLEL, El::Device::GPU>; +#endif // LBANN_HAS_GPU +#endif // LBANN_L2_NORM2_LAYER_INSTANTIATE + } // namespace lbann #endif // LBANN_LAYERS_LOSS_L2_NORM2_HPP_INCLUDED diff --git a/include/lbann/layers/loss/mean_absolute_error.hpp b/include/lbann/layers/loss/mean_absolute_error.hpp index c136f1f6c72..34acbd9c04e 100644 --- a/include/lbann/layers/loss/mean_absolute_error.hpp +++ b/include/lbann/layers/loss/mean_absolute_error.hpp @@ -173,6 +173,19 @@ class mean_absolute_error_layer : public Layer { }; +#ifndef LBANN_MEAN_ABSOLUTE_ERROR_LAYER_INSTANTIATE +extern template class mean_absolute_error_layer< + data_layout::DATA_PARALLEL, El::Device::CPU>; +extern template class mean_absolute_error_layer< + data_layout::MODEL_PARALLEL, El::Device::CPU>; +#ifdef LBANN_HAS_GPU +extern template class mean_absolute_error_layer< + data_layout::DATA_PARALLEL, El::Device::GPU>; +extern template class mean_absolute_error_layer< + data_layout::MODEL_PARALLEL, El::Device::GPU>; +#endif // LBANN_HAS_GPU +#endif // LBANN_MEAN_ABSOLUTE_ERROR_LAYER_INSTANTIATE + } // namespace lbann #endif // LBANN_LAYERS_LOSS_MEAN_ABSOLUTE_ERROR_HPP_INCLUDED diff --git a/include/lbann/layers/loss/mean_squared_error.hpp b/include/lbann/layers/loss/mean_squared_error.hpp index 19ead85c346..0ee52441d83 100644 --- a/include/lbann/layers/loss/mean_squared_error.hpp +++ b/include/lbann/layers/loss/mean_squared_error.hpp @@ -173,6 +173,19 @@ class mean_squared_error_layer : public Layer { }; +#ifndef LBANN_MEAN_SQUARED_ERROR_LAYER_INSTANTIATE +extern template class mean_squared_error_layer< + data_layout::DATA_PARALLEL, El::Device::CPU>; +extern template class mean_squared_error_layer< + data_layout::MODEL_PARALLEL, El::Device::CPU>; +#ifdef LBANN_HAS_GPU +extern template class mean_squared_error_layer< + data_layout::DATA_PARALLEL, El::Device::GPU>; +extern template class mean_squared_error_layer< + data_layout::MODEL_PARALLEL, El::Device::GPU>; +#endif // LBANN_HAS_GPU +#endif // LBANN_MEAN_SQUARED_ERROR_LAYER_INSTANTIATE + } // namespace lbann #endif // LBANN_LAYERS_LOSS_MEAN_SQUARED_ERROR_HPP_INCLUDED diff --git a/include/lbann/layers/loss/top_k_categorical_accuracy.hpp b/include/lbann/layers/loss/top_k_categorical_accuracy.hpp index 4442419d4de..855ab34607c 100644 --- a/include/lbann/layers/loss/top_k_categorical_accuracy.hpp +++ b/include/lbann/layers/loss/top_k_categorical_accuracy.hpp @@ -99,6 +99,19 @@ class top_k_categorical_accuracy_layer : public Layer { }; +#ifndef LBANN_TOP_K_CATEGORICAL_ACCURACY_LAYER_INSTANTIATE +extern template class top_k_categorical_accuracy_layer< + data_layout::DATA_PARALLEL, El::Device::CPU>; +extern template class top_k_categorical_accuracy_layer< + data_layout::MODEL_PARALLEL, El::Device::CPU>; +#ifdef LBANN_HAS_GPU +extern template class top_k_categorical_accuracy_layer< + data_layout::DATA_PARALLEL, El::Device::GPU>; +extern template class top_k_categorical_accuracy_layer< + data_layout::MODEL_PARALLEL, El::Device::GPU>; +#endif // LBANN_HAS_GPU +#endif // LBANN_TOP_K_CATEGORICAL_ACCURACY_LAYER_INSTANTIATE + } // namespace lbann #endif // LBANN_LAYERS_LOSS_TOP_K_CATEGORICAL_ACCURACY_HPP_INCLUDED diff --git a/include/lbann/layers/math/binary.hpp b/include/lbann/layers/math/binary.hpp index d389ccbaae8..1b7d5cadbd5 100644 --- a/include/lbann/layers/math/binary.hpp +++ b/include/lbann/layers/math/binary.hpp @@ -81,6 +81,33 @@ class entrywise_binary_layer : public Layer { }; +// Convenience macros for ETI decls for binary layers + +#ifndef LBANN_BINARY_LAYER_INSTANTIATE +#define BINARY_ETI_DECL_MACRO_DEV(LAYER_NAME, DEVICE) \ + extern template class entrywise_binary_layer< \ + data_layout::DATA_PARALLEL, DEVICE, LAYER_NAME##_name_struct>; \ + extern template class entrywise_binary_layer< \ + data_layout::MODEL_PARALLEL, DEVICE, LAYER_NAME##_name_struct> +#else +#define BINARY_ETI_DECL_MACRO_DEV(...) +#endif // LBANN_BINARY_LAYER_INSTANTIATE + +#define BINARY_ETI_INST_MACRO_DEV(LAYER_NAME, DEVICE) \ + template class entrywise_binary_layer< \ + data_layout::DATA_PARALLEL, DEVICE, LAYER_NAME##_name_struct>; \ + template class entrywise_binary_layer< \ + data_layout::MODEL_PARALLEL, DEVICE, LAYER_NAME##_name_struct> + +#ifdef LBANN_HAS_GPU +#define BINARY_ETI_DECL_MACRO(LAYER_NAME) \ + BINARY_ETI_DECL_MACRO_DEV(LAYER_NAME, El::Device::CPU); \ + BINARY_ETI_DECL_MACRO_DEV(LAYER_NAME, El::Device::GPU) +#else +#define BINARY_ETI_DECL_MACRO(LAYER_NAME) \ + BINARY_ETI_DECL_MACRO_DEV(LAYER_NAME, El::Device::CPU) +#endif // LBANN_HAS_GPU + // Convenience macro to define an entry-wise binary layer class #define DEFINE_ENTRYWISE_BINARY_LAYER(layer_name, layer_string) \ struct layer_name##_name_struct { \ @@ -88,7 +115,8 @@ class entrywise_binary_layer : public Layer { }; \ template \ using layer_name \ - = entrywise_binary_layer; + = entrywise_binary_layer; \ + BINARY_ETI_DECL_MACRO(layer_name) // Arithmetic operations DEFINE_ENTRYWISE_BINARY_LAYER(add_layer, "add"); @@ -118,4 +146,7 @@ DEFINE_ENTRYWISE_BINARY_LAYER(logical_xor_layer, "logical xor"); } // namespace lbann #undef DEFINE_ENTRYWISE_BINARY_LAYER +#undef BINARY_ETI_DECL_MACRO +#undef BINARY_ETI_DECL_MACRO_DEV + #endif // LBANN_LAYERS_MATH_BINARY_HPP_INCLUDED diff --git a/include/lbann/layers/math/clamp.hpp b/include/lbann/layers/math/clamp.hpp index 0d7a2264369..6a36143d73f 100644 --- a/include/lbann/layers/math/clamp.hpp +++ b/include/lbann/layers/math/clamp.hpp @@ -82,6 +82,19 @@ class clamp_layer : public Layer { }; +#ifndef LBANN_CLAMP_LAYER_INSTANTIATE +extern template class clamp_layer< + data_layout::DATA_PARALLEL, El::Device::CPU>; +extern template class clamp_layer< + data_layout::MODEL_PARALLEL, El::Device::CPU>; +#ifdef LBANN_HAS_GPU +extern template class clamp_layer< + data_layout::DATA_PARALLEL, El::Device::GPU>; +extern template class clamp_layer< + data_layout::MODEL_PARALLEL, El::Device::GPU>; +#endif // LBANN_HAS_GPU +#endif // LBANN_CLAMP_LAYER_INSTANTIATE + } // namespace lbann #endif // LBANN_LAYERS_MATH_CLAMP_HPP_INCLUDED diff --git a/include/lbann/layers/math/unary.hpp b/include/lbann/layers/math/unary.hpp index 73034b0593f..f73d61c7977 100644 --- a/include/lbann/layers/math/unary.hpp +++ b/include/lbann/layers/math/unary.hpp @@ -55,14 +55,42 @@ class entrywise_unary_layer : public Layer { void bp_compute() override; }; +// Convenience macros for ETI decls for unary layers + +#ifndef LBANN_UNARY_LAYER_INSTANTIATE +#define UNARY_ETI_DECL_MACRO_DEV(LAYER_NAME, DEVICE) \ + extern template class entrywise_unary_layer< \ + data_layout::DATA_PARALLEL, DEVICE, LAYER_NAME##_name_struct>; \ + extern template class entrywise_unary_layer< \ + data_layout::MODEL_PARALLEL, DEVICE, LAYER_NAME##_name_struct> +#else +#define UNARY_ETI_DECL_MACRO_DEV(...) +#endif // LBANN_UNARY_LAYER_INSTANTIATE + +#define UNARY_ETI_INST_MACRO_DEV(LAYER_NAME, DEVICE) \ + template class entrywise_unary_layer< \ + data_layout::DATA_PARALLEL, DEVICE, LAYER_NAME##_name_struct>; \ + template class entrywise_unary_layer< \ + data_layout::MODEL_PARALLEL, DEVICE, LAYER_NAME##_name_struct> + +#ifdef LBANN_HAS_GPU +#define UNARY_ETI_DECL_MACRO(LAYER_NAME) \ + UNARY_ETI_DECL_MACRO_DEV(LAYER_NAME, El::Device::CPU); \ + UNARY_ETI_DECL_MACRO_DEV(LAYER_NAME, El::Device::GPU) +#else +#define UNARY_ETI_DECL_MACRO(LAYER_NAME) \ + UNARY_ETI_DECL_MACRO_DEV(LAYER_NAME, El::Device::CPU) +#endif // LBANN_HAS_GPU + // Convenience macro to define an entry-wise unary layer class -#define DEFINE_ENTRYWISE_UNARY_LAYER(layer_name, layer_string) \ +#define DEFINE_ENTRYWISE_UNARY_LAYER(layer_name, layer_string) \ struct layer_name##_name_struct { \ inline operator std::string() { return layer_string; } \ }; \ template \ using layer_name \ - = entrywise_unary_layer; + = entrywise_unary_layer; \ + UNARY_ETI_DECL_MACRO(layer_name) // Logical operations DEFINE_ENTRYWISE_UNARY_LAYER(logical_not_layer, "logical not"); @@ -109,4 +137,7 @@ DEFINE_ENTRYWISE_UNARY_LAYER(atanh_layer, "hyperbolic arctangent"); } // namespace lbann #undef DEFINE_ENTRYWISE_UNARY_LAYER +#undef UNARY_ETI_DECL_MACRO +#undef UNARY_ETI_DECL_MACRO_DEV + #endif // LBANN_LAYERS_MATH_UNARY_HPP_INCLUDED diff --git a/include/lbann/layers/misc/argmax.hpp b/include/lbann/layers/misc/argmax.hpp index 2627396a038..524e9feeae8 100644 --- a/include/lbann/layers/misc/argmax.hpp +++ b/include/lbann/layers/misc/argmax.hpp @@ -38,14 +38,13 @@ namespace lbann { */ template class argmax_layer : public Layer { + static_assert(Layout == data_layout::DATA_PARALLEL, + "argmax layer only supports data parallel layout"); + static_assert(Device == El::Device::CPU, + "argmax layer only supports CPU"); public: - argmax_layer(lbann_comm* comm) : Layer(comm) { - static_assert(Layout == data_layout::DATA_PARALLEL, - "argmax layer only supports data parallel layout"); - static_assert(Device == El::Device::CPU, - "argmax layer only supports CPU"); - } + argmax_layer(lbann_comm* comm) : Layer(comm) { } argmax_layer* copy() const override { return new argmax_layer(*this); } std::string get_type() const override { return "argmax"; } data_layout get_data_layout() const override { return Layout; } @@ -72,6 +71,11 @@ class argmax_layer : public Layer { }; +#ifndef LBANN_ARGMAX_LAYER_INSTANTIATE +extern template class argmax_layer< + data_layout::DATA_PARALLEL, El::Device::CPU>; +#endif // LBANN_ARGMAX_LAYER_INSTANTIATE + } // namespace lbann #endif // LBANN_LAYERS_MISC_ARGMAX_HPP_INCLUDED diff --git a/include/lbann/layers/misc/argmin.hpp b/include/lbann/layers/misc/argmin.hpp index 844e3637b35..c05ddecd08b 100644 --- a/include/lbann/layers/misc/argmin.hpp +++ b/include/lbann/layers/misc/argmin.hpp @@ -38,14 +38,13 @@ namespace lbann { */ template class argmin_layer : public Layer { + static_assert(Layout == data_layout::DATA_PARALLEL, + "argmin layer only supports data parallel layout"); + static_assert(Device == El::Device::CPU, + "argmin layer only supports CPU"); public: - argmin_layer(lbann_comm* comm) : Layer(comm) { - static_assert(Layout == data_layout::DATA_PARALLEL, - "argmin layer only supports data parallel layout"); - static_assert(Device == El::Device::CPU, - "argmin layer only supports CPU"); - } + argmin_layer(lbann_comm* comm) : Layer(comm) { } argmin_layer* copy() const override { return new argmin_layer(*this); } std::string get_type() const override { return "argmin"; } data_layout get_data_layout() const override { return Layout; } @@ -72,6 +71,10 @@ class argmin_layer : public Layer { }; +#ifndef LBANN_ARGMIN_LAYER_INSTANTIATE +extern template class argmin_layer< + data_layout::DATA_PARALLEL, El::Device::CPU>; +#endif // LBANN_ARGMIN_LAYER_INSTANTIATE } // namespace lbann #endif // LBANN_LAYERS_MISC_ARGMIN_HPP_INCLUDED diff --git a/include/lbann/layers/misc/channelwise_mean.hpp b/include/lbann/layers/misc/channelwise_mean.hpp index 5889b853256..8aea533dfc1 100644 --- a/include/lbann/layers/misc/channelwise_mean.hpp +++ b/include/lbann/layers/misc/channelwise_mean.hpp @@ -32,15 +32,16 @@ namespace lbann { /** @todo Replace with more general reduction layer. */ -template +template class channelwise_mean_layer : public Layer { + static_assert(Layout == data_layout::DATA_PARALLEL, + "channelwise_mean_layer only supports " + "data-parallel data layout"); public: channelwise_mean_layer(lbann_comm *comm) : Layer(comm) { - static_assert(Layout == data_layout::DATA_PARALLEL, - "channelwise_mean_layer only supports " - "data-parallel data layout"); if (comm->am_trainer_master()) { LBANN_WARNING("channelwise_mean_layer is experimental " "and may be deprecated at any time"); @@ -65,6 +66,15 @@ class channelwise_mean_layer : public Layer { }; +#ifndef LBANN_CHANNELWISE_MEAN_LAYER_INSTANTIATE +extern template class channelwise_mean_layer< + data_layout::DATA_PARALLEL, El::Device::CPU>; +#ifdef LBANN_HAS_GPU +extern template class channelwise_mean_layer< + data_layout::DATA_PARALLEL, El::Device::GPU>; +#endif // LBANN_HAS_GPU +#endif // LBANN_CHANNELWISE_MEAN_LAYER_INSTANTIATE + } // namespace lbann #endif // LBANN_LAYERS_MISC_CHANNELWISE_MEAN_HPP_INCLUDED diff --git a/include/lbann/layers/misc/covariance.hpp b/include/lbann/layers/misc/covariance.hpp index 23390b2eead..57c3d304b38 100644 --- a/include/lbann/layers/misc/covariance.hpp +++ b/include/lbann/layers/misc/covariance.hpp @@ -123,6 +123,19 @@ class covariance_layer : public Layer { }; +#ifndef LBANN_COVARIANCE_LAYER_INSTANTIATE +extern template class covariance_layer< + data_layout::DATA_PARALLEL, El::Device::CPU>; +extern template class covariance_layer< + data_layout::MODEL_PARALLEL, El::Device::CPU>; +#ifdef LBANN_HAS_GPU +extern template class covariance_layer< + data_layout::DATA_PARALLEL, El::Device::GPU>; +extern template class covariance_layer< + data_layout::MODEL_PARALLEL, El::Device::GPU>; +#endif // LBANN_HAS_GPU +#endif // LBANN_COVARIANCE_LAYER_INSTANTIATE + } // namespace lbann #endif // LBANN_LAYERS_MISC_COVARIANCE_HPP_INCLUDED diff --git a/include/lbann/layers/misc/mini_batch_index.hpp b/include/lbann/layers/misc/mini_batch_index.hpp index 51538000dce..60176e87851 100644 --- a/include/lbann/layers/misc/mini_batch_index.hpp +++ b/include/lbann/layers/misc/mini_batch_index.hpp @@ -37,7 +37,8 @@ namespace lbann { * mini-batch sample. Each sample in a model's mini-batch has a * unique index in [0, mini_batch_size). */ -template +template class mini_batch_index_layer : public Layer { public: @@ -87,6 +88,19 @@ class mini_batch_index_layer : public Layer { }; +#ifndef LBANN_MINI_BATCH_INDEX_LAYER_INSTANTIATE +extern template class mini_batch_index_layer< + data_layout::DATA_PARALLEL, El::Device::CPU>; +extern template class mini_batch_index_layer< + data_layout::MODEL_PARALLEL, El::Device::CPU>; +#ifdef LBANN_HAS_GPU +extern template class mini_batch_index_layer< + data_layout::DATA_PARALLEL, El::Device::GPU>; +extern template class mini_batch_index_layer< + data_layout::MODEL_PARALLEL, El::Device::GPU>; +#endif // LBANN_HAS_GPU +#endif // LBANN_MINI_BATCH_INDEX_LAYER_INSTANTIATE + } // namespace lbann #endif // LBANN_LAYERS_MISC_MINI_BATCH_INDEX_HPP_INCLUDED diff --git a/include/lbann/layers/misc/mini_batch_size.hpp b/include/lbann/layers/misc/mini_batch_size.hpp index 5a1445ef422..3b20486de65 100644 --- a/include/lbann/layers/misc/mini_batch_size.hpp +++ b/include/lbann/layers/misc/mini_batch_size.hpp @@ -36,7 +36,8 @@ namespace lbann { * Output tensor is a 1D tensor with a single entry containing the * model's current mini-batch size. */ -template +template class mini_batch_size_layer : public Layer { public: @@ -72,6 +73,19 @@ class mini_batch_size_layer : public Layer { }; +#ifndef LBANN_MINI_BATCH_SIZE_LAYER_INSTANTIATE +extern template class mini_batch_size_layer< + data_layout::DATA_PARALLEL, El::Device::CPU>; +extern template class mini_batch_size_layer< + data_layout::MODEL_PARALLEL, El::Device::CPU>; +#ifdef LBANN_HAS_GPU +extern template class mini_batch_size_layer< + data_layout::DATA_PARALLEL, El::Device::GPU>; +extern template class mini_batch_size_layer< + data_layout::MODEL_PARALLEL, El::Device::GPU>; +#endif // LBANN_HAS_GPU +#endif // LBANN_MINI_BATCH_SIZE_LAYER_INSTANTIATE + } // namespace lbann #endif // LBANN_LAYERS_MISC_MINI_BATCH_SIZE_HPP_INCLUDED diff --git a/include/lbann/layers/misc/one_hot.hpp b/include/lbann/layers/misc/one_hot.hpp index aeaca0ab975..c362e091ab3 100644 --- a/include/lbann/layers/misc/one_hot.hpp +++ b/include/lbann/layers/misc/one_hot.hpp @@ -41,12 +41,12 @@ namespace lbann { */ template class one_hot_layer : public Layer { + static_assert(Layout == data_layout::DATA_PARALLEL, + "one-hot layer only supports data-parallel layout"); public: one_hot_layer(lbann_comm* comm, size_t size) : Layer(comm) { set_output_dims({static_cast(size)}); - static_assert(Layout == data_layout::DATA_PARALLEL, - "one-hot layer only supports data-parallel layout"); } one_hot_layer* copy() const override { return new one_hot_layer(*this); } std::string get_type() const override { return "one-hot"; } @@ -76,6 +76,15 @@ class one_hot_layer : public Layer { }; +#ifndef LBANN_ONE_HOT_LAYER_INSTANTIATE +extern template class one_hot_layer< + data_layout::DATA_PARALLEL, El::Device::CPU>; +#ifdef LBANN_HAS_GPU +extern template class one_hot_layer< + data_layout::DATA_PARALLEL, El::Device::GPU>; +#endif // LBANN_HAS_GPU +#endif // LBANN_ONE_HOT_LAYER_INSTANTIATE + } // namespace lbann #endif // LBANN_LAYERS_MISC_ONE_HOT_HPP_INCLUDED diff --git a/include/lbann/layers/misc/variance.hpp b/include/lbann/layers/misc/variance.hpp index 685f1f9c340..c9839b446be 100644 --- a/include/lbann/layers/misc/variance.hpp +++ b/include/lbann/layers/misc/variance.hpp @@ -117,6 +117,19 @@ class variance_layer : public Layer { }; +#ifndef LBANN_VARIANCE_LAYER_INSTANTIATE +extern template class variance_layer< + data_layout::DATA_PARALLEL, El::Device::CPU>; +extern template class variance_layer< + data_layout::MODEL_PARALLEL, El::Device::CPU>; +#ifdef LBANN_HAS_GPU +extern template class variance_layer< + data_layout::DATA_PARALLEL, El::Device::GPU>; +extern template class variance_layer< + data_layout::MODEL_PARALLEL, El::Device::GPU>; +#endif // LBANN_HAS_GPU +#endif // LBANN_VARIANCE_LAYER_INSTANTIATE + } // namespace lbann #endif // LBANN_LAYERS_MISC_VARIANCE_HPP_INCLUDED diff --git a/include/lbann/layers/regularizers/batch_normalization.hpp b/include/lbann/layers/regularizers/batch_normalization.hpp index 030fd1359d6..a981d770f1e 100644 --- a/include/lbann/layers/regularizers/batch_normalization.hpp +++ b/include/lbann/layers/regularizers/batch_normalization.hpp @@ -56,7 +56,8 @@ enum class batch_normalization_stats_aggregation { */ template class batch_normalization_layer : public regularizer_layer { - + static_assert(T_layout == data_layout::DATA_PARALLEL, + "batch normalization only supports DATA_PARALLEL"); private: /** Decay rate for the running statistics. */ @@ -116,8 +117,6 @@ class batch_normalization_layer : public regularizer_layer { m_decay(decay), m_epsilon(epsilon), m_statistics_group_size(statistics_group_size) { - static_assert(T_layout == data_layout::DATA_PARALLEL, - "batch normalization only supports DATA_PARALLEL"); #ifdef LBANN_DETERMINISTIC // Force global computation. m_statistics_group_size = 0; @@ -335,6 +334,15 @@ class batch_normalization_layer : public regularizer_layer { }; +#ifndef LBANN_BATCH_NORMALIZATION_LAYER_INSTANTIATE +extern template class batch_normalization_layer< + data_layout::DATA_PARALLEL, El::Device::CPU>; +#ifdef LBANN_HAS_GPU +extern template class batch_normalization_layer< + data_layout::DATA_PARALLEL, El::Device::GPU>; +#endif // LBANN_HAS_GPU +#endif // LBANN_BATCH_NORMALIZATION_LAYER_INSTANTIATE + } // namespace lbann #endif // LBANN_LAYER_REGULARIZER_BATCH_NORMALIZATION_HPP_INCLUDED diff --git a/include/lbann/layers/regularizers/dropout.hpp b/include/lbann/layers/regularizers/dropout.hpp index 9b93423cc0b..95f85f54498 100644 --- a/include/lbann/layers/regularizers/dropout.hpp +++ b/include/lbann/layers/regularizers/dropout.hpp @@ -28,6 +28,7 @@ #define LBANN_LAYER_REGULARIZER_DROPOUT_HPP_INCLUDED #include "lbann/layers/regularizers/regularizer.hpp" +#include "lbann/models/model.hpp" #include "lbann/utils/cudnn.hpp" #include "lbann/utils/random.hpp" @@ -88,7 +89,7 @@ class dropout : public regularizer_layer { dropout& operator=(const dropout& other) { regularizer_layer::operator=(other); m_keep_prob = other.m_keep_prob; - m_mask = other.m_mask ? other.m_mask->Copy() : nullptr; + m_mask = other.m_mask ? std::unique_ptr(other.m_mask->Copy()) : nullptr; #ifdef LBANN_HAS_CUDNN m_tensors_cudnn_desc = other.m_tensors_cudnn_desc; m_tensors_cudnn_desc.set_layer(this); @@ -337,6 +338,15 @@ class dropout : public regularizer_layer { }; +#ifndef LBANN_DROPOUT_LAYER_INSTANTIATE +extern template class dropout; +extern template class dropout; +#ifdef LBANN_HAS_GPU +extern template class dropout; +extern template class dropout; +#endif // LBANN_HAS_GPU +#endif // LBANN_DROPOUT_LAYER_INSTANTIATE + } // namespace lbann #endif // LBANN_LAYER_REGULARIZER_DROPOUT_HPP_INCLUDED diff --git a/include/lbann/layers/regularizers/entrywise_batch_normalization.hpp b/include/lbann/layers/regularizers/entrywise_batch_normalization.hpp index 0bc692b92b6..7bf925519c4 100644 --- a/include/lbann/layers/regularizers/entrywise_batch_normalization.hpp +++ b/include/lbann/layers/regularizers/entrywise_batch_normalization.hpp @@ -219,6 +219,19 @@ class entrywise_batch_normalization_layer : public Layer { }; +#ifndef LBANN_ENTRYWISE_BATCH_NORMALIZATION_LAYER_INSTANTIATE +extern template class entrywise_batch_normalization_layer< + data_layout::DATA_PARALLEL, El::Device::CPU>; +extern template class entrywise_batch_normalization_layer< + data_layout::MODEL_PARALLEL, El::Device::CPU>; +#ifdef LBANN_HAS_GPU +extern template class entrywise_batch_normalization_layer< + data_layout::DATA_PARALLEL, El::Device::GPU>; +extern template class entrywise_batch_normalization_layer< + data_layout::MODEL_PARALLEL, El::Device::GPU>; +#endif // LBANN_HAS_GPU +#endif // LBANN_ENTRYWISE_BATCH_NORMALIZATION_LAYER_INSTANTIATE + } // namespace lbann #endif // LBANN_LAYERS_REGULARIZERS_ENTRYWISE_BATCH_NORMALIZATION_HPP_INCLUDED diff --git a/include/lbann/layers/regularizers/local_response_normalization.hpp b/include/lbann/layers/regularizers/local_response_normalization.hpp index b6cbbb014c7..7866ef96282 100644 --- a/include/lbann/layers/regularizers/local_response_normalization.hpp +++ b/include/lbann/layers/regularizers/local_response_normalization.hpp @@ -43,8 +43,11 @@ namespace lbann { * Advances in Neural Information Processing Systems, * pp. 1097-1105. 2012. */ -template +template class local_response_normalization_layer : public regularizer_layer { + static_assert(T_layout == data_layout::DATA_PARALLEL, + "local_response_normalization only supports DATA_PARALLEL"); public: local_response_normalization_layer(lbann_comm *comm, @@ -58,10 +61,7 @@ class local_response_normalization_layer : public regularizer_layer { , m_lrn_cudnn_desc(nullptr), m_tensors_cudnn_desc(this) #endif // LBANN_HAS_CUDNN - { - static_assert(T_layout == data_layout::DATA_PARALLEL, - "local_response_normalization only supports DATA_PARALLEL"); - } + { } local_response_normalization_layer(const local_response_normalization_layer& other) : regularizer_layer(other), @@ -110,6 +110,7 @@ class local_response_normalization_layer : public regularizer_layer { m_tensors_cudnn_desc = other.m_tensors_cudnn_desc; m_tensors_cudnn_desc.set_layer(this); #endif // LBANN_HAS_CUDNN + return *this; } ~local_response_normalization_layer() override { @@ -444,6 +445,15 @@ class local_response_normalization_layer : public regularizer_layer { }; +#ifndef LBANN_LOCAL_RESPONSE_NORMALIZATION_LAYER_INSTANTIATE +extern template class local_response_normalization_layer< + data_layout::DATA_PARALLEL, El::Device::CPU>; +#ifdef LBANN_HAS_GPU +extern template class local_response_normalization_layer< + data_layout::DATA_PARALLEL, El::Device::GPU>; +#endif // LBANN_HAS_GPU +#endif // LBANN_LOCAL_RESPONSE_NORMALIZATION_LAYER_INSTANTIATE + } // namespace lbann #endif // LBANN_LAYER_LOCAL_RESPONSE_NORMALIZATION_HPP_INCLUDED diff --git a/include/lbann/layers/regularizers/selu_dropout.hpp b/include/lbann/layers/regularizers/selu_dropout.hpp index 8c7b837729f..428493f732e 100644 --- a/include/lbann/layers/regularizers/selu_dropout.hpp +++ b/include/lbann/layers/regularizers/selu_dropout.hpp @@ -28,6 +28,7 @@ #define LBANN_LAYER_REGULARIZER_SELU_DROPOUT_HPP_INCLUDED #include "lbann/layers/regularizers/regularizer.hpp" +#include "lbann/models/model.hpp" namespace lbann { @@ -175,6 +176,19 @@ class selu_dropout : public regularizer_layer { AbsDistMat *m_mask; }; +#ifndef LBANN_SELU_DROPOUT_LAYER_INSTANTIATE +extern template class selu_dropout< + data_layout::DATA_PARALLEL, El::Device::CPU>; +extern template class selu_dropout< + data_layout::MODEL_PARALLEL, El::Device::CPU>; +#ifdef LBANN_HAS_GPU +extern template class selu_dropout< + data_layout::DATA_PARALLEL, El::Device::GPU>; +extern template class selu_dropout< + data_layout::MODEL_PARALLEL, El::Device::GPU>; +#endif // LBANN_HAS_GPU +#endif // LBANN_SELU_DROPOUT_LAYER_INSTANTIATE + } // namespace lbann #endif // LBANN_LAYER_REGULARIZER_SELU_DROPOUT_HPP_INCLUDED diff --git a/include/lbann/layers/transform/bernoulli.hpp b/include/lbann/layers/transform/bernoulli.hpp index f7216d75421..9fda020695e 100644 --- a/include/lbann/layers/transform/bernoulli.hpp +++ b/include/lbann/layers/transform/bernoulli.hpp @@ -28,6 +28,7 @@ #define LBANN_LAYER_BERNOULLI_HPP_INCLUDED #include "lbann/layers/transform/transform.hpp" +#include "lbann/models/model.hpp" #include "lbann/utils/random.hpp" namespace lbann { @@ -36,7 +37,8 @@ namespace lbann { * * During validation and testing, outputs are all zero. */ -template +template class bernoulli_layer : public transform_layer { private: /** Probability of outputting 1. */ @@ -74,6 +76,19 @@ class bernoulli_layer : public transform_layer { }; +#ifndef LBANN_BERNOULLI_LAYER_INSTANTIATE +extern template class bernoulli_layer< + data_layout::DATA_PARALLEL, El::Device::CPU>; +extern template class bernoulli_layer< + data_layout::MODEL_PARALLEL, El::Device::CPU>; +#ifdef LBANN_HAS_GPU +extern template class bernoulli_layer< + data_layout::DATA_PARALLEL, El::Device::GPU>; +extern template class bernoulli_layer< + data_layout::MODEL_PARALLEL, El::Device::GPU>; +#endif // LBANN_HAS_GPU +#endif // LBANN_BERNOULLI_LAYER_INSTANTIATE + } // namespace lbann #endif // LBANN_LAYER_BERNOULLI_HPP_INCLUDED diff --git a/include/lbann/layers/transform/categorical_random.hpp b/include/lbann/layers/transform/categorical_random.hpp index 555b44ff67e..44e303530dc 100644 --- a/include/lbann/layers/transform/categorical_random.hpp +++ b/include/lbann/layers/transform/categorical_random.hpp @@ -28,6 +28,7 @@ #define LBANN_LAYER_CATEGORICAL_RANDOM_HPP_INCLUDED #include "lbann/layers/transform/transform.hpp" +#include "lbann/models/model.hpp" #include "lbann/utils/random.hpp" namespace lbann { @@ -40,16 +41,17 @@ namespace lbann { * * @todo Remove. */ -template +template class categorical_random_layer : public transform_layer { - + static_assert(Dev == El::Device::CPU, + "categorical random layer currently only supports CPU"); + static_assert(T_layout == data_layout::DATA_PARALLEL, + "categorical random layer currently only " + "supports DATA_PARALLEL"); public: categorical_random_layer(lbann_comm *comm) : transform_layer(comm) { - static_assert(Dev == El::Device::CPU, - "categorical random layer currently only supports CPU"); - static_assert(T_layout == data_layout::DATA_PARALLEL, - "categorical random layer currently only supports DATA_PARALLEL"); } categorical_random_layer* copy() const override { return new categorical_random_layer(*this); } std::string get_type() const override { return "categorical random"; } @@ -109,6 +111,11 @@ class categorical_random_layer : public transform_layer { }; +#ifndef LBANN_CATEGORICAL_RANDOM_LAYER_INSTANTIATE +extern template class categorical_random_layer< + data_layout::DATA_PARALLEL, El::Device::CPU>; +#endif // LBANN_CATEGORICAL_RANDOM_LAYER_INSTANTIATE + } // namespace lbann #endif // LBANN_LAYER_CATEGORICAL_RANDOM_HPP_INCLUDED diff --git a/include/lbann/layers/transform/concatenation.hpp b/include/lbann/layers/transform/concatenation.hpp index 2630ea6a3d8..218877f9d52 100644 --- a/include/lbann/layers/transform/concatenation.hpp +++ b/include/lbann/layers/transform/concatenation.hpp @@ -28,12 +28,14 @@ #define LBANN_LAYER_CONCATENATION_HPP_INCLUDED #include "lbann/layers/transform/transform.hpp" +#include "lbann/models/model.hpp" #include "lbann/utils/exception.hpp" namespace lbann { /** @brief Concatenate tensors along specified dimension. */ -template +template class concatenation_layer : public transform_layer { public: @@ -56,6 +58,7 @@ class concatenation_layer : public transform_layer { m_concat_points = other.m_concat_points; m_input_v.reset(other.m_input_v ? other.m_input_v->Copy() : nullptr); m_output_v.reset(other.m_output_v ? other.m_output_v->Copy() : nullptr); + return *this; } concatenation_layer* copy() const override { return new concatenation_layer(*this); } @@ -283,6 +286,19 @@ class concatenation_layer : public transform_layer { }; +#ifndef LBANN_CONCATENATION_LAYER_INSTANTIATE +extern template class concatenation_layer< + data_layout::DATA_PARALLEL, El::Device::CPU>; +extern template class concatenation_layer< + data_layout::MODEL_PARALLEL, El::Device::CPU>; +#ifdef LBANN_HAS_GPU +extern template class concatenation_layer< + data_layout::DATA_PARALLEL, El::Device::GPU>; +extern template class concatenation_layer< + data_layout::MODEL_PARALLEL, El::Device::GPU>; +#endif // LBANN_HAS_GPU +#endif // LBANN_CONCATENATION_LAYER_INSTANTIATE + } // namespace lbann #endif // LBANN_LAYER_CONCATENATION_HPP_INCLUDED diff --git a/include/lbann/layers/transform/constant.hpp b/include/lbann/layers/transform/constant.hpp index 3324e621ffd..c6e378851cd 100644 --- a/include/lbann/layers/transform/constant.hpp +++ b/include/lbann/layers/transform/constant.hpp @@ -32,7 +32,8 @@ namespace lbann { /** @brief Constant output. */ -template +template class constant_layer : public transform_layer { public: @@ -72,6 +73,19 @@ class constant_layer : public transform_layer { }; +#ifndef LBANN_CONSTANT_LAYER_INSTANTIATE +extern template class constant_layer< + data_layout::DATA_PARALLEL, El::Device::CPU>; +extern template class constant_layer< + data_layout::MODEL_PARALLEL, El::Device::CPU>; +#ifdef LBANN_HAS_GPU +extern template class constant_layer< + data_layout::DATA_PARALLEL, El::Device::GPU>; +extern template class constant_layer< + data_layout::MODEL_PARALLEL, El::Device::GPU>; +#endif // LBANN_HAS_GPU +#endif // LBANN_CONSTANT_LAYER_INSTANTIATE + } // namespace lbann #endif // LBANN_LAYER_CONSTANT_HPP_INCLUDED diff --git a/include/lbann/layers/transform/crop.hpp b/include/lbann/layers/transform/crop.hpp index f0b37b293d3..30fbc0e810e 100644 --- a/include/lbann/layers/transform/crop.hpp +++ b/include/lbann/layers/transform/crop.hpp @@ -40,15 +40,16 @@ namespace lbann { * to the red-top-left corner and (1,1,1) to the blue-bottom-right * corner. The crop size is determined at setup. */ -template +template class crop_layer : public transform_layer { + static_assert(T_layout == data_layout::DATA_PARALLEL, + "crop layer only supports DATA_PARALLEL"); public: crop_layer(lbann_comm *comm, std::vector dims) : transform_layer(comm) { - static_assert(T_layout == data_layout::DATA_PARALLEL, - "crop layer only supports DATA_PARALLEL"); set_output_dims(dims); this->m_expected_num_parent_layers = 2; } @@ -327,6 +328,13 @@ class crop_layer : public transform_layer { }; +#ifndef LBANN_CROP_LAYER_INSTANTIATE +extern template class crop_layer; +#ifdef LBANN_HAS_GPU +extern template class crop_layer; +#endif // LBANN_HAS_GPU +#endif // LBANN_CROP_LAYER_INSTANTIATE + } // namespace lbann #endif // LBANN_LAYER_CROP_HPP_INCLUDED diff --git a/include/lbann/layers/transform/discrete_random.hpp b/include/lbann/layers/transform/discrete_random.hpp index 1da49cd16d7..853c5880822 100644 --- a/include/lbann/layers/transform/discrete_random.hpp +++ b/include/lbann/layers/transform/discrete_random.hpp @@ -28,6 +28,7 @@ #define LBANN_LAYER_DISCRETE_RANDOM_HPP_INCLUDED #include "lbann/layers/transform/transform.hpp" +#include "lbann/models/model.hpp" #include "lbann/utils/random.hpp" namespace lbann { @@ -39,8 +40,13 @@ namespace lbann { * * @todo Remove. */ -template +template class discrete_random_layer : public transform_layer { + static_assert(Dev == El::Device::CPU, + "discrete random layer currently only supports CPU"); + static_assert(T_layout == data_layout::DATA_PARALLEL, + "discrete random layer currently only supports DATA_PARALLEL"); private: /** Values in discrete distribution. */ @@ -52,10 +58,6 @@ class discrete_random_layer : public transform_layer { std::vector dims) : transform_layer(comm), m_values(values) { - static_assert(Dev == El::Device::CPU, - "discrete random layer currently only supports CPU"); - static_assert(T_layout == data_layout::DATA_PARALLEL, - "discrete random layer currently only supports DATA_PARALLEL"); set_output_dims(dims); } discrete_random_layer* copy() const override { return new discrete_random_layer(*this); } @@ -119,6 +121,10 @@ class discrete_random_layer : public transform_layer { }; +#ifndef LBANN_DISCRETE_RANDOM_LAYER_INSTANTIATE +extern template class discrete_random_layer< + data_layout::DATA_PARALLEL, El::Device::CPU>; +#endif // LBANN_DISCRETE_RANDOM_LAYER_INSTANTIATE } // namespace lbann #endif // LBANN_LAYER_DISCRETE_RANDOM_HPP_INCLUDED diff --git a/include/lbann/layers/transform/dummy.hpp b/include/lbann/layers/transform/dummy.hpp index ec451fbe08e..053a4385b02 100644 --- a/include/lbann/layers/transform/dummy.hpp +++ b/include/lbann/layers/transform/dummy.hpp @@ -36,7 +36,8 @@ namespace lbann { * Does no computation and is primarily intended as a placeholder for * unused layer outputs. */ -template +template class dummy_layer : public transform_layer { public: dummy_layer(lbann_comm *comm) : transform_layer(comm) { @@ -50,6 +51,19 @@ class dummy_layer : public transform_layer { void fp_compute() override {} }; +#ifndef LBANN_DUMMY_LAYER_INSTANTIATE +extern template class dummy_layer< + data_layout::DATA_PARALLEL, El::Device::CPU>; +extern template class dummy_layer< + data_layout::MODEL_PARALLEL, El::Device::CPU>; +#ifdef LBANN_HAS_GPU +extern template class dummy_layer< + data_layout::DATA_PARALLEL, El::Device::GPU>; +extern template class dummy_layer< + data_layout::MODEL_PARALLEL, El::Device::GPU>; +#endif // LBANN_HAS_GPU +#endif // LBANN_DUMMY_LAYER_INSTANTIATE + } // namespace lbann #endif // LBANN_LAYER_DUMMY_HPP_INCLUDED diff --git a/include/lbann/layers/transform/evaluation.hpp b/include/lbann/layers/transform/evaluation.hpp index 014ff9e3849..0e237f7911f 100644 --- a/include/lbann/layers/transform/evaluation.hpp +++ b/include/lbann/layers/transform/evaluation.hpp @@ -77,7 +77,8 @@ class abstract_evaluation_layer : public transform_layer { * Computes the average value across a mini-batch. If the input * tensor has multiple neurons, their values are added together. */ -template +template class evaluation_layer : public abstract_evaluation_layer { public: evaluation_layer(lbann_comm *comm) : abstract_evaluation_layer(comm) {} @@ -87,6 +88,19 @@ class evaluation_layer : public abstract_evaluation_layer { El::Device get_device_allocation() const override { return Dev; } }; +#ifndef LBANN_EVALUATION_LAYER_INSTANTIATE +extern template class evaluation_layer< + data_layout::DATA_PARALLEL, El::Device::CPU>; +extern template class evaluation_layer< + data_layout::MODEL_PARALLEL, El::Device::CPU>; +#ifdef LBANN_HAS_GPU +extern template class evaluation_layer< + data_layout::DATA_PARALLEL, El::Device::GPU>; +extern template class evaluation_layer< + data_layout::MODEL_PARALLEL, El::Device::GPU>; +#endif // LBANN_HAS_GPU +#endif // LBANN_EVALUATION_LAYER_INSTANTIATE + } // namespace lbann #endif // LBANN_LAYER_EVALUATION_HPP_INCLUDED diff --git a/include/lbann/layers/transform/gaussian.hpp b/include/lbann/layers/transform/gaussian.hpp index 8d679fd4a91..806af947448 100644 --- a/include/lbann/layers/transform/gaussian.hpp +++ b/include/lbann/layers/transform/gaussian.hpp @@ -28,6 +28,7 @@ #define LBANN_LAYER_GAUSSIAN_HPP_INCLUDED #include "lbann/layers/transform/transform.hpp" +#include "lbann/models/model.hpp" #include "lbann/utils/random.hpp" namespace lbann { @@ -37,7 +38,8 @@ namespace lbann { * During validation and testing, outputs are all equal to the * distribution mean. */ -template +template class gaussian_layer : public transform_layer { private: /** Gaussian distribution mean. */ @@ -79,6 +81,19 @@ class gaussian_layer : public transform_layer { }; +#ifndef LBANN_GAUSSIAN_LAYER_INSTANTIATE +extern template class gaussian_layer< + data_layout::DATA_PARALLEL, El::Device::CPU>; +extern template class gaussian_layer< + data_layout::MODEL_PARALLEL, El::Device::CPU>; +#ifdef LBANN_HAS_GPU +extern template class gaussian_layer< + data_layout::DATA_PARALLEL, El::Device::GPU>; +extern template class gaussian_layer< + data_layout::MODEL_PARALLEL, El::Device::GPU>; +#endif // LBANN_HAS_GPU +#endif // LBANN_GAUSSIAN_LAYER_INSTANTIATE + } // namespace lbann #endif // LBANN_LAYER_GAUSSIAN_HPP_INCLUDED diff --git a/include/lbann/layers/transform/hadamard.hpp b/include/lbann/layers/transform/hadamard.hpp index 04426334b91..a5db4686bc1 100644 --- a/include/lbann/layers/transform/hadamard.hpp +++ b/include/lbann/layers/transform/hadamard.hpp @@ -34,7 +34,8 @@ namespace lbann { /** @brief Entry-wise tensor product. */ -template +template class hadamard_layer : public transform_layer { public: @@ -126,6 +127,19 @@ class hadamard_layer : public transform_layer { }; +#ifndef LBANN_HADAMARD_LAYER_INSTANTIATE +extern template class hadamard_layer< + data_layout::DATA_PARALLEL, El::Device::CPU>; +extern template class hadamard_layer< + data_layout::MODEL_PARALLEL, El::Device::CPU>; +#ifdef LBANN_HAS_GPU +extern template class hadamard_layer< + data_layout::DATA_PARALLEL, El::Device::GPU>; +extern template class hadamard_layer< + data_layout::MODEL_PARALLEL, El::Device::GPU>; +#endif // LBANN_HAS_GPU +#endif // LBANN_HADAMARD_LAYER_INSTANTIATE + } // namespace lbann #endif // LBANN_LAYER_HADAMARD_HPP_INCLUDED diff --git a/include/lbann/layers/transform/in_top_k.hpp b/include/lbann/layers/transform/in_top_k.hpp index 959fb37881e..137a1153b7f 100644 --- a/include/lbann/layers/transform/in_top_k.hpp +++ b/include/lbann/layers/transform/in_top_k.hpp @@ -38,7 +38,8 @@ namespace lbann { * one and the rest to zero. Ties are broken in favor of entries with * smaller indices. */ -template +template class in_top_k_layer : public transform_layer { public: @@ -78,6 +79,19 @@ class in_top_k_layer : public transform_layer { }; +#ifndef LBANN_IN_TOP_K_LAYER_INSTANTIATE +extern template class in_top_k_layer< + data_layout::DATA_PARALLEL, El::Device::CPU>; +extern template class in_top_k_layer< + data_layout::MODEL_PARALLEL, El::Device::CPU>; +#ifdef LBANN_HAS_GPU +extern template class in_top_k_layer< + data_layout::DATA_PARALLEL, El::Device::GPU>; +extern template class in_top_k_layer< + data_layout::MODEL_PARALLEL, El::Device::GPU>; +#endif // LBANN_HAS_GPU +#endif // LBANN_IN_TOP_K_LAYER_INSTANTIATE + } // namespace lbann #endif // LBANN_LAYER_IN_TOP_K_HPP_INCLUDED diff --git a/include/lbann/layers/transform/pooling.hpp b/include/lbann/layers/transform/pooling.hpp index e70bf9dd303..5dd2fb98e81 100644 --- a/include/lbann/layers/transform/pooling.hpp +++ b/include/lbann/layers/transform/pooling.hpp @@ -40,12 +40,15 @@ namespace lbann { template class unpooling_layer; -template +template class pooling_layer : public transform_layer { + static_assert(T_layout == data_layout::DATA_PARALLEL, + "pooling only supports DATA_PARALLEL"); private: /** Pooling mode. */ - const pool_mode m_pool_mode; + pool_mode m_pool_mode; /** Pooling window dimensions. */ std::vector m_pool_dims; @@ -103,9 +106,6 @@ class pooling_layer : public transform_layer { m_tensors_cudnn_desc(this) #endif // LBANN_HAS_CUDNN { - static_assert(T_layout == data_layout::DATA_PARALLEL, - "pooling only supports DATA_PARALLEL"); - // Initialize input dimensions and pooling parameters m_pool_size = std::accumulate(m_pool_dims.begin(), m_pool_dims.end(), @@ -553,6 +553,15 @@ class pooling_layer : public transform_layer { }; +#ifndef LBANN_POOLING_LAYER_INSTANTIATE +extern template class pooling_layer< + data_layout::DATA_PARALLEL, El::Device::CPU>; +#ifdef LBANN_HAS_GPU +extern template class pooling_layer< + data_layout::DATA_PARALLEL, El::Device::GPU>; +#endif // LBANN_HAS_GPU +#endif // LBANN_POOLING_LAYER_INSTANTIATE + } // namespace lbann #endif // LBANN_LAYER_POOLING_HPP_INCLUDED diff --git a/include/lbann/layers/transform/reduction.hpp b/include/lbann/layers/transform/reduction.hpp index 0328ccf4c3e..dce99308c0f 100644 --- a/include/lbann/layers/transform/reduction.hpp +++ b/include/lbann/layers/transform/reduction.hpp @@ -38,8 +38,11 @@ enum class reduction_mode {INVALID, SUM, AVERAGE}; * * @todo Reduction over specified dimensions. */ -template +template class reduction_layer : public transform_layer { + static_assert(T_layout == data_layout::DATA_PARALLEL, + "reduction currently only supports DATA_PARALLEL"); private: /** Reduction mode. */ @@ -54,8 +57,6 @@ class reduction_layer : public transform_layer { reduction_mode mode) : transform_layer(comm), m_mode(mode) { - static_assert(T_layout == data_layout::DATA_PARALLEL, - "reduction currently only supports DATA_PARALLEL"); if (mode == reduction_mode::INVALID) { LBANN_ERROR("invalid reduction mode"); } @@ -143,6 +144,15 @@ class reduction_layer : public transform_layer { }; +#ifndef LBANN_REDUCTION_LAYER_INSTANTIATE +extern template class reduction_layer< + data_layout::DATA_PARALLEL, El::Device::CPU>; +#ifdef LBANN_HAS_GPU +extern template class reduction_layer< + data_layout::DATA_PARALLEL, El::Device::GPU>; +#endif // LBANN_HAS_GPU +#endif // LBANN_REDUCTION_LAYER_INSTANTIATE + } // namespace lbann #endif // LBANN_LAYER_REDUCTION_HPP_INCLUDED diff --git a/include/lbann/layers/transform/reshape.hpp b/include/lbann/layers/transform/reshape.hpp index 7770080ff69..982c8eab3bd 100644 --- a/include/lbann/layers/transform/reshape.hpp +++ b/include/lbann/layers/transform/reshape.hpp @@ -102,6 +102,19 @@ class reshape_layer : public transform_layer { }; +#ifndef LBANN_RESHAPE_LAYER_INSTANTIATE +extern template class reshape_layer< + data_layout::DATA_PARALLEL, El::Device::CPU>; +extern template class reshape_layer< + data_layout::MODEL_PARALLEL, El::Device::CPU>; +#ifdef LBANN_HAS_GPU +extern template class reshape_layer< + data_layout::DATA_PARALLEL, El::Device::GPU>; +extern template class reshape_layer< + data_layout::MODEL_PARALLEL, El::Device::GPU>; +#endif // LBANN_HAS_GPU +#endif // LBANN_RESHAPE_LAYER_INSTANTIATE + } // namespace lbann #endif // RESHAPE_HPP_INCLUDED diff --git a/include/lbann/layers/transform/slice.hpp b/include/lbann/layers/transform/slice.hpp index 98113accec9..091ecb1fca6 100644 --- a/include/lbann/layers/transform/slice.hpp +++ b/include/lbann/layers/transform/slice.hpp @@ -44,7 +44,8 @@ namespace lbann { * \cdots\times D_n @f$ * tensor. */ -template +template class slice_layer : public transform_layer { public: @@ -71,6 +72,7 @@ class slice_layer : public transform_layer { m_slice_points = other.m_slice_points; m_input_v.reset(other.m_input_v ? other.m_input_v->Copy() : nullptr); m_output_v.reset(other.m_output_v ? other.m_output_v->Copy() : nullptr); + return *this; } slice_layer* copy() const override { return new slice_layer(*this); } @@ -286,6 +288,15 @@ class slice_layer : public transform_layer { }; +#ifndef LBANN_SLICE_LAYER_INSTANTIATE +extern template class slice_layer; +extern template class slice_layer; +#ifdef LBANN_HAS_GPU +extern template class slice_layer; +extern template class slice_layer; +#endif // LBANN_HAS_GPU +#endif // LBANN_SLICE_LAYER_INSTANTIATE + } // namespace lbann #endif // LBANN_LAYER_SLICE_HPP_INCLUDED diff --git a/include/lbann/layers/transform/sort.hpp b/include/lbann/layers/transform/sort.hpp index 131297383bc..d9d54821e3f 100644 --- a/include/lbann/layers/transform/sort.hpp +++ b/include/lbann/layers/transform/sort.hpp @@ -32,14 +32,15 @@ namespace lbann { /** @brief Sort tensor entries. */ -template +template class sort_layer : public transform_layer { + static_assert(T_layout == data_layout::DATA_PARALLEL, + "sort layer only supports DATA_PARALLEL"); public: sort_layer(lbann_comm *comm, bool descending = false) : transform_layer(comm), m_descending(descending) { - static_assert(T_layout == data_layout::DATA_PARALLEL, - "sort layer only supports DATA_PARALLEL"); } sort_layer(const sort_layer& other) : transform_layer(other), @@ -138,6 +139,15 @@ class sort_layer : public transform_layer { }; +#ifndef LBANN_SORT_LAYER_INSTANTIATE +extern template class sort_layer< + data_layout::DATA_PARALLEL, El::Device::CPU>; +#ifdef LBANN_HAS_GPU +extern template class sort_layer< + data_layout::DATA_PARALLEL, El::Device::GPU>; +#endif // LBANN_HAS_GPU +#endif // LBANN_SORT_LAYER_INSTANTIATE + } // namespace lbann #endif // LBANN_LAYER_SORT_HPP_INCLUDED diff --git a/include/lbann/layers/transform/split.hpp b/include/lbann/layers/transform/split.hpp index a7f151f7452..600148a8062 100644 --- a/include/lbann/layers/transform/split.hpp +++ b/include/lbann/layers/transform/split.hpp @@ -34,7 +34,8 @@ namespace lbann { /** @brief Present input tensor to multiple outputs. */ -template +template class split_layer : public transform_layer { public: @@ -80,6 +81,15 @@ class split_layer : public transform_layer { }; +#ifndef LBANN_SPLIT_LAYER_INSTANTIATE +extern template class split_layer; +extern template class split_layer; +#ifdef LBANN_HAS_GPU +extern template class split_layer; +extern template class split_layer; +#endif // LBANN_HAS_GPU +#endif // LBANN_SPLIT_LAYER_INSTANTIATE + } // namespace lbann #endif // LBANN_LAYER_SPLIT_HPP_INCLUDED diff --git a/include/lbann/layers/transform/stop_gradient.hpp b/include/lbann/layers/transform/stop_gradient.hpp index 4adeafbb205..08666c25c2c 100644 --- a/include/lbann/layers/transform/stop_gradient.hpp +++ b/include/lbann/layers/transform/stop_gradient.hpp @@ -60,6 +60,19 @@ class stop_gradient_layer : public transform_layer { }; +#ifndef LBANN_STOP_GRADIENT_LAYER_INSTANTIATE +extern template class stop_gradient_layer< + data_layout::DATA_PARALLEL, El::Device::CPU>; +extern template class stop_gradient_layer< + data_layout::MODEL_PARALLEL, El::Device::CPU>; +#ifdef LBANN_HAS_GPU +extern template class stop_gradient_layer< + data_layout::DATA_PARALLEL, El::Device::GPU>; +extern template class stop_gradient_layer< + data_layout::MODEL_PARALLEL, El::Device::GPU>; +#endif // LBANN_HAS_GPU +#endif // LBANN_STOP_GRADIENT_LAYER_INSTANTIATE + } // namespace lbann #endif // STOP_GRADIENT_HPP_INCLUDED diff --git a/include/lbann/layers/transform/sum.hpp b/include/lbann/layers/transform/sum.hpp index ab9ce9a4af6..3544f90b1a7 100644 --- a/include/lbann/layers/transform/sum.hpp +++ b/include/lbann/layers/transform/sum.hpp @@ -32,7 +32,8 @@ namespace lbann { -template +template class sum_layer : public transform_layer { public: @@ -104,6 +105,15 @@ class sum_layer : public transform_layer { }; +#ifndef LBANN_SUM_LAYER_INSTANTIATE +extern template class sum_layer; +extern template class sum_layer; +#ifdef LBANN_HAS_GPU +extern template class sum_layer; +extern template class sum_layer; +#endif // LBANN_HAS_GPU +#endif // LBANN_SUM_LAYER_INSTANTIATE + } // namespace lbann #endif // LBANN_LAYER_SUM_HPP_INCLUDED diff --git a/include/lbann/layers/transform/tessellate.hpp b/include/lbann/layers/transform/tessellate.hpp index eafe02cb9df..07111dc8ab9 100644 --- a/include/lbann/layers/transform/tessellate.hpp +++ b/include/lbann/layers/transform/tessellate.hpp @@ -57,7 +57,8 @@ namespace lbann { * e_n@f$. Then, denoting the modulo operator with @f$ \% @f$, * @f[ Y_{i_1,\cdots,i_n} = X_{i_1\% d_1,\cdots,i_n\% d_n} @f] */ -template +template class tessellate_layer : public Layer { public: @@ -211,6 +212,19 @@ class tessellate_layer : public Layer { }; +#ifndef LBANN_TESSELLATE_LAYER_INSTANTIATE +extern template class tessellate_layer< + data_layout::DATA_PARALLEL, El::Device::CPU>; +extern template class tessellate_layer< + data_layout::MODEL_PARALLEL, El::Device::CPU>; +#ifdef LBANN_HAS_GPU +extern template class tessellate_layer< + data_layout::DATA_PARALLEL, El::Device::GPU>; +extern template class tessellate_layer< + data_layout::MODEL_PARALLEL, El::Device::GPU>; +#endif // LBANN_HAS_GPU +#endif // LBANN_TESSELLATE_LAYER_INSTANTIATE + } // namespace lbann #endif // LBANN_LAYERS_TRANSFORM_TESSELLATE_HPP_INCLUDED diff --git a/include/lbann/layers/transform/uniform.hpp b/include/lbann/layers/transform/uniform.hpp index b7423cb0295..5b394396cb6 100644 --- a/include/lbann/layers/transform/uniform.hpp +++ b/include/lbann/layers/transform/uniform.hpp @@ -28,6 +28,7 @@ #define LBANN_LAYER_UNIFORM_HPP_INCLUDED #include "lbann/layers/transform/transform.hpp" +#include "lbann/models/model.hpp" #include "lbann/utils/random.hpp" namespace lbann { @@ -37,7 +38,8 @@ namespace lbann { * During validation and testing, outputs are all equal to the * distribution mean. */ -template +template class uniform_layer : public transform_layer { private: /** Uniform distribution mean. */ @@ -83,6 +85,19 @@ class uniform_layer : public transform_layer { }; +#ifndef LBANN_UNIFORM_LAYER_INSTANTIATE +extern template class uniform_layer< + data_layout::DATA_PARALLEL, El::Device::CPU>; +extern template class uniform_layer< + data_layout::MODEL_PARALLEL, El::Device::CPU>; +#ifdef LBANN_HAS_GPU +extern template class uniform_layer< + data_layout::DATA_PARALLEL, El::Device::GPU>; +extern template class uniform_layer< + data_layout::MODEL_PARALLEL, El::Device::GPU>; +#endif // LBANN_HAS_GPU +#endif // LBANN_UNIFORM_LAYER_INSTANTIATE + } // namespace lbann #endif // LBANN_LAYER_UNIFORM_HPP_INCLUDED diff --git a/include/lbann/layers/transform/unpooling.hpp b/include/lbann/layers/transform/unpooling.hpp index 9a88eabcc1a..59372cf185a 100644 --- a/include/lbann/layers/transform/unpooling.hpp +++ b/include/lbann/layers/transform/unpooling.hpp @@ -37,8 +37,13 @@ namespace lbann { /** @brief Transpose of pooling layer. * @todo GPU support. */ -template +template class unpooling_layer : public transform_layer { + static_assert(T_layout == data_layout::DATA_PARALLEL, + "unpooling only supports DATA_PARALLEL"); + static_assert(Dev == El::Device::CPU, + "unpooling only supports CPU"); private: /** Corresponding pooling layer. */ @@ -49,12 +54,7 @@ class unpooling_layer : public transform_layer { unpooling_layer(lbann_comm *comm, pooling_layer* pool = nullptr) : transform_layer(comm), - m_pooling_layer(pool) { - static_assert(T_layout == data_layout::DATA_PARALLEL, - "unpooling only supports DATA_PARALLEL"); - static_assert(Dev == El::Device::CPU, - "unpooling only supports CPU"); - } + m_pooling_layer(pool) { } unpooling_layer* copy() const override { return new unpooling_layer(*this); } std::string get_type() const override { return "unpooling"; } @@ -252,6 +252,11 @@ class unpooling_layer : public transform_layer { }; +#ifndef LBANN_UNPOOLING_LAYER_INSTANTIATE +extern template class unpooling_layer< + data_layout::DATA_PARALLEL, El::Device::CPU>; +#endif // LBANN_UNPOOLING_LAYER_INSTANTIATE + } // namespace lbann -#endif // LBANN_LAYER_POOLING_HPP_INCLUDED +#endif // LBANN_LAYER_UNPOOLING_HPP_INCLUDED diff --git a/include/lbann/layers/transform/weighted_sum.hpp b/include/lbann/layers/transform/weighted_sum.hpp index fe1e367be98..b96bb40f0e6 100644 --- a/include/lbann/layers/transform/weighted_sum.hpp +++ b/include/lbann/layers/transform/weighted_sum.hpp @@ -34,7 +34,8 @@ namespace lbann { /** @brief Add tensors with specified scaling factors. */ -template +template class weighted_sum_layer : public transform_layer { private: @@ -130,6 +131,19 @@ class weighted_sum_layer : public transform_layer { }; +#ifndef LBANN_WEIGHTED_SUM_LAYER_INSTANTIATE +extern template class weighted_sum_layer< + data_layout::DATA_PARALLEL, El::Device::CPU>; +extern template class weighted_sum_layer< + data_layout::MODEL_PARALLEL, El::Device::CPU>; +#ifdef LBANN_HAS_GPU +extern template class weighted_sum_layer< + data_layout::DATA_PARALLEL, El::Device::GPU>; +extern template class weighted_sum_layer< + data_layout::MODEL_PARALLEL, El::Device::GPU>; +#endif // LBANN_HAS_GPU +#endif // LBANN_WEIGHTED_SUM_LAYER_INSTANTIATE + } // namespace lbann #endif // LBANN_LAYER_WEIGHTED_SUM_HPP_INCLUDED diff --git a/include/lbann/layers/transform/weights.hpp b/include/lbann/layers/transform/weights.hpp index 5f0091fed1f..8c34cf0aaa6 100644 --- a/include/lbann/layers/transform/weights.hpp +++ b/include/lbann/layers/transform/weights.hpp @@ -28,6 +28,7 @@ #define LBANN_LAYER_WEIGHTS_HPP_INCLUDED #include "lbann/layers/transform/transform.hpp" +#include "lbann/models/model.hpp" #include "lbann/execution_contexts/sgd_execution_context.hpp" namespace lbann { @@ -36,7 +37,8 @@ namespace lbann { * * Interfaces with a @c weights object and outputs its tensor. */ -template +template class weights_layer : public transform_layer { public: @@ -203,6 +205,19 @@ class weights_layer : public transform_layer { }; +#ifndef LBANN_WEIGHTS_LAYER_INSTANTIATE +extern template class weights_layer< + data_layout::DATA_PARALLEL, El::Device::CPU>; +extern template class weights_layer< + data_layout::MODEL_PARALLEL, El::Device::CPU>; +#ifdef LBANN_HAS_GPU +extern template class weights_layer< + data_layout::DATA_PARALLEL, El::Device::GPU>; +extern template class weights_layer< + data_layout::MODEL_PARALLEL, El::Device::GPU>; +#endif // LBANN_HAS_GPU +#endif // LBANN_WEIGHTS_LAYER_INSTANTIATE + } // namespace lbann #endif // LBANN_LAYER_WEIGHTS_HPP_INCLUDED diff --git a/include/lbann/utils/CMakeLists.txt b/include/lbann/utils/CMakeLists.txt index 230ff88b506..c31a52fdc75 100644 --- a/include/lbann/utils/CMakeLists.txt +++ b/include/lbann/utils/CMakeLists.txt @@ -9,6 +9,7 @@ set_full_path(THIS_DIR_HEADERS description.hpp entrywise_operator.hpp enum_iterator.hpp + eti_macros.hpp exception.hpp factory.hpp factory_error_policies.hpp diff --git a/src/layers/CMakeLists.txt b/src/layers/CMakeLists.txt index 2bd2ea2db36..305a84b6241 100644 --- a/src/layers/CMakeLists.txt +++ b/src/layers/CMakeLists.txt @@ -6,6 +6,7 @@ set_full_path(THIS_DIR_SOURCES # Add the subdirectories add_subdirectory(activations) add_subdirectory(image) +add_subdirectory(io/input) add_subdirectory(learning) add_subdirectory(loss) add_subdirectory(math) diff --git a/src/layers/activations/CMakeLists.txt b/src/layers/activations/CMakeLists.txt index e7be74b8843..30b6880475f 100644 --- a/src/layers/activations/CMakeLists.txt +++ b/src/layers/activations/CMakeLists.txt @@ -2,6 +2,7 @@ set_full_path(THIS_DIR_SOURCES activations.cpp elu.cpp + identity.cpp leaky_relu.cpp log_softmax.cpp softmax.cpp diff --git a/src/layers/activations/activations.cpp b/src/layers/activations/activations.cpp index 6b45f0c0b63..d9b945e3a1f 100644 --- a/src/layers/activations/activations.cpp +++ b/src/layers/activations/activations.cpp @@ -24,6 +24,7 @@ // permissions and limitations under the license. //////////////////////////////////////////////////////////////////////////////// +#define LBANN_ACTIVATIONS_LAYER_INSTANTIATE #include "lbann/layers/activations/activations.hpp" #include "lbann/utils/entrywise_operator.hpp" @@ -158,12 +159,14 @@ struct softsign_op { apply_entrywise_binary_operator(get_prev_activations(), \ get_prev_error_signals(), \ get_error_signals()); \ - } - INSTANTIATE(log_sigmoid_layer, log_sigmoid_op) - INSTANTIATE(relu_layer, relu_op) - INSTANTIATE(selu_layer, selu_op) - INSTANTIATE(sigmoid_layer, sigmoid_op) - INSTANTIATE(softplus_layer, softplus_op) - INSTANTIATE(softsign_layer, softsign_op) + } \ + UNARY_ETI_INST_MACRO_DEV(layer, El::Device::CPU) + +INSTANTIATE(log_sigmoid_layer, log_sigmoid_op); +INSTANTIATE(relu_layer, relu_op); +INSTANTIATE(selu_layer, selu_op); +INSTANTIATE(sigmoid_layer, sigmoid_op); +INSTANTIATE(softplus_layer, softplus_op); +INSTANTIATE(softsign_layer, softsign_op); } // namespace lbann diff --git a/src/layers/activations/activations.cu b/src/layers/activations/activations.cu index 16696e11910..992180a8c38 100644 --- a/src/layers/activations/activations.cu +++ b/src/layers/activations/activations.cu @@ -24,6 +24,7 @@ // permissions and limitations under the license. //////////////////////////////////////////////////////////////////////////////// +#define LBANN_ACTIVATIONS_LAYER_INSTANTIATE #include "lbann/layers/activations/activations.hpp" #include "lbann/utils/cuda.hpp" @@ -157,12 +158,14 @@ struct softsign_op { cuda::apply_entrywise_binary_operator(get_prev_activations(), \ get_prev_error_signals(), \ get_error_signals()); \ - } - INSTANTIATE(log_sigmoid_layer, log_sigmoid_op) - INSTANTIATE(relu_layer, relu_op) - INSTANTIATE(selu_layer, selu_op) - INSTANTIATE(sigmoid_layer, sigmoid_op) - INSTANTIATE(softplus_layer, softplus_op) - INSTANTIATE(softsign_layer, softsign_op) + } \ + UNARY_ETI_INST_MACRO_DEV(layer, El::Device::GPU) + +INSTANTIATE(log_sigmoid_layer, log_sigmoid_op); +INSTANTIATE(relu_layer, relu_op); +INSTANTIATE(selu_layer, selu_op); +INSTANTIATE(sigmoid_layer, sigmoid_op); +INSTANTIATE(softplus_layer, softplus_op); +INSTANTIATE(softsign_layer, softsign_op); } // namespace lbann diff --git a/src/layers/activations/elu.cpp b/src/layers/activations/elu.cpp index 4a4083b8a78..4f4bfc48de9 100644 --- a/src/layers/activations/elu.cpp +++ b/src/layers/activations/elu.cpp @@ -24,6 +24,7 @@ // permissions and limitations under the license. //////////////////////////////////////////////////////////////////////////////// +#define LBANN_ELU_LAYER_INSTANTIATE #include "lbann/layers/activations/elu.hpp" namespace lbann { @@ -100,4 +101,7 @@ void elu_layer get_local_error_signals()); } +template class elu_layer; +template class elu_layer; + } // namespace lbann diff --git a/src/layers/activations/elu.cu b/src/layers/activations/elu.cu index f6d6f1581fb..bbcc7c5abb5 100644 --- a/src/layers/activations/elu.cu +++ b/src/layers/activations/elu.cu @@ -24,6 +24,7 @@ // permissions and limitations under the license. //////////////////////////////////////////////////////////////////////////////// +#define LBANN_ELU_LAYER_INSTANTIATE #include "lbann/layers/activations/elu.hpp" namespace lbann { @@ -162,4 +163,7 @@ void elu_layer get_local_error_signals()); } +template class elu_layer; +template class elu_layer; + } // namespace lbann diff --git a/src/layers/activations/identity.cpp b/src/layers/activations/identity.cpp new file mode 100644 index 00000000000..b33a56b4212 --- /dev/null +++ b/src/layers/activations/identity.cpp @@ -0,0 +1,39 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#define LBANN_IDENTITY_LAYER_INSTANTIATE +#include "lbann/layers/activations/identity.hpp" + +namespace lbann { + +template class lbann::identity_layer; +template class lbann::identity_layer; +#ifdef LBANN_HAS_GPU +template class lbann::identity_layer; +template class lbann::identity_layer; +#endif // LBANN_HAS_GPU + +}// namespace lbann diff --git a/src/layers/activations/leaky_relu.cpp b/src/layers/activations/leaky_relu.cpp index e808e35a017..1e77d987785 100644 --- a/src/layers/activations/leaky_relu.cpp +++ b/src/layers/activations/leaky_relu.cpp @@ -24,6 +24,7 @@ // permissions and limitations under the license. //////////////////////////////////////////////////////////////////////////////// +#define LBANN_LEAKY_RELU_LAYER_INSTANTIATE #include "lbann/layers/activations/leaky_relu.hpp" namespace lbann { @@ -100,4 +101,9 @@ void leaky_relu_layer get_local_error_signals()); } +template class leaky_relu_layer< + data_layout::DATA_PARALLEL, El::Device::CPU>; +template class leaky_relu_layer< + data_layout::MODEL_PARALLEL, El::Device::CPU>; + } // namespace lbann diff --git a/src/layers/activations/leaky_relu.cu b/src/layers/activations/leaky_relu.cu index e87d9a39af0..43835abd155 100644 --- a/src/layers/activations/leaky_relu.cu +++ b/src/layers/activations/leaky_relu.cu @@ -24,6 +24,7 @@ // permissions and limitations under the license. //////////////////////////////////////////////////////////////////////////////// +#define LBANN_LEAKY_RELU_LAYER_INSTANTIATE #include "lbann/layers/activations/leaky_relu.hpp" namespace lbann { @@ -162,4 +163,9 @@ void leaky_relu_layer get_local_error_signals()); } +template class leaky_relu_layer< + data_layout::DATA_PARALLEL, El::Device::GPU>; +template class leaky_relu_layer< + data_layout::MODEL_PARALLEL, El::Device::GPU>; + } // namespace lbann diff --git a/src/layers/activations/log_softmax.cpp b/src/layers/activations/log_softmax.cpp index 737d1ec1045..ccf4992e91f 100644 --- a/src/layers/activations/log_softmax.cpp +++ b/src/layers/activations/log_softmax.cpp @@ -24,6 +24,7 @@ // permissions and limitations under the license. //////////////////////////////////////////////////////////////////////////////// +#define LBANN_LOG_SOFTMAX_LAYER_INSTANTIATE #include "lbann/layers/activations/log_softmax.hpp" namespace lbann { @@ -154,4 +155,9 @@ void log_softmax_layer::bp_compute *m_workspace); } +template class log_softmax_layer< + data_layout::DATA_PARALLEL, El::Device::CPU>; +template class log_softmax_layer< + data_layout::MODEL_PARALLEL, El::Device::CPU>; + } // namespace lbann diff --git a/src/layers/activations/log_softmax.cu b/src/layers/activations/log_softmax.cu index d7ce82c5f9c..43584d28c46 100644 --- a/src/layers/activations/log_softmax.cu +++ b/src/layers/activations/log_softmax.cu @@ -24,6 +24,7 @@ // permissions and limitations under the license. //////////////////////////////////////////////////////////////////////////////// +#define LBANN_LOG_SOFTMAX_LAYER_INSTANTIATE #include "lbann/layers/activations/log_softmax.hpp" namespace lbann { @@ -388,4 +389,9 @@ void log_softmax_layer::bp_compute } +template class log_softmax_layer< + data_layout::DATA_PARALLEL, El::Device::GPU>; +template class log_softmax_layer< + data_layout::MODEL_PARALLEL, El::Device::GPU>; + } // namespace lbann diff --git a/src/layers/activations/softmax.cpp b/src/layers/activations/softmax.cpp index e56788fd43f..1839f642903 100644 --- a/src/layers/activations/softmax.cpp +++ b/src/layers/activations/softmax.cpp @@ -24,6 +24,7 @@ // permissions and limitations under the license. //////////////////////////////////////////////////////////////////////////////// +#define LBANN_SOFTMAX_LAYER_INSTANTIATE #include "lbann/layers/activations/softmax.hpp" namespace lbann { @@ -165,4 +166,9 @@ void softmax_layer::bp_compute() { *m_workspace); } +template class softmax_layer< + data_layout::DATA_PARALLEL, El::Device::CPU>; +template class softmax_layer< + data_layout::MODEL_PARALLEL, El::Device::CPU>; + } // namespace lbann diff --git a/src/layers/activations/softmax.cu b/src/layers/activations/softmax.cu index a58d38c760a..07b807f4a23 100644 --- a/src/layers/activations/softmax.cu +++ b/src/layers/activations/softmax.cu @@ -24,6 +24,7 @@ // permissions and limitations under the license. //////////////////////////////////////////////////////////////////////////////// +#define LBANN_SOFTMAX_LAYER_INSTANTIATE #include "lbann/layers/activations/softmax.hpp" namespace lbann { @@ -433,4 +434,9 @@ void softmax_layer::bp_compute() { } +template class softmax_layer< + data_layout::DATA_PARALLEL, El::Device::GPU>; +template class softmax_layer< + data_layout::MODEL_PARALLEL, El::Device::GPU>; + } // namespace lbann diff --git a/src/layers/image/bilinear_resize.cpp b/src/layers/image/bilinear_resize.cpp index 4e293070d1c..ae44a78c95c 100644 --- a/src/layers/image/bilinear_resize.cpp +++ b/src/layers/image/bilinear_resize.cpp @@ -24,6 +24,7 @@ // permissions and limitations under the license. //////////////////////////////////////////////////////////////////////////////// +#define LBANN_BILINEAR_RESIZE_LAYER_INSTANTIATE #include "lbann/layers/image/bilinear_resize.hpp" namespace lbann { @@ -110,4 +111,7 @@ void bilinear_resize_layer::fp_comp } +template class bilinear_resize_layer< + data_layout::DATA_PARALLEL, El::Device::CPU>; + } // namespace lbann diff --git a/src/layers/image/bilinear_resize.cu b/src/layers/image/bilinear_resize.cu index 166b87c753c..808caf24d32 100644 --- a/src/layers/image/bilinear_resize.cu +++ b/src/layers/image/bilinear_resize.cu @@ -24,6 +24,7 @@ // permissions and limitations under the license. //////////////////////////////////////////////////////////////////////////////// +#define LBANN_BILINEAR_RESIZE_LAYER_INSTANTIATE #include "lbann/layers/image/bilinear_resize.hpp" #include "lbann/utils/cuda.hpp" @@ -156,4 +157,7 @@ void bilinear_resize_layer::fp_comp } +template class bilinear_resize_layer< + data_layout::DATA_PARALLEL, El::Device::GPU>; + } // namespace lbann diff --git a/src/layers/io/input/CMakeLists.txt b/src/layers/io/input/CMakeLists.txt new file mode 100644 index 00000000000..fd26136188a --- /dev/null +++ b/src/layers/io/input/CMakeLists.txt @@ -0,0 +1,7 @@ +# Add the source files for this directory +set_full_path(THIS_DIR_SOURCES + input_layer.cpp + ) + +# Propagate the files up the tree +set(SOURCES "${SOURCES}" "${THIS_DIR_SOURCES}" PARENT_SCOPE) diff --git a/src/layers/io/input/input_layer.cpp b/src/layers/io/input/input_layer.cpp new file mode 100644 index 00000000000..d81ae89daab --- /dev/null +++ b/src/layers/io/input/input_layer.cpp @@ -0,0 +1,43 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#define LBANN_INPUT_LAYER_INSTANTIATE +#include "lbann/layers/io/input/input_layer.hpp" + +namespace lbann { + +template class input_layer< + partitioned_io_buffer, data_layout::DATA_PARALLEL, El::Device::CPU>; +template class input_layer< + partitioned_io_buffer, data_layout::MODEL_PARALLEL, El::Device::CPU>; +#ifdef LBANN_HAS_GPU +template class input_layer< + partitioned_io_buffer, data_layout::DATA_PARALLEL, El::Device::GPU>; +template class input_layer< + partitioned_io_buffer, data_layout::MODEL_PARALLEL, El::Device::GPU>; +#endif // LBANN_HAS_GPU + +}// namespace lbann diff --git a/src/layers/learning/CMakeLists.txt b/src/layers/learning/CMakeLists.txt index 4ff1798faaa..e5b1cb0f4a8 100644 --- a/src/layers/learning/CMakeLists.txt +++ b/src/layers/learning/CMakeLists.txt @@ -1,6 +1,8 @@ # Add the source files for this directory set_full_path(THIS_DIR_SOURCES channelwise_scale_bias.cpp + convolution.cpp + deconvolution.cpp entrywise_scale_bias.cpp embedding.cpp fully_connected.cpp diff --git a/src/layers/learning/channelwise_scale_bias.cpp b/src/layers/learning/channelwise_scale_bias.cpp index b019ae28a5a..dec23a2a379 100644 --- a/src/layers/learning/channelwise_scale_bias.cpp +++ b/src/layers/learning/channelwise_scale_bias.cpp @@ -24,6 +24,7 @@ // permissions and limitations under the license. //////////////////////////////////////////////////////////////////////////////// +#define LBANN_CHANNELWISE_SCALE_BIAS_LAYER_INSTANTIATE #include "lbann/layers/learning/channelwise_scale_bias.hpp" #include "lbann/execution_contexts/sgd_execution_context.hpp" @@ -135,4 +136,7 @@ void channelwise_scale_bias_layer } +template class channelwise_scale_bias_layer< + data_layout::DATA_PARALLEL, El::Device::CPU>; + } // namespace lbann diff --git a/src/layers/learning/channelwise_scale_bias.cu b/src/layers/learning/channelwise_scale_bias.cu index a6d37f04251..5d321bd7d73 100644 --- a/src/layers/learning/channelwise_scale_bias.cu +++ b/src/layers/learning/channelwise_scale_bias.cu @@ -24,6 +24,7 @@ // permissions and limitations under the license. //////////////////////////////////////////////////////////////////////////////// +#define LBANN_CHANNELWISE_SCALE_BIAS_LAYER_INSTANTIATE #include "lbann/layers/learning/channelwise_scale_bias.hpp" #ifdef HYDROGEN_HAVE_CUB #include "cub/block/block_reduce.cuh" @@ -275,4 +276,7 @@ void channelwise_scale_bias_layer } +template class channelwise_scale_bias_layer< + data_layout::DATA_PARALLEL, El::Device::GPU>; + } // namespace lbann diff --git a/src/layers/learning/convolution.cpp b/src/layers/learning/convolution.cpp new file mode 100644 index 00000000000..d0b9b9c3cc9 --- /dev/null +++ b/src/layers/learning/convolution.cpp @@ -0,0 +1,42 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#define LBANN_CONVOLUTION_LAYER_INSTANTIATE +#include "lbann/layers/learning/base_convolution.hpp" +#include "lbann/layers/learning/convolution.hpp" + +namespace lbann { + +// Note: This unit will also instantiate the base_convolution_layer class. + +template class base_convolution_layer; +template class convolution_layer; +#ifdef LBANN_HAS_GPU +template class base_convolution_layer; +template class convolution_layer; +#endif // LBANN_HAS_GPU + +}// namespace lbann diff --git a/src/layers/learning/deconvolution.cpp b/src/layers/learning/deconvolution.cpp new file mode 100644 index 00000000000..8a64dc4bdcf --- /dev/null +++ b/src/layers/learning/deconvolution.cpp @@ -0,0 +1,40 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#define LBANN_CONVOLUTION_LAYER_INSTANTIATE +#include "lbann/layers/learning/deconvolution.hpp" + +namespace lbann { + +template class deconvolution_layer< + data_layout::DATA_PARALLEL, El::Device::CPU>; + +#ifdef LBANN_HAS_GPU +template class deconvolution_layer< + data_layout::DATA_PARALLEL, El::Device::GPU>; +#endif // LBANN_HAS_GPU + +}// namespace lbann diff --git a/src/layers/learning/embedding.cpp b/src/layers/learning/embedding.cpp index a84f7223dab..e79a7aa5bc3 100644 --- a/src/layers/learning/embedding.cpp +++ b/src/layers/learning/embedding.cpp @@ -24,6 +24,7 @@ // permissions and limitations under the license. //////////////////////////////////////////////////////////////////////////////// +#define LBANN_EMBEDDING_LAYER_INSTANTIATE #include "lbann/layers/learning/embedding.hpp" #include "lbann/models/model.hpp" #include "lbann/execution_contexts/sgd_execution_context.hpp" @@ -141,4 +142,7 @@ void embedding_layer::bp_compute() { } +// Explicit instantiation +template class embedding_layer; + } // namespace lbann diff --git a/src/layers/learning/entrywise_scale_bias.cpp b/src/layers/learning/entrywise_scale_bias.cpp index 085ac42c259..01b607eea1b 100644 --- a/src/layers/learning/entrywise_scale_bias.cpp +++ b/src/layers/learning/entrywise_scale_bias.cpp @@ -24,6 +24,7 @@ // permissions and limitations under the license. //////////////////////////////////////////////////////////////////////////////// +#define LBANN_ENTRYWISE_SCALE_BIAS_LAYER_INSTANTIATE #include "lbann/layers/learning/entrywise_scale_bias.hpp" #include "lbann/execution_contexts/sgd_execution_context.hpp" @@ -160,4 +161,9 @@ void entrywise_scale_bias_layer c.get_effective_mini_batch_size()); } +template class entrywise_scale_bias_layer< + data_layout::DATA_PARALLEL, El::Device::CPU>; +template class entrywise_scale_bias_layer< + data_layout::MODEL_PARALLEL, El::Device::CPU>; + } // namespace lbann diff --git a/src/layers/learning/entrywise_scale_bias.cu b/src/layers/learning/entrywise_scale_bias.cu index d0492ab78a6..286dfa8a993 100644 --- a/src/layers/learning/entrywise_scale_bias.cu +++ b/src/layers/learning/entrywise_scale_bias.cu @@ -24,6 +24,7 @@ // permissions and limitations under the license. //////////////////////////////////////////////////////////////////////////////// +#define LBANN_ENTRYWISE_SCALE_BIAS_LAYER_INSTANTIATE #include "lbann/layers/learning/entrywise_scale_bias.hpp" #include "lbann/execution_contexts/sgd_execution_context.hpp" @@ -216,4 +217,9 @@ void entrywise_scale_bias_layer c.get_effective_mini_batch_size()); } +template class entrywise_scale_bias_layer< + data_layout::DATA_PARALLEL, El::Device::GPU>; +template class entrywise_scale_bias_layer< + data_layout::MODEL_PARALLEL, El::Device::GPU>; + } // namespace lbann diff --git a/src/layers/learning/fully_connected.cpp b/src/layers/learning/fully_connected.cpp index 4454d6e8897..01dad8b8a04 100644 --- a/src/layers/learning/fully_connected.cpp +++ b/src/layers/learning/fully_connected.cpp @@ -24,6 +24,7 @@ // permissions and limitations under the license. //////////////////////////////////////////////////////////////////////////////// +#define LBANN_FULLY_CONNECTED_LAYER_INSTANTIATE #include "lbann/layers/learning/fully_connected.hpp" #include "lbann/execution_contexts/sgd_execution_context.hpp" @@ -490,4 +491,16 @@ void fully_connected_layer::bp_com } #endif // LBANN_HAS_GPU +template class fully_connected_layer< + data_layout::DATA_PARALLEL, El::Device::CPU>; +template class fully_connected_layer< + data_layout::MODEL_PARALLEL, El::Device::CPU>; + +#ifdef LBANN_HAS_GPU +template class fully_connected_layer< + data_layout::DATA_PARALLEL, El::Device::GPU>; +template class fully_connected_layer< + data_layout::MODEL_PARALLEL, El::Device::GPU>; +#endif // LBANN_HAS_GPU + } // namespace lbann diff --git a/src/layers/loss/categorical_accuracy.cpp b/src/layers/loss/categorical_accuracy.cpp index f10f80c599c..8659bc63500 100644 --- a/src/layers/loss/categorical_accuracy.cpp +++ b/src/layers/loss/categorical_accuracy.cpp @@ -24,6 +24,7 @@ // permissions and limitations under the license. //////////////////////////////////////////////////////////////////////////////// +#define LBANN_CATEGORICAL_ACCURACY_LAYER_INSTANTIATE #include "lbann/layers/loss/categorical_accuracy.hpp" #include @@ -213,4 +214,9 @@ void categorical_accuracy_layer get_activations()); } +template class categorical_accuracy_layer< + data_layout::DATA_PARALLEL, El::Device::CPU>; +template class categorical_accuracy_layer< + data_layout::MODEL_PARALLEL, El::Device::CPU>; + } // namespace lbann diff --git a/src/layers/loss/categorical_accuracy.cu b/src/layers/loss/categorical_accuracy.cu index c91f3359bce..6079f02f8ed 100644 --- a/src/layers/loss/categorical_accuracy.cu +++ b/src/layers/loss/categorical_accuracy.cu @@ -24,6 +24,7 @@ // permissions and limitations under the license. //////////////////////////////////////////////////////////////////////////////// +#define LBANN_CATEGORICAL_ACCURACY_LAYER_INSTANTIATE #include "lbann/layers/loss/categorical_accuracy.hpp" #include "lbann/utils/cuda.hpp" @@ -383,4 +384,9 @@ void categorical_accuracy_layer get_activations()); } +template class categorical_accuracy_layer< + data_layout::DATA_PARALLEL, El::Device::GPU>; +template class categorical_accuracy_layer< + data_layout::MODEL_PARALLEL, El::Device::GPU>; + } // namespace lbann diff --git a/src/layers/loss/cross_entropy.cpp b/src/layers/loss/cross_entropy.cpp index 65eda604540..4e97301cabf 100644 --- a/src/layers/loss/cross_entropy.cpp +++ b/src/layers/loss/cross_entropy.cpp @@ -24,6 +24,7 @@ // permissions and limitations under the license. //////////////////////////////////////////////////////////////////////////////// +#define LBANN_CROSS_ENTROPY_LAYER_INSTANTIATE #include "lbann/layers/loss/cross_entropy.hpp" #include "lbann/utils/exception.hpp" @@ -132,4 +133,9 @@ void cross_entropy_layer local_gradient_wrt_ground_truth); } +template class cross_entropy_layer< + data_layout::DATA_PARALLEL, El::Device::CPU>; +template class cross_entropy_layer< + data_layout::MODEL_PARALLEL, El::Device::CPU>; + } // namespace lbann diff --git a/src/layers/loss/cross_entropy.cu b/src/layers/loss/cross_entropy.cu index 73ba9e2d226..20f5e6410ca 100644 --- a/src/layers/loss/cross_entropy.cu +++ b/src/layers/loss/cross_entropy.cu @@ -24,6 +24,7 @@ // permissions and limitations under the license. //////////////////////////////////////////////////////////////////////////////// +#define LBANN_CROSS_ENTROPY_LAYER_INSTANTIATE #include "lbann/layers/loss/cross_entropy.hpp" #include "lbann/utils/exception.hpp" #include "math.h" @@ -204,4 +205,9 @@ void cross_entropy_layer local_gradient_wrt_ground_truth); } +template class cross_entropy_layer< + data_layout::DATA_PARALLEL, El::Device::GPU>; +template class cross_entropy_layer< + data_layout::MODEL_PARALLEL, El::Device::GPU>; + } // namespace lbann diff --git a/src/layers/loss/entrywise.cpp b/src/layers/loss/entrywise.cpp index 112a9c6861f..238630955aa 100644 --- a/src/layers/loss/entrywise.cpp +++ b/src/layers/loss/entrywise.cpp @@ -24,6 +24,7 @@ // permissions and limitations under the license. //////////////////////////////////////////////////////////////////////////////// +#define LBANN_ENTRYWISE_LAYER_INSTANTIATE #include "lbann/layers/loss/entrywise.hpp" #include "lbann/utils/entrywise_operator.hpp" @@ -235,11 +236,14 @@ struct boolean_false_positive_op { get_local_prev_error_signals(), \ get_local_error_signals(0), \ get_local_error_signals(1)); \ - } - INSTANTIATE(binary_cross_entropy_layer, binary_cross_entropy_op) - INSTANTIATE(sigmoid_binary_cross_entropy_layer, sigmoid_binary_cross_entropy_op) - INSTANTIATE(boolean_accuracy_layer, boolean_accuracy_op) - INSTANTIATE(boolean_false_negative_layer, boolean_false_negative_op) - INSTANTIATE(boolean_false_positive_layer, boolean_false_positive_op) + } \ + BINARY_ETI_INST_MACRO_DEV(layer, El::Device::CPU) + +INSTANTIATE(binary_cross_entropy_layer, binary_cross_entropy_op); +INSTANTIATE(sigmoid_binary_cross_entropy_layer, + sigmoid_binary_cross_entropy_op); +INSTANTIATE(boolean_accuracy_layer, boolean_accuracy_op); +INSTANTIATE(boolean_false_negative_layer, boolean_false_negative_op); +INSTANTIATE(boolean_false_positive_layer, boolean_false_positive_op); } // namespace lbann diff --git a/src/layers/loss/entrywise.cu b/src/layers/loss/entrywise.cu index c206dc815f6..ac10385dab6 100644 --- a/src/layers/loss/entrywise.cu +++ b/src/layers/loss/entrywise.cu @@ -24,6 +24,7 @@ // permissions and limitations under the license. //////////////////////////////////////////////////////////////////////////////// +#define LBANN_ENTRYWISE_LAYER_INSTANTIATE #include "lbann/layers/loss/entrywise.hpp" #include "lbann/utils/cuda.hpp" @@ -267,11 +268,14 @@ struct boolean_false_positive_op { get_local_prev_error_signals(), \ get_local_error_signals(0), \ get_local_error_signals(1)); \ - } - INSTANTIATE(binary_cross_entropy_layer, binary_cross_entropy_op) - INSTANTIATE(sigmoid_binary_cross_entropy_layer, sigmoid_binary_cross_entropy_op) - INSTANTIATE(boolean_accuracy_layer, boolean_accuracy_op) - INSTANTIATE(boolean_false_negative_layer, boolean_false_negative_op) - INSTANTIATE(boolean_false_positive_layer, boolean_false_positive_op) + } \ + BINARY_ETI_INST_MACRO_DEV(layer, El::Device::GPU) + +INSTANTIATE(binary_cross_entropy_layer, binary_cross_entropy_op); +INSTANTIATE(sigmoid_binary_cross_entropy_layer, + sigmoid_binary_cross_entropy_op); +INSTANTIATE(boolean_accuracy_layer, boolean_accuracy_op); +INSTANTIATE(boolean_false_negative_layer, boolean_false_negative_op); +INSTANTIATE(boolean_false_positive_layer, boolean_false_positive_op); } // namespace lbann diff --git a/src/layers/loss/l1_norm.cpp b/src/layers/loss/l1_norm.cpp index 1d20295719a..825b5bfdd1c 100644 --- a/src/layers/loss/l1_norm.cpp +++ b/src/layers/loss/l1_norm.cpp @@ -24,6 +24,7 @@ // permissions and limitations under the license. //////////////////////////////////////////////////////////////////////////////// +#define LBANN_L1_NORM_LAYER_INSTANTIATE #include "lbann/layers/loss/l1_norm.hpp" namespace lbann { @@ -97,4 +98,9 @@ void l1_norm_layer local_gradient_wrt_input); } +template class l1_norm_layer< + data_layout::DATA_PARALLEL, El::Device::CPU>; +template class l1_norm_layer< + data_layout::MODEL_PARALLEL, El::Device::CPU>; + } // namespace lbann diff --git a/src/layers/loss/l1_norm.cu b/src/layers/loss/l1_norm.cu index 1dfda1aac6e..f849afb4b84 100644 --- a/src/layers/loss/l1_norm.cu +++ b/src/layers/loss/l1_norm.cu @@ -24,6 +24,7 @@ // permissions and limitations under the license. //////////////////////////////////////////////////////////////////////////////// +#define LBANN_L1_NORM_LAYER_INSTANTIATE #include "lbann/layers/loss/l1_norm.hpp" namespace lbann { @@ -172,4 +173,9 @@ void l1_norm_layer local_gradient_wrt_input); } +template class l1_norm_layer< + data_layout::DATA_PARALLEL, El::Device::GPU>; +template class l1_norm_layer< + data_layout::MODEL_PARALLEL, El::Device::GPU>; + } // namespace lbann diff --git a/src/layers/loss/l2_norm2.cpp b/src/layers/loss/l2_norm2.cpp index 258179d883f..01676d02ae4 100644 --- a/src/layers/loss/l2_norm2.cpp +++ b/src/layers/loss/l2_norm2.cpp @@ -24,6 +24,7 @@ // permissions and limitations under the license. //////////////////////////////////////////////////////////////////////////////// +#define LBANN_L2_NORM2_LAYER_INSTANTIATE #include "lbann/layers/loss/l2_norm2.hpp" namespace lbann { @@ -92,4 +93,9 @@ void l2_norm2_layer local_gradient_wrt_input); } +template class l2_norm2_layer< + data_layout::DATA_PARALLEL, El::Device::CPU>; +template class l2_norm2_layer< + data_layout::MODEL_PARALLEL, El::Device::CPU>; + } // namespace lbann diff --git a/src/layers/loss/l2_norm2.cu b/src/layers/loss/l2_norm2.cu index 1a02a1096c2..506cba2fb11 100644 --- a/src/layers/loss/l2_norm2.cu +++ b/src/layers/loss/l2_norm2.cu @@ -24,6 +24,7 @@ // permissions and limitations under the license. //////////////////////////////////////////////////////////////////////////////// +#define LBANN_L2_NORM2_LAYER_INSTANTIATE #include "lbann/layers/loss/l2_norm2.hpp" namespace lbann { @@ -165,4 +166,9 @@ void l2_norm2_layer local_gradient_wrt_input); } +template class l2_norm2_layer< + data_layout::DATA_PARALLEL, El::Device::GPU>; +template class l2_norm2_layer< + data_layout::MODEL_PARALLEL, El::Device::GPU>; + } // namespace lbann diff --git a/src/layers/loss/mean_absolute_error.cpp b/src/layers/loss/mean_absolute_error.cpp index e497f11add6..074c0e1c53d 100644 --- a/src/layers/loss/mean_absolute_error.cpp +++ b/src/layers/loss/mean_absolute_error.cpp @@ -24,6 +24,7 @@ // permissions and limitations under the license. //////////////////////////////////////////////////////////////////////////////// +#define LBANN_MEAN_ABSOLUTE_ERROR_LAYER_INSTANTIATE #include "lbann/layers/loss/mean_absolute_error.hpp" namespace lbann { @@ -142,4 +143,9 @@ void mean_absolute_error_layer local_gradient_wrt_ground_truth); } +template class mean_absolute_error_layer< + data_layout::DATA_PARALLEL, El::Device::CPU>; +template class mean_absolute_error_layer< + data_layout::MODEL_PARALLEL, El::Device::CPU>; + } // namespace lbann diff --git a/src/layers/loss/mean_absolute_error.cu b/src/layers/loss/mean_absolute_error.cu index ba7c224c899..802e551473f 100644 --- a/src/layers/loss/mean_absolute_error.cu +++ b/src/layers/loss/mean_absolute_error.cu @@ -24,6 +24,7 @@ // permissions and limitations under the license. //////////////////////////////////////////////////////////////////////////////// +#define LBANN_MEAN_ABSOLUTE_ERROR_LAYER_INSTANTIATE #include "lbann/layers/loss/mean_absolute_error.hpp" namespace lbann { @@ -221,4 +222,9 @@ void mean_absolute_error_layer local_gradient_wrt_ground_truth); } +template class mean_absolute_error_layer< + data_layout::DATA_PARALLEL, El::Device::GPU>; +template class mean_absolute_error_layer< + data_layout::MODEL_PARALLEL, El::Device::GPU>; + } // namespace lbann diff --git a/src/layers/loss/mean_squared_error.cpp b/src/layers/loss/mean_squared_error.cpp index 19df1b60b4c..cf78b2a6c24 100644 --- a/src/layers/loss/mean_squared_error.cpp +++ b/src/layers/loss/mean_squared_error.cpp @@ -24,6 +24,7 @@ // permissions and limitations under the license. //////////////////////////////////////////////////////////////////////////////// +#define LBANN_MEAN_SQUARED_ERROR_LAYER_INSTANTIATE #include "lbann/layers/loss/mean_squared_error.hpp" namespace lbann { @@ -133,4 +134,9 @@ void mean_squared_error_layer local_gradient_wrt_ground_truth); } +template class mean_squared_error_layer< + data_layout::DATA_PARALLEL, El::Device::CPU>; +template class mean_squared_error_layer< + data_layout::MODEL_PARALLEL, El::Device::CPU>; + } // namespace lbann diff --git a/src/layers/loss/mean_squared_error.cu b/src/layers/loss/mean_squared_error.cu index 024b676b39f..d350f279d1e 100644 --- a/src/layers/loss/mean_squared_error.cu +++ b/src/layers/loss/mean_squared_error.cu @@ -24,6 +24,7 @@ // permissions and limitations under the license. //////////////////////////////////////////////////////////////////////////////// +#define LBANN_MEAN_SQUARED_ERROR_LAYER_INSTANTIATE #include "lbann/layers/loss/mean_squared_error.hpp" namespace lbann { @@ -213,4 +214,9 @@ void mean_squared_error_layer local_gradient_wrt_ground_truth); } +template class mean_squared_error_layer< + data_layout::DATA_PARALLEL, El::Device::GPU>; +template class mean_squared_error_layer< + data_layout::MODEL_PARALLEL, El::Device::GPU>; + } // namespace lbann diff --git a/src/layers/loss/top_k_categorical_accuracy.cpp b/src/layers/loss/top_k_categorical_accuracy.cpp index 9cad631a78c..ebb8d5ca6ee 100644 --- a/src/layers/loss/top_k_categorical_accuracy.cpp +++ b/src/layers/loss/top_k_categorical_accuracy.cpp @@ -24,6 +24,7 @@ // permissions and limitations under the license. //////////////////////////////////////////////////////////////////////////////// +#define LBANN_TOP_K_CATEGORICAL_ACCURACY_LAYER_INSTANTIATE #include "lbann/layers/loss/top_k_categorical_accuracy.hpp" #include #include @@ -192,4 +193,9 @@ void top_k_categorical_accuracy_layer; +template class top_k_categorical_accuracy_layer< + data_layout::MODEL_PARALLEL, El::Device::CPU>; + } // namespace lbann diff --git a/src/layers/loss/top_k_categorical_accuracy.cu b/src/layers/loss/top_k_categorical_accuracy.cu index 7ea4f64a77a..824f67ae633 100644 --- a/src/layers/loss/top_k_categorical_accuracy.cu +++ b/src/layers/loss/top_k_categorical_accuracy.cu @@ -24,6 +24,7 @@ // permissions and limitations under the license. //////////////////////////////////////////////////////////////////////////////// +#define LBANN_TOP_K_CATEGORICAL_ACCURACY_LAYER_INSTANTIATE #include "lbann/layers/loss/top_k_categorical_accuracy.hpp" #include "lbann/utils/cuda.hpp" #include "lbann/utils/exception.hpp" @@ -330,4 +331,9 @@ void top_k_categorical_accuracy_layer; +template class top_k_categorical_accuracy_layer< + data_layout::MODEL_PARALLEL, El::Device::GPU>; + } // namespace lbann diff --git a/src/layers/math/binary.cpp b/src/layers/math/binary.cpp index 774bc7c7bf5..f1e5a58e98d 100644 --- a/src/layers/math/binary.cpp +++ b/src/layers/math/binary.cpp @@ -24,6 +24,7 @@ // permissions and limitations under the license. //////////////////////////////////////////////////////////////////////////////// +#define LBANN_BINARY_LAYER_INSTANTIATE #include "lbann/layers/math/binary.hpp" #include "lbann/utils/entrywise_operator.hpp" @@ -460,25 +461,27 @@ struct logical_xor_op { get_local_prev_error_signals(), \ get_local_error_signals(0), \ get_local_error_signals(1)); \ - } - INSTANTIATE(add_layer, add_op) - INSTANTIATE(subtract_layer, subtract_op) - INSTANTIATE(multiply_layer, multiply_op) - INSTANTIATE(divide_layer, divide_op) - INSTANTIATE(mod_layer, mod_op) - INSTANTIATE(pow_layer, pow_op) - INSTANTIATE(safe_divide_layer, safe_divide_op) - INSTANTIATE(squared_difference_layer, squared_difference_op) - INSTANTIATE(max_layer, max_op) - INSTANTIATE(min_layer, min_op) - INSTANTIATE(equal_layer, equal_op) - INSTANTIATE(not_equal_layer, not_equal_op) - INSTANTIATE(less_layer, less_op) - INSTANTIATE(less_equal_layer, less_equal_op) - INSTANTIATE(greater_layer, greater_op) - INSTANTIATE(greater_equal_layer, greater_equal_op) - INSTANTIATE(logical_and_layer, logical_and_op) - INSTANTIATE(logical_or_layer, logical_or_op) - INSTANTIATE(logical_xor_layer, logical_xor_op) + } \ + BINARY_ETI_INST_MACRO_DEV(layer, El::Device::CPU) + +INSTANTIATE(add_layer, add_op); +INSTANTIATE(subtract_layer, subtract_op); +INSTANTIATE(multiply_layer, multiply_op); +INSTANTIATE(divide_layer, divide_op); +INSTANTIATE(mod_layer, mod_op); +INSTANTIATE(pow_layer, pow_op); +INSTANTIATE(safe_divide_layer, safe_divide_op); +INSTANTIATE(squared_difference_layer, squared_difference_op); +INSTANTIATE(max_layer, max_op); +INSTANTIATE(min_layer, min_op); +INSTANTIATE(equal_layer, equal_op); +INSTANTIATE(not_equal_layer, not_equal_op); +INSTANTIATE(less_layer, less_op); +INSTANTIATE(less_equal_layer, less_equal_op); +INSTANTIATE(greater_layer, greater_op); +INSTANTIATE(greater_equal_layer, greater_equal_op); +INSTANTIATE(logical_and_layer, logical_and_op); +INSTANTIATE(logical_or_layer, logical_or_op); +INSTANTIATE(logical_xor_layer, logical_xor_op); } // namespace lbann diff --git a/src/layers/math/binary.cu b/src/layers/math/binary.cu index 401ea2c94b8..4ae488be43b 100644 --- a/src/layers/math/binary.cu +++ b/src/layers/math/binary.cu @@ -24,6 +24,7 @@ // permissions and limitations under the license. //////////////////////////////////////////////////////////////////////////////// +#define LBANN_BINARY_LAYER_INSTANTIATE #include "lbann/layers/math/binary.hpp" namespace lbann { @@ -484,25 +485,27 @@ struct logical_xor_op { get_local_prev_error_signals(), \ get_local_error_signals(0), \ get_local_error_signals(1)); \ - } - INSTANTIATE(add_layer, add_op) - INSTANTIATE(subtract_layer, subtract_op) - INSTANTIATE(multiply_layer, multiply_op) - INSTANTIATE(divide_layer, divide_op) - INSTANTIATE(mod_layer, mod_op) - INSTANTIATE(pow_layer, pow_op) - INSTANTIATE(safe_divide_layer, safe_divide_op) - INSTANTIATE(squared_difference_layer, squared_difference_op) - INSTANTIATE(max_layer, max_op) - INSTANTIATE(min_layer, min_op) - INSTANTIATE(equal_layer, equal_op) - INSTANTIATE(not_equal_layer, not_equal_op) - INSTANTIATE(less_layer, less_op) - INSTANTIATE(less_equal_layer, less_equal_op) - INSTANTIATE(greater_layer, greater_op) - INSTANTIATE(greater_equal_layer, greater_equal_op) - INSTANTIATE(logical_and_layer, logical_and_op) - INSTANTIATE(logical_or_layer, logical_or_op) - INSTANTIATE(logical_xor_layer, logical_xor_op) + } \ + BINARY_ETI_INST_MACRO_DEV(layer, El::Device::GPU) + +INSTANTIATE(add_layer, add_op); +INSTANTIATE(subtract_layer, subtract_op); +INSTANTIATE(multiply_layer, multiply_op); +INSTANTIATE(divide_layer, divide_op); +INSTANTIATE(mod_layer, mod_op); +INSTANTIATE(pow_layer, pow_op); +INSTANTIATE(safe_divide_layer, safe_divide_op); +INSTANTIATE(squared_difference_layer, squared_difference_op); +INSTANTIATE(max_layer, max_op); +INSTANTIATE(min_layer, min_op); +INSTANTIATE(equal_layer, equal_op); +INSTANTIATE(not_equal_layer, not_equal_op); +INSTANTIATE(less_layer, less_op); +INSTANTIATE(less_equal_layer, less_equal_op); +INSTANTIATE(greater_layer, greater_op); +INSTANTIATE(greater_equal_layer, greater_equal_op); +INSTANTIATE(logical_and_layer, logical_and_op); +INSTANTIATE(logical_or_layer, logical_or_op); +INSTANTIATE(logical_xor_layer, logical_xor_op); } // namespace lbann diff --git a/src/layers/math/clamp.cpp b/src/layers/math/clamp.cpp index 0815cc2ee61..64f514f7984 100644 --- a/src/layers/math/clamp.cpp +++ b/src/layers/math/clamp.cpp @@ -24,6 +24,7 @@ // permissions and limitations under the license. //////////////////////////////////////////////////////////////////////////////// +#define LBANN_CLAMP_LAYER_INSTANTIATE #include "lbann/layers/math/clamp.hpp" namespace lbann { @@ -101,4 +102,9 @@ void clamp_layer get_local_error_signals()); } +template class clamp_layer< + data_layout::DATA_PARALLEL, El::Device::CPU>; +template class clamp_layer< + data_layout::MODEL_PARALLEL, El::Device::CPU>; + } // namespace lbann diff --git a/src/layers/math/clamp.cu b/src/layers/math/clamp.cu index 31e4064f745..7666c5a20ed 100644 --- a/src/layers/math/clamp.cu +++ b/src/layers/math/clamp.cu @@ -24,6 +24,7 @@ // permissions and limitations under the license. //////////////////////////////////////////////////////////////////////////////// +#define LBANN_CLAMP_LAYER_INSTANTIATE #include "lbann/layers/math/clamp.hpp" namespace lbann { @@ -168,4 +169,9 @@ void clamp_layer get_local_error_signals()); } +template class clamp_layer< + data_layout::DATA_PARALLEL, El::Device::GPU>; +template class clamp_layer< + data_layout::MODEL_PARALLEL, El::Device::GPU>; + } // namespace lbann diff --git a/src/layers/math/unary.cpp b/src/layers/math/unary.cpp index f0d8954e0dd..b260ec31d7b 100644 --- a/src/layers/math/unary.cpp +++ b/src/layers/math/unary.cpp @@ -24,6 +24,7 @@ // permissions and limitations under the license. //////////////////////////////////////////////////////////////////////////////// +#define LBANN_UNARY_LAYER_INSTANTIATE #include "lbann/layers/math/unary.hpp" #include "lbann/utils/entrywise_operator.hpp" @@ -369,34 +370,36 @@ struct atanh_op { apply_entrywise_binary_operator(get_prev_activations(), \ get_prev_error_signals(), \ get_error_signals()); \ - } - INSTANTIATE(logical_not_layer, logical_not_op) - INSTANTIATE(abs_layer, abs_op) - INSTANTIATE(negative_layer, negative_op) - INSTANTIATE(sign_layer, sign_op) - INSTANTIATE(round_layer, round_op) - INSTANTIATE(ceil_layer, ceil_op) - INSTANTIATE(floor_layer, floor_op) - INSTANTIATE(reciprocal_layer, reciprocal_op) - INSTANTIATE(square_layer, square_op) - INSTANTIATE(sqrt_layer, sqrt_op) - INSTANTIATE(rsqrt_layer, rsqrt_op) - INSTANTIATE(safe_reciprocal_layer, safe_reciprocal_op) - INSTANTIATE(exp_layer, exp_op) - INSTANTIATE(expm1_layer, expm1_op) - INSTANTIATE(log_layer, log_op) - INSTANTIATE(log1p_layer, log1p_op) - INSTANTIATE(cos_layer, cos_op) - INSTANTIATE(sin_layer, sin_op) - INSTANTIATE(tan_layer, tan_op) - INSTANTIATE(acos_layer, acos_op) - INSTANTIATE(asin_layer, asin_op) - INSTANTIATE(atan_layer, atan_op) - INSTANTIATE(cosh_layer, cosh_op) - INSTANTIATE(sinh_layer, sinh_op) - INSTANTIATE(tanh_layer, tanh_op) - INSTANTIATE(acosh_layer, acosh_op) - INSTANTIATE(asinh_layer, asinh_op) - INSTANTIATE(atanh_layer, atanh_op) + } \ + UNARY_ETI_INST_MACRO_DEV(layer, El::Device::CPU) + +INSTANTIATE(logical_not_layer, logical_not_op); +INSTANTIATE(abs_layer, abs_op); +INSTANTIATE(negative_layer, negative_op); +INSTANTIATE(sign_layer, sign_op); +INSTANTIATE(round_layer, round_op); +INSTANTIATE(ceil_layer, ceil_op); +INSTANTIATE(floor_layer, floor_op); +INSTANTIATE(reciprocal_layer, reciprocal_op); +INSTANTIATE(square_layer, square_op); +INSTANTIATE(sqrt_layer, sqrt_op); +INSTANTIATE(rsqrt_layer, rsqrt_op); +INSTANTIATE(safe_reciprocal_layer, safe_reciprocal_op); +INSTANTIATE(exp_layer, exp_op); +INSTANTIATE(expm1_layer, expm1_op); +INSTANTIATE(log_layer, log_op); +INSTANTIATE(log1p_layer, log1p_op); +INSTANTIATE(cos_layer, cos_op); +INSTANTIATE(sin_layer, sin_op); +INSTANTIATE(tan_layer, tan_op); +INSTANTIATE(acos_layer, acos_op); +INSTANTIATE(asin_layer, asin_op); +INSTANTIATE(atan_layer, atan_op); +INSTANTIATE(cosh_layer, cosh_op); +INSTANTIATE(sinh_layer, sinh_op); +INSTANTIATE(tanh_layer, tanh_op); +INSTANTIATE(acosh_layer, acosh_op); +INSTANTIATE(asinh_layer, asinh_op); +INSTANTIATE(atanh_layer, atanh_op); } // namespace lbann diff --git a/src/layers/math/unary.cu b/src/layers/math/unary.cu index 143381522e7..58de90b98dd 100644 --- a/src/layers/math/unary.cu +++ b/src/layers/math/unary.cu @@ -24,6 +24,7 @@ // permissions and limitations under the license. //////////////////////////////////////////////////////////////////////////////// +#define LBANN_UNARY_LAYER_INSTANTIATE #include "lbann/layers/math/unary.hpp" #include "lbann/utils/cuda.hpp" @@ -369,34 +370,36 @@ struct atanh_op { cuda::apply_entrywise_binary_operator(get_prev_activations(), \ get_prev_error_signals(), \ get_error_signals()); \ - } - INSTANTIATE(logical_not_layer, logical_not_op) - INSTANTIATE(abs_layer, abs_op) - INSTANTIATE(negative_layer, negative_op) - INSTANTIATE(sign_layer, sign_op) - INSTANTIATE(round_layer, round_op) - INSTANTIATE(ceil_layer, ceil_op) - INSTANTIATE(floor_layer, floor_op) - INSTANTIATE(reciprocal_layer, reciprocal_op) - INSTANTIATE(square_layer, square_op) - INSTANTIATE(sqrt_layer, sqrt_op) - INSTANTIATE(safe_reciprocal_layer, safe_reciprocal_op) - INSTANTIATE(rsqrt_layer, rsqrt_op) - INSTANTIATE(exp_layer, exp_op) - INSTANTIATE(expm1_layer, expm1_op) - INSTANTIATE(log_layer, log_op) - INSTANTIATE(log1p_layer, log1p_op) - INSTANTIATE(cos_layer, cos_op) - INSTANTIATE(sin_layer, sin_op) - INSTANTIATE(tan_layer, tan_op) - INSTANTIATE(acos_layer, acos_op) - INSTANTIATE(asin_layer, asin_op) - INSTANTIATE(atan_layer, atan_op) - INSTANTIATE(cosh_layer, cosh_op) - INSTANTIATE(sinh_layer, sinh_op) - INSTANTIATE(tanh_layer, tanh_op) - INSTANTIATE(acosh_layer, acosh_op) - INSTANTIATE(asinh_layer, asinh_op) - INSTANTIATE(atanh_layer, atanh_op) + } \ + UNARY_ETI_INST_MACRO_DEV(layer, El::Device::GPU) + +INSTANTIATE(logical_not_layer, logical_not_op); +INSTANTIATE(abs_layer, abs_op); +INSTANTIATE(negative_layer, negative_op); +INSTANTIATE(sign_layer, sign_op); +INSTANTIATE(round_layer, round_op); +INSTANTIATE(ceil_layer, ceil_op); +INSTANTIATE(floor_layer, floor_op); +INSTANTIATE(reciprocal_layer, reciprocal_op); +INSTANTIATE(square_layer, square_op); +INSTANTIATE(sqrt_layer, sqrt_op); +INSTANTIATE(safe_reciprocal_layer, safe_reciprocal_op); +INSTANTIATE(rsqrt_layer, rsqrt_op); +INSTANTIATE(exp_layer, exp_op); +INSTANTIATE(expm1_layer, expm1_op); +INSTANTIATE(log_layer, log_op); +INSTANTIATE(log1p_layer, log1p_op); +INSTANTIATE(cos_layer, cos_op); +INSTANTIATE(sin_layer, sin_op); +INSTANTIATE(tan_layer, tan_op); +INSTANTIATE(acos_layer, acos_op); +INSTANTIATE(asin_layer, asin_op); +INSTANTIATE(atan_layer, atan_op); +INSTANTIATE(cosh_layer, cosh_op); +INSTANTIATE(sinh_layer, sinh_op); +INSTANTIATE(tanh_layer, tanh_op); +INSTANTIATE(acosh_layer, acosh_op); +INSTANTIATE(asinh_layer, asinh_op); +INSTANTIATE(atanh_layer, atanh_op); } // namespace lbann diff --git a/src/layers/misc/CMakeLists.txt b/src/layers/misc/CMakeLists.txt index f0b66a776b7..4fade603860 100644 --- a/src/layers/misc/CMakeLists.txt +++ b/src/layers/misc/CMakeLists.txt @@ -1,11 +1,13 @@ # Add the source files for this directory set_full_path(THIS_DIR_SOURCES - covariance.cpp - variance.cpp - channelwise_mean.cpp argmax.cpp argmin.cpp + channelwise_mean.cpp + covariance.cpp + mini_batch_index.cpp + mini_batch_size.cpp one_hot.cpp + variance.cpp ) if (LBANN_HAS_CUDA) diff --git a/src/layers/misc/argmax.cpp b/src/layers/misc/argmax.cpp index c038075b5d0..4ee6e272f1c 100644 --- a/src/layers/misc/argmax.cpp +++ b/src/layers/misc/argmax.cpp @@ -24,6 +24,7 @@ // permissions and limitations under the license. //////////////////////////////////////////////////////////////////////////////// +#define LBANN_ARGMAX_LAYER_INSTANTIATE #include "lbann/layers/misc/argmax.hpp" #include @@ -46,4 +47,7 @@ void argmax_layer } } +template class argmax_layer< + data_layout::DATA_PARALLEL, El::Device::CPU>; + } // namespace lbann diff --git a/src/layers/misc/argmin.cpp b/src/layers/misc/argmin.cpp index c0e1d17435a..8543f8e00c4 100644 --- a/src/layers/misc/argmin.cpp +++ b/src/layers/misc/argmin.cpp @@ -24,6 +24,7 @@ // permissions and limitations under the license. //////////////////////////////////////////////////////////////////////////////// +#define LBANN_ARGMIN_LAYER_INSTANTIATE #include "lbann/layers/misc/argmin.hpp" #include @@ -46,4 +47,7 @@ void argmin_layer } } +template class argmin_layer< + data_layout::DATA_PARALLEL, El::Device::CPU>; + } // namespace lbann diff --git a/src/layers/misc/channelwise_mean.cpp b/src/layers/misc/channelwise_mean.cpp index fa9561ef6c6..983b0285f04 100644 --- a/src/layers/misc/channelwise_mean.cpp +++ b/src/layers/misc/channelwise_mean.cpp @@ -24,6 +24,7 @@ // permissions and limitations under the license. //////////////////////////////////////////////////////////////////////////////// +#define LBANN_CHANNELWISE_MEAN_LAYER_INSTANTIATE #include "lbann/layers/misc/channelwise_mean.hpp" namespace lbann { @@ -92,4 +93,7 @@ void channelwise_mean_layer } +template class channelwise_mean_layer< + data_layout::DATA_PARALLEL, El::Device::CPU>; + } // namespace lbann diff --git a/src/layers/misc/channelwise_mean.cu b/src/layers/misc/channelwise_mean.cu index e4aa15c3850..9a74f95d2c7 100644 --- a/src/layers/misc/channelwise_mean.cu +++ b/src/layers/misc/channelwise_mean.cu @@ -24,6 +24,7 @@ // permissions and limitations under the license. //////////////////////////////////////////////////////////////////////////////// +#define LBANN_CHANNELWISE_MEAN_LAYER_INSTANTIATE #include "lbann/layers/misc/channelwise_mean.hpp" namespace lbann { @@ -180,4 +181,7 @@ void channelwise_mean_layer } +template class channelwise_mean_layer< + data_layout::DATA_PARALLEL, El::Device::GPU>; + } // namespace lbann diff --git a/src/layers/misc/covariance.cpp b/src/layers/misc/covariance.cpp index 5387a7f5605..f6bbbebec98 100644 --- a/src/layers/misc/covariance.cpp +++ b/src/layers/misc/covariance.cpp @@ -24,6 +24,7 @@ // permissions and limitations under the license. //////////////////////////////////////////////////////////////////////////////// +#define LBANN_COVARIANCE_LAYER_INSTANTIATE #include "lbann/layers/misc/covariance.hpp" namespace lbann { @@ -187,4 +188,9 @@ void covariance_layer m_biased); } +template class covariance_layer< + data_layout::DATA_PARALLEL, El::Device::CPU>; +template class covariance_layer< + data_layout::MODEL_PARALLEL, El::Device::CPU>; + } // namespace lbann diff --git a/src/layers/misc/covariance.cu b/src/layers/misc/covariance.cu index 4d1b544922c..b6d21da3ea5 100644 --- a/src/layers/misc/covariance.cu +++ b/src/layers/misc/covariance.cu @@ -24,6 +24,7 @@ // permissions and limitations under the license. //////////////////////////////////////////////////////////////////////////////// +#define LBANN_COVARIANCE_LAYER_INSTANTIATE #include "lbann/layers/misc/covariance.hpp" namespace lbann { @@ -334,4 +335,9 @@ void covariance_layer m_biased); } +template class covariance_layer< + data_layout::DATA_PARALLEL, El::Device::GPU>; +template class covariance_layer< + data_layout::MODEL_PARALLEL, El::Device::GPU>; + } // namespace lbann diff --git a/src/layers/misc/mini_batch_index.cpp b/src/layers/misc/mini_batch_index.cpp new file mode 100644 index 00000000000..44135428807 --- /dev/null +++ b/src/layers/misc/mini_batch_index.cpp @@ -0,0 +1,43 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#define LBANN_MINI_BATCH_INDEX_LAYER_INSTANTIATE +#include "lbann/layers/misc/mini_batch_index.hpp" + +namespace lbann { + +template class mini_batch_index_layer< + data_layout::DATA_PARALLEL, El::Device::CPU>; +template class mini_batch_index_layer< + data_layout::MODEL_PARALLEL, El::Device::CPU>; +#ifdef LBANN_HAS_GPU +template class mini_batch_index_layer< + data_layout::DATA_PARALLEL, El::Device::GPU>; +template class mini_batch_index_layer< + data_layout::MODEL_PARALLEL, El::Device::GPU>; +#endif // LBANN_HAS_GPU + +}// namespace lbann diff --git a/src/layers/misc/mini_batch_size.cpp b/src/layers/misc/mini_batch_size.cpp new file mode 100644 index 00000000000..d447bf43668 --- /dev/null +++ b/src/layers/misc/mini_batch_size.cpp @@ -0,0 +1,43 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#define LBANN_MINI_BATCH_SIZE_LAYER_INSTANTIATE +#include "lbann/layers/misc/mini_batch_size.hpp" + +namespace lbann { + +template class mini_batch_size_layer< + data_layout::DATA_PARALLEL, El::Device::CPU>; +template class mini_batch_size_layer< + data_layout::MODEL_PARALLEL, El::Device::CPU>; +#ifdef LBANN_HAS_GPU +template class mini_batch_size_layer< + data_layout::DATA_PARALLEL, El::Device::GPU>; +template class mini_batch_size_layer< + data_layout::MODEL_PARALLEL, El::Device::GPU>; +#endif // LBANN_HAS_GPU + +}// namespace lbann diff --git a/src/layers/misc/one_hot.cpp b/src/layers/misc/one_hot.cpp index c3531211393..08ce4e2d36d 100644 --- a/src/layers/misc/one_hot.cpp +++ b/src/layers/misc/one_hot.cpp @@ -24,6 +24,7 @@ // permissions and limitations under the license. //////////////////////////////////////////////////////////////////////////////// +#define LBANN_ONE_HOT_LAYER_INSTANTIATE #include "lbann/layers/misc/one_hot.hpp" namespace lbann { @@ -51,4 +52,7 @@ void one_hot_layer } +template class one_hot_layer< + data_layout::DATA_PARALLEL, El::Device::CPU>; + } // namespace lbann diff --git a/src/layers/misc/one_hot.cu b/src/layers/misc/one_hot.cu index 171cba8ba39..7820612cd57 100644 --- a/src/layers/misc/one_hot.cu +++ b/src/layers/misc/one_hot.cu @@ -24,6 +24,7 @@ // permissions and limitations under the license. //////////////////////////////////////////////////////////////////////////////// +#define LBANN_ONE_HOT_LAYER_INSTANTIATE #include "lbann/layers/misc/one_hot.hpp" namespace lbann { @@ -83,4 +84,7 @@ void one_hot_layer } +template class one_hot_layer< + data_layout::DATA_PARALLEL, El::Device::GPU>; + } // namespace lbann diff --git a/src/layers/misc/variance.cpp b/src/layers/misc/variance.cpp index 49e4f7e8f9e..46f31951965 100644 --- a/src/layers/misc/variance.cpp +++ b/src/layers/misc/variance.cpp @@ -24,6 +24,7 @@ // permissions and limitations under the license. //////////////////////////////////////////////////////////////////////////////// +#define LBANN_VARIANCE_LAYER_INSTANTIATE #include "lbann/layers/misc/variance.hpp" namespace lbann { @@ -167,4 +168,9 @@ void variance_layer m_biased); } +template class variance_layer< + data_layout::DATA_PARALLEL, El::Device::CPU>; +template class variance_layer< + data_layout::MODEL_PARALLEL, El::Device::CPU>; + } // namespace lbann diff --git a/src/layers/misc/variance.cu b/src/layers/misc/variance.cu index 02ace16d465..12693992ea3 100644 --- a/src/layers/misc/variance.cu +++ b/src/layers/misc/variance.cu @@ -24,6 +24,7 @@ // permissions and limitations under the license. //////////////////////////////////////////////////////////////////////////////// +#define LBANN_VARIANCE_LAYER_INSTANTIATE #include "lbann/layers/misc/variance.hpp" namespace lbann { @@ -242,4 +243,9 @@ void variance_layer m_biased); } +template class variance_layer< + data_layout::DATA_PARALLEL, El::Device::GPU>; +template class variance_layer< + data_layout::MODEL_PARALLEL, El::Device::GPU>; + } // namespace lbann diff --git a/src/layers/regularizers/CMakeLists.txt b/src/layers/regularizers/CMakeLists.txt index f82d39fd12a..198de11ce86 100644 --- a/src/layers/regularizers/CMakeLists.txt +++ b/src/layers/regularizers/CMakeLists.txt @@ -1,7 +1,10 @@ # Add the source files for this directory set_full_path(THIS_DIR_SOURCES batch_normalization.cpp + dropout.cpp entrywise_batch_normalization.cpp + local_response_normalization.cpp + selu_dropout.cpp ) if (LBANN_HAS_CUDA) diff --git a/src/layers/regularizers/batch_normalization.cpp b/src/layers/regularizers/batch_normalization.cpp index c93130f8442..e5bba2677b0 100644 --- a/src/layers/regularizers/batch_normalization.cpp +++ b/src/layers/regularizers/batch_normalization.cpp @@ -24,6 +24,7 @@ // permissions and limitations under the license. //////////////////////////////////////////////////////////////////////////////// +#define LBANN_BATCH_NORMALIZATION_LAYER_INSTANTIATE #include "lbann/layers/regularizers/batch_normalization.hpp" #include "lbann/execution_contexts/sgd_execution_context.hpp" @@ -301,4 +302,7 @@ void batch_normalization_layer::bp_ } +template class batch_normalization_layer< + data_layout::DATA_PARALLEL, El::Device::CPU>; + } // namespace lbann diff --git a/src/layers/regularizers/batch_normalization.cu b/src/layers/regularizers/batch_normalization.cu index 664f7ad847c..8aed7440912 100644 --- a/src/layers/regularizers/batch_normalization.cu +++ b/src/layers/regularizers/batch_normalization.cu @@ -24,6 +24,7 @@ // permissions and limitations under the license. //////////////////////////////////////////////////////////////////////////////// +#define LBANN_BATCH_NORMALIZATION_LAYER_INSTANTIATE #include "lbann/layers/regularizers/batch_normalization.hpp" #include "lbann/utils/cuda.hpp" #include "lbann/execution_contexts/sgd_execution_context.hpp" @@ -525,4 +526,7 @@ void batch_normalization_layer::bp_ } +template class batch_normalization_layer< + data_layout::DATA_PARALLEL, El::Device::GPU>; + } // namespace lbann diff --git a/src/layers/regularizers/dropout.cpp b/src/layers/regularizers/dropout.cpp new file mode 100644 index 00000000000..c1fa4dee7a8 --- /dev/null +++ b/src/layers/regularizers/dropout.cpp @@ -0,0 +1,39 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#define LBANN_DROPOUT_LAYER_INSTANTIATE +#include "lbann/layers/regularizers/dropout.hpp" + +namespace lbann { + +template class dropout; +template class dropout; +#ifdef LBANN_HAS_GPU +template class dropout; +template class dropout; +#endif // LBANN_HAS_GPU + +}// namespace lbann diff --git a/src/layers/regularizers/entrywise_batch_normalization.cpp b/src/layers/regularizers/entrywise_batch_normalization.cpp index 4cdd51fbbf7..4e6f1bebef3 100644 --- a/src/layers/regularizers/entrywise_batch_normalization.cpp +++ b/src/layers/regularizers/entrywise_batch_normalization.cpp @@ -24,6 +24,7 @@ // permissions and limitations under the license. //////////////////////////////////////////////////////////////////////////////// +#define LBANN_ENTRYWISE_BATCH_NORMALIZATION_LAYER_INSTANTIATE #include "lbann/layers/regularizers/entrywise_batch_normalization.hpp" #include "lbann/execution_contexts/sgd_execution_context.hpp" @@ -427,4 +428,9 @@ void entrywise_batch_normalization_layerget_values()); } +template class entrywise_batch_normalization_layer< + data_layout::DATA_PARALLEL, El::Device::CPU>; +template class entrywise_batch_normalization_layer< + data_layout::MODEL_PARALLEL, El::Device::CPU>; + } // namespace lbann diff --git a/src/layers/regularizers/entrywise_batch_normalization.cu b/src/layers/regularizers/entrywise_batch_normalization.cu index fde189373bc..a6dbaf90e1e 100644 --- a/src/layers/regularizers/entrywise_batch_normalization.cu +++ b/src/layers/regularizers/entrywise_batch_normalization.cu @@ -24,6 +24,7 @@ // permissions and limitations under the license. //////////////////////////////////////////////////////////////////////////////// +#define LBANN_ENTRYWISE_BATCH_NORMALIZATION_LAYER_INSTANTIATE #include "lbann/layers/regularizers/entrywise_batch_normalization.hpp" #include "lbann/utils/cuda.hpp" #include "lbann/execution_contexts/sgd_execution_context.hpp" @@ -615,4 +616,9 @@ void entrywise_batch_normalization_layerget_values()); } +template class entrywise_batch_normalization_layer< + data_layout::DATA_PARALLEL, El::Device::GPU>; +template class entrywise_batch_normalization_layer< + data_layout::MODEL_PARALLEL, El::Device::GPU>; + } // namespace lbann diff --git a/src/layers/regularizers/local_response_normalization.cpp b/src/layers/regularizers/local_response_normalization.cpp new file mode 100644 index 00000000000..633f567e2e3 --- /dev/null +++ b/src/layers/regularizers/local_response_normalization.cpp @@ -0,0 +1,39 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#define LBANN_LOCAL_RESPONSE_NORMALIZATION_LAYER_INSTANTIATE +#include "lbann/layers/regularizers/local_response_normalization.hpp" + +namespace lbann { + +template class local_response_normalization_layer< + data_layout::DATA_PARALLEL, El::Device::CPU>; +#ifdef LBANN_HAS_GPU +template class local_response_normalization_layer< + data_layout::DATA_PARALLEL, El::Device::GPU>; +#endif // LBANN_HAS_GPU + +}// namespace lbann diff --git a/src/layers/regularizers/selu_dropout.cpp b/src/layers/regularizers/selu_dropout.cpp new file mode 100644 index 00000000000..1419da522d0 --- /dev/null +++ b/src/layers/regularizers/selu_dropout.cpp @@ -0,0 +1,39 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#define LBANN_SELU_DROPOUT_LAYER_INSTANTIATE +#include "lbann/layers/regularizers/selu_dropout.hpp" + +namespace lbann { + +template class selu_dropout; +template class selu_dropout; +#ifdef LBANN_HAS_GPU +template class selu_dropout; +template class selu_dropout; +#endif // LBANN_HAS_GPU + +}// namespace lbann diff --git a/src/layers/transform/CMakeLists.txt b/src/layers/transform/CMakeLists.txt index 6040645f6cf..434010b57a0 100644 --- a/src/layers/transform/CMakeLists.txt +++ b/src/layers/transform/CMakeLists.txt @@ -1,10 +1,29 @@ # Add the source files for this directory set_full_path(THIS_DIR_SOURCES + bernoulli.cpp + categorical_random.cpp + concatenation.cpp + constant.cpp crop.cpp + discrete_random.cpp + dummy.cpp evaluation.cpp + gaussian.cpp + hadamard.cpp in_top_k.cpp + pooling.cpp + reduction.cpp + reshape.cpp + slice.cpp sort.cpp + split.cpp + stop_gradient.cpp + sum.cpp tessellate.cpp + uniform.cpp + unpooling.cpp + weighted_sum.cpp + weights.cpp ) if (LBANN_HAS_CUDA) diff --git a/src/layers/transform/bernoulli.cpp b/src/layers/transform/bernoulli.cpp new file mode 100644 index 00000000000..6e65e53b1b5 --- /dev/null +++ b/src/layers/transform/bernoulli.cpp @@ -0,0 +1,39 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#define LBANN_BERNOULLI_LAYER_INSTANTIATE +#include "lbann/layers/transform/bernoulli.hpp" + +namespace lbann { + +template class bernoulli_layer; +template class bernoulli_layer; +#ifdef LBANN_HAS_GPU +template class bernoulli_layer; +template class bernoulli_layer; +#endif // LBANN_HAS_GPU + +}// namespace lbann diff --git a/src/layers/transform/categorical_random.cpp b/src/layers/transform/categorical_random.cpp new file mode 100644 index 00000000000..6065ec4013c --- /dev/null +++ b/src/layers/transform/categorical_random.cpp @@ -0,0 +1,35 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#define LBANN_CATEGORICAL_RANDOM_LAYER_INSTANTIATE +#include "lbann/layers/transform/categorical_random.hpp" + +namespace lbann { + +template class categorical_random_layer< + data_layout::DATA_PARALLEL, El::Device::CPU>; + +}// namespace lbann diff --git a/src/layers/transform/concatenation.cpp b/src/layers/transform/concatenation.cpp new file mode 100644 index 00000000000..36901393a68 --- /dev/null +++ b/src/layers/transform/concatenation.cpp @@ -0,0 +1,43 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#define LBANN_CONCATENATION_LAYER_INSTANTIATE +#include "lbann/layers/transform/concatenation.hpp" + +namespace lbann { + +template class concatenation_layer< + data_layout::DATA_PARALLEL, El::Device::CPU>; +template class concatenation_layer< + data_layout::MODEL_PARALLEL, El::Device::CPU>; +#ifdef LBANN_HAS_GPU +template class concatenation_layer< + data_layout::DATA_PARALLEL, El::Device::GPU>; +template class concatenation_layer< + data_layout::MODEL_PARALLEL, El::Device::GPU>; +#endif // LBANN_HAS_GPU + +}// namespace lbann diff --git a/src/layers/transform/constant.cpp b/src/layers/transform/constant.cpp new file mode 100644 index 00000000000..a92b0caa6b9 --- /dev/null +++ b/src/layers/transform/constant.cpp @@ -0,0 +1,39 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#define LBANN_CONSTANT_LAYER_INSTANTIATE +#include "lbann/layers/transform/constant.hpp" + +namespace lbann { + +template class constant_layer; +template class constant_layer; +#ifdef LBANN_HAS_GPU +template class constant_layer; +template class constant_layer; +#endif // LBANN_HAS_GPU + +}// namespace lbann diff --git a/src/layers/transform/crop.cpp b/src/layers/transform/crop.cpp index b319cee2985..300ce410990 100644 --- a/src/layers/transform/crop.cpp +++ b/src/layers/transform/crop.cpp @@ -24,6 +24,7 @@ // permissions and limitations under the license. //////////////////////////////////////////////////////////////////////////////// +#define LBANN_CROP_LAYER_INSTANTIATE #include "lbann/layers/transform/crop.hpp" namespace lbann { @@ -38,4 +39,6 @@ void crop_layer::bp_compute_3d() { bp_compute_nd(); } +template class crop_layer; + } // namespace lbann diff --git a/src/layers/transform/crop.cu b/src/layers/transform/crop.cu index 460b7e4404c..02ae37f15f3 100644 --- a/src/layers/transform/crop.cu +++ b/src/layers/transform/crop.cu @@ -24,6 +24,7 @@ // permissions and limitations under the license. //////////////////////////////////////////////////////////////////////////////// +#define LBANN_CROP_LAYER_INSTANTIATE #include "lbann/layers/transform/crop.hpp" #include "lbann/utils/cuda.hpp" @@ -229,4 +230,6 @@ void crop_layer::bp_compute_3d() { } +template class crop_layer; + } // namespace lbann diff --git a/src/layers/transform/discrete_random.cpp b/src/layers/transform/discrete_random.cpp new file mode 100644 index 00000000000..9eca436a09e --- /dev/null +++ b/src/layers/transform/discrete_random.cpp @@ -0,0 +1,35 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#define LBANN_DISCRETE_RANDOM_LAYER_INSTANTIATE +#include "lbann/layers/transform/discrete_random.hpp" + +namespace lbann { + +template class discrete_random_layer< + data_layout::DATA_PARALLEL, El::Device::CPU>; + +}// namespace lbann diff --git a/src/layers/transform/dummy.cpp b/src/layers/transform/dummy.cpp new file mode 100644 index 00000000000..0f2ab932d86 --- /dev/null +++ b/src/layers/transform/dummy.cpp @@ -0,0 +1,39 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#define LBANN_DUMMY_LAYER_INSTANTIATE +#include "lbann/layers/transform/dummy.hpp" + +namespace lbann { + +template class dummy_layer; +template class dummy_layer; +#ifdef LBANN_HAS_GPU +template class dummy_layer; +template class dummy_layer; +#endif // LBANN_HAS_GPU + +}// namespace lbann diff --git a/src/layers/transform/evaluation.cpp b/src/layers/transform/evaluation.cpp index 9bbae41007d..3bd2d69d2f5 100644 --- a/src/layers/transform/evaluation.cpp +++ b/src/layers/transform/evaluation.cpp @@ -24,6 +24,7 @@ // permissions and limitations under the license. //////////////////////////////////////////////////////////////////////////////// +#define LBANN_EVALUATION_LAYER_INSTANTIATE #include "lbann/layers/transform/evaluation.hpp" #include "lbann/utils/exception.hpp" #ifdef LBANN_HAS_GPU @@ -225,4 +226,11 @@ abstract_evaluation_layer::construct(lbann_comm *comm, } +template class evaluation_layer; +template class evaluation_layer; +#ifdef LBANN_HAS_GPU +template class evaluation_layer; +template class evaluation_layer; +#endif // LBANN_HAS_GPU + } // namespace lbann diff --git a/src/layers/transform/gaussian.cpp b/src/layers/transform/gaussian.cpp new file mode 100644 index 00000000000..b9f5980ef09 --- /dev/null +++ b/src/layers/transform/gaussian.cpp @@ -0,0 +1,39 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#define LBANN_GAUSSIAN_LAYER_INSTANTIATE +#include "lbann/layers/transform/gaussian.hpp" + +namespace lbann { + +template class gaussian_layer; +template class gaussian_layer; +#ifdef LBANN_HAS_GPU +template class gaussian_layer; +template class gaussian_layer; +#endif // LBANN_HAS_GPU + +}// namespace lbann diff --git a/src/layers/transform/hadamard.cpp b/src/layers/transform/hadamard.cpp new file mode 100644 index 00000000000..aee6d738239 --- /dev/null +++ b/src/layers/transform/hadamard.cpp @@ -0,0 +1,39 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#define LBANN_HADAMARD_LAYER_INSTANTIATE +#include "lbann/layers/transform/hadamard.hpp" + +namespace lbann { + +template class hadamard_layer; +template class hadamard_layer; +#ifdef LBANN_HAS_GPU +template class hadamard_layer; +template class hadamard_layer; +#endif // LBANN_HAS_GPU + +}// namespace lbann diff --git a/src/layers/transform/in_top_k.cpp b/src/layers/transform/in_top_k.cpp index 0ce65b1e454..168ce18bac1 100644 --- a/src/layers/transform/in_top_k.cpp +++ b/src/layers/transform/in_top_k.cpp @@ -24,6 +24,7 @@ // permissions and limitations under the license. //////////////////////////////////////////////////////////////////////////////// +#define LBANN_IN_TOP_K_LAYER_INSTANTIATE #include "lbann/layers/transform/in_top_k.hpp" #include #include @@ -151,4 +152,7 @@ void in_top_k_layer fp_cpu(*get_comm(), m_k, get_prev_activations(), get_activations()); } +template class in_top_k_layer; +template class in_top_k_layer; + } // namespace lbann diff --git a/src/layers/transform/in_top_k.cu b/src/layers/transform/in_top_k.cu index 1331f1c18ee..d9b6d5d3a6a 100644 --- a/src/layers/transform/in_top_k.cu +++ b/src/layers/transform/in_top_k.cu @@ -24,6 +24,7 @@ // permissions and limitations under the license. //////////////////////////////////////////////////////////////////////////////// +#define LBANN_IN_TOP_K_LAYER_INSTANTIATE #include "lbann/layers/transform/in_top_k.hpp" #include "lbann/utils/cuda.hpp" #include "lbann/utils/exception.hpp" @@ -282,4 +283,7 @@ void in_top_k_layer fp_gpu(*get_comm(), m_k, get_prev_activations(), get_activations()); } +template class in_top_k_layer; +template class in_top_k_layer; + } // namespace lbann diff --git a/src/layers/transform/pooling.cpp b/src/layers/transform/pooling.cpp new file mode 100644 index 00000000000..3faaccc98b4 --- /dev/null +++ b/src/layers/transform/pooling.cpp @@ -0,0 +1,37 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#define LBANN_POOLING_LAYER_INSTANTIATE +#include "lbann/layers/transform/pooling.hpp" + +namespace lbann { + +template class pooling_layer; +#ifdef LBANN_HAS_GPU +template class pooling_layer; +#endif // LBANN_HAS_GPU + +}// namespace lbann diff --git a/src/layers/transform/reduction.cpp b/src/layers/transform/reduction.cpp new file mode 100644 index 00000000000..a03c47b00ec --- /dev/null +++ b/src/layers/transform/reduction.cpp @@ -0,0 +1,37 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#define LBANN_REDUCTION_LAYER_INSTANTIATE +#include "lbann/layers/transform/reduction.hpp" + +namespace lbann { + +template class reduction_layer; +#ifdef LBANN_HAS_GPU +template class reduction_layer; +#endif // LBANN_HAS_GPU + +}// namespace lbann diff --git a/src/layers/transform/reshape.cpp b/src/layers/transform/reshape.cpp new file mode 100644 index 00000000000..b1ddb6e00a2 --- /dev/null +++ b/src/layers/transform/reshape.cpp @@ -0,0 +1,39 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#define LBANN_RESHAPE_LAYER_INSTANTIATE +#include "lbann/layers/transform/reshape.hpp" + +namespace lbann { + +template class reshape_layer; +template class reshape_layer; +#ifdef LBANN_HAS_GPU +template class reshape_layer; +template class reshape_layer; +#endif // LBANN_HAS_GPU + +}// namespace lbann diff --git a/src/layers/transform/slice.cpp b/src/layers/transform/slice.cpp new file mode 100644 index 00000000000..8deb7a9f452 --- /dev/null +++ b/src/layers/transform/slice.cpp @@ -0,0 +1,39 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#define LBANN_SLICE_LAYER_INSTANTIATE +#include "lbann/layers/transform/slice.hpp" + +namespace lbann { + +template class slice_layer; +template class slice_layer; +#ifdef LBANN_HAS_GPU +template class slice_layer; +template class slice_layer; +#endif // LBANN_HAS_GPU + +}// namespace lbann diff --git a/src/layers/transform/sort.cpp b/src/layers/transform/sort.cpp index 72aa76835da..e2e9465e561 100644 --- a/src/layers/transform/sort.cpp +++ b/src/layers/transform/sort.cpp @@ -24,6 +24,7 @@ // permissions and limitations under the license. //////////////////////////////////////////////////////////////////////////////// +#define LBANN_SORT_LAYER_INSTANTIATE #include "lbann/layers/transform/sort.hpp" namespace lbann { @@ -86,4 +87,6 @@ void sort_layer } +template class sort_layer; + } // namespace lbann diff --git a/src/layers/transform/sort.cu b/src/layers/transform/sort.cu index d8aeb743438..31ca8f768fb 100644 --- a/src/layers/transform/sort.cu +++ b/src/layers/transform/sort.cu @@ -24,6 +24,7 @@ // permissions and limitations under the license. //////////////////////////////////////////////////////////////////////////////// +#define LBANN_SORT_LAYER_INSTANTIATE #include "lbann/layers/transform/sort.hpp" #include "lbann/utils/cuda.hpp" #include "lbann/utils/exception.hpp" @@ -98,4 +99,6 @@ void sort_layer } +template class sort_layer; + } // namespace lbann diff --git a/src/layers/transform/split.cpp b/src/layers/transform/split.cpp new file mode 100644 index 00000000000..956c76bc09c --- /dev/null +++ b/src/layers/transform/split.cpp @@ -0,0 +1,39 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#define LBANN_SPLIT_LAYER_INSTANTIATE +#include "lbann/layers/transform/split.hpp" + +namespace lbann { + +template class split_layer; +template class split_layer; +#ifdef LBANN_HAS_GPU +template class split_layer; +template class split_layer; +#endif // LBANN_HAS_GPU + +}// namespace lbann diff --git a/src/layers/transform/stop_gradient.cpp b/src/layers/transform/stop_gradient.cpp new file mode 100644 index 00000000000..861bb3d4bdd --- /dev/null +++ b/src/layers/transform/stop_gradient.cpp @@ -0,0 +1,43 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#define LBANN_STOP_GRADIENT_LAYER_INSTANTIATE +#include "lbann/layers/transform/stop_gradient.hpp" + +namespace lbann { + +template class stop_gradient_layer< + data_layout::DATA_PARALLEL, El::Device::CPU>; +template class stop_gradient_layer< + data_layout::MODEL_PARALLEL, El::Device::CPU>; +#ifdef LBANN_HAS_GPU +template class stop_gradient_layer< + data_layout::DATA_PARALLEL, El::Device::GPU>; +template class stop_gradient_layer< + data_layout::MODEL_PARALLEL, El::Device::GPU>; +#endif // LBANN_HAS_GPU + +}// namespace lbann diff --git a/src/layers/transform/sum.cpp b/src/layers/transform/sum.cpp new file mode 100644 index 00000000000..332ac1ba7cc --- /dev/null +++ b/src/layers/transform/sum.cpp @@ -0,0 +1,39 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#define LBANN_SUM_LAYER_INSTANTIATE +#include "lbann/layers/transform/sum.hpp" + +namespace lbann { + +template class sum_layer; +template class sum_layer; +#ifdef LBANN_HAS_GPU +template class sum_layer; +template class sum_layer; +#endif // LBANN_HAS_GPU + +}// namespace lbann diff --git a/src/layers/transform/tessellate.cpp b/src/layers/transform/tessellate.cpp index 6c5c0dd09c9..12bd459b59c 100644 --- a/src/layers/transform/tessellate.cpp +++ b/src/layers/transform/tessellate.cpp @@ -24,6 +24,7 @@ // permissions and limitations under the license. //////////////////////////////////////////////////////////////////////////////// +#define LBANN_TESSELLATE_LAYER_INSTANTIATE #include "lbann/layers/transform/tessellate.hpp" namespace lbann { @@ -138,4 +139,7 @@ void tessellate_layer gradient_wrt_output, gradient_wrt_input); } +template class tessellate_layer; +template class tessellate_layer; + } // namespace lbann diff --git a/src/layers/transform/tessellate.cu b/src/layers/transform/tessellate.cu index 94f4bd7110b..539529072a6 100644 --- a/src/layers/transform/tessellate.cu +++ b/src/layers/transform/tessellate.cu @@ -24,6 +24,7 @@ // permissions and limitations under the license. //////////////////////////////////////////////////////////////////////////////// +#define LBANN_TESSELLATE_LAYER_INSTANTIATE #include "lbann/layers/transform/tessellate.hpp" namespace lbann { @@ -197,4 +198,7 @@ void tessellate_layer gradient_wrt_output, gradient_wrt_input); } +template class tessellate_layer; +template class tessellate_layer; + } // namespace lbann diff --git a/src/layers/transform/uniform.cpp b/src/layers/transform/uniform.cpp new file mode 100644 index 00000000000..4884dd6cfd1 --- /dev/null +++ b/src/layers/transform/uniform.cpp @@ -0,0 +1,39 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#define LBANN_UNIFORM_LAYER_INSTANTIATE +#include "lbann/layers/transform/uniform.hpp" + +namespace lbann { + +template class uniform_layer; +template class uniform_layer; +#ifdef LBANN_HAS_GPU +template class uniform_layer; +template class uniform_layer; +#endif // LBANN_HAS_GPU + +}// namespace lbann diff --git a/src/layers/transform/unpooling.cpp b/src/layers/transform/unpooling.cpp new file mode 100644 index 00000000000..f104b41603e --- /dev/null +++ b/src/layers/transform/unpooling.cpp @@ -0,0 +1,34 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#define LBANN_UNPOOLING_LAYER_INSTANTIATE +#include "lbann/layers/transform/unpooling.hpp" + +namespace lbann { + +template class unpooling_layer; + +}// namespace lbann diff --git a/src/layers/transform/weighted_sum.cpp b/src/layers/transform/weighted_sum.cpp new file mode 100644 index 00000000000..d70da2e11d6 --- /dev/null +++ b/src/layers/transform/weighted_sum.cpp @@ -0,0 +1,43 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#define LBANN_WEIGHTED_SUM_LAYER_INSTANTIATE +#include "lbann/layers/transform/weighted_sum.hpp" + +namespace lbann { + +template class weighted_sum_layer< + data_layout::DATA_PARALLEL, El::Device::CPU>; +template class weighted_sum_layer< + data_layout::MODEL_PARALLEL, El::Device::CPU>; +#ifdef LBANN_HAS_GPU +template class weighted_sum_layer< + data_layout::DATA_PARALLEL, El::Device::GPU>; +template class weighted_sum_layer< + data_layout::MODEL_PARALLEL, El::Device::GPU>; +#endif // LBANN_HAS_GPU + +}// namespace lbann diff --git a/src/layers/transform/weights.cpp b/src/layers/transform/weights.cpp new file mode 100644 index 00000000000..8eb26dccece --- /dev/null +++ b/src/layers/transform/weights.cpp @@ -0,0 +1,39 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#define LBANN_WEIGHTS_LAYER_INSTANTIATE +#include "lbann/layers/transform/weights.hpp" + +namespace lbann { + +template class weights_layer; +template class weights_layer; +#ifdef LBANN_HAS_GPU +template class weights_layer; +template class weights_layer; +#endif // LBANN_HAS_GPU + +}// namespace lbann diff --git a/src/proto/factories/layer_graph_factory.cpp b/src/proto/factories/layer_graph_factory.cpp index 6bc9f2091ec..8868c83332b 100644 --- a/src/proto/factories/layer_graph_factory.cpp +++ b/src/proto/factories/layer_graph_factory.cpp @@ -112,7 +112,7 @@ void setup_unpooling_pointers(lbann_comm* comm, unpool->set_pooling_layer(pool); } } -#ifdef LBANN_HAS_GPU +#if defined(LBANN_HAS_GPU) && defined(LBANN_UNPOOLING_LAYER_SUPPORTS_GPU) { unpooling_layer* unpool = dynamic_cast*>(layers[i]); From 3e5cc615a591b59bf5e47eaf8ba44955798f01ca Mon Sep 17 00:00:00 2001 From: "David A. Hysom" Date: Wed, 25 Sep 2019 14:47:17 -0700 Subject: [PATCH 315/634] l working version of threaded preload --- .../lbann/data_readers/data_reader_image.hpp | 1 + .../lbann/data_store/data_store_conduit.hpp | 6 +- src/data_readers/data_reader.cpp | 15 ++-- src/data_readers/data_reader_image.cpp | 70 +++++++++++++++---- src/data_store/data_store_conduit.cpp | 66 ++++++++++++++++- 5 files changed, 138 insertions(+), 20 deletions(-) diff --git a/include/lbann/data_readers/data_reader_image.hpp b/include/lbann/data_readers/data_reader_image.hpp index 0ac7b7740e5..4780407efb0 100644 --- a/include/lbann/data_readers/data_reader_image.hpp +++ b/include/lbann/data_readers/data_reader_image.hpp @@ -115,6 +115,7 @@ class image_data_reader : public generic_data_reader { int m_num_labels; ///< number of labels void load_conduit_node_from_file(int data_id, conduit::Node &node); + bool load_conduit_nodes_from_file(const std::unordered_set &data_ids); }; diff --git a/include/lbann/data_store/data_store_conduit.hpp b/include/lbann/data_store/data_store_conduit.hpp index d8bdc563bbb..d12421d0782 100644 --- a/include/lbann/data_store/data_store_conduit.hpp +++ b/include/lbann/data_store/data_store_conduit.hpp @@ -75,7 +75,7 @@ class data_store_conduit { void set_shuffled_indices(const std::vector *indices); /// for use during development and debugging - int get_num_indices() { return m_shuffled_indices->size(); } + size_t get_num_indices() const; void setup(int mini_batch_size); @@ -150,6 +150,10 @@ class data_store_conduit { protected : + double m_exchange_time = 0; + double m_rebuild_time = 0; + double m_super_node_packaging_time = 0; + int m_cur_epoch = 0; bool m_is_setup = false; diff --git a/src/data_readers/data_reader.cpp b/src/data_readers/data_reader.cpp index 5c6e6dd2b7c..42be34a0828 100644 --- a/src/data_readers/data_reader.cpp +++ b/src/data_readers/data_reader.cpp @@ -686,6 +686,7 @@ double generic_data_reader::get_use_percent() const { } void generic_data_reader::instantiate_data_store(const std::vector& local_list_sizes) { + double tm1 = get_time(); options *opts = options::get(); if (! (opts->get_bool("use_data_store") || opts->get_bool("preload_data_store") || opts->get_bool("data_store_cache"))) { if (m_data_store != nullptr) { @@ -718,18 +719,22 @@ void generic_data_reader::instantiate_data_store(const std::vector& local_l if(is_master()) { std::cout << "generic_data_reader::instantiate_data_store - Starting the preload" << std::endl; } + double tm2 = get_time(); if (local_list_sizes.size() != 0) { m_data_store->build_preloaded_owner_map(local_list_sizes); } preload_data_store(); if(is_master()) { - std::cout << "preload complete" << std::endl; - std::cout << "num loaded samples in P_0: " << m_data_store->get_data_size() << std::endl; + std::cout << "Preload complete; time: " << get_time() - tm2 << std::endl; } - } - if(is_master()) { - std::cout << "Setting up the data store is complete" << std::endl; + size_t n = m_data_store->get_num_indices(); + if (n != m_shuffled_indices.size()) { + LBANN_ERROR("num samples loaded: ", n, " != shuffled-indices.size(): ", m_shuffled_indices.size()); + } + } + if (is_master()) { + std::cout << "generic_data_reader::instantiate_data_store time: : " << (get_time() - tm1) << std::endl; } } diff --git a/src/data_readers/data_reader_image.cpp b/src/data_readers/data_reader_image.cpp index 6fd051eecdc..d948a9915c3 100644 --- a/src/data_readers/data_reader_image.cpp +++ b/src/data_readers/data_reader_image.cpp @@ -31,6 +31,8 @@ #include "lbann/utils/timer.hpp" #include "lbann/data_store/data_store_conduit.hpp" #include "lbann/utils/file_utils.hpp" +#include "lbann/utils/threads/thread_utils.hpp" +#include "lbann/utils/lbann_library.hpp" #include namespace lbann { @@ -204,24 +206,59 @@ void read_raw_data(const std::string &filename, std::vector &data) { in.close(); } + void image_data_reader::preload_data_store() { - double tm1 = get_time(); m_data_store->set_preload(); + options *opts = options::get(); - conduit::Node node; - if (is_master()) std::cerr << "Starting image_data_reader::preload_data_store; num indices: " << m_shuffled_indices.size() << std::endl; + if (is_master()) std::cout << "Starting image_data_reader::preload_data_store; num indices: " << m_shuffled_indices.size() << std::endl; int rank = m_comm->get_rank_in_trainer(); - for (size_t data_id=0; data_idget_index_owner(index) != rank) { - continue; + + bool threaded = ! options::get()->get_bool("data_store_no_thread"); + if (threaded) { + if (is_master()) { + std::cout << "mode: data_store_thread\n"; + } + std::shared_ptr io_thread_pool = construct_io_thread_pool(m_comm, opts); + int num_threads = static_cast(io_thread_pool->get_num_threads()); + + std::vector> data_ids(num_threads); + int j = 0; + for (size_t data_id=0; data_idget_index_owner(index) != rank) { + continue; + } + data_ids[j++].insert(index); + if (j == num_threads) { + j = 0; + } } - load_conduit_node_from_file(index, node); - m_data_store->set_preloaded_conduit_node(index, node); - } - if (is_master()) { - std::cout << "image_data_reader::preload_data_store time: " << (get_time() - tm1) << "\n"; + for (int t = 0; t < num_threads; t++) { + if(t == io_thread_pool->get_local_thread_id()) { + continue; + } else { + io_thread_pool->submit_job_to_work_group(std::bind(&image_data_reader::load_conduit_nodes_from_file, this, data_ids[t])); + } + } + load_conduit_nodes_from_file(data_ids[io_thread_pool->get_local_thread_id()]); + io_thread_pool->finish_work_group(); + } + + else { + conduit::Node node; + if (is_master()) { + std::cout << "mode: NOT data_store_thread\n"; + } + for (size_t data_id=0; data_idget_index_owner(index) != rank) { + continue; + } + load_conduit_node_from_file(index, node); + m_data_store->set_preloaded_conduit_node(index, node); + } } } @@ -239,6 +276,15 @@ std::vector image_data_reader::get_image_list_of_cu return ret; } +bool image_data_reader::load_conduit_nodes_from_file(const std::unordered_set &data_ids) { + conduit::Node node; + for (auto t : data_ids) { + load_conduit_node_from_file(t, node); + m_data_store->set_preloaded_conduit_node(t, node); + } + return true; +} + void image_data_reader::load_conduit_node_from_file(int data_id, conduit::Node &node) { node.reset(); const std::string filename = get_file_dir() + m_image_list[data_id].first; diff --git a/src/data_store/data_store_conduit.cpp b/src/data_store/data_store_conduit.cpp index 58a7decf393..f0dbc683ee5 100644 --- a/src/data_store/data_store_conduit.cpp +++ b/src/data_store/data_store_conduit.cpp @@ -268,7 +268,7 @@ void data_store_conduit::exchange_data_by_super_node(size_t current_pos, size_t (*m_output) << "starting data_store_conduit::exchange_data_by_super_node; mb_size: " << mb_size << std::endl; } - if (m_cur_epoch == 0) { + if (m_send_buffer.size() == 0) { setup_data_store_buffers(); } @@ -280,6 +280,11 @@ void data_store_conduit::exchange_data_by_super_node(size_t current_pos, size_t // construct a super node for each processor; the super node // contains all samples this proc owns that other procs need + if (m_send_buffer.size() != (size_t)m_np_in_trainer) { + LBANN_ERROR("m_send_buffer.size() != m_np_in_trainer; m_send_buffer.size: ", m_send_buffer.size()); + } + + double tm3 = get_time(); for (int p=0; p other_sizes; for (int k=0; kbroadcast(k, my_sizes.data(), all_counts[k]*2, m_comm->get_trainer_comm()); } else { m_comm->broadcast(k, other_sizes.data(), all_counts[k]*2, m_comm->get_trainer_comm()); + +/* XX + if (m_world_master) std::cout << "SAMPLE SIZES for P_" << k << std::endl; + for (size_t h=0; hat_new_epoch()) { + if (m_world_master && m_cur_epoch > 0) { + std::cout << "time for exchange_mini_batch_data calls: " + << m_exchange_time << std::endl + << "time for constructing conduit Nodes: " << m_rebuild_time + << std::endl; + if (m_super_node) { + std::cout << "time for constructing super_nodes: " << m_super_node_packaging_time; + } + std::cout << std::endl; + m_exchange_time = 0.; + m_rebuild_time = 0.; + m_super_node_packaging_time = 0.; + } ++m_cur_epoch; } @@ -1372,6 +1427,7 @@ void data_store_conduit::exchange_mini_batch_data(size_t current_pos, size_t mb_ } else { exchange_data_by_sample(current_pos, mb_size); } + m_exchange_time += (get_time() - tm1); } void data_store_conduit::flush_debug_file() { @@ -1382,4 +1438,10 @@ void data_store_conduit::flush_debug_file() { m_output->open(m_debug_filename.c_str(), std::ios::app); } +size_t data_store_conduit::get_num_indices() const { + size_t num = m_data.size(); + size_t n = m_comm->trainer_allreduce(num); + return n; +} + } // namespace lbann From 09ee160932c4a4747d378cfe147b1406ba74a396 Mon Sep 17 00:00:00 2001 From: Ryan Forsyth Date: Wed, 25 Sep 2019 15:07:43 -0700 Subject: [PATCH 316/634] Update lenet integration test to include a frequency string --- bamboo/integration_tests/test_integration_performance.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/bamboo/integration_tests/test_integration_performance.py b/bamboo/integration_tests/test_integration_performance.py index d16367693a3..b3c7c980070 100644 --- a/bamboo/integration_tests/test_integration_performance.py +++ b/bamboo/integration_tests/test_integration_performance.py @@ -112,8 +112,11 @@ def skeleton_performance_lenet_mnist(cluster, dir_name, executables, cluster, dir_name, executable, model_folder, model_name, DATA_FIELDS, should_log, compiler_name=compiler_name, weekly=weekly, data_reader_percent=data_reader_percent) + frequency_str = '_nightly' + if weekly: + frequency_str = '_weekly' run_tests(actual_performance, model_name, dir_name, should_log, - compiler_name, cluster) + compiler_name, cluster, frequency_str=frequency_str) def skeleton_performance_alexnet(cluster, dir_name, executables, compiler_name, @@ -134,7 +137,7 @@ def skeleton_performance_alexnet(cluster, dir_name, executables, compiler_name, if weekly: frequency_str = '_weekly' run_tests(actual_performance, model_name, dir_name, should_log, - compiler_name, cluster, frequency_str) + compiler_name, cluster, frequency_str=frequency_str) def skeleton_performance_full_alexnet(cluster, dir_name, executables, From 883045aa7e9f6917ec7d6afd71582f8734d50565 Mon Sep 17 00:00:00 2001 From: Nikoli Dryden Date: Fri, 27 Sep 2019 05:15:09 -0700 Subject: [PATCH 317/634] Clean up build_lbann_lc.sh Fixes building without CUDA on GPU systems. --- scripts/build_lbann_lc.sh | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/scripts/build_lbann_lc.sh b/scripts/build_lbann_lc.sh index 397c2361851..1251513c04e 100755 --- a/scripts/build_lbann_lc.sh +++ b/scripts/build_lbann_lc.sh @@ -43,8 +43,6 @@ if [ "${ARCH}" == "x86_64" ]; then fi -ELEMENTAL_MATH_LIBS= -PATCH_OPENBLAS=ON C_FLAGS= CXX_FLAGS=-DLBANN_SET_EL_RNG Fortran_FLAGS= @@ -564,8 +562,7 @@ if [ "${CLUSTER}" == "surface" -o "${CORAL}" -eq 1 -o "${CLUSTER}" == "pascal" ] HAS_GPU=1 WITH_CUDA=${WITH_CUDA:-ON} WITH_CUDNN=ON - WITH_CUB=ON - ELEMENTAL_USE_CUBLAS=OFF + WITH_CUB=${WITH_CUB:-ON} WITH_ALUMINUM=${WITH_ALUMINUM:-ON} ALUMINUM_WITH_NCCL=${ALUMINUM_WITH_NCCL:-ON} if [[ ${CORAL} -eq 1 ]]; then @@ -629,7 +626,9 @@ else HAS_GPU=0 WITH_CUDA=${WITH_CUDA:-OFF} WITH_CUDNN=OFF - ELEMENTAL_USE_CUBLAS=OFF + WITH_CUB=OFF + ALUMINUM_WITH_NCCL=OFF + ALUMINUM_WITH_MPI_CUDA=OFF fi ################################################################ @@ -708,9 +707,6 @@ if [ ${VERBOSE} -ne 0 ]; then print_variable WITH_CUDA print_variable WITH_CUDNN print_variable WITH_NVPROF - print_variable ELEMENTAL_USE_CUBLAS - print_variable ELEMENTAL_MATH_LIBS - print_variable PATCH_OPENBLAS print_variable DETERMINISTIC print_variable CLEAN_BUILD print_variable VERBOSE @@ -791,6 +787,7 @@ cmake \ -D LBANN_DATATYPE=${DATATYPE} \ -D LBANN_DETERMINISTIC=${DETERMINISTIC} \ -D LBANN_WITH_ALUMINUM=${WITH_ALUMINUM} \ +-D LBANN_SB_BUILD_CATCH2=ON \ -D LBANN_NO_OMP_FOR_DATA_READERS=${NO_OMP_FOR_DATA_READERS} \ -D LBANN_CONDUIT_DIR=${CONDUIT_DIR} \ -D LBANN_BUILT_WITH_SPECTRUM=${WITH_SPECTRUM} \ From 96fb47a696698e62dfac2fbd69c5fc7d24d4cc2a Mon Sep 17 00:00:00 2001 From: "David A. Hysom" Date: Sat, 28 Sep 2019 14:43:16 -0700 Subject: [PATCH 318/634] Modifications to make data_reader_jag_conduit::preload_data_store() more efficient. --- .../data_readers/data_reader_jag_conduit.hpp | 1 + src/data_readers/data_reader.cpp | 2 +- src/data_readers/data_reader_jag_conduit.cpp | 84 +++++++++++++++---- src/data_store/data_store_conduit.cpp | 19 +---- 4 files changed, 75 insertions(+), 31 deletions(-) diff --git a/include/lbann/data_readers/data_reader_jag_conduit.hpp b/include/lbann/data_readers/data_reader_jag_conduit.hpp index 3caf527a4ad..7e6f52ea052 100644 --- a/include/lbann/data_readers/data_reader_jag_conduit.hpp +++ b/include/lbann/data_readers/data_reader_jag_conduit.hpp @@ -346,6 +346,7 @@ class data_reader_jag_conduit : public generic_data_reader { bool has_path(const file_handle_t& h, const std::string& path) const; void read_node(const file_handle_t& h, const std::string& path, conduit::Node& n) const; + void read_partial_node(const file_handle_t& h, const std::string& path, conduit::Node& n) const; /// Allow const access to the conduit data structure static const conduit::Node& get_conduit_node(const conduit::Node& n_base, const std::string key); diff --git a/src/data_readers/data_reader.cpp b/src/data_readers/data_reader.cpp index 42be34a0828..a914f192c6f 100644 --- a/src/data_readers/data_reader.cpp +++ b/src/data_readers/data_reader.cpp @@ -717,7 +717,7 @@ void generic_data_reader::instantiate_data_store(const std::vector& local_l // optionally preload the data store if (opts->get_bool("preload_data_store") && !opts->get_bool("data_store_cache")) { if(is_master()) { - std::cout << "generic_data_reader::instantiate_data_store - Starting the preload" << std::endl; + std::cerr << "generic_data_reader::instantiate_data_store - Starting the preload" << std::endl; } double tm2 = get_time(); if (local_list_sizes.size() != 0) { diff --git a/src/data_readers/data_reader_jag_conduit.cpp b/src/data_readers/data_reader_jag_conduit.cpp index 41e93072bf6..bf603c02331 100644 --- a/src/data_readers/data_reader_jag_conduit.cpp +++ b/src/data_readers/data_reader_jag_conduit.cpp @@ -355,11 +355,55 @@ bool data_reader_jag_conduit::load_conduit_node(const size_t i, const std::strin } } - read_node(h, path, node); + if (options::get()->get_bool("old_method") || ! options::get()->get_bool("preload_data_store")) { + read_node(h, path, node); + } else { + read_partial_node(h, path, node); + } return true; } +#ifdef _USE_IO_HANDLE_ +void data_reader_jag_conduit::read_partial_node(const data_reader_jag_conduit::file_handle_t& h, const std::string& path, conduit::Node& n) const { + LBANN_ERROR("Not implemented; please contact Dave Hysom"); +} +#else + +void data_reader_jag_conduit::read_partial_node(const data_reader_jag_conduit::file_handle_t& h, const std::string& path, conduit::Node& n) const { + conduit::Node work; + if (!has_path(h, path)) { + LBANN_ERROR("has_path failed for: ", path, ": num nodes successfully loaded by this rank: ", m_data_store->get_data_size()); + } + const std::string key = path + "/inputs"; + const std::string key2 = path + "/outputs/scalars"; + + if (! has_path(h, key)) { + LBANN_ERROR("has_path failed for: ", key, ": num nodes successfully loaded by this rank: ", m_data_store->get_data_size()); + } + conduit::relay::io::hdf5_read(h, key, work); + n["inputs"] = work; + //n[key2] = work; + + if (! has_path(h, key2)) { + LBANN_ERROR("has_path failed for: ", key2, ": num nodes successfully loaded by this rank: ", m_data_store->get_data_size()); + } + conduit::relay::io::hdf5_read(h, key2, work); + n["/outputs/scalars"] = work; + //n[key] = work; + + for (auto &&t : m_emi_image_keys) { + const std::string key3 = "/" + path + "/outputs/images/" + t; + if (! has_path(h, key3)) { + LBANN_ERROR("has_path failed for: ", key3, ": num nodes successfully loaded by this rank: ", m_data_store->get_data_size()); + } + conduit::relay::io::hdf5_read(h, key3, work); + //n[key3] = work; + n["/outputs/images/" + t] = work; + } +} +#endif + bool data_reader_jag_conduit::has_conduit_path(const size_t i, const std::string& key) const { const sample_t& s = m_sample_list[i]; sample_file_id_t id = s.first; @@ -778,19 +822,24 @@ void data_reader_jag_conduit::load() { m_shuffled_indices.clear(); if(is_master()) { - std::cout << "starting load" << std::endl; + std::cout << "data_reader_jag_conduit - starting load" << std::endl; } const std::string data_dir = add_delimiter(get_file_dir()); const std::string sample_list_file = data_dir + get_data_index_list(); options *opts = options::get(); + bool check_data = ! opts->get_bool("no_check_data"); /// The use of these flags need to be updated to properly separate /// how index lists are used between trainers and models /// @todo m_list_per_trainer || m_list_per_model load_list_of_samples(sample_list_file, m_comm->get_procs_per_trainer(), m_comm->get_rank_in_trainer()); if(is_master()) { - std::cout << "Finished sample list, check data" << std::endl; + if (check_data) { + std::cout << "Finished sample list, check data" << std::endl; + } else { + std::cout << "Finished sample list, skipping check data" << std::endl; + } } /// Check the data that each rank loaded @@ -803,14 +852,20 @@ void data_reader_jag_conduit::load() { if (m_scalar_keys.size() == 0u) { set_all_scalar_choices(); // use all by default if none is specified } - check_scalar_keys(); + if (check_data) { + check_scalar_keys(); + } if (m_input_keys.size() == 0u) { set_all_input_choices(); // use all by default if none is specified } - check_input_keys(); + if (check_data) { + check_input_keys(); + } - check_image_data(); + if (check_data) { + check_image_data(); + } m_sample_list.close_if_done_samples_file_handle(0); } @@ -856,7 +911,6 @@ void data_reader_jag_conduit::load() { } } } - instantiate_data_store(local_list_sizes); select_subset_of_data(); @@ -875,9 +929,7 @@ void data_reader_jag_conduit::preload_data_store() { double tm1 = get_time(); if (get_comm()->am_world_master() || (opts->get_bool("ltfb_verbose") && get_comm()->am_trainer_master())) { - std::stringstream msg; - msg << " for role: " << get_role() << " starting preload"; - LBANN_WARNING(msg.str()); + LBANN_WARNING("starting preload for role: ", get_role(), "; --old_method=", opts->get_bool("old_method")); } for (size_t idx=0; idx < m_shuffled_indices.size(); idx++) { @@ -892,7 +944,6 @@ void data_reader_jag_conduit::preload_data_store() { conduit::Node & node = m_data_store->get_empty_node(index); const std::string padded_idx = '/' + LBANN_DATA_ID_STR(index); node[padded_idx] = work; - m_data_store->set_preloaded_conduit_node(index, node); } catch (conduit::Error const& e) { LBANN_ERROR(" :: trying to load the node " + std::to_string(index) + " with key " + key + " and got " + e.what()); @@ -906,6 +957,8 @@ void data_reader_jag_conduit::preload_data_store() { } m_sample_list.close_if_done_samples_file_handle(index); } + + if (get_comm()->am_world_master() || (opts->get_bool("ltfb_verbose") && get_comm()->am_trainer_master())) { std::stringstream msg; @@ -1209,11 +1262,13 @@ data_reader_jag_conduit::get_image_data(const size_t sample_id, conduit::Node& s for (const auto& emi_tag : m_emi_image_keys) { const std::string conduit_field = m_output_image_prefix + emi_tag; - const std::string conduit_obj = '/' + LBANN_DATA_ID_STR(sample_id) + '/' + conduit_field; + const std::string conduit_obj = LBANN_DATA_ID_STR(sample_id) + conduit_field; if(sample[conduit_obj].schema().dtype().is_empty()) { if (data_store_active()) { - LBANN_ERROR("Unable to find field " + conduit_obj - + " in conduit node: " + std::to_string(sample_id)); + LBANN_ERROR("Unable to find field ", conduit_obj, + " in conduit node: ", std::to_string(sample_id), + ": num nodes successfully loaded by this rank: ", + m_data_store->get_data_size(), " num successful calls to get_image_data on this rank: "); } conduit::Node n_image; bool from_file = load_conduit_node(sample_id, conduit_field, n_image); @@ -1233,6 +1288,7 @@ data_reader_jag_conduit::get_image_data(const size_t sample_id, conduit::Node& s return image_ptrs; } + std::vector data_reader_jag_conduit::get_scalars(const size_t sample_id, conduit::Node& sample) const { std::vector scalars; scalars.reserve(m_scalar_keys.size()); diff --git a/src/data_store/data_store_conduit.cpp b/src/data_store/data_store_conduit.cpp index f0dbc683ee5..e1a0e754ce5 100644 --- a/src/data_store/data_store_conduit.cpp +++ b/src/data_store/data_store_conduit.cpp @@ -411,6 +411,9 @@ void data_store_conduit::set_preloaded_conduit_node(int data_id, conduit::Node & void data_store_conduit::error_check_compacted_node(const conduit::Node &nd, int data_id) { if (m_compacted_sample_size == 0) { m_compacted_sample_size = nd.total_bytes_compact(); + if (m_world_master) { + std::cout << "num bytes for nodes to be transmitted: " << nd.total_bytes_compact() << " per node" << std::endl; + } } else if (m_compacted_sample_size != nd.total_bytes_compact() && !m_node_sizes_vary) { LBANN_ERROR("Conduit node being added data_id: ", data_id, " is not the same size as existing nodes in the data_store ", @@ -534,7 +537,6 @@ const conduit::Node & data_store_conduit::get_conduit_node(int data_id) const { // code in the following method is a modification of code from // conduit/src/libs/relay/conduit_relay_mpi.cpp void data_store_conduit::build_node_for_sending(const conduit::Node &node_in, conduit::Node &node_out) { - node_out.reset(); conduit::Schema s_data_compact; if( node_in.is_compact() && node_in.is_contiguous()) { @@ -1035,21 +1037,6 @@ void data_store_conduit::exchange_sample_sizes() { } else { m_comm->broadcast(k, other_sizes.data(), all_counts[k]*2, m_comm->get_trainer_comm()); -/* XX - if (m_world_master) std::cout << "SAMPLE SIZES for P_" << k << std::endl; - for (size_t h=0; h Date: Mon, 30 Sep 2019 10:09:17 -0700 Subject: [PATCH 319/634] use cmake mechanism for correctly finding python libraries (#1275) --- cmake/modules/FindPython.cmake | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/cmake/modules/FindPython.cmake b/cmake/modules/FindPython.cmake index 62c7945174f..39d5430461e 100644 --- a/cmake/modules/FindPython.cmake +++ b/cmake/modules/FindPython.cmake @@ -64,13 +64,19 @@ execute_process( COMMAND "${Python_EXECUTABLE}" "-c" "import sys; from distutils.sysconfig import get_config_var; sys.stdout.write(get_config_var('LIBDIR'))" OUTPUT_VARIABLE _LIB_DIR) -if (BUILD_SHARED_LIBS) - set(_GLOB_EXPR "${_LIB_DIR}/libpython*${CMAKE_SHARED_LIBRARY_SUFFIX}") -ELSE (BUILD_SHARED_LIBS) - set(_GLOB_EXPR "${_LIB_DIR}/libpython*${CMAKE_STATIC_LIBRARY_SUFFIX}") -endif (BUILD_SHARED_LIBS) -FILE(GLOB _GLOB_RESULT "${_GLOB_EXPR}") -get_filename_component(Python_LIBRARIES "${_GLOB_RESULT}" ABSOLUTE) + +set(_PY_MAJ_MIN_VERSION "${Python_VERSION_MAJOR}.${Python_VERSION_MINOR}") +find_library(Python_LIBRARY + NAMES python python${_PY_MAJ_MIN_VERSION}m python${_PY_MAJ_MIN_VERSION} + python${Python_VERSION_MAJOR}m python${Python_VERSION_MAJOR} + HINTS ${_LIB_DIR} + DOC "The python${Python_VERSION_MAJOR} library." + NO_DEFAULT_PATH) +if (NOT Python_LIBRARY) + message(FATAL_ERROR "Could not find Python library for version " + "${_PY_MAJ_MIN_VERSION} in directory: ${_LIB_DIR}") +endif () +set(Python_LIBRARIES "${Python_LIBRARY}") # Handle the find_package arguments include(FindPackageHandleStandardArgs) From 2f1c4d955cda12e0b09d67500e7c4f8dc4f69d1a Mon Sep 17 00:00:00 2001 From: Tim Moon Date: Tue, 1 Oct 2019 16:59:46 -0700 Subject: [PATCH 320/634] Fix bug in GPU not equal layer --- src/layers/math/binary.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/layers/math/binary.cu b/src/layers/math/binary.cu index 4ae488be43b..c96b915d9e6 100644 --- a/src/layers/math/binary.cu +++ b/src/layers/math/binary.cu @@ -320,7 +320,7 @@ struct equal_op { struct not_equal_op { inline __device__ DataType operator()(const DataType& x1, const DataType& x2) const { - return x1 == x2 ? DataType(1) : DataType(0); + return x1 == x2 ? DataType(0) : DataType(1); } inline __device__ void operator()(const DataType& x1, const DataType& x2, From 21007aa5c02b1088178c9ef3cc8643831477d48d Mon Sep 17 00:00:00 2001 From: Tom Benson <30674819+benson31@users.noreply.github.com> Date: Wed, 2 Oct 2019 09:28:41 -0700 Subject: [PATCH 321/634] Get Bamboo to turn green (#1270) Made a bunch of tweaks and modifications to the testing infrastructure to get the time on Nightly to around 30 min/cluster and fixed a few bugs so they all run green now. --- bamboo/allocate_and_run.sh | 35 +++-- bamboo/common_python/tools.py | 62 +++++---- bamboo/compiler_tests/build_script.sh | 131 +++++++++++++++++- bamboo/compiler_tests/test_compiler.py | 6 +- bamboo/run.sh | 7 +- .../test_unit_check_proto_models.py | 2 + bamboo/unit_tests/test_unit_checkpoint.py | 7 +- .../unit_tests/test_unit_datareader_python.py | 3 +- bamboo/unit_tests/test_unit_lbann2_reload.py | 54 ++++++-- .../test_unit_mnist_ridge_regression.py | 3 + .../test_unit_mnist_softmax_classifier.py | 5 + .../test_unit_reconstruction_loss.py | 7 +- include/lbann/callbacks/checkpoint.hpp | 2 +- .../layers/io/input/generic_input_layer.hpp | 4 +- model_zoo/lbann2.cpp | 17 ++- python/lbann/contrib/lc/launcher.py | 7 +- python/lbann/contrib/lc/systems.py | 2 + src/proto/proto_common.cpp | 10 +- src/utils/lbann_library.cpp | 12 ++ 19 files changed, 302 insertions(+), 74 deletions(-) diff --git a/bamboo/allocate_and_run.sh b/bamboo/allocate_and_run.sh index 3876955f9e7..26f74f45c19 100755 --- a/bamboo/allocate_and_run.sh +++ b/bamboo/allocate_and_run.sh @@ -2,8 +2,7 @@ CLUSTER=$(hostname | sed 's/\([a-zA-Z][a-zA-Z]*\)[0-9]*/\1/g') -echo "allocate_and_run.sh CLUSTER=" -echo $CLUSTER +echo "allocate_and_run.sh CLUSTER=${CLUSTER}" export PYTHONPATH=${HOME}/.local/lib/python3.7/site-packages:${PYTHONPATH} @@ -27,8 +26,7 @@ while :; do shift done -echo "allocate_and_run.sh WEEKLY=" -echo $WEEKLY +echo "allocate_and_run.sh WEEKLY=${WEEKLY}" if [ "${CLUSTER}" = 'pascal' ]; then export MV2_USE_CUDA=1 @@ -37,14 +35,21 @@ fi if [ "${CLUSTER}" = 'lassen' ]; then ALLOCATION_TIME_LIMIT=600 if [ ${WEEKLY} -ne 0 ]; then - timeout -k 5 24h bsub -G guests -Is -q pbatch -nnodes 16 -W $ALLOCATION_TIME_LIMIT ./run.sh --weekly + timeout -k 5 24h bsub -G guests -Is -q pbatch -nnodes 16 -W ${ALLOCATION_TIME_LIMIT} ./run.sh --weekly else - timeout -k 5 24h bsub -G guests -Is -q pbatch -nnodes 16 -W $ALLOCATION_TIME_LIMIT ./run.sh + timeout -k 5 24h bsub -G guests -Is -q pbatch -nnodes 2 -W ${ALLOCATION_TIME_LIMIT} ./run.sh + fi +elif [ "${CLUSTER}" = 'ray' ]; then + if [ ${WEEKLY} -ne 0 ]; then + echo "No ray testing in weekly." + else + ALLOCATION_TIME_LIMIT=240 + timeout -k 5 24h bsub -Is -q pbatch -nnodes 2 -W ${ALLOCATION_TIME_LIMIT} ./run.sh fi elif [ "${CLUSTER}" = 'catalyst' ] || [ "${CLUSTER}" = 'corona' ] || [ "${CLUSTER}" = 'pascal' ]; then ALLOCATION_TIME_LIMIT=960 if [ ${WEEKLY} -ne 0 ]; then - timeout -k 5 24h salloc -N16 --partition=pbatch -t $ALLOCATION_TIME_LIMIT ./run.sh --weekly + timeout -k 5 24h salloc -N16 --partition=pbatch -t ${ALLOCATION_TIME_LIMIT} ./run.sh --weekly if [ "${CLUSTER}" = 'catalyst' ]; then cd integration_tests python -m pytest -s test_integration_performance.py -k test_integration_performance_full_alexnet_clang6 --weekly --run --junitxml=../full_alexnet_clang6/results.xml @@ -53,9 +58,19 @@ elif [ "${CLUSTER}" = 'catalyst' ] || [ "${CLUSTER}" = 'corona' ] || [ "${CLUSTE cd .. fi else - timeout -k 5 24h salloc -N16 --partition=pbatch -t $ALLOCATION_TIME_LIMIT ./run.sh + ALLOCATION_TIME_LIMIT=90 # Start with 1.5 hrs; may adjust for CPU clusters + if [[ $(mjstat -c | awk 'match($1, "pbatch") && NF < 7 { print $5 }') -ne "0" ]]; + then + timeout -k 5 24h salloc -N2 --partition=pbatch -t ${ALLOCATION_TIME_LIMIT} ./run.sh + else + echo "Partition \"pbatch\" on cluster \"${CLUSTER}\" appears to be down." + if [[ "${CLUSTER}" =~ ^corona$ ]]; + then + echo "Trying \"pgpu\"." + timeout -k 5 24h salloc -N2 --partition=pgpu -t ${ALLOCATION_TIME_LIMIT} ./run.sh + fi + fi fi else - echo "allocate_and_run.sh. Unsupported cluster CLUSTER=" - echo $CLUSTER + echo "allocate_and_run.sh. Unsupported cluster CLUSTER=${CLUSTER}" fi diff --git a/bamboo/common_python/tools.py b/bamboo/common_python/tools.py index 98b2450d2e7..feaee4a359b 100644 --- a/bamboo/common_python/tools.py +++ b/bamboo/common_python/tools.py @@ -19,6 +19,7 @@ def get_command(cluster, time_limit=None, # LBANN Parameters ckpt_dir=None, + disable_cuda=None, dir_name=None, data_filedir_default=None, data_filedir_train_default=None, @@ -38,6 +39,7 @@ def get_command(cluster, optimizer_name=None, optimizer_path=None, processes_per_model=None, + restart_dir=None, extra_lbann_flags=None, # Error/Output Redirect error_file_name=None, @@ -60,7 +62,7 @@ def get_command(cluster, data_filename_test_default, data_reader_name, data_reader_path, data_reader_percent, exit_after_setup, metadata, mini_batch_size, model_folder, model_name, model_path, num_epochs, optimizer_name, - optimizer_path, processes_per_model, + optimizer_path, processes_per_model, restart_dir, # Error/Output Redirect error_file_name, output_file_name, # Misc. Parameters @@ -214,7 +216,7 @@ def get_command(cluster, # Cannot specify time limit for jsrun. command_run = '{s}jsrun'.format(s=space) else: - command_run = '{s}mpirun --timeout={t}'.format(s=space, t=time_limit) + command_run = '{s}mpirun --timeout {t}'.format(s=space, t=time_limit*60) option_bind = '' option_cpu_per_resource = '' option_gpu_per_resource = '' @@ -253,6 +255,7 @@ def get_command(cluster, # Create LBANN command option_ckpt_dir = '' + option_disable_cuda = '' option_data_filedir = '' option_data_filedir_train = '' option_data_filename_train = '' @@ -267,6 +270,7 @@ def get_command(cluster, option_num_epochs = '' option_optimizer = '' option_processes_per_model = '' + option_restart_dir = '' if model_path is not None: # If model_folder and/or model_name are set, an exception will be # raised later. @@ -394,23 +398,26 @@ def get_command(cluster, '_test_default] is set, but neither data_reader_name or' ' data_reader_path are.')) # else: no conflicts - if data_reader_percent is not None: - # If data_reader_percent is not None, then it will override `weekly`. - # If it is None however, we choose its value based on `weekly`. - try: - data_reader_percent = float(data_reader_percent) + if data_reader_percent != "prototext": + if data_reader_percent is not None: - except ValueError: - lbann_errors.append( - 'data_reader_percent={d} is not a float.'.format( - d=data_reader_percent)) - elif weekly: - data_reader_percent = 1.00 - else: - # Nightly - data_reader_percent = 0.10 - option_data_reader_percent = ' --data_reader_percent={d}'.format( - d=data_reader_percent) + # If data_reader_percent is not None, then it will override `weekly`. + # If it is None however, we choose its value based on `weekly`. + try: + data_reader_percent = float(data_reader_percent) + + except ValueError: + lbann_errors.append( + 'data_reader_percent={d} is not a float.'.format( + d=data_reader_percent)) + elif weekly: + data_reader_percent = 1.00 + else: + # Nightly + data_reader_percent = 0.10 + option_data_reader_percent = ' --data_reader_percent={d}'.format( + d=data_reader_percent) + # else: use the data reader's value if exit_after_setup: option_exit_after_setup = ' --exit_after_setup' if metadata is not None: @@ -423,6 +430,10 @@ def get_command(cluster, option_processes_per_model = ' --procs_per_model=%d' % processes_per_model if ckpt_dir is not None: option_ckpt_dir = ' --ckpt_dir=%s' % ckpt_dir + if restart_dir is not None: + option_restart_dir = ' --restart_dir=%s' % restart_dir + if disable_cuda is not None: + option_disable_cuda = ' --disable_cuda=%d' % int(bool(disable_cuda)) extra_options = '' if extra_lbann_flags is not None: # If extra_lbann_flags is not a dict, then we have already appended @@ -447,7 +458,7 @@ def get_command(cluster, 'num_io_threads', 'serialize_io', 'disable_background_io_activity', - 'disable_cuda', + #'disable_cuda', 'random_seed', 'objective_function', 'data_layout', @@ -458,7 +469,9 @@ def get_command(cluster, 'write_sample_list', 'ltfb_verbose', 'ckpt_dir', - + #'restart_dir', + 'restart_dir_is_fullpath', + # DataReaders: # 'data_filedir', # 'data_filedir_train', @@ -494,14 +507,15 @@ def get_command(cluster, if lbann_errors != []: print('lbann_errors={lbann_errors}.'.format(lbann_errors=lbann_errors)) raise Exception('Invalid Usage: ' + ' , '.join(lbann_errors)) - command_lbann = '%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s' % ( - executable, option_ckpt_dir, option_data_filedir, + command_lbann = '%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s' % ( + executable, option_ckpt_dir, option_disable_cuda, + option_data_filedir, option_data_filedir_train, option_data_filename_train, option_data_filedir_test, option_data_filename_test, option_data_reader, option_data_reader_percent, option_exit_after_setup, option_metadata, option_mini_batch_size, option_model, option_num_epochs, option_optimizer, - option_processes_per_model, extra_options) + option_processes_per_model, option_restart_dir, extra_options) # Create redirect command command_output = '' @@ -580,7 +594,7 @@ def get_default_exes(default_dirname, cluster): default_exes = {} default_exes['default'] = '%s/build/gnu.Release.%s.llnl.gov/install/bin/lbann' % (default_dirname, cluster) - if cluster in ['catalyst', 'corona', 'lassen', 'pascal']: + if cluster in ['catalyst', 'corona', 'lassen', 'pascal', 'ray']: # Define all compilers. # x86_cpu - catalyst # x86_gpu_pascal - pascal diff --git a/bamboo/compiler_tests/build_script.sh b/bamboo/compiler_tests/build_script.sh index 1ccf4efd12d..6ef43375d60 100755 --- a/bamboo/compiler_tests/build_script.sh +++ b/bamboo/compiler_tests/build_script.sh @@ -1,4 +1,133 @@ +#!/bin/bash + source /usr/share/lmod/lmod/init/bash source /etc/profile.d/00-modulepath.sh + LBANN_DIR=$(git rev-parse --show-toplevel) -${LBANN_DIR}/scripts/build_lbann_lc.sh --with-conduit +CLUSTER=$(hostname | sed 's/[0-9]*//g') +USER=$(whoami) +WORKSPACE_DIR=$(ls --color=no -d /usr/workspace/ws*/${USER}) +DEPENDENCY_DIR_BASE=${WORKSPACE_DIR}/stable_dependencies/${CLUSTER} + +# For this script, we only care about GCC. +LATEST_GCC=$(ls -1 ${DEPENDENCY_DIR_BASE} | grep gcc | tail -n1) +COMPILER_DIR=${DEPENDENCY_DIR_BASE}/${LATEST_GCC} + +# For now, there's only one MPI library. The pipe to tail ensures that +# we just pick one thing, just in case. +MPI_LIBRARY=$(ls -1 --color=no ${COMPILER_DIR} | tail -n1) +MPI_DIR=${COMPILER_DIR}/${MPI_LIBRARY} + +# All the dependencies are installed at the MPI level (even though +# most are MPI-independent). +DEPENDENCY_DIR=${MPI_DIR} + +if [ -e ${DEPENDENCY_DIR} ]; +then + SAVELIST_NAME=$(echo ${CLUSTER}_${LATEST_GCC}_${MPI_LIBRARY} | sed -e 's/\./x/g') + + if ml -t savelist |& grep ${SAVELIST_NAME} > /dev/null 2>&1 + then + ml restore ${SAVELIST_NAME} + else + # Compilers are easy... + COMPILER_MODULE=$(echo ${LATEST_GCC} | sed -e 's|-|/|g') + + if [[ ${MPI_LIBRARY} =~ ^spectrum-mpi-.*$ ]] + then + MPI_MODULE=$(echo ${MPI_LIBRARY} | sed -e 's|spectrum-mpi-|spectrum-mpi/|g') + else + MPI_MODULE=$(echo ${MPI_LIBRARY} | sed -e 's|-|/|g') + fi + + # Use the latest CUDA 10, since it's compatible with other + # CUDA 10.* libraries + CUDA_MODULE=$(ml --terse avail cuda |& sed -n '/\/10\./p' | tail -n1) + + # Load up the appropriate modules + module load ${COMPILER_MODULE} ${MPI_MODULE} ${CUDA_MODULE} cmake/3.14.5 + ml save ${SAVELIST_NAME} + fi + + BRAIN_DIR=/usr/workspace/wsb/brain + + # CUDA-y things (Use the newest) + ARCH=$(uname -i) + export NCCL_DIR=$(ls -d --color=no ${BRAIN_DIR}/nccl2/*cuda10*${ARCH} | tail -n1) + export CUDNN_DIR=$(find ${BRAIN_DIR}/cudnn -maxdepth 2 -type d | grep "cuda-10.*_${ARCH}" | tail -n1) + + # Unit testing framework + export CATCH2_DIR=${WORKSPACE_DIR}/stable_dependencies/catch2 + + # Add Ninja support + export PATH=${DEPENDENCY_DIR_BASE}/ninja/bin:${PATH} + + # Setup paths to match the build_lbann_lc.sh script (ugh) + BUILD_DIR_BASE=${LBANN_DIR}/build/gnu.Release.${CLUSTER}.llnl.gov + BUILD_DIR=${BUILD_DIR_BASE}/lbann/build + INSTALL_DIR=${BUILD_DIR_BASE}/install + + # Setup a path for Catch2 to use + CATCH2_OUTPUT_DIR=${LBANN_DIR}/bamboo/compiler_tests + rm -f ${CATCH2_OUTPUT_DIR}/*.xml + + # Decide if CUDA should be used. + if [[ "${CLUSTER}" =~ ^(pascal|lassen|ray)$ ]]; + then + USE_CUDA=ON + else + USE_CUDA=OFF + fi + + # Cleanup + [[ -e ${BUILD_DIR_BASE} ]] && rm -rf ${BUILD_DIR_BASE} + mkdir -p ${BUILD_DIR} && cd ${BUILD_DIR} + + # Hack to be nice to others. + if [[ "${CLUSTER}" =~ ^(lassen|ray)$ ]]; + then + LAUNCH_CMD="lrun -1" + else + unset LAUNCH_CMD + fi + + cmake \ + -GNinja \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_INSTALL_PREFIX=${INSTALL_DIR} \ + \ + -DCMAKE_CXX_COMPILER=$(which g++) \ + -DCMAKE_CXX_FLAGS="-DLBANN_SET_EL_RNG -g" \ + -DCMAKE_CUDA_COMPILER=$(which nvcc) \ + -DCMAKE_CUDA_HOST_COMPILER=$(which g++) \ + \ + -DCMAKE_CXX_STANDARD=14 \ + -DCMAKE_CUDA_STANDARD=14 \ + \ + -DLBANN_DATATYPE=float \ + -DLBANN_DETERMINISTIC=OFF \ + -DLBANN_WARNINGS_AS_ERRORS=ON \ + -DLBANN_WITH_ALUMINUM=ON \ + -DLBANN_WITH_CONDUIT=ON \ + -DLBANN_WITH_CUDA=ON \ + -DLBANN_WITH_NVPROF=OFF \ + -DLBANN_WITH_TBINF=ON \ + -DLBANN_WITH_UNIT_TESTING=ON \ + -DLBANN_WITH_VTUNE=OFF \ + \ + -DAluminum_DIR=${DEPENDENCY_DIR}/lib/cmake/Aluminum \ + -DCEREAL_DIR=${DEPENDENCY_DIR} \ + -DCNPY_DIR=${DEPENDENCY_DIR} \ + -DCATCH2_DIR=${WORKSPACE_DIR}/stable_dependencies/catch2 \ + -DHDF5_DIR=${DEPENDENCY_DIR} \ + -DCONDUIT_DIR=${DEPENDENCY_DIR} \ + -DCUB_DIR=${DEPENDENCY_DIR} \ + -DHydrogen_DIR=${DEPENDENCY_DIR} \ + -DOpenCV_DIR=${DEPENDENCY_DIR} \ + -DPROTOBUF_DIR=${DEPENDENCY_DIR} \ + -Dprotobuf_MODULE_COMPATIBLE=ON \ + \ + ${LBANN_DIR} && ${LAUNCH_CMD} ninja && ${LAUNCH_CMD} ninja install && ${LAUNCH_CMD} ./unit_test/seq-catch-tests -r junit -o ${CATCH2_OUTPUT_DIR}/seq_catch_tests_output-${CLUSTER}.xml +else + ${LBANN_DIR}/scripts/build_lbann_lc.sh --with-conduit +fi diff --git a/bamboo/compiler_tests/test_compiler.py b/bamboo/compiler_tests/test_compiler.py index eddcd801ba4..212dcf7f8cc 100644 --- a/bamboo/compiler_tests/test_compiler.py +++ b/bamboo/compiler_tests/test_compiler.py @@ -6,7 +6,7 @@ def test_compiler_build_script(cluster, dirname): - if cluster not in ['corona', 'lassen', 'pascal']: + if cluster not in ['catalyst', 'corona', 'lassen', 'pascal', 'ray']: e = 'test_compiler_build_script: Unsupported Cluster %s' % cluster print('Skip - ' + e) pytest.skip(e) @@ -67,7 +67,7 @@ def test_compiler_intel19_debug(cluster, dirname): def skeleton_clang6(cluster, dir_name, debug): - if cluster not in ['catalyst']: + if cluster not in []: e = 'skeleton_clang6: Unsupported Cluster %s' % cluster print('Skip - ' + e) pytest.skip(e) @@ -80,7 +80,7 @@ def skeleton_clang6(cluster, dir_name, debug): def skeleton_gcc7(cluster, dir_name, debug): - if cluster not in ['catalyst', 'pascal']: + if cluster not in []: e = 'skeleton_gcc7: Unsupported Cluster %s' % cluster print('Skip - ' + e) pytest.skip(e) diff --git a/bamboo/run.sh b/bamboo/run.sh index c90d256bb5c..da1e96968a9 100755 --- a/bamboo/run.sh +++ b/bamboo/run.sh @@ -39,14 +39,13 @@ module load cmake/3.9.2 $PYTHON -m pytest -s -vv --durations=0 --junitxml=results.xml cd .. -echo "Task: Integration Tests" +echo "Task: Integration Tests (Weekly only)" cd integration_tests if [ ${WEEKLY} -ne 0 ]; then $PYTHON -m pytest -s -vv --durations=0 --weekly --junitxml=results.xml -else - $PYTHON -m pytest -s -vv --durations=0 --junitxml=results.xml +# else +# $PYTHON -m pytest -s -vv --durations=0 --junitxml=results.xml fi - cd .. echo "Task: Unit Tests" diff --git a/bamboo/unit_tests/test_unit_check_proto_models.py b/bamboo/unit_tests/test_unit_check_proto_models.py index b49c5b6a0de..5e6cb5f294b 100644 --- a/bamboo/unit_tests/test_unit_check_proto_models.py +++ b/bamboo/unit_tests/test_unit_check_proto_models.py @@ -66,6 +66,8 @@ def skeleton_models(cluster, dir_name, executables, compiler_name, time_limit = 3 if 'resnet50' in file_name: node_count = 8 + if not weekly: + continue # This is too many nodes for nightly. elif 'cifar' in file_name: data_filename_train_default = '/p/lscratchh/brainusr/datasets/cifar10-bin/data_all.bin' data_filename_test_default = '/p/lscratchh/brainusr/datasets/cifar10-bin/test_batch.bin' diff --git a/bamboo/unit_tests/test_unit_checkpoint.py b/bamboo/unit_tests/test_unit_checkpoint.py index e6516cfcec9..adf8f2fab67 100644 --- a/bamboo/unit_tests/test_unit_checkpoint.py +++ b/bamboo/unit_tests/test_unit_checkpoint.py @@ -12,7 +12,9 @@ def skeleton_checkpoint_lenet_shared(cluster, executables, dir_name, print('Skip - ' + e) pytest.skip(e) exe = executables[compiler_name] - + # Handle data + if data_reader_percent is None: + data_reader_percent = 0.01 # No checkpointing, printing weights to files. output_file_name = '%s/bamboo/unit_tests/output/checkpoint_lenet_shared_no_checkpoint_%s_output.txt' % (dir_name, compiler_name) error_file_name = '%s/bamboo/unit_tests/error/checkpoint_lenet_shared_no_checkpoint_%s_error.txt' % (dir_name, compiler_name) @@ -74,6 +76,9 @@ def skeleton_checkpoint_lenet_distributed(cluster, executables, dir_name, print('Skip - ' + e) pytest.skip(e) exe = executables[compiler_name] + # Handle data + if data_reader_percent is None: + data_reader_percent = 0.01 # No checkpointing, printing weights to files. output_file_name = '%s/bamboo/unit_tests/output/checkpoint_lenet_distributed_no_checkpoint_%s_output.txt' % (dir_name, compiler_name) diff --git a/bamboo/unit_tests/test_unit_datareader_python.py b/bamboo/unit_tests/test_unit_datareader_python.py index be83d06fe8a..2e3f1dd9f02 100644 --- a/bamboo/unit_tests/test_unit_datareader_python.py +++ b/bamboo/unit_tests/test_unit_datareader_python.py @@ -175,7 +175,8 @@ def _test(cluster, executables, dir_name, compiler_name): kwargs = { 'account': 'guests', 'nodes': 1, - 'partition': 'pbatch' + 'partition': 'pbatch', + 'overwrite_script': True } experiment_dir = '{d}/bamboo/unit_tests/experiments/{t}_{c}'.format( d=dir_name, t=_test_name, c=compiler_name) diff --git a/bamboo/unit_tests/test_unit_lbann2_reload.py b/bamboo/unit_tests/test_unit_lbann2_reload.py index 2bc0bef69dc..7de45a04613 100644 --- a/bamboo/unit_tests/test_unit_lbann2_reload.py +++ b/bamboo/unit_tests/test_unit_lbann2_reload.py @@ -11,13 +11,20 @@ def skeleton_lbann2_reload(cluster, executables, dir_name, compiler_name, e = 'skeleton_lbann2_reload: default_exes[%s] does not exist' % compiler_name print('Skip - ' + e) pytest.skip(e) - lbann2 = executables[compiler_name] + '2' + lbann = executables[compiler_name] + lbann2 = lbann + '2' + ckpt_base_dir = 'ckpt_lbann2_reload' + os.system('rm -rf ' + ckpt_base_dir) + + if data_reader_percent is None: + data_reader_percent=0.005 + # No checkpointing, printing weights to files. - model_path = '{../../model_zoo/models/lenet_mnist/model_lenet_mnist.prototext,../../model_zoo/tests/model_lenet_mnist_lbann2ckpt.prototext}' + model_path = '{../../model_zoo/tests/model_lenet_mnist_ckpt.prototext,../../model_zoo/tests/model_lenet_mnist_lbann2ckpt.prototext}' output_file_name = '%s/bamboo/unit_tests/output/lbann2_no_checkpoint_%s_output.txt' % (dir_name, compiler_name) error_file_name = '%s/bamboo/unit_tests/error/lbann2_no_checkpoint_%s_error.txt' % (dir_name, compiler_name) - no_ckpt_dir = 'ckpt_lbann2_reload/lbann2_no_ckpt_{c}'.format(c=compiler_name) + no_ckpt_dir = os.path.join(ckpt_base_dir, 'lbann2_no_ckpt_{c}'.format(c=compiler_name)) command = tools.get_command( cluster=cluster, executable=lbann2, num_nodes=1, num_processes=2, data_reader_name='mnist', @@ -27,6 +34,7 @@ def skeleton_lbann2_reload(cluster, executables, dir_name, compiler_name, ckpt_dir=no_ckpt_dir, model_path=model_path, optimizer_name='sgd', + disable_cuda=1, num_epochs=2, output_file_name=output_file_name, error_file_name=error_file_name, weekly=weekly) @@ -37,16 +45,25 @@ def skeleton_lbann2_reload(cluster, executables, dir_name, compiler_name, # Run to checkpoint, printing weights to files. output_file_name = '%s/bamboo/unit_tests/output/lbann2_checkpoint_%s_output.txt' % (dir_name, compiler_name) error_file_name = '%s/bamboo/unit_tests/error/lbann2_checkpoint_%s_error.txt' % (dir_name, compiler_name) - ckpt_dir = 'ckpt_lbann2_reload/lbann2_ckpt_{c}'.format(c=compiler_name) + ckpt_dir = os.path.join(ckpt_base_dir,'lbann2_ckpt_{c}'.format(c=compiler_name)) command = tools.get_command( - cluster=cluster, executable=lbann2, num_nodes=1, num_processes=2, - dir_name=dir_name, + cluster=cluster, + executable=lbann2, + num_nodes=1, + num_processes=2, + ckpt_dir=ckpt_dir, data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST', - data_reader_name='mnist', data_reader_percent=data_reader_percent, - ckpt_dir=ckpt_dir, model_folder='tests', - model_name='lenet_mnist_ckpt', num_epochs=2, optimizer_name='sgd', + data_reader_name='mnist', + data_reader_percent=data_reader_percent, + dir_name=dir_name, + disable_cuda=1, + model_folder='tests', + model_name='lenet_mnist_ckpt', + num_epochs=2, + optimizer_name='sgd', output_file_name=output_file_name, - error_file_name=error_file_name, weekly=weekly) + error_file_name=error_file_name, + weekly=weekly) return_code_ckpt_1 = os.system(command) tools.assert_success(return_code_ckpt_1, error_file_name) @@ -54,16 +71,23 @@ def skeleton_lbann2_reload(cluster, executables, dir_name, compiler_name, output_file_name = '%s/bamboo/unit_tests/output/lbann2_restart_%s_output.txt' % (dir_name, compiler_name) error_file_name = '%s/bamboo/unit_tests/error/lbann2_restart_%s_error.txt' % (dir_name, compiler_name) command = tools.get_command( - cluster=cluster, executable=lbann2, num_nodes=1, num_processes=2, - dir_name=dir_name, + cluster=cluster, + executable=lbann2, + num_nodes=1, + num_processes=2, + ckpt_dir=ckpt_dir, data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST', data_reader_name='mnist', data_reader_percent=data_reader_percent, - ckpt_dir=ckpt_dir, + dir_name=dir_name, + disable_cuda=1, model_path='../../model_zoo/tests/model_lenet_mnist_lbann2ckpt.prototext', - num_epochs=2, optimizer_name='sgd', + num_epochs=2, + optimizer_name='sgd', + restart_dir=ckpt_dir, output_file_name=output_file_name, - error_file_name=error_file_name, weekly=weekly) + error_file_name=error_file_name, + weekly=weekly) return_code_ckpt_2 = os.system(command) tools.assert_success(return_code_ckpt_2, error_file_name) # os.system('rm lbann2_ckpt/model0-epoch*') diff --git a/bamboo/unit_tests/test_unit_mnist_ridge_regression.py b/bamboo/unit_tests/test_unit_mnist_ridge_regression.py index 289af72bbb3..60bee02df2b 100644 --- a/bamboo/unit_tests/test_unit_mnist_ridge_regression.py +++ b/bamboo/unit_tests/test_unit_mnist_ridge_regression.py @@ -100,6 +100,8 @@ def skeleton_mnist_ridge_regression(cluster, executables, dir_name, lbann_args = '--data_reader_percent={drp}'.format(drp=data_reader_percent) if cluster == 'lassen': lbann_args += ' --data_filedir_train=/p/gpfs1/brainusr/datasets/MNIST --data_filedir_test=/p/gpfs1/brainusr/datasets/MNIST' + if cluster == 'ray': + lbann_args += ' --data_filedir_train=/p/gscratchr/brainusr/datasets/MNIST --data_filedir_test=/p/gscratchr/brainusr/datasets/MNIST' kwargs['lbann_args'] = lbann_args # Run @@ -114,6 +116,7 @@ def skeleton_mnist_ridge_regression(cluster, executables, dir_name, model=model, data_reader=data_reader_proto, optimizer=optimizer, + overwrite_script=True, job_name='lbann_ridge_regression', **kwargs) diff --git a/bamboo/unit_tests/test_unit_mnist_softmax_classifier.py b/bamboo/unit_tests/test_unit_mnist_softmax_classifier.py index 9dca795f968..a7bf98175bc 100644 --- a/bamboo/unit_tests/test_unit_mnist_softmax_classifier.py +++ b/bamboo/unit_tests/test_unit_mnist_softmax_classifier.py @@ -7,6 +7,11 @@ def skeleton_mnist_softmax_classifier(cluster, executables, dir_name, compiler_name, weekly, data_reader_percent): + if not weekly: + e = 'test_unit_mnist_softmax_classifier: Not doing weekly testing' + print('SKIP - ' + e) + pytest.skip(e) + if compiler_name not in executables: e = 'skeleton_mnist_softmax_classifier: default_exes[%s] does not exist' % compiler_name print('Skip - ' + e) diff --git a/bamboo/unit_tests/test_unit_reconstruction_loss.py b/bamboo/unit_tests/test_unit_reconstruction_loss.py index d9fb6aa2b0c..04e8ae52718 100644 --- a/bamboo/unit_tests/test_unit_reconstruction_loss.py +++ b/bamboo/unit_tests/test_unit_reconstruction_loss.py @@ -16,12 +16,13 @@ def skeleton_jag_reconstruction_loss(cluster, executables, dir_name, compiler_na command = tools.get_command( cluster=cluster, executable=executables[compiler_name], - num_nodes=16, - num_processes=32, + num_nodes=2, + num_processes=32, + disable_cuda=1, dir_name=dir_name, data_filedir_default='/p/lscratchh/brainusr/datasets/10MJAG/1M_A/100K4trainers', data_reader_name='jag', - data_reader_percent=data_reader_percent, + data_reader_percent='prototext', metadata='model_zoo/models/jag/wae_cycle_gan/jag_100M_metadata.prototext', model_folder='tests', model_name='jag_single_layer_ae', diff --git a/include/lbann/callbacks/checkpoint.hpp b/include/lbann/callbacks/checkpoint.hpp index 65ed972e168..267cf4d7413 100644 --- a/include/lbann/callbacks/checkpoint.hpp +++ b/include/lbann/callbacks/checkpoint.hpp @@ -225,7 +225,7 @@ inline bool read_latest(std::string filename, execution_mode *mode, size_t *epoc *mode = exec_mode_from_string(modeStr); // close our file closeread(fd, filename.c_str()); - if(ret != 2) { return false; } + if(ret != 3) { return false; } return true; } return false; diff --git a/include/lbann/layers/io/input/generic_input_layer.hpp b/include/lbann/layers/io/input/generic_input_layer.hpp index 6f8b05b382e..7dac4dad4cd 100644 --- a/include/lbann/layers/io/input/generic_input_layer.hpp +++ b/include/lbann/layers/io/input/generic_input_layer.hpp @@ -100,7 +100,9 @@ class generic_input_layer : public io_layer { // needs to interact with data readers, etc., so it needs to be // synchronized before any of them are destroyed. if (this->m_model != nullptr) { - this->m_model->get_execution_context().get_io_thread_pool().reap_threads(); + if (this->m_model->has_valid_execution_context()) { + this->m_model->get_execution_context().get_io_thread_pool().reap_threads(); + } } for (auto& io_buffer : m_io_buffers) { diff --git a/model_zoo/lbann2.cpp b/model_zoo/lbann2.cpp index b9147096c44..f85f56cd6af 100644 --- a/model_zoo/lbann2.cpp +++ b/model_zoo/lbann2.cpp @@ -70,11 +70,7 @@ int main(int argc, char *argv[]) { auto model_1 = build_model_from_prototext(argc, argv, pb_trainer, *(pbs[0]), comm.get(), opts, io_thread_pool, true); - std::unique_ptr model_2; - if (pbs.size() > 1) { - model_2 = build_model_from_prototext(argc, argv, pb_trainer, *(pbs[1]), - comm.get(), opts, io_thread_pool, false); - } + // Load layer weights from checkpoint if checkpoint directory given if(opts->has_string("ckpt_dir")){ callback::save_model::load_model_weights(opts->get_string("ckpt_dir"), @@ -96,6 +92,17 @@ int main(int argc, char *argv[]) { trainer->evaluate(model_1.get(), execution_mode::testing); } + + std::unique_ptr model_2; + if (pbs.size() > 1) { + // Reset the RNGs + init_random(random_seed); + init_data_seq_random(random_seed); + + model_2 = build_model_from_prototext(argc, argv, pb_trainer, *(pbs[1]), + comm.get(), opts, io_thread_pool, false); + + } if (model_2 != nullptr) { const auto layers1 = model_1->get_layers(); const auto layers2 = model_2->get_layers(); diff --git a/python/lbann/contrib/lc/launcher.py b/python/lbann/contrib/lc/launcher.py index ca36fcc35af..fc51f9ce246 100644 --- a/python/lbann/contrib/lc/launcher.py +++ b/python/lbann/contrib/lc/launcher.py @@ -15,6 +15,7 @@ def run(trainer, model, data_reader, optimizer, partition=partition(), account=account(), reservation=None, + overwrite_script=False, launcher_args=[], lbann_args=[], environment={}, @@ -69,11 +70,11 @@ def run(trainer, model, data_reader, optimizer, # Write, run, or submit batch script status = 0 if setup_only: - script.write() + script.write(overwrite=overwrite_script) elif has_allocation: - status = script.run() + status = script.run(overwrite=overwrite_script) else: - status = script.submit() + status = script.submit(overwrite=overwrite_script) return status def make_batch_script(script_file=None, diff --git a/python/lbann/contrib/lc/systems.py b/python/lbann/contrib/lc/systems.py index 4156a979bc2..5c61e1af7b5 100644 --- a/python/lbann/contrib/lc/systems.py +++ b/python/lbann/contrib/lc/systems.py @@ -19,10 +19,12 @@ def __init__(self, # Supported LC systems _system_params = {'catalyst': SystemParams(24, 0, 'slurm', 'pbatch', 'brain'), + 'coronal': SystemParams(24, 0, 'slurm', 'pbatch', None), 'pascal': SystemParams(36, 2, 'slurm', 'pbatch', 'lc'), 'quartz': SystemParams(36, 0, 'slurm', 'pbatch', 'brain'), 'surface': SystemParams(16, 2, 'slurm', 'pbatch', 'hpclearn'), 'lassen': SystemParams(44, 4, 'lsf', 'pbatch', None), + 'ray': SystemParams(40, 4, 'lsf', 'pbatch', None), 'sierra': SystemParams(44, 4, 'lsf', 'pbatch', None)} # Detect system diff --git a/src/proto/proto_common.cpp b/src/proto/proto_common.cpp index d5abf284347..f5fbb6cf0f4 100644 --- a/src/proto/proto_common.cpp +++ b/src/proto/proto_common.cpp @@ -500,7 +500,7 @@ void init_data_readers( if (store != nullptr) { store->set_data_reader_ptr(reader_validation); reader_validation->get_data_store_ptr()->compact_nodes(); - } + } /// At this point clean up any unused samples from the main data store if(reader->get_data_store_ptr() != nullptr) { @@ -896,9 +896,15 @@ void print_help(std::ostream& os) " --ltfb_verbose \n" " Increases number of per-trainer messages that are reported\n" " --ckpt_dir=\n" - " Save to or reload from a specific checkpoint directory.\n" + " Save to or restart from a specific checkpoint directory.\n" " Additionally, sets the output directory for dumping weights.\n" " Modifies callbacks: checkpoint, save_model, dump_weights\n" + " --restart_dir=\n" + " Restart from a checkpoint found in the given directory.\n" + " If the directory doesn't exist or doesn't contain a checkpoint,\n" + " an error will be thrown.\n" + " --restart_dir_is_fullpath=\n" + " Indicate whether the restart_dir is a full path.\n" "\n" "DataReaders:\n" " --data_filedir=\n" diff --git a/src/utils/lbann_library.cpp b/src/utils/lbann_library.cpp index 69631579b35..f9228c45139 100644 --- a/src/utils/lbann_library.cpp +++ b/src/utils/lbann_library.cpp @@ -305,6 +305,18 @@ std::unique_ptr build_model_from_prototext( "--------------------------------------------------------------------------------\n"; } #endif + + if (opts && opts->has_string("restart_dir")) { + bool loaded = callback::save_model::load_model_weights( + opts->get_string("restart_dir"), + ret_model.get(), + opts->get_bool("restart_dir_is_fullpath")); + if(!loaded) { + LBANN_ERROR("Unable to reload model from given restart directory: ", + opts->get_string("restart_dir")); + } + } + return ret_model; } From 4be68a2591fa5fa98da2facf7b69aff3ead083c0 Mon Sep 17 00:00:00 2001 From: "Thomas R. Benson" Date: Wed, 2 Oct 2019 17:41:16 -0700 Subject: [PATCH 322/634] Quick build system change to allow Aluminum detection from Hydrogen rather than CMake option --- CMakeLists.txt | 6 ++++++ bamboo/compiler_tests/build_script.sh | 1 - 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 588e32a30a2..bc8b78b7804 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -184,6 +184,12 @@ endif () message(STATUS "Found Hydrogen: ${Hydrogen_DIR}") set(LBANN_HAS_HYDROGEN ${Hydrogen_FOUND}) +# Not the ideal fix, but should be fine for now. +if (Aluminum_FOUND) + message(STATUS "Aluminum found in Hydrogen. Using Aluminum.") + set(LBANN_WITH_ALUMINUM ON CACHE BOOL "Use aluminum." FORCE) +endif () + include(SetupOpenMP) include(SetupMPI) include(SetupProtobuf) diff --git a/bamboo/compiler_tests/build_script.sh b/bamboo/compiler_tests/build_script.sh index 6ef43375d60..5cd13988337 100755 --- a/bamboo/compiler_tests/build_script.sh +++ b/bamboo/compiler_tests/build_script.sh @@ -107,7 +107,6 @@ then -DLBANN_DATATYPE=float \ -DLBANN_DETERMINISTIC=OFF \ -DLBANN_WARNINGS_AS_ERRORS=ON \ - -DLBANN_WITH_ALUMINUM=ON \ -DLBANN_WITH_CONDUIT=ON \ -DLBANN_WITH_CUDA=ON \ -DLBANN_WITH_NVPROF=OFF \ From bc04f711094aa966c971528775872854d4ba3dd8 Mon Sep 17 00:00:00 2001 From: Tim Moon Date: Thu, 3 Oct 2019 11:14:23 -0700 Subject: [PATCH 323/634] Add layer tests to Bamboo (#1271) * Add layer tests to Bamboo Perform gradient checking and/or metric checking with argmax, argmin, channel-wise scale/bias, embedding, entry-wise batchnorm, entry-wise scale/bias, one-hot, and slice layers. * Add utility function to create PyTest tests that use Python frontend * Remove PyTest boilerplate from layer unit tests * Add PyTest utility function to create protobuf message for Python data reader * Fix slightly misleading comment * Fix typo --- bamboo/common_python/tools.py | 163 ++++++++++- .../unit_tests/test_unit_datareader_python.py | 135 ++-------- bamboo/unit_tests/test_unit_layer_argmax.py | 154 +++++++++++ bamboo/unit_tests/test_unit_layer_argmin.py | 157 +++++++++++ .../test_unit_layer_channelwise_scale_bias.py | 173 ++++++++++++ .../unit_tests/test_unit_layer_embedding.py | 149 +++++++++++ ...nit_layer_entrywise_batch_normalization.py | 180 +++++++++++++ .../test_unit_layer_entrywise_scale_bias.py | 220 +++++++++++++++ bamboo/unit_tests/test_unit_layer_one_hot.py | 142 ++++++++++ bamboo/unit_tests/test_unit_layer_slice.py | 252 ++++++++++++++++++ 10 files changed, 1616 insertions(+), 109 deletions(-) create mode 100644 bamboo/unit_tests/test_unit_layer_argmax.py create mode 100644 bamboo/unit_tests/test_unit_layer_argmin.py create mode 100644 bamboo/unit_tests/test_unit_layer_channelwise_scale_bias.py create mode 100644 bamboo/unit_tests/test_unit_layer_embedding.py create mode 100644 bamboo/unit_tests/test_unit_layer_entrywise_batch_normalization.py create mode 100644 bamboo/unit_tests/test_unit_layer_entrywise_scale_bias.py create mode 100644 bamboo/unit_tests/test_unit_layer_one_hot.py create mode 100644 bamboo/unit_tests/test_unit_layer_slice.py diff --git a/bamboo/common_python/tools.py b/bamboo/common_python/tools.py index feaee4a359b..b27f6599a83 100644 --- a/bamboo/common_python/tools.py +++ b/bamboo/common_python/tools.py @@ -1,4 +1,8 @@ -import math, os, re +import math +import os +import re +import sys +import pytest def check_list(substrings, strings): @@ -471,7 +475,7 @@ def get_command(cluster, 'ckpt_dir', #'restart_dir', 'restart_dir_is_fullpath', - + # DataReaders: # 'data_filedir', # 'data_filedir_train', @@ -657,3 +661,158 @@ def assert_failure(return_code, expected_error, error_file_name): 'return_code={rc}\nFailed with error different than expected.\nactual_error={ae}\nexpected_error={ee}\nSee {efn}'.format( rc=return_code, ae=actual_error, ee=expected_error, efn=error_file_name)) + + +def create_tests(setup_func, test_name): + """Create functions that can interact with PyTest. + + This function creates tests that involve setting up and running an + LBANN experiment with the Python frontend. `setup_func` should be + a function that takes in the LBANN Python module and outputs + objects for an LBANN experiment. A test succeeds if LBANN runs and + exits with an exit code of 0, and fails otherwise. + + PyTest detects tests by loading in a Python script and looking for + functions prefixed with 'test_'. After you call this function + within a script to generate test functions, make sure to add the + test functions to the script's scope. For example: + + _test_funcs = tools.create_tests(setup_func, test_name) + for t in _test_funcs: + globals()[t.__name__] = t + + Args: + setup_func (function): Sets up an LBANN experiment using the + Python frontend. It takes in the LBANN Python module as + input and returns a `(lbann.Trainer, lbann.Model, + lbann.reader_pb2.DataReader, lbann.Optimizer)`. + test_name (str): Descriptive name. Should be prefixed with + 'test_'. + + Returns: + Iterable of function: Tests that can interact with PyTest. + + """ + + # Basic test function + def test_func(cluster, executables, dir_name, compiler_name): + process_executable(test_name, compiler_name, executables) + + # Choose LBANN build and load Python frontend + if compiler_name == 'exe': + exe = executables[compiler_name] + bin_dir = os.path.dirname(exe) + install_dir = os.path.dirname(bin_dir) + build_path = '{i}/lib/python3.7/site-packages'.format(i=install_dir) + else: + if compiler_name == 'clang6': + path = 'clang.Release' + elif compiler_name == 'clang6_debug': + path = 'clang.Debug' + elif compiler_name == 'gcc7': + path = 'gnu.Release' + elif compiler_name == 'clang6_debug': + path = 'gnu.Debug' + elif compiler_name == 'intel19': + path = 'intel.Release' + elif compiler_name == 'intel19_debug': + path = 'intel.Debug' + path = '{p}.{c}.llnl.gov'.format(p=path, c=cluster) + build_path = '{d}/build/{p}/install/lib/python3.7/site-packages'.format( + d=dir_name, p=path) + print('build_path={b}'.format(b=build_path)) + sys.path.append(build_path) + import lbann + import lbann.contrib.lc.launcher + + # Setup LBANN experiment + trainer, model, data_reader, optimizer = setup_func(lbann) + + # Run LBANN experiment + kwargs = { + 'nodes': 1, + 'overwrite_script': True + } + experiment_dir = '{d}/bamboo/unit_tests/experiments/{t}_{c}'.format( + d=dir_name, t=test_name, c=compiler_name) + error_file_name = '{e}/err.log'.format( + e=experiment_dir, c=compiler_name) + return_code = lbann.contrib.lc.launcher.run( + trainer=trainer, + model=model, + data_reader=data_reader, + optimizer=optimizer, + experiment_dir=experiment_dir, + job_name='lbann_{}'.format(test_name), + **kwargs) + assert_success(return_code, error_file_name) + + # Specific test functions for different build configurations + def test_func_exe(cluster, dirname, exe): + if exe is None: + e = 'test_{}_exe: Non-local testing'.format(test_name) + print('Skip - ' + e) + pytest.skip(e) + exes = {'exe': exe} + test_func(cluster, exes, dirname, 'exe') + def test_func_clang6(cluster, exes, dirname): + test_func(cluster, exes, dirname, 'clang6') + def test_func_gcc7(cluster, exes, dirname): + test_func(cluster, exes, dirname, 'gcc7') + def test_func_intel19(cluster, exes, dirname): + test_func(cluster, exes, dirname, 'intel19') + test_func_exe.__name__ = '{}_exe'.format(test_name) + test_func_clang6.__name__ = '{}_clang6'.format(test_name) + test_func_gcc7.__name__ = '{}_gcc7'.format(test_name) + test_func_intel19.__name__ = '{}_intel19'.format(test_name) + + return (test_func_exe, + test_func_clang6, + test_func_gcc7, + test_func_intel19) + + +def create_python_data_reader(lbann, + file_name, + sample_function_name, + num_samples_function_name, + sample_dims_function_name, + execution_mode): + """Create protobuf message for Python data reader. + + A Python data reader gets data by importing a Python module and + calling functions in its scope. + + Args: + lbann (module): Module for LBANN Python frontend. + file_name (str): Python file. + sample_function_name (str): Function to get a data sample. It + takes one integer argument for the sample index and + returns an `Iterator` of `float`s. + sample_dims_function_name (str): Function to get dimensions of + a data sample. It takes no arguments and returns a + `(int,)`. + num_samples_function_name (str): Function to get number of + data samples in data set. It takes no arguments and + returns an `int`. + execution_mode (str): 'train', 'validation', or 'test' + + """ + + # Extract paths + file_name = os.path.realpath(file_name) + dir_name = os.path.dirname(file_name) + module_name = os.path.splitext(os.path.basename(file_name))[0] + + # Construct protobuf message for data reader + reader = lbann.reader_pb2.Reader() + reader.name = 'python' + reader.role = execution_mode + reader.percent_of_data_to_use = 1.0 + reader.python.module = module_name + reader.python.module_dir = dir_name + reader.python.sample_function = sample_function_name + reader.python.num_samples_function = num_samples_function_name + reader.python.sample_dims_function = sample_dims_function_name + + return reader diff --git a/bamboo/unit_tests/test_unit_datareader_python.py b/bamboo/unit_tests/test_unit_datareader_python.py index 2e3f1dd9f02..e6fd5f22f24 100644 --- a/bamboo/unit_tests/test_unit_datareader_python.py +++ b/bamboo/unit_tests/test_unit_datareader_python.py @@ -2,7 +2,6 @@ import os.path import sys import numpy as np -import pytest # Local files current_file = os.path.realpath(__file__) @@ -13,10 +12,8 @@ # ============================================== # Objects for Python data reader # ============================================== -# Note: The Python data reader imports this file and calls the -# functions below to ingest data. This is the only part of the script -# that should be executed when the script is imported, or else the -# Python data reader might misbehave. +# Note: The Python data reader imports this file as a module and calls +# the functions below to ingest data. # Data np.random.seed(20190708) @@ -99,114 +96,38 @@ def construct_data_reader(lbann): lbann (module): Module for LBANN Python frontend """ - module_name = os.path.splitext(os.path.basename(current_file))[0] - # Base data reader message - message = lbann.reader_pb2.DataReader() - - # Training set data reader - # TODO: This can be removed once + # Note: The training data reader should be removed when # https://github.com/LLNL/lbann/issues/1098 is resolved. - data_reader = message.reader.add() - data_reader.name = 'python' - data_reader.role = 'train' - data_reader.percent_of_data_to_use = 1.0 - data_reader.python.module = module_name - data_reader.python.module_dir = current_dir - data_reader.python.sample_function = 'get_sample' - data_reader.python.num_samples_function = 'num_samples' - data_reader.python.sample_dims_function = 'sample_dims' - - # Test set data reader - data_reader = message.reader.add() - data_reader.name = 'python' - data_reader.role = 'test' - data_reader.percent_of_data_to_use = 1.0 - data_reader.python.module = module_name - data_reader.python.module_dir = current_dir - data_reader.python.sample_function = 'get_sample' - data_reader.python.num_samples_function = 'num_samples' - data_reader.python.sample_dims_function = 'sample_dims' - + message = lbann.reader_pb2.DataReader() + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'train' + ) + ]) + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'test' + ) + ]) return message # ============================================== # Setup PyTest # ============================================== -# Generate test name based on file name +# Create test functions that can interact with PyTest +# Note: Create test name by removing ".py" from file name _test_name = os.path.splitext(os.path.basename(current_file))[0] - -# Primary test method -def _test(cluster, executables, dir_name, compiler_name): - tools.process_executable(_test_name, compiler_name, executables) - - # Import LBANN Python frontend - if compiler_name == 'exe': - exe = executables[compiler_name] - bin_dir = os.path.dirname(exe) - install_dir = os.path.dirname(bin_dir) - build_path = '{i}/lib/python3.7/site-packages'.format(i=install_dir) - else: - if compiler_name == 'clang6': - path = 'clang.Release' - elif compiler_name == 'clang6_debug': - path = 'clang.Debug' - elif compiler_name == 'gcc7': - path = 'gnu.Release' - elif compiler_name == 'clang6_debug': - path = 'gnu.Debug' - elif compiler_name == 'intel19': - path = 'intel.Release' - elif compiler_name == 'intel19_debug': - path = 'intel.Debug' - path = '{p}.{c}.llnl.gov'.format(p=path, c=cluster) - build_path = '{d}/build/{p}/install/lib/python3.7/site-packages'.format( - d=dir_name, p=path) - print('build_path={b}'.format(b=build_path)) - sys.path.append(build_path) - import lbann - import lbann.contrib.lc.launcher - - # Setup LBANN experiment - trainer, model, data_reader, optimizer = setup_experiment(lbann) - - # Run LBANN experiment - kwargs = { - 'account': 'guests', - 'nodes': 1, - 'partition': 'pbatch', - 'overwrite_script': True - } - experiment_dir = '{d}/bamboo/unit_tests/experiments/{t}_{c}'.format( - d=dir_name, t=_test_name, c=compiler_name) - error_file_name = '{e}/err.log'.format( - e=experiment_dir, c=compiler_name) - return_code = lbann.contrib.lc.launcher.run( - trainer=trainer, - model=model, - data_reader=data_reader, - optimizer=optimizer, - experiment_dir=experiment_dir, - job_name='lbann_{}'.format(_test_name), - **kwargs) - tools.assert_success(return_code, error_file_name) - -# Construct methods that will be detected by PyTest -def _test_clang6(cluster, exes, dirname): - _test(cluster, exes, dirname, 'clang6') -def _test_gcc7(cluster, exes, dirname): - _test(cluster, exes, dirname, 'gcc7') -def _test_intel19(cluster, exes, dirname): - _test(cluster, exes, dirname, 'intel19') -def _test_exe(cluster, dirname, exe): - if exe is None: - e = 'test_{}_exe: Non-local testing'.format(_test_name) - print('Skip - ' + e) - pytest.skip(e) - exes = {'exe': exe} - _test(cluster, exes, dirname, 'exe') -globals()['{}_clang6'.format(_test_name)] = _test_clang6 -globals()['{}_gcc7'.format(_test_name)] = _test_gcc7 -globals()['{}_intel19'.format(_test_name)] = _test_intel19 -globals()['{}_exe'.format(_test_name)] = _test_exe +for test in tools.create_tests(setup_experiment, _test_name): + globals()[test.__name__] = test diff --git a/bamboo/unit_tests/test_unit_layer_argmax.py b/bamboo/unit_tests/test_unit_layer_argmax.py new file mode 100644 index 00000000000..f2b2f83bbb7 --- /dev/null +++ b/bamboo/unit_tests/test_unit_layer_argmax.py @@ -0,0 +1,154 @@ +import functools +import operator +import os +import os.path +import sys +import numpy as np + +# Local files +current_file = os.path.realpath(__file__) +current_dir = os.path.dirname(current_file) +sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python')) +import tools + +# ============================================== +# Objects for Python data reader +# ============================================== +# Note: The Python data reader imports this file as a module and calls +# the functions below to ingest data. + +# Data +np.random.seed(20190911) +_num_samples = 31 +_sample_dims = (11,) +_sample_size = functools.reduce(operator.mul, _sample_dims) +_samples = np.random.normal(size=(_num_samples,_sample_size)).astype(np.float32) +_samples[1,:] = 0.5 +_samples[15,:] = -1.0 +_samples[15,3] = -0.5 +_samples[15,5] = -0.5 + +# Sample access functions +def get_sample(index): + return _samples[index,:] +def num_samples(): + return _num_samples +def sample_dims(): + return (_sample_size,) + +# ============================================== +# Setup LBANN experiment +# ============================================== + +def setup_experiment(lbann): + """Construct LBANN experiment. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + trainer = lbann.Trainer() + model = construct_model(lbann) + data_reader = construct_data_reader(lbann) + optimizer = lbann.NoOptimizer() + return trainer, model, data_reader, optimizer + +def construct_model(lbann): + """Construct LBANN model. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Convenience function to convert list to a space-separated string + def str_list(it): + return ' '.join([str(i) for i in it]) + + # Convenience function to compute L2 norm squared with NumPy + def l2_norm2(x): + x = x.reshape(-1) + return np.inner(x, x) + + # LBANN implementation + x = lbann.Reshape(lbann.Input(), dims=str_list(_sample_dims)) + y = lbann.Argmax(x, device='cpu') + z = lbann.L2Norm2(y) + + # Objects for LBANN model + obj = z + metric = lbann.Metric(z, name='obj') + layers = list(lbann.traverse_layer_graph(z)) + callbacks = [] + + # Get expected metric value from NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i).reshape(_sample_dims) + y = np.argmax(x) + z = l2_norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metric.name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # Construct model + mini_batch_size = 17 + num_epochs = 0 + return lbann.Model(mini_batch_size, + num_epochs, + layers=layers, + objective_function=obj, + metrics=metric, + callbacks=callbacks) + +def construct_data_reader(lbann): + """Construct Protobuf message for Python data reader. + + The Python data reader will import the current Python file to + access the sample access functions. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Note: The training data reader should be removed when + # https://github.com/LLNL/lbann/issues/1098 is resolved. + message = lbann.reader_pb2.DataReader() + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'train' + ) + ]) + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'test' + ) + ]) + return message + +# ============================================== +# Setup PyTest +# ============================================== + +# Create test functions that can interact with PyTest +# Note: Create test name by removing ".py" from file name +_test_name = os.path.splitext(os.path.basename(current_file))[0] +for test in tools.create_tests(setup_experiment, _test_name): + globals()[test.__name__] = test diff --git a/bamboo/unit_tests/test_unit_layer_argmin.py b/bamboo/unit_tests/test_unit_layer_argmin.py new file mode 100644 index 00000000000..5887818e65b --- /dev/null +++ b/bamboo/unit_tests/test_unit_layer_argmin.py @@ -0,0 +1,157 @@ +import functools +import operator +import os +import os.path +import sys +import numpy as np + +# Local files +current_file = os.path.realpath(__file__) +current_dir = os.path.dirname(current_file) +sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python')) +import tools + +# ============================================== +# Objects for Python data reader +# ============================================== +# Note: The Python data reader imports this file as a module and calls +# the functions below to ingest data. + +# Data +np.random.seed(201909112) +_num_samples = 31 +_sample_dims = (11,) +_sample_size = functools.reduce(operator.mul, _sample_dims) +_samples = np.random.normal(size=(_num_samples,_sample_size)).astype(np.float32) +_samples[1,:] = 0.5 +_samples[15,:] = 1.0 +_samples[15,3] = 0.5 +_samples[15,5] = 0.5 + +# Sample access functions +def get_sample(index): + return _samples[index,:] +def num_samples(): + return _num_samples +def sample_dims(): + return (_sample_size,) + +# ============================================== +# Setup LBANN experiment +# ============================================== + +def setup_experiment(lbann): + """Construct LBANN experiment. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + trainer = lbann.Trainer() + model = construct_model(lbann) + data_reader = construct_data_reader(lbann) + optimizer = lbann.NoOptimizer() + return trainer, model, data_reader, optimizer + +def construct_model(lbann): + """Construct LBANN model. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Convenience function to convert list to a space-separated string + def str_list(it): + return ' '.join([str(i) for i in it]) + + # Convenience function to compute L2 norm squared with NumPy + def l2_norm2(x): + x = x.reshape(-1) + return np.inner(x, x) + + # LBANN implementation + x = lbann.Reshape(lbann.Input(), dims=str_list(_sample_dims)) + y = lbann.Argmin(x, device='cpu') + z = lbann.L2Norm2(y) + + # Objects for LBANN model + obj = z + metric = lbann.Metric(z, name='obj') + layers = list(lbann.traverse_layer_graph(z)) + callbacks = [] + + # Get expected metric value from NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i).reshape(_sample_dims) + y = np.argmin(x) + z = l2_norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metric.name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # Construct model + # ------------------------------------------ + + mini_batch_size = 17 + num_epochs = 0 + return lbann.Model(mini_batch_size, + num_epochs, + layers=layers, + objective_function=obj, + metrics=metric, + callbacks=callbacks) + +def construct_data_reader(lbann): + """Construct Protobuf message for Python data reader. + + The Python data reader will import the current Python file to + access the sample access functions. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Note: The training data reader should be removed when + # https://github.com/LLNL/lbann/issues/1098 is resolved. + message = lbann.reader_pb2.DataReader() + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'train' + ) + ]) + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'test' + ) + ]) + return message + +# ============================================== +# Setup PyTest +# ============================================== + +# Create test functions that can interact with PyTest +# Note: Create test name by removing ".py" from file name +_test_name = os.path.splitext(os.path.basename(current_file))[0] +for test in tools.create_tests(setup_experiment, _test_name): + globals()[test.__name__] = test diff --git a/bamboo/unit_tests/test_unit_layer_channelwise_scale_bias.py b/bamboo/unit_tests/test_unit_layer_channelwise_scale_bias.py new file mode 100644 index 00000000000..302dc262723 --- /dev/null +++ b/bamboo/unit_tests/test_unit_layer_channelwise_scale_bias.py @@ -0,0 +1,173 @@ +import functools +import operator +import os +import os.path +import sys +import numpy as np + +# Local files +current_file = os.path.realpath(__file__) +current_dir = os.path.dirname(current_file) +sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python')) +import tools + +# ============================================== +# Objects for Python data reader +# ============================================== +# Note: The Python data reader imports this file as a module and calls +# the functions below to ingest data. + +# Data +np.random.seed(20190719) +_num_samples = 29 +_sample_dims = (7,5,3) +_sample_size = functools.reduce(operator.mul, _sample_dims) +_samples = np.random.normal(size=(_num_samples,_sample_size)).astype(np.float32) +_scale = np.random.normal(loc=1, size=(_sample_dims[0],1,1)).astype(np.float32) +_bias = np.random.normal(loc=0, size=(_sample_dims[0],1,1)).astype(np.float32) + +# Sample access functions +def get_sample(index): + return _samples[index,:] +def num_samples(): + return _num_samples +def sample_dims(): + return (_sample_size,) + +# ============================================== +# Setup LBANN experiment +# ============================================== + +def setup_experiment(lbann): + """Construct LBANN experiment. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + trainer = lbann.Trainer() + model = construct_model(lbann) + data_reader = construct_data_reader(lbann) + optimizer = lbann.NoOptimizer() + return trainer, model, data_reader, optimizer + +def construct_model(lbann): + """Construct LBANN model. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Convenience function to convert list to a space-separated string + def str_list(it): + return ' '.join([str(i) for i in it]) + + # Convenience function to compute L2 norm squared with NumPy + def l2_norm2(x): + x = x.reshape(-1) + return np.inner(x, x) + + # Input data + # Note: Sum with a weights layer so that gradient checking will + # verify that error signals are correct. + x_weights = lbann.Weights(optimizer=lbann.SGD(), + initializer=lbann.ConstantInitializer(value=0.0), + name='input_weights') + x0 = lbann.WeightsLayer(weights=x_weights, + dims=str_list(_sample_dims)) + x1 = lbann.Reshape(lbann.Input(), dims=str_list(_sample_dims)) + x = lbann.Sum([x0, x1]) + + # Apply channel-wise scale/bias + scale_values = str_list(np.nditer(_scale)) + bias_values = str_list(np.nditer(_bias)) + scalebias_weights = lbann.Weights( + optimizer=lbann.SGD(), + initializer=lbann.ValueInitializer(values='{} {}'.format(scale_values, + bias_values)), + name='scalebias_weights' + ) + y = lbann.ChannelwiseScaleBias(x, weights=scalebias_weights) + z = lbann.L2Norm2(y) + + # Objects for LBANN model + obj = z + metric = lbann.Metric(z, name='obj') + layers = list(lbann.traverse_layer_graph(z)) + callbacks = [] + + # Get expected metric value from NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i).reshape(_sample_dims) + y = _scale * x + _bias + z = l2_norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metric.name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # Gradient checking + callbacks.append(lbann.CallbackCheckGradients(error_on_failure=True)) + + # Construct model + mini_batch_size = 17 + num_epochs = 0 + return lbann.Model(mini_batch_size, + num_epochs, + layers=layers, + objective_function=obj, + metrics=metric, + callbacks=callbacks) + +def construct_data_reader(lbann): + """Construct Protobuf message for Python data reader. + + The Python data reader will import the current Python file to + access the sample access functions. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Note: The training data reader should be removed when + # https://github.com/LLNL/lbann/issues/1098 is resolved. + message = lbann.reader_pb2.DataReader() + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'train' + ) + ]) + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'test' + ) + ]) + return message + +# ============================================== +# Setup PyTest +# ============================================== + +# Create test functions that can interact with PyTest +# Note: Create test name by removing ".py" from file name +_test_name = os.path.splitext(os.path.basename(current_file))[0] +for test in tools.create_tests(setup_experiment, _test_name): + globals()[test.__name__] = test diff --git a/bamboo/unit_tests/test_unit_layer_embedding.py b/bamboo/unit_tests/test_unit_layer_embedding.py new file mode 100644 index 00000000000..e999eb2a3ea --- /dev/null +++ b/bamboo/unit_tests/test_unit_layer_embedding.py @@ -0,0 +1,149 @@ +import functools +import operator +import os +import os.path +import sys +import numpy as np + +# Local files +current_file = os.path.realpath(__file__) +current_dir = os.path.dirname(current_file) +sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python')) +import tools + +# ============================================== +# Objects for Python data reader +# ============================================== +# Note: The Python data reader imports this file as a module and calls +# the functions below to ingest data. + +# Data +dictionary_size = 7 +embedding_size = 5 +np.random.seed(4321) +embedding_array = np.random.normal(size=(dictionary_size,embedding_size)) + +# Sample access functions +def get_sample(index): + np.random.seed(1234+index) + return [np.random.randint(dictionary_size)] +def num_samples(): + return 41 +def sample_dims(): + return (1,) + +# ============================================== +# Setup LBANN experiment +# ============================================== + +def setup_experiment(lbann): + """Construct LBANN experiment. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + trainer = lbann.Trainer() + model = construct_model(lbann) + data_reader = construct_data_reader(lbann) + optimizer = lbann.NoOptimizer() + return trainer, model, data_reader, optimizer + +def construct_model(lbann): + """Construct LBANN model. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Construct weights for embeddings + embedding_values = ' '.join([str(i) for i in np.nditer(embedding_array)]) + init = lbann.ValueInitializer(values=embedding_values) + w = lbann.Weights(optimizer=lbann.SGD(), initializer=init) + + # Layer graph + input = lbann.Input() + embedding = lbann.Embedding(input, + weights=w, + dictionary_size=dictionary_size, + embedding_size=embedding_size, + device='cpu') + l2_norm2 = lbann.L2Norm2(embedding) + layers = list(lbann.traverse_layer_graph(input)) + metric = lbann.Metric(l2_norm2, name='L2 norm squared') + obj = lbann.ObjectiveFunction(l2_norm2) + + # Compute expected value + metric_vals = [] + for i in range(num_samples()): + input = get_sample(i) + embedding = embedding_array[int(input[0]), :] + l2_norm2 = np.inner(embedding, embedding) + metric_vals.append(l2_norm2) + expected_metric_value = np.mean(metric_vals) + tol = 8 * expected_metric_value * np.finfo(np.float32).eps + + # Initialize check metric callback + callbacks = [lbann.CallbackCheckMetric(metric='L2 norm squared', + lower_bound=expected_metric_value-tol, + upper_bound=expected_metric_value+tol, + error_on_failure=True, + execution_modes='test'), + lbann.CallbackCheckGradients(error_on_failure=True)] + + # Construct model + mini_batch_size = 17 + num_epochs = 0 + return lbann.Model(mini_batch_size, + num_epochs, + layers=layers, + objective_function=obj, + metrics=[metric], + callbacks=callbacks) + +def construct_data_reader(lbann): + """Construct Protobuf message for Python data reader. + + The Python data reader will import the current Python file to + access the sample access functions. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Note: The training data reader should be removed when + # https://github.com/LLNL/lbann/issues/1098 is resolved. + message = lbann.reader_pb2.DataReader() + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'train' + ) + ]) + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'test' + ) + ]) + return message + +# ============================================== +# Setup PyTest +# ============================================== + +# Create test functions that can interact with PyTest +# Note: Create test name by removing ".py" from file name +_test_name = os.path.splitext(os.path.basename(current_file))[0] +for test in tools.create_tests(setup_experiment, _test_name): + globals()[test.__name__] = test diff --git a/bamboo/unit_tests/test_unit_layer_entrywise_batch_normalization.py b/bamboo/unit_tests/test_unit_layer_entrywise_batch_normalization.py new file mode 100644 index 00000000000..33592ae3253 --- /dev/null +++ b/bamboo/unit_tests/test_unit_layer_entrywise_batch_normalization.py @@ -0,0 +1,180 @@ +import functools +import operator +import os +import os.path +import sys +import numpy as np + +# Local files +current_file = os.path.realpath(__file__) +current_dir = os.path.dirname(current_file) +sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python')) +import tools + +# ============================================== +# Objects for Python data reader +# ============================================== +# Note: The Python data reader imports this file as a module and calls +# the functions below to ingest data. + +# Data +np.random.seed(20190815) +_num_samples = 29 +_sample_dims = (7,5,3) +_sample_size = functools.reduce(operator.mul, _sample_dims) +_samples = np.random.normal(size=(_num_samples,_sample_size)).astype(np.float32) + +# Sample access functions +def get_sample(index): + return _samples[index,:] +def num_samples(): + return _num_samples +def sample_dims(): + return (_sample_size,) + +# ============================================== +# Setup LBANN experiment +# ============================================== + +def setup_experiment(lbann): + """Construct LBANN experiment. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + trainer = lbann.Trainer() + model = construct_model(lbann) + data_reader = construct_data_reader(lbann) + optimizer = lbann.NoOptimizer() + return trainer, model, data_reader, optimizer + +def construct_model(lbann): + """Construct LBANN model. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Convenience function to convert list to a space-separated string + def str_list(it): + return ' '.join([str(i) for i in it]) + + # Input data + # Note: We want to use gradient checking to verify that error + # signals are correct. To do this, we zero-initialize a weights + # object, construct a zero-valued tensor, and add it to the + # input. To make sure that batchnorm is non-trivial, we multiply + # the zero-valued tensor by the mini-batch index. + x = lbann.Reshape(lbann.Input(), dims=str_list(_sample_dims)) + x_weights = lbann.Weights(optimizer=lbann.SGD(), + initializer=lbann.ConstantInitializer(value=0.0)) + x0 = lbann.WeightsLayer(weights=x_weights, + dims=str_list(_sample_dims)) + x1 = lbann.Divide([lbann.MiniBatchIndex(), lbann.MiniBatchSize()]) + x1 = lbann.Tessellate(lbann.Reshape(x1, dims='1 1 1'), dims=str_list(_sample_dims)) + x = lbann.Sum([x, lbann.Multiply([x0, x1])]) + x_lbann = x + + # Objects for LBANN model + obj = [] + metrics = [] + callbacks = [] + + # ------------------------------------------ + # Data-parallel layout + # ------------------------------------------ + + # LBANN implementation + decay = 0.9 + epsilon = 1e-5 + x = x_lbann + y = lbann.EntrywiseBatchNormalization(x, + decay=decay, + epsilon=epsilon, + data_layout='data_parallel') + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='data-parallel output')) + + # ------------------------------------------ + # Model-parallel layout + # ------------------------------------------ + + # LBANN implementation + decay = 0.9 + epsilon = 1e-5 + x = x_lbann + y = lbann.EntrywiseBatchNormalization(x, + decay=decay, + epsilon=epsilon, + data_layout='model_parallel') + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='model-parallel output')) + + # ------------------------------------------ + # Gradient checkint + # ------------------------------------------ + + callbacks.append(lbann.CallbackCheckGradients(error_on_failure=True)) + + # ------------------------------------------ + # Construct model + # ------------------------------------------ + + mini_batch_size = 64 + num_epochs = 1 + return lbann.Model(mini_batch_size, + num_epochs, + layers=lbann.traverse_layer_graph(x_lbann), + objective_function=obj, + metrics=metrics, + callbacks=callbacks) + +def construct_data_reader(lbann): + """Construct Protobuf message for Python data reader. + + The Python data reader will import the current Python file to + access the sample access functions. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Note: The training data reader should be removed when + # https://github.com/LLNL/lbann/issues/1098 is resolved. + message = lbann.reader_pb2.DataReader() + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'train' + ) + ]) + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'test' + ) + ]) + return message + +# ============================================== +# Setup PyTest +# ============================================== + +# Create test functions that can interact with PyTest +# Note: Create test name by removing ".py" from file name +_test_name = os.path.splitext(os.path.basename(current_file))[0] +for test in tools.create_tests(setup_experiment, _test_name): + globals()[test.__name__] = test diff --git a/bamboo/unit_tests/test_unit_layer_entrywise_scale_bias.py b/bamboo/unit_tests/test_unit_layer_entrywise_scale_bias.py new file mode 100644 index 00000000000..c1816202f30 --- /dev/null +++ b/bamboo/unit_tests/test_unit_layer_entrywise_scale_bias.py @@ -0,0 +1,220 @@ +import functools +import operator +import os +import os.path +import sys +import numpy as np + +# Local files +current_file = os.path.realpath(__file__) +current_dir = os.path.dirname(current_file) +sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python')) +import tools + +# ============================================== +# Objects for Python data reader +# ============================================== +# Note: The Python data reader imports this file as a module and calls +# the functions below to ingest data. + +# Data +np.random.seed(20190723) +_num_samples = 29 +_sample_dims = (7,5,3) +_sample_size = functools.reduce(operator.mul, _sample_dims) +_samples = np.random.normal(size=(_num_samples,_sample_size)).astype(np.float32) +_scale = np.random.normal(loc=1, size=_sample_dims).astype(np.float32) +_bias = np.random.normal(loc=0, size=_sample_dims).astype(np.float32) + +# Sample access functions +def get_sample(index): + return _samples[index,:] +def num_samples(): + return _num_samples +def sample_dims(): + return (_sample_size,) + +# ============================================== +# Setup LBANN experiment +# ============================================== + +def setup_experiment(lbann): + """Construct LBANN experiment. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + trainer = lbann.Trainer() + model = construct_model(lbann) + data_reader = construct_data_reader(lbann) + optimizer = lbann.NoOptimizer() + return trainer, model, data_reader, optimizer + +def construct_model(lbann): + """Construct LBANN model. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Convenience function to convert list to a space-separated string + def str_list(it): + return ' '.join([str(i) for i in it]) + + # Convenience function to compute L2 norm squared with NumPy + def l2_norm2(x): + x = x.reshape(-1) + return np.inner(x, x) + + # Input data + # Note: Sum with a weights layer so that gradient checking will + # verify that error signals are correct. + x_weights = lbann.Weights(optimizer=lbann.SGD(), + initializer=lbann.ConstantInitializer(value=0.0)) + x0 = lbann.WeightsLayer(weights=x_weights, + dims=str_list(_sample_dims)) + x1 = lbann.Reshape(lbann.Input(), dims=str_list(_sample_dims)) + x = lbann.Sum([x0, x1]) + x_lbann = x + + # Objects for LBANN model + obj = [] + metrics = [] + callbacks = [] + + # ------------------------------------------ + # Data-parallel layout + # ------------------------------------------ + + # LBANN implementation + scale_values = str_list(np.nditer(_scale)) + bias_values = str_list(np.nditer(_bias)) + scalebias_weights = lbann.Weights( + optimizer=lbann.SGD(), + initializer=lbann.ValueInitializer(values='{} {}'.format(scale_values, + bias_values))) + x = x_lbann + y = lbann.EntrywiseScaleBias(x, + weights=scalebias_weights, + data_layout='data_parallel') + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='data-parallel output')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i).reshape(_sample_dims) + y = _scale * x + _bias + z = l2_norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # Model-parallel layout + # ------------------------------------------ + + # LBANN implementation + scale_values = str_list(np.nditer(_scale)) + bias_values = str_list(np.nditer(_bias)) + scalebias_weights = lbann.Weights( + optimizer=lbann.SGD(), + initializer=lbann.ValueInitializer(values='{} {}'.format(scale_values, + bias_values))) + x = x_lbann + y = lbann.EntrywiseScaleBias(x, + weights=scalebias_weights, + data_layout='model_parallel') + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='model-parallel output')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i).reshape(_sample_dims) + y = _scale * x + _bias + z = l2_norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # Gradient checkint + # ------------------------------------------ + + callbacks.append(lbann.CallbackCheckGradients(error_on_failure=True)) + + # ------------------------------------------ + # Construct model + # ------------------------------------------ + + mini_batch_size = 17 + num_epochs = 0 + return lbann.Model(mini_batch_size, + num_epochs, + layers=lbann.traverse_layer_graph(x_lbann), + objective_function=obj, + metrics=metrics, + callbacks=callbacks) + +def construct_data_reader(lbann): + """Construct Protobuf message for Python data reader. + + The Python data reader will import the current Python file to + access the sample access functions. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Note: The training data reader should be removed when + # https://github.com/LLNL/lbann/issues/1098 is resolved. + message = lbann.reader_pb2.DataReader() + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'train' + ) + ]) + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'test' + ) + ]) + return message + +# ============================================== +# Setup PyTest +# ============================================== + +# Create test functions that can interact with PyTest +# Note: Create test name by removing ".py" from file name +_test_name = os.path.splitext(os.path.basename(current_file))[0] +for test in tools.create_tests(setup_experiment, _test_name): + globals()[test.__name__] = test diff --git a/bamboo/unit_tests/test_unit_layer_one_hot.py b/bamboo/unit_tests/test_unit_layer_one_hot.py new file mode 100644 index 00000000000..efc0927044f --- /dev/null +++ b/bamboo/unit_tests/test_unit_layer_one_hot.py @@ -0,0 +1,142 @@ +import functools +import operator +import os +import os.path +import sys +import numpy as np + +# Local files +current_file = os.path.realpath(__file__) +current_dir = os.path.dirname(current_file) +sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python')) +import tools + +# ============================================== +# Objects for Python data reader +# ============================================== +# Note: The Python data reader imports this file as a module and calls +# the functions below to ingest data. + +# Data +one_hot_size = 7 +seed = 201909113 + +# Sample access functions +def get_sample(index): + np.random.seed(seed+index) + return [np.random.uniform(-1, one_hot_size+1)] +def num_samples(): + return 47 +def sample_dims(): + return (1,) + +# ============================================== +# Setup LBANN experiment +# ============================================== + +def setup_experiment(lbann): + """Construct LBANN experiment. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + trainer = lbann.Trainer() + model = construct_model(lbann) + data_reader = construct_data_reader(lbann) + optimizer = lbann.NoOptimizer() + return trainer, model, data_reader, optimizer + +def construct_model(lbann): + """Construct LBANN model. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Layer graph + x = lbann.Input() + y1 = lbann.OneHot(x, size=one_hot_size) + y2 = lbann.Concatenation([lbann.Constant(value=i+1, num_neurons='1') + for i in range(one_hot_size)]) + y = lbann.Multiply([y1, y2]) + z = lbann.L2Norm2(y) + + # Objects for LBANN model + layers = list(lbann.traverse_layer_graph(x)) + metric = lbann.Metric(z, name='obj') + obj = lbann.ObjectiveFunction(z) + callbacks = [] + + # Compute expected metric value + vals = [] + for i in range(num_samples()): + x = get_sample(i)[0] + y = int(x) + 1 if (0 <= x and x < one_hot_size) else 0 + z = y * y + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metric.name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # Construct model + mini_batch_size = 19 + num_epochs = 0 + return lbann.Model(mini_batch_size, + num_epochs, + layers=layers, + objective_function=obj, + metrics=[metric], + callbacks=callbacks) + +def construct_data_reader(lbann): + """Construct Protobuf message for Python data reader. + + The Python data reader will import the current Python file to + access the sample access functions. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Note: The training data reader should be removed when + # https://github.com/LLNL/lbann/issues/1098 is resolved. + message = lbann.reader_pb2.DataReader() + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'train' + ) + ]) + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'test' + ) + ]) + return message + +# ============================================== +# Setup PyTest +# ============================================== + +# Create test functions that can interact with PyTest +# Note: Create test name by removing ".py" from file name +_test_name = os.path.splitext(os.path.basename(current_file))[0] +for test in tools.create_tests(setup_experiment, _test_name): + globals()[test.__name__] = test diff --git a/bamboo/unit_tests/test_unit_layer_slice.py b/bamboo/unit_tests/test_unit_layer_slice.py new file mode 100644 index 00000000000..8acb6f2cd70 --- /dev/null +++ b/bamboo/unit_tests/test_unit_layer_slice.py @@ -0,0 +1,252 @@ +import functools +import operator +import os +import os.path +import sys +import numpy as np +import pytest + +# Local files +current_file = os.path.realpath(__file__) +current_dir = os.path.dirname(current_file) +sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python')) +import tools + +# ============================================== +# Objects for Python data reader +# ============================================== +# Note: The Python data reader imports this file as a module and calls +# the functions below to ingest data. + +# Data +np.random.seed(20190708) +_num_samples = 29 +_sample_dims = (7,5,3) +_sample_size = functools.reduce(operator.mul, _sample_dims) +_samples = np.random.normal(size=(_num_samples,_sample_size)).astype(np.float32) + +# Sample access functions +def get_sample(index): + return _samples[index,:] +def num_samples(): + return _num_samples +def sample_dims(): + return (_sample_size,) + +# ============================================== +# Setup LBANN experiment +# ============================================== + +def setup_experiment(lbann): + """Construct LBANN experiment. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + trainer = lbann.Trainer() + model = construct_model(lbann) + data_reader = construct_data_reader(lbann) + optimizer = lbann.NoOptimizer() + return trainer, model, data_reader, optimizer + +def construct_model(lbann): + """Construct LBANN model. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Convenience function to convert list to a space-separated string + def str_list(it): + return ' '.join([str(i) for i in it]) + + # Convenience function to compute L2 norm squared with NumPy + def l2_norm2(x): + x = x.reshape(-1) + return np.inner(x, x) + + # LBANN objects + obj = [] + metrics = [] + callbacks = [] + + # -------------------------- + # LBANN input data + # -------------------------- + # Note: Sum with a weights layer so that gradient checking will + # verify that error signals are correct. + w = lbann.Weights(optimizer=lbann.SGD(), + initializer=lbann.ConstantInitializer(value=0.0)) + x0 = lbann.WeightsLayer(weights=w, + dims=str_list(_sample_dims)) + x1 = lbann.Reshape(lbann.Input(), dims=str_list(_sample_dims)) + x_lbann = lbann.Sum([x0, x1]) + + # -------------------------- + # Slice along axis 0 + # -------------------------- + + # LBANN implementation + slice_points = (2, 3, 6, 7) + x = x_lbann + x_slice = lbann.Slice(x, axis=0, slice_points=str_list(slice_points)) + y = [] + for _ in range(len(slice_points)-1): + y.append(lbann.L2Norm2(x_slice)) + z = lbann.Sum(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='axis0')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i).reshape(_sample_dims) + y = [] + for j in range(len(slice_points)-1): + x_slice = x[slice_points[j]:slice_points[j+1],:,:] + y.append(l2_norm2(x_slice)) + z = sum(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # -------------------------- + # Slice along axis 1 + # -------------------------- + + # LBANN implementation + slice_points = (0, 2, 3, 4) + x = x_lbann + x_slice = lbann.Slice(x, axis=1, slice_points=str_list(slice_points)) + y = [] + for _ in range(len(slice_points)-1): + y.append(lbann.L2Norm2(x_slice)) + z = lbann.Sum(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='axis1')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i).reshape(_sample_dims) + y = [] + for j in range(len(slice_points)-1): + x_slice = x[:,slice_points[j]:slice_points[j+1],:] + y.append(l2_norm2(x_slice)) + z = sum(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # -------------------------- + # Slice along axis 2 + # -------------------------- + + # LBANN implementation + slice_points = (0, 1, 2, 3) + x = x_lbann + x_slice = lbann.Slice(x, axis=2, slice_points=str_list(slice_points)) + y = [] + for _ in range(len(slice_points)-1): + y.append(lbann.L2Norm2(x_slice)) + z = lbann.Sum(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='axis2')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i).reshape(_sample_dims) + y = [] + for j in range(len(slice_points)-1): + x_slice = x[:,:,slice_points[j]:slice_points[j+1]] + y.append(l2_norm2(x_slice)) + z = sum(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # -------------------------- + # Gradient checking + # -------------------------- + + callbacks.append(lbann.CallbackCheckGradients(error_on_failure=True)) + + # -------------------------- + # Construct model + # -------------------------- + + mini_batch_size = 17 + num_epochs = 0 + return lbann.Model(mini_batch_size, + num_epochs, + layers=lbann.traverse_layer_graph(x_lbann), + objective_function=obj, + metrics=metrics, + callbacks=callbacks) + +def construct_data_reader(lbann): + """Construct Protobuf message for Python data reader. + + The Python data reader will import the current Python file to + access the sample access functions. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Note: The training data reader should be removed when + # https://github.com/LLNL/lbann/issues/1098 is resolved. + message = lbann.reader_pb2.DataReader() + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'train' + ) + ]) + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'test' + ) + ]) + return message + +# ============================================== +# Setup PyTest +# ============================================== + +# Create test functions that can interact with PyTest +# Note: Create test name by removing ".py" from file name +_test_name = os.path.splitext(os.path.basename(current_file))[0] +for test in tools.create_tests(setup_experiment, _test_name): + globals()[test.__name__] = test From 4ae7197b92c63a443f22c05fe0c60de94eac7af0 Mon Sep 17 00:00:00 2001 From: Tim Moon Date: Thu, 3 Oct 2019 11:16:36 -0700 Subject: [PATCH 324/634] Replace READMEs with .gitignores in Bamboo log dirs (#1272) --- bamboo/clean.sh | 16 ++++++++-------- bamboo/compiler_tests/builds/.gitignore | 2 ++ bamboo/compiler_tests/builds/README.md | 1 - bamboo/compiler_tests/error/.gitignore | 2 ++ bamboo/compiler_tests/error/README.md | 1 - bamboo/compiler_tests/output/.gitignore | 2 ++ bamboo/compiler_tests/output/README.md | 1 - bamboo/integration_tests/error/.gitignore | 2 ++ bamboo/integration_tests/error/README.md | 1 - bamboo/integration_tests/output/.gitignore | 2 ++ bamboo/integration_tests/output/README.md | 1 - bamboo/unit_tests/.gitignore | 1 + bamboo/unit_tests/error/.gitignore | 1 - bamboo/unit_tests/error/README.md | 1 - bamboo/unit_tests/experiments/.gitignore | 2 ++ bamboo/unit_tests/experiments/README.md | 1 - bamboo/unit_tests/output/.gitignore | 1 - bamboo/unit_tests/output/README.md | 1 - 18 files changed, 21 insertions(+), 18 deletions(-) create mode 100644 bamboo/compiler_tests/builds/.gitignore delete mode 100644 bamboo/compiler_tests/builds/README.md create mode 100644 bamboo/compiler_tests/error/.gitignore delete mode 100644 bamboo/compiler_tests/error/README.md create mode 100644 bamboo/compiler_tests/output/.gitignore delete mode 100644 bamboo/compiler_tests/output/README.md create mode 100644 bamboo/integration_tests/error/.gitignore delete mode 100644 bamboo/integration_tests/error/README.md create mode 100644 bamboo/integration_tests/output/.gitignore delete mode 100644 bamboo/integration_tests/output/README.md delete mode 100644 bamboo/unit_tests/error/README.md create mode 100644 bamboo/unit_tests/experiments/.gitignore delete mode 100644 bamboo/unit_tests/experiments/README.md delete mode 100644 bamboo/unit_tests/output/README.md diff --git a/bamboo/clean.sh b/bamboo/clean.sh index 3d9d20c951f..ac408cff67e 100755 --- a/bamboo/clean.sh +++ b/bamboo/clean.sh @@ -6,10 +6,9 @@ LBANN_DIR=$(git rev-parse --show-toplevel) # Compiler Tests rm -f ${LBANN_DIR}/bamboo/compiler_tests/*.pyc rm -rf ${LBANN_DIR}/bamboo/compiler_tests/__pycache__ -rm -rf ${LBANN_DIR}/bamboo/compiler_tests/builds/*_debug -rm -rf ${LBANN_DIR}/bamboo/compiler_tests/builds/*_rel -rm -f ${LBANN_DIR}/bamboo/compiler_tests/error/*.txt -rm -f ${LBANN_DIR}/bamboo/compiler_tests/output/*.txt +rm -rf ${LBANN_DIR}/bamboo/compiler_tests/builds/* +rm -f ${LBANN_DIR}/bamboo/compiler_tests/error/* +rm -f ${LBANN_DIR}/bamboo/compiler_tests/output/* # Integration Tests rm -f ${LBANN_DIR}/bamboo/integration_tests/*.pgm @@ -17,8 +16,8 @@ rm -f ${LBANN_DIR}/bamboo/integration_tests/*.prototext* rm -f ${LBANN_DIR}/bamboo/integration_tests/*.pyc rm -rf ${LBANN_DIR}/bamboo/integration_tests/__pycache__ rm -f ${LBANN_DIR}/bamboo/integration_tests/*.tfevents.* -rm -f ${LBANN_DIR}/bamboo/integration_tests/error/*.txt -rm -f ${LBANN_DIR}/bamboo/integration_tests/output/*.txt +rm -f ${LBANN_DIR}/bamboo/integration_tests/error/* +rm -f ${LBANN_DIR}/bamboo/integration_tests/output/* # Unit Tests rm -rf ${LBANN_DIR}/bamboo/unit_tests/ckpt* @@ -27,5 +26,6 @@ rm -f ${LBANN_DIR}/bamboo/unit_tests/*.prototext* rm -f ${LBANN_DIR}/bamboo/unit_tests/*.pyc rm -rf ${LBANN_DIR}/bamboo/unit_tests/__pycache__ rm -f ${LBANN_DIR}/bamboo/unit_tests/*.tfevents.* -rm -f ${LBANN_DIR}/bamboo/unit_tests/error/*.txt -rm -f ${LBANN_DIR}/bamboo/unit_tests/output/*.txt +rm -f ${LBANN_DIR}/bamboo/unit_tests/error/* +rm -f ${LBANN_DIR}/bamboo/unit_tests/output/* +rm -rf ${LBANN_DIR}/bamboo/unit_tests/experiments/* diff --git a/bamboo/compiler_tests/builds/.gitignore b/bamboo/compiler_tests/builds/.gitignore new file mode 100644 index 00000000000..d6b7ef32c84 --- /dev/null +++ b/bamboo/compiler_tests/builds/.gitignore @@ -0,0 +1,2 @@ +* +!.gitignore diff --git a/bamboo/compiler_tests/builds/README.md b/bamboo/compiler_tests/builds/README.md deleted file mode 100644 index 1962c6506d6..00000000000 --- a/bamboo/compiler_tests/builds/README.md +++ /dev/null @@ -1 +0,0 @@ -Subdirectory for build directories diff --git a/bamboo/compiler_tests/error/.gitignore b/bamboo/compiler_tests/error/.gitignore new file mode 100644 index 00000000000..d6b7ef32c84 --- /dev/null +++ b/bamboo/compiler_tests/error/.gitignore @@ -0,0 +1,2 @@ +* +!.gitignore diff --git a/bamboo/compiler_tests/error/README.md b/bamboo/compiler_tests/error/README.md deleted file mode 100644 index 78712c2962b..00000000000 --- a/bamboo/compiler_tests/error/README.md +++ /dev/null @@ -1 +0,0 @@ -Subdirectory for test error diff --git a/bamboo/compiler_tests/output/.gitignore b/bamboo/compiler_tests/output/.gitignore new file mode 100644 index 00000000000..d6b7ef32c84 --- /dev/null +++ b/bamboo/compiler_tests/output/.gitignore @@ -0,0 +1,2 @@ +* +!.gitignore diff --git a/bamboo/compiler_tests/output/README.md b/bamboo/compiler_tests/output/README.md deleted file mode 100644 index 308358e3777..00000000000 --- a/bamboo/compiler_tests/output/README.md +++ /dev/null @@ -1 +0,0 @@ -Subdirectory for test output diff --git a/bamboo/integration_tests/error/.gitignore b/bamboo/integration_tests/error/.gitignore new file mode 100644 index 00000000000..d6b7ef32c84 --- /dev/null +++ b/bamboo/integration_tests/error/.gitignore @@ -0,0 +1,2 @@ +* +!.gitignore diff --git a/bamboo/integration_tests/error/README.md b/bamboo/integration_tests/error/README.md deleted file mode 100644 index 78712c2962b..00000000000 --- a/bamboo/integration_tests/error/README.md +++ /dev/null @@ -1 +0,0 @@ -Subdirectory for test error diff --git a/bamboo/integration_tests/output/.gitignore b/bamboo/integration_tests/output/.gitignore new file mode 100644 index 00000000000..d6b7ef32c84 --- /dev/null +++ b/bamboo/integration_tests/output/.gitignore @@ -0,0 +1,2 @@ +* +!.gitignore diff --git a/bamboo/integration_tests/output/README.md b/bamboo/integration_tests/output/README.md deleted file mode 100644 index 308358e3777..00000000000 --- a/bamboo/integration_tests/output/README.md +++ /dev/null @@ -1 +0,0 @@ -Subdirectory for test output diff --git a/bamboo/unit_tests/.gitignore b/bamboo/unit_tests/.gitignore index 16d3c4dbbfe..0cc4de789bf 100644 --- a/bamboo/unit_tests/.gitignore +++ b/bamboo/unit_tests/.gitignore @@ -1 +1,2 @@ .cache +*.prototext diff --git a/bamboo/unit_tests/error/.gitignore b/bamboo/unit_tests/error/.gitignore index 7c9d611b592..d6b7ef32c84 100644 --- a/bamboo/unit_tests/error/.gitignore +++ b/bamboo/unit_tests/error/.gitignore @@ -1,3 +1,2 @@ * !.gitignore -!README.md diff --git a/bamboo/unit_tests/error/README.md b/bamboo/unit_tests/error/README.md deleted file mode 100644 index 78712c2962b..00000000000 --- a/bamboo/unit_tests/error/README.md +++ /dev/null @@ -1 +0,0 @@ -Subdirectory for test error diff --git a/bamboo/unit_tests/experiments/.gitignore b/bamboo/unit_tests/experiments/.gitignore new file mode 100644 index 00000000000..d6b7ef32c84 --- /dev/null +++ b/bamboo/unit_tests/experiments/.gitignore @@ -0,0 +1,2 @@ +* +!.gitignore diff --git a/bamboo/unit_tests/experiments/README.md b/bamboo/unit_tests/experiments/README.md deleted file mode 100644 index 0c210a7e6e8..00000000000 --- a/bamboo/unit_tests/experiments/README.md +++ /dev/null @@ -1 +0,0 @@ -Subdirectory for test experiments diff --git a/bamboo/unit_tests/output/.gitignore b/bamboo/unit_tests/output/.gitignore index 7c9d611b592..d6b7ef32c84 100644 --- a/bamboo/unit_tests/output/.gitignore +++ b/bamboo/unit_tests/output/.gitignore @@ -1,3 +1,2 @@ * !.gitignore -!README.md diff --git a/bamboo/unit_tests/output/README.md b/bamboo/unit_tests/output/README.md deleted file mode 100644 index 308358e3777..00000000000 --- a/bamboo/unit_tests/output/README.md +++ /dev/null @@ -1 +0,0 @@ -Subdirectory for test output From a85f9e977139c24e0a5f33e5d4e185f32126564d Mon Sep 17 00:00:00 2001 From: "Thomas R. Benson" Date: Fri, 4 Oct 2019 08:37:27 -0700 Subject: [PATCH 325/634] fix a typo (that @timmoon10 caught but I missed) --- python/lbann/contrib/lc/systems.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/lbann/contrib/lc/systems.py b/python/lbann/contrib/lc/systems.py index 5c61e1af7b5..3b2c6a6571d 100644 --- a/python/lbann/contrib/lc/systems.py +++ b/python/lbann/contrib/lc/systems.py @@ -19,7 +19,7 @@ def __init__(self, # Supported LC systems _system_params = {'catalyst': SystemParams(24, 0, 'slurm', 'pbatch', 'brain'), - 'coronal': SystemParams(24, 0, 'slurm', 'pbatch', None), + 'corona': SystemParams(24, 0, 'slurm', 'pbatch', None), 'pascal': SystemParams(36, 2, 'slurm', 'pbatch', 'lc'), 'quartz': SystemParams(36, 0, 'slurm', 'pbatch', 'brain'), 'surface': SystemParams(16, 2, 'slurm', 'pbatch', 'hpclearn'), From 32ed1d5af8072f2116039854bb3d41ca7e09832e Mon Sep 17 00:00:00 2001 From: "Thomas R. Benson" Date: Fri, 4 Oct 2019 09:39:44 -0700 Subject: [PATCH 326/634] fixes for ray --- bamboo/unit_tests/test_unit_lbann_invocation.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/bamboo/unit_tests/test_unit_lbann_invocation.py b/bamboo/unit_tests/test_unit_lbann_invocation.py index 51985037dec..9a4748f62ef 100644 --- a/bamboo/unit_tests/test_unit_lbann_invocation.py +++ b/bamboo/unit_tests/test_unit_lbann_invocation.py @@ -35,6 +35,7 @@ def test_unit_no_params_bad(cluster, dirname, exes): command = tools.get_command( cluster=cluster, executable=exe, exit_after_setup=True, + num_processes=1, output_file_name=output_file_name, error_file_name=error_file_name ) @@ -57,6 +58,7 @@ def test_unit_one_model_bad(cluster, dirname, exes): cluster=cluster, executable=exe, exit_after_setup=True, model_path=model_path, + num_processes=1, output_file_name=output_file_name, error_file_name=error_file_name ) @@ -79,6 +81,7 @@ def test_unit_two_models_bad(cluster, dirname, exes): cluster=cluster, executable=exe, exit_after_setup=True, model_path=model_path, + num_processes=1, output_file_name=output_file_name, error_file_name=error_file_name ) @@ -102,6 +105,7 @@ def test_unit_two_models_bad2(cluster, dirname, exes): cluster=cluster, executable=exe, exit_after_setup=True, model_path=model_path, + num_processes=1, output_file_name=output_file_name, error_file_name=error_file_name ) @@ -125,6 +129,7 @@ def test_unit_missing_optimizer(cluster, dirname, exes): data_reader_path=data_reader_path, data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST', exit_after_setup=True, model_path=model_path, + num_processes=1, output_file_name=output_file_name, error_file_name=error_file_name ) @@ -147,6 +152,7 @@ def test_unit_missing_reader(cluster, dirname, exes): cluster=cluster, executable=exe, exit_after_setup=True, model_path=model_path, optimizer_path=optimizer_path, + num_processes=1, output_file_name=output_file_name, error_file_name=error_file_name ) @@ -167,6 +173,7 @@ def test_unit_bad_params(cluster, dirname, exes): dirname) (command_allocate, command_run, _, _) = tools.get_command( cluster=cluster, executable=exe, + num_processes=1, return_tuple=True) (output_file_name, error_file_name) = get_file_names(dirname, 'bad_params') command_string = '{ca}{cr} {e} -exit_after_setup --reader={d} --model={m} --optimizer={o} > {ofn} 2> {efn}'.format( @@ -195,6 +202,7 @@ def test_unit_should_work(cluster, dirname, exes): data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST', exit_after_setup=True, model_path=model_path, optimizer_path=optimizer_path, + num_processes=1, output_file_name=output_file_name, error_file_name=error_file_name) return_code = os.system(command) From f18602f954d16dabeb0a2da162ed60735cddfd9d Mon Sep 17 00:00:00 2001 From: "Thomas R. Benson" Date: Fri, 4 Oct 2019 09:40:54 -0700 Subject: [PATCH 327/634] Revert "fixes for ray"; was accidentally on the wrong branch This reverts commit 32ed1d5af8072f2116039854bb3d41ca7e09832e. --- bamboo/unit_tests/test_unit_lbann_invocation.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/bamboo/unit_tests/test_unit_lbann_invocation.py b/bamboo/unit_tests/test_unit_lbann_invocation.py index 9a4748f62ef..51985037dec 100644 --- a/bamboo/unit_tests/test_unit_lbann_invocation.py +++ b/bamboo/unit_tests/test_unit_lbann_invocation.py @@ -35,7 +35,6 @@ def test_unit_no_params_bad(cluster, dirname, exes): command = tools.get_command( cluster=cluster, executable=exe, exit_after_setup=True, - num_processes=1, output_file_name=output_file_name, error_file_name=error_file_name ) @@ -58,7 +57,6 @@ def test_unit_one_model_bad(cluster, dirname, exes): cluster=cluster, executable=exe, exit_after_setup=True, model_path=model_path, - num_processes=1, output_file_name=output_file_name, error_file_name=error_file_name ) @@ -81,7 +79,6 @@ def test_unit_two_models_bad(cluster, dirname, exes): cluster=cluster, executable=exe, exit_after_setup=True, model_path=model_path, - num_processes=1, output_file_name=output_file_name, error_file_name=error_file_name ) @@ -105,7 +102,6 @@ def test_unit_two_models_bad2(cluster, dirname, exes): cluster=cluster, executable=exe, exit_after_setup=True, model_path=model_path, - num_processes=1, output_file_name=output_file_name, error_file_name=error_file_name ) @@ -129,7 +125,6 @@ def test_unit_missing_optimizer(cluster, dirname, exes): data_reader_path=data_reader_path, data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST', exit_after_setup=True, model_path=model_path, - num_processes=1, output_file_name=output_file_name, error_file_name=error_file_name ) @@ -152,7 +147,6 @@ def test_unit_missing_reader(cluster, dirname, exes): cluster=cluster, executable=exe, exit_after_setup=True, model_path=model_path, optimizer_path=optimizer_path, - num_processes=1, output_file_name=output_file_name, error_file_name=error_file_name ) @@ -173,7 +167,6 @@ def test_unit_bad_params(cluster, dirname, exes): dirname) (command_allocate, command_run, _, _) = tools.get_command( cluster=cluster, executable=exe, - num_processes=1, return_tuple=True) (output_file_name, error_file_name) = get_file_names(dirname, 'bad_params') command_string = '{ca}{cr} {e} -exit_after_setup --reader={d} --model={m} --optimizer={o} > {ofn} 2> {efn}'.format( @@ -202,7 +195,6 @@ def test_unit_should_work(cluster, dirname, exes): data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST', exit_after_setup=True, model_path=model_path, optimizer_path=optimizer_path, - num_processes=1, output_file_name=output_file_name, error_file_name=error_file_name) return_code = os.system(command) From e9e1640b6fc161bf7a9857680fcf4132c2bdb837 Mon Sep 17 00:00:00 2001 From: "David A. Hysom" Date: Mon, 7 Oct 2019 08:17:18 -0700 Subject: [PATCH 328/634] initial checkin for threading. Not ready yet. --- .../data_reader_numpy_npz_conduit.cpp | 69 +++++++++++++++++-- 1 file changed, 65 insertions(+), 4 deletions(-) diff --git a/src/data_readers/data_reader_numpy_npz_conduit.cpp b/src/data_readers/data_reader_numpy_npz_conduit.cpp index 24a98e4e4b2..c27336982cb 100644 --- a/src/data_readers/data_reader_numpy_npz_conduit.cpp +++ b/src/data_readers/data_reader_numpy_npz_conduit.cpp @@ -132,22 +132,61 @@ void numpy_npz_conduit_reader::load() { } void numpy_npz_conduit_reader::preload_data_store() { + double tm1 = get_time(); + + if (is_master()) std::cout << "Starting numpy_npz_conduit_reader::preload_data_store; num indices: " << m_shuffled_indices.size() << std::endl; + size_t count = get_absolute_sample_count(); double use_percent = get_use_percent(); if (count != 0 || use_percent != 1) { LBANN_ERROR("numpy_npz_conduit_reader currently assumes you are using 100% of the data set; you specified get_absolute_sample_count() = ", count, " and get_use_percent() = ", use_percent, "; please ask Dave Hysom to modify the code, if you want to use less than 100%"); } - double tm1 = get_time(); m_data_store->set_preload(); int rank = m_comm->get_rank_in_trainer(); std::unordered_set label_classes; - for (size_t data_id=0; data_idget_index_owner(data_id) != rank) { - continue; + + bool threaded = options::get()->get_bool("data_store_threaded"); + + //threaded mode + if (threaded) { + if (is_master()) { + std::cout << "mode: data_store_thread\n"; + } + std::shared_ptr io_thread_pool = construct_io_thread_pool(m_comm, opts); + int num_threads = static_cast(io_thread_pool->get_num_threads()); + + //collect the set of indices that belong to this rank + std::vector> data_ids(num_threads); + int j = 0; + for (size_t data_id=0; data_idget_index_owner(index) != rank) { + continue; + } + data_ids[j++].insert(index); + if (j == num_threads) { + j = 0; + } + } + + //load the samples + for (int t = 0; t < num_threads; t++) { + if(t == io_thread_pool->get_local_thread_id()) { + continue; + } else { + io_thread_pool->submit_job_to_work_group(std::bind(&numpy_npz_conduit_reader::load_numpy_npz_from_file, this, data_ids[t])); + } } + load_numpy_npz_from_file(data_ids[io_thread_pool->get_local_thread_id()]); + io_thread_pool->finish_work_group(); + } //end: threaded mode + +bool numpy_npz_conduit_reader::load_numpy_npz_from_file(const std::unordered_set &data_ids) { + conduit::Node node; + for (auto t : data_ids) { conduit::Node node; numpy_conduit_converter::load_conduit_node(m_filenames[data_id], data_id, node); const char *char_ptr = node[LBANN_DATA_ID_STR(data_id) + "/frm/data"].value(); @@ -155,7 +194,27 @@ void numpy_npz_conduit_reader::preload_data_store() { label_classes.insert(*label_ptr); m_data_store->set_conduit_node(data_id, node); } +} + //non-threaded mode + else { + for (size_t data_id=0; data_idget_index_owner(data_id) != rank) { + continue; + } + + conduit::Node node; + numpy_conduit_converter::load_conduit_node(m_filenames[data_id], data_id, node); + const char *char_ptr = node[LBANN_DATA_ID_STR(data_id) + "/frm/data"].value(); + const int* label_ptr = reinterpret_cast(char_ptr); + label_classes.insert(*label_ptr); + m_data_store->set_conduit_node(data_id, node); + } + } //end: non-threaded mode + + // Nikoli says we're not using labels, so I'm commenting this section out + // (this section is a mess, anyway) + #if 0 if (m_has_labels) { // get max element. Yes, I know you can do this with, e.g, lambda @@ -204,6 +263,8 @@ void numpy_npz_conduit_reader::preload_data_store() { m_num_labels = label_classes.size(); #endif } + #endif + double tm2 = get_time(); if (is_master()) { std::cout << "time to preload: " << tm2 - tm1 << " for role: " << get_role() << "\n"; From d7bef475a749c782837933cac9327ff9f2f213fd Mon Sep 17 00:00:00 2001 From: "David A. Hysom" Date: Mon, 7 Oct 2019 09:36:25 -0700 Subject: [PATCH 329/634] working version --- .../data_reader_numpy_npz_conduit.hpp | 2 ++ .../data_reader_numpy_npz_conduit.cpp | 33 +++++++++---------- src/data_readers/numpy_conduit_converter.cpp | 2 +- 3 files changed, 19 insertions(+), 18 deletions(-) diff --git a/include/lbann/data_readers/data_reader_numpy_npz_conduit.hpp b/include/lbann/data_readers/data_reader_numpy_npz_conduit.hpp index 414e177b6c9..09aeca29498 100644 --- a/include/lbann/data_readers/data_reader_numpy_npz_conduit.hpp +++ b/include/lbann/data_readers/data_reader_numpy_npz_conduit.hpp @@ -105,6 +105,8 @@ namespace lbann { void fill_in_metadata(); std::vector m_filenames; + + bool load_numpy_npz_from_file(const std::unordered_set &data_ids, std::unordered_set& label_classes); }; } // namespace lbann diff --git a/src/data_readers/data_reader_numpy_npz_conduit.cpp b/src/data_readers/data_reader_numpy_npz_conduit.cpp index c27336982cb..bbedcd31cc6 100644 --- a/src/data_readers/data_reader_numpy_npz_conduit.cpp +++ b/src/data_readers/data_reader_numpy_npz_conduit.cpp @@ -33,7 +33,7 @@ #include "lbann/utils/jag_utils.hpp" // read_filelist(..) TODO should be move to file_utils #include "lbann/utils/timer.hpp" #include "lbann/models/model.hpp" - +#include "lbann/utils/lbann_library.hpp" namespace lbann { @@ -154,7 +154,7 @@ void numpy_npz_conduit_reader::preload_data_store() { if (is_master()) { std::cout << "mode: data_store_thread\n"; } - std::shared_ptr io_thread_pool = construct_io_thread_pool(m_comm, opts); + std::shared_ptr io_thread_pool = construct_io_thread_pool(m_comm, options::get()); int num_threads = static_cast(io_thread_pool->get_num_threads()); //collect the set of indices that belong to this rank @@ -176,26 +176,13 @@ void numpy_npz_conduit_reader::preload_data_store() { if(t == io_thread_pool->get_local_thread_id()) { continue; } else { - io_thread_pool->submit_job_to_work_group(std::bind(&numpy_npz_conduit_reader::load_numpy_npz_from_file, this, data_ids[t])); + io_thread_pool->submit_job_to_work_group(std::bind(&numpy_npz_conduit_reader::load_numpy_npz_from_file, this, data_ids[t], label_classes)); } } - load_numpy_npz_from_file(data_ids[io_thread_pool->get_local_thread_id()]); + load_numpy_npz_from_file(data_ids[io_thread_pool->get_local_thread_id()], label_classes); io_thread_pool->finish_work_group(); } //end: threaded mode - -bool numpy_npz_conduit_reader::load_numpy_npz_from_file(const std::unordered_set &data_ids) { - conduit::Node node; - for (auto t : data_ids) { - conduit::Node node; - numpy_conduit_converter::load_conduit_node(m_filenames[data_id], data_id, node); - const char *char_ptr = node[LBANN_DATA_ID_STR(data_id) + "/frm/data"].value(); - const int* label_ptr = reinterpret_cast(char_ptr); - label_classes.insert(*label_ptr); - m_data_store->set_conduit_node(data_id, node); - } -} - //non-threaded mode else { for (size_t data_id=0; data_id &data_ids, std::unordered_set &label_classes) { + for (auto data_id : data_ids) { + conduit::Node node; + numpy_conduit_converter::load_conduit_node(m_filenames[data_id], data_id, node); + const char *char_ptr = node[LBANN_DATA_ID_STR(data_id) + "/frm/data"].value(); + const int* label_ptr = reinterpret_cast(char_ptr); + label_classes.insert(*label_ptr); + m_data_store->set_conduit_node(data_id, node); + } + return true; +} + bool numpy_npz_conduit_reader::fetch_datum(Mat& X, int data_id, int mb_idx) { Mat X_v = El::View(X, El::IR(0, X.Height()), El::IR(mb_idx, mb_idx+1)); conduit::Node node; diff --git a/src/data_readers/numpy_conduit_converter.cpp b/src/data_readers/numpy_conduit_converter.cpp index 25ddf445689..bc3e1dedff6 100644 --- a/src/data_readers/numpy_conduit_converter.cpp +++ b/src/data_readers/numpy_conduit_converter.cpp @@ -46,7 +46,7 @@ void numpy_conduit_converter::load_conduit_node(const std::string filename, int for (auto &&t : a) { cnpy::NpyArray &b = t.second; if (b.shape[0] != 1) { - LBANN_ERROR("lbann currently only supports one sample per npz file; this file appears to contain " + std::to_string(b.shape[0]) + " samples"); + LBANN_ERROR("lbann currently only supports one sample per npz file; this file appears to contain " + std::to_string(b.shape[0]) + " samples; (", filename); } output[LBANN_DATA_ID_STR(data_id) + "/" + t.first + "/word_size"] = b.word_size; output[LBANN_DATA_ID_STR(data_id) + "/" + t.first + "/fortran_order"] = b.fortran_order; From 3c5c239a780e29c9a10deb2b7fef0171aaba45d2 Mon Sep 17 00:00:00 2001 From: "David A. Hysom" Date: Mon, 7 Oct 2019 13:38:34 -0700 Subject: [PATCH 330/634] initial commit --- .../python/build_inclusive_from_exclusive.py | 42 +++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100644 model_zoo/jag_utils/python/build_inclusive_from_exclusive.py diff --git a/model_zoo/jag_utils/python/build_inclusive_from_exclusive.py b/model_zoo/jag_utils/python/build_inclusive_from_exclusive.py new file mode 100644 index 00000000000..8826279e51c --- /dev/null +++ b/model_zoo/jag_utils/python/build_inclusive_from_exclusive.py @@ -0,0 +1,42 @@ +import sys + +if len(sys.argv) != 4 : + print 'usage:', sys.argv[0], 'index_fn id_mapping_fn output_fn' + exit(9) + +a = open(sys.argv[1]) +a.readline() +header = a.readline() +dir = a.readline() + +#build map: filename -> set of bad samples +mp = {} +for line in a : + t = line.split() + mp[t[0]] = set() + for id in t[1:] : + mp[t[0]].add(id) + +a.close() + +out = open(sys.argv[3], 'w') +out.write('CONDUIT_HDF5_INCLUSION\n') +out.write(header) +out.write(dir) + +a = open(sys.argv[2]) +bad = 0 +for line in a : + t = line.split() + fn = t[0] + out.write(fn + ' ') + for id in t[1:] : + if id not in mp[fn] : + out.write(id + ' ') + else : + bad += 1 + out.write('\n') + +out.close() +print header +print 'num found bad:', bad From f45413925bcc13d7dd67981f74ebe403cbae4e69 Mon Sep 17 00:00:00 2001 From: "David A. Hysom" Date: Mon, 7 Oct 2019 15:04:02 -0700 Subject: [PATCH 331/634] bug fix. forgot to print the 'include/exclude' fields for each file from line 3 - end --- .../jag_utils/python/build_inclusive_from_exclusive.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/model_zoo/jag_utils/python/build_inclusive_from_exclusive.py b/model_zoo/jag_utils/python/build_inclusive_from_exclusive.py index 8826279e51c..35da15160be 100644 --- a/model_zoo/jag_utils/python/build_inclusive_from_exclusive.py +++ b/model_zoo/jag_utils/python/build_inclusive_from_exclusive.py @@ -11,10 +11,14 @@ #build map: filename -> set of bad samples mp = {} +mp_good = {} +mp_bad = {} for line in a : t = line.split() mp[t[0]] = set() - for id in t[1:] : + mp_good[t[0]] = t[1] + mp_bad[t[0]] = t[2] + for id in t[3:] : mp[t[0]].add(id) a.close() @@ -29,7 +33,7 @@ for line in a : t = line.split() fn = t[0] - out.write(fn + ' ') + out.write(fn + ' ' + mp_good[fn] + ' ' + mp_bad[fn] + ' ') for id in t[1:] : if id not in mp[fn] : out.write(id + ' ') From 56c8eaa94fa4d18fd79fd823a97e1b655de1c941 Mon Sep 17 00:00:00 2001 From: "Thomas R. Benson" Date: Mon, 7 Oct 2019 22:47:29 -0700 Subject: [PATCH 332/634] fix Python linkage --- CMakeLists.txt | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index bc8b78b7804..41e2cde1704 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -494,10 +494,6 @@ target_include_directories(lbann PUBLIC $ $) -if (LBANN_HAS_PYTHON) - target_include_directories(lbann PUBLIC ${Python_INCLUDE_DIRS}) -endif () - # Use the IMPORTED targets when possible. target_link_libraries(lbann PUBLIC LbannProto) target_link_libraries(lbann PUBLIC Threads::Threads) @@ -530,7 +526,7 @@ if (LBANN_HAS_VTUNE) endif () if (LBANN_HAS_PYTHON) - target_link_libraries(lbann PUBLIC ${Python_LIBRARIES}) + target_link_libraries(lbann PUBLIC Python::Python) endif () if (TARGET LBANN_CXX_FLAGS_werror) From 743e04201b214c8f79179a86d4dd9950ca8a3abc Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Mon, 7 Oct 2019 23:33:06 -0700 Subject: [PATCH 333/634] Updated the open file sample list to not verify or dynamically extract the list of samples from a file bundle each time when using explicit list formats. This dramatically improves the runtime for reading large number of HDF5 files. --- .../data_readers/sample_list_open_files.hpp | 5 + .../sample_list_open_files_impl.hpp | 108 ++++++++++++------ 2 files changed, 77 insertions(+), 36 deletions(-) diff --git a/include/lbann/data_readers/sample_list_open_files.hpp b/include/lbann/data_readers/sample_list_open_files.hpp index a7b4b81a278..57bfb89980e 100644 --- a/include/lbann/data_readers/sample_list_open_files.hpp +++ b/include/lbann/data_readers/sample_list_open_files.hpp @@ -88,9 +88,14 @@ class sample_list_open_files : public sample_list { /// Get the list of samples from a specific type of bundle file virtual void obtain_sample_names(file_handle_t& h, std::vector& sample_names) const = 0; + file_handle_t open_file_handle(std::string file_path); + /// Get the list of samples that exist in a bundle file file_handle_t get_bundled_sample_names(std::string file_path, std::vector& sample_names, size_t included_samples, size_t excluded_samples); + /// Check that the list of samples given actually exist in a bundle file + void validate_implicit_bundles_sample_names(std::string file_path, std::string filename, std::vector& sample_names, size_t included_samples, size_t excluded_samples); + /// read the body of exclusive sample list void read_exclusive_list(std::istream& istrm, size_t stride=1, size_t offset=0); diff --git a/include/lbann/data_readers/sample_list_open_files_impl.hpp b/include/lbann/data_readers/sample_list_open_files_impl.hpp index 4ad36be3b3b..479ecd40287 100644 --- a/include/lbann/data_readers/sample_list_open_files_impl.hpp +++ b/include/lbann/data_readers/sample_list_open_files_impl.hpp @@ -219,40 +219,27 @@ ::read_inclusive_list(std::istream& istrm, + " :: data file '" + filename + "' does not exist."); } - std::vector sample_names; - file_handle_t file_hnd = get_bundled_sample_names(file_path, sample_names, included_samples, excluded_samples); + file_handle_t file_hnd = open_file_handle(file_path); if (!is_file_handle_valid(file_hnd)) { continue; // skipping the file } - if(m_file_map.count(filename) > 0) { - if(sample_names.size() != m_file_map[filename]) { - LBANN_ERROR(std::string("The same file ") - + filename - + " was opened multiple times and reported different sizes: " - + std::to_string(sample_names.size()) - + " and " - + std::to_string(m_file_map[filename])); - } - }else { - m_file_map[filename] = sample_names.size(); - } - - std::unordered_set set_of_samples(sample_names.begin(), sample_names.end()); - sample_file_id_t index = m_file_id_stats_map.size(); m_file_id_stats_map.emplace_back(std::make_tuple(filename, uninitialized_file_handle(), std::deque>{})); set_files_handle(filename, file_hnd); size_t valid_sample_count = 0u; + //#define VALIDATE_SAMPLE_LIST +#ifdef VALIDATE_SAMPLE_LIST + std::vector sample_names; +#endif while(!sstr.eof()) { std::string sample_name_str; sstr >> sample_name_str; - std::unordered_set::const_iterator found = set_of_samples.find(sample_name_str); - if (found == set_of_samples.cend()) { - LBANN_ERROR(std::string("Illegal request for a data ID that does not exist: ") + sample_name_str); - } m_sample_list.emplace_back(index, to_sample_name_t(sample_name_str)); +#ifdef VALIDATE_SAMPLE_LIST + sample_names.emplace_back(sample_name_str); +#endif valid_sample_count++; } if(valid_sample_count != included_samples) { @@ -261,6 +248,22 @@ ::read_inclusive_list(std::istream& istrm, + std::string(" samples, but found ") + std::to_string(valid_sample_count)); } + + if(m_file_map.count(filename) > 0) { + if(valid_sample_count != m_file_map[filename]) { + LBANN_ERROR(std::string("The same file ") + + filename + + " was opened multiple times and reported different sizes: " + + std::to_string(valid_sample_count) + + " and " + + std::to_string(m_file_map[filename])); + } + }else { + m_file_map[filename] = /*valid_sample_count*/ included_samples + excluded_samples; + } +#ifdef VALIDATE_SAMPLE_LIST + validate_implicit_bundles_sample_names(file_path, filename, sample_names, included_samples, excluded_samples); +#endif } if (m_header.get_num_files() != cnt_files) { @@ -420,10 +423,7 @@ ::obtain_sample_names(file_handle_t& h, std::vector& sample_names) template inline file_handle_t sample_list_open_files -::get_bundled_sample_names(std::string file_path, - std::vector& sample_names, - size_t included_samples, - size_t excluded_samples) { +::open_file_handle(std::string file_path) { file_handle_t file_hnd; clear_file_handle(file_hnd); bool retry = false; @@ -438,6 +438,17 @@ ::get_bundled_sample_names(std::string file_path, } }while(retry && retry_cnt < LBANN_MAX_OPEN_FILE_RETRY); + return file_hnd; +} + +template +inline file_handle_t sample_list_open_files +::get_bundled_sample_names(std::string file_path, + std::vector& sample_names, + size_t included_samples, + size_t excluded_samples) { + file_handle_t file_hnd = open_file_handle(file_path); + if (!is_file_handle_valid(file_hnd)) { std::cout << "Opening the file didn't work" << std::endl; return file_hnd; @@ -457,6 +468,40 @@ ::get_bundled_sample_names(std::string file_path, return file_hnd; } +template +inline void sample_list_open_files +::validate_implicit_bundles_sample_names(std::string file_path, + std::string filename, + std::vector& sample_names, + size_t included_samples, + size_t excluded_samples) { + std::vector all_sample_names; + file_handle_t file_hnd = get_bundled_sample_names(file_path, all_sample_names, included_samples, excluded_samples); + if (!is_file_handle_valid(file_hnd)) { + return; // skipping the file + } + if(m_file_map.count(filename) > 0) { + if(all_sample_names.size() != m_file_map[filename]) { + LBANN_ERROR(std::string("The same file ") + + filename + + " was opened multiple times and reported different sizes: " + + std::to_string(all_sample_names.size()) + + " and " + + std::to_string(m_file_map[filename])); + } + }else { + m_file_map[filename] = all_sample_names.size(); + } + std::unordered_set set_of_samples(all_sample_names.begin(), all_sample_names.end()); + for(auto&& sample_name : sample_names) { + std::unordered_set::const_iterator found = set_of_samples.find(sample_name); + if (found == set_of_samples.cend()) { + LBANN_ERROR(std::string("Illegal request for a data ID that does not exist: ") + sample_name); + } + } + return; +} + template inline void sample_list_open_files ::all_gather_packed_lists(lbann_comm& comm) { @@ -612,17 +657,8 @@ ::open_samples_file_handle(const size_t i, bool pre_open_fd) { if (file_name.empty() || !check_if_file_exists(file_path)) { LBANN_ERROR(std::string{} + " :: data file '" + file_path + "' does not exist."); } - bool retry = false; - int retry_cnt = 0; - do { - try { - h = open_file_handle_for_read( file_path ); - }catch (conduit::Error const& e) { - LBANN_WARNING(" :: trying to open the file " + file_path + " and got " + e.what()); - retry = true; - retry_cnt++; - } - }while(retry && retry_cnt < 3); + + h = open_file_handle(file_path); if (!is_file_handle_valid(h)) { LBANN_ERROR(std::string{} + " :: data file '" + file_path + "' could not be opened."); From 8a015245925316ccb7763f531e91fec51cc740d8 Mon Sep 17 00:00:00 2001 From: "David A. Hysom" Date: Tue, 8 Oct 2019 09:38:32 -0700 Subject: [PATCH 334/634] made threaded preload the default. To use unthreaded use the cmd line flag: --data_store_no_thread --- src/data_readers/data_reader_numpy_npz_conduit.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/data_readers/data_reader_numpy_npz_conduit.cpp b/src/data_readers/data_reader_numpy_npz_conduit.cpp index bbedcd31cc6..39699da06df 100644 --- a/src/data_readers/data_reader_numpy_npz_conduit.cpp +++ b/src/data_readers/data_reader_numpy_npz_conduit.cpp @@ -147,7 +147,7 @@ void numpy_npz_conduit_reader::preload_data_store() { std::unordered_set label_classes; - bool threaded = options::get()->get_bool("data_store_threaded"); + bool threaded = ! options::get()->get_bool("data_store_no_thread"); //threaded mode if (threaded) { From e85fc4f0f98f55de7e4c85c62c4fbbc9d3fa0403 Mon Sep 17 00:00:00 2001 From: Tom Benson <30674819+benson31@users.noreply.github.com> Date: Tue, 8 Oct 2019 13:26:02 -0700 Subject: [PATCH 335/634] Clean up Ray and Corona testing time (#1282) * fixes for ray * skip reconstruction test on ray since the data doesn't exist * limit OMP threads --- bamboo/run.sh | 2 +- bamboo/unit_tests/test_unit_lbann_invocation.py | 8 ++++++++ bamboo/unit_tests/test_unit_reconstruction_loss.py | 12 ++++++++---- 3 files changed, 17 insertions(+), 5 deletions(-) diff --git a/bamboo/run.sh b/bamboo/run.sh index da1e96968a9..45908a04f8e 100755 --- a/bamboo/run.sh +++ b/bamboo/run.sh @@ -51,7 +51,7 @@ cd .. echo "Task: Unit Tests" cd unit_tests module load python/3.6.4 -$PYTHON -m pytest -s -vv --durations=0 --junitxml=results.xml +OMP_NUM_THREADS=10 $PYTHON -m pytest -s -vv --durations=0 --junitxml=results.xml cd .. echo "Task: Finished" diff --git a/bamboo/unit_tests/test_unit_lbann_invocation.py b/bamboo/unit_tests/test_unit_lbann_invocation.py index 51985037dec..9a4748f62ef 100644 --- a/bamboo/unit_tests/test_unit_lbann_invocation.py +++ b/bamboo/unit_tests/test_unit_lbann_invocation.py @@ -35,6 +35,7 @@ def test_unit_no_params_bad(cluster, dirname, exes): command = tools.get_command( cluster=cluster, executable=exe, exit_after_setup=True, + num_processes=1, output_file_name=output_file_name, error_file_name=error_file_name ) @@ -57,6 +58,7 @@ def test_unit_one_model_bad(cluster, dirname, exes): cluster=cluster, executable=exe, exit_after_setup=True, model_path=model_path, + num_processes=1, output_file_name=output_file_name, error_file_name=error_file_name ) @@ -79,6 +81,7 @@ def test_unit_two_models_bad(cluster, dirname, exes): cluster=cluster, executable=exe, exit_after_setup=True, model_path=model_path, + num_processes=1, output_file_name=output_file_name, error_file_name=error_file_name ) @@ -102,6 +105,7 @@ def test_unit_two_models_bad2(cluster, dirname, exes): cluster=cluster, executable=exe, exit_after_setup=True, model_path=model_path, + num_processes=1, output_file_name=output_file_name, error_file_name=error_file_name ) @@ -125,6 +129,7 @@ def test_unit_missing_optimizer(cluster, dirname, exes): data_reader_path=data_reader_path, data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST', exit_after_setup=True, model_path=model_path, + num_processes=1, output_file_name=output_file_name, error_file_name=error_file_name ) @@ -147,6 +152,7 @@ def test_unit_missing_reader(cluster, dirname, exes): cluster=cluster, executable=exe, exit_after_setup=True, model_path=model_path, optimizer_path=optimizer_path, + num_processes=1, output_file_name=output_file_name, error_file_name=error_file_name ) @@ -167,6 +173,7 @@ def test_unit_bad_params(cluster, dirname, exes): dirname) (command_allocate, command_run, _, _) = tools.get_command( cluster=cluster, executable=exe, + num_processes=1, return_tuple=True) (output_file_name, error_file_name) = get_file_names(dirname, 'bad_params') command_string = '{ca}{cr} {e} -exit_after_setup --reader={d} --model={m} --optimizer={o} > {ofn} 2> {efn}'.format( @@ -195,6 +202,7 @@ def test_unit_should_work(cluster, dirname, exes): data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST', exit_after_setup=True, model_path=model_path, optimizer_path=optimizer_path, + num_processes=1, output_file_name=output_file_name, error_file_name=error_file_name) return_code = os.system(command) diff --git a/bamboo/unit_tests/test_unit_reconstruction_loss.py b/bamboo/unit_tests/test_unit_reconstruction_loss.py index 04e8ae52718..a7c9b4823ae 100644 --- a/bamboo/unit_tests/test_unit_reconstruction_loss.py +++ b/bamboo/unit_tests/test_unit_reconstruction_loss.py @@ -8,16 +8,20 @@ def skeleton_jag_reconstruction_loss(cluster, executables, dir_name, compiler_name, weekly, data_reader_percent): if compiler_name not in executables: - e = 'skeleton_jag_reconstruction_loss: default_exes[%s] does not exist' % compiler_name - print('Skip - ' + e) - pytest.skip(e) + e = 'skeleton_jag_reconstruction_loss: default_exes[%s] does not exist' % compiler_name + print('Skip - ' + e) + pytest.skip(e) + if cluster == 'ray': + e = 'skeleton_jag_reconstruction_loss: dataset does not exist on %s' % cluster + print('Skip - ' + e) + pytest.skip(e) output_file_name = '%s/bamboo/unit_tests/output/jag_reconstruction_loss_%s_output.txt' % (dir_name, compiler_name) error_file_name = '%s/bamboo/unit_tests/error/jag_reconstruction_loss_%s_error.txt' % (dir_name, compiler_name) command = tools.get_command( cluster=cluster, executable=executables[compiler_name], num_nodes=2, - num_processes=32, + num_processes=32, disable_cuda=1, dir_name=dir_name, data_filedir_default='/p/lscratchh/brainusr/datasets/10MJAG/1M_A/100K4trainers', From be63f8259db2241507bc54085e70f32075970c04 Mon Sep 17 00:00:00 2001 From: "David A. Hysom" Date: Wed, 9 Oct 2019 10:47:16 -0700 Subject: [PATCH 336/634] Modified linear search through a vector to a map lookup --- .../lbann/data_readers/sample_list_open_files_impl.hpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/include/lbann/data_readers/sample_list_open_files_impl.hpp b/include/lbann/data_readers/sample_list_open_files_impl.hpp index 479ecd40287..ae869170b15 100644 --- a/include/lbann/data_readers/sample_list_open_files_impl.hpp +++ b/include/lbann/data_readers/sample_list_open_files_impl.hpp @@ -533,6 +533,7 @@ ::all_gather_packed_lists(lbann_comm& comm) { m_file_id_stats_map.reserve(num_ids); m_file_map.reserve(num_files); + std::unordered_map mp; for(int r = 0; r < num_ranks; r++) { const samples_t& s_list = per_rank_samples[r]; const auto& files = per_rank_files[r]; @@ -548,13 +549,12 @@ ::all_gather_packed_lists(lbann_comm& comm) { if(m_file_map.count(filename) == 0) { m_file_map[filename] = file_map.at(filename); } + mp[filename] = index; }else { - for(size_t i = 0; i < m_file_id_stats_map.size(); i++) { - if(filename == get_samples_filename(i)) { - index = i; - break; - } + if (mp.find(filename) == mp.end()) { + LBANN_ERROR("mp.find(filename) == mp.end()"); } + index = mp[filename]; } m_sample_list.emplace_back(std::make_pair(index, s.second)); } From aa4e4bd860b8afaca6f8a34d4cb2ef01dc0c0980 Mon Sep 17 00:00:00 2001 From: "Brian C. Van Essen" Date: Wed, 9 Oct 2019 12:16:06 -0700 Subject: [PATCH 337/634] Updated to remove second lookup in table. --- include/lbann/data_readers/sample_list_open_files_impl.hpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/include/lbann/data_readers/sample_list_open_files_impl.hpp b/include/lbann/data_readers/sample_list_open_files_impl.hpp index ae869170b15..3839ca2a425 100644 --- a/include/lbann/data_readers/sample_list_open_files_impl.hpp +++ b/include/lbann/data_readers/sample_list_open_files_impl.hpp @@ -551,10 +551,11 @@ ::all_gather_packed_lists(lbann_comm& comm) { } mp[filename] = index; }else { - if (mp.find(filename) == mp.end()) { + auto search_result = mp.find(filename); + if (search_result == mp.end()) { LBANN_ERROR("mp.find(filename) == mp.end()"); } - index = mp[filename]; + index = search_result->second; } m_sample_list.emplace_back(std::make_pair(index, s.second)); } From 3a1381fe478ddc7648db78a4ea75baedc532e5c9 Mon Sep 17 00:00:00 2001 From: "David A. Hysom" Date: Wed, 9 Oct 2019 14:09:04 -0700 Subject: [PATCH 338/634] added a couple of timing statements --- src/data_readers/data_reader_jag_conduit.cpp | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/src/data_readers/data_reader_jag_conduit.cpp b/src/data_readers/data_reader_jag_conduit.cpp index bf603c02331..9b277cc1d9d 100644 --- a/src/data_readers/data_reader_jag_conduit.cpp +++ b/src/data_readers/data_reader_jag_conduit.cpp @@ -833,12 +833,12 @@ void data_reader_jag_conduit::load() { /// The use of these flags need to be updated to properly separate /// how index lists are used between trainers and models /// @todo m_list_per_trainer || m_list_per_model + double tm2 = get_time(); load_list_of_samples(sample_list_file, m_comm->get_procs_per_trainer(), m_comm->get_rank_in_trainer()); if(is_master()) { - if (check_data) { - std::cout << "Finished sample list, check data" << std::endl; - } else { - std::cout << "Finished sample list, skipping check data" << std::endl; + std::cout << "Finished loadingsample list; time: " << get_time() - tm2 << std::endl; + if (!check_data) { + std::cout << "Skipping check data" << std::endl; } } @@ -874,6 +874,7 @@ void data_reader_jag_conduit::load() { } /// Merge all of the sample lists + tm2 = get_time(); m_sample_list.all_gather_packed_lists(*m_comm); if (opts->has_string("write_sample_list") && m_comm->am_trainer_master()) { { @@ -886,6 +887,9 @@ void data_reader_jag_conduit::load() { s << basename << "." << ext; m_sample_list.write(s.str()); } + if (is_master()) { + std::cout << "time for all_gather_packed_lists: " << get_time() - tm2 << std::endl; + } m_shuffled_indices.resize(m_sample_list.size()); std::iota(m_shuffled_indices.begin(), m_shuffled_indices.end(), 0); From 68003a3eeb18caa69ad4dd30494b5d2c8059d0a1 Mon Sep 17 00:00:00 2001 From: Tim Moon Date: Wed, 9 Oct 2019 14:19:31 -0700 Subject: [PATCH 339/634] Add Catch2 tests for embedded Python session (#1286) * Add Catch2 tests for embedded Python utility functions Add convenience functions for Python int, float, and str objects. * Test Python initialization and finalization with Catch2 * Embedded Python session can recover gracefully from an error * Add Catch2 test for Python object constructor with null pointer * Make section names more descriptinve in Python util Catch2 tests Adding some tests for Python float convenience functions --- include/lbann/utils/python.hpp | 12 +- src/utils/python.cpp | 36 ++++ src/utils/unit_test/CMakeLists.txt | 1 + src/utils/unit_test/python_test.cpp | 269 ++++++++++++++++++++++++++++ 4 files changed, 316 insertions(+), 2 deletions(-) create mode 100644 src/utils/unit_test/python_test.cpp diff --git a/include/lbann/utils/python.hpp b/include/lbann/utils/python.hpp index 0bcbaf34ff0..e762ae3f842 100644 --- a/include/lbann/utils/python.hpp +++ b/include/lbann/utils/python.hpp @@ -61,8 +61,9 @@ bool is_active(); /** @brief Check if a Python error has occurred. * - * Throws an exception if a Python error is detected. The GIL is - * acquired internally. + * If a Python error is detected, then the Python error indicator is + * cleared and a C++ exception is thrown. The GIL is acquired + * internally. * * @param force_error Whether to force an exception to be thrown. */ @@ -148,6 +149,13 @@ class object { */ PyObject* release() noexcept; + /** Convert Python @c str to C++ @c std::string. */ + operator std::string(); + /** Convert Python @c int to C++ @c long. */ + operator long(); + /** Convert Python @c float to C++ @c double. */ + operator double(); + private: /** Python object pointer. */ diff --git a/src/utils/python.cpp b/src/utils/python.cpp index ad73ed992e1..c248ca6715f 100644 --- a/src/utils/python.cpp +++ b/src/utils/python.cpp @@ -136,6 +136,7 @@ void check_error(bool force_error) { // Clean up and throw exception PyErr_Restore(type.release(), value.release(), traceback.release()); + PyErr_Clear(); LBANN_ERROR(err.str()); } @@ -221,6 +222,41 @@ PyObject* object::release() noexcept { return old_ptr; } +object::operator std::string() { + global_interpreter_lock gil; + if (m_ptr == nullptr) { + LBANN_ERROR("Attempted to convert Python object to std::string, " + "but it has not been set"); + } + object python_str = PyObject_Str(m_ptr); + std::ostringstream ss; + ss << PyUnicode_AsUTF8(python_str); + check_error(); + return ss.str(); +} + +object::operator long() { + global_interpreter_lock gil; + if (m_ptr == nullptr) { + LBANN_ERROR("Attempted to convert Python object to long, " + "but it has not been set"); + } + auto val = PyLong_AsLong(m_ptr); + check_error(); + return val; +} + +object::operator double() { + global_interpreter_lock gil; + if (m_ptr == nullptr) { + LBANN_ERROR("Attempted to convert Python object to double, " + "but it has not been set"); + } + auto val = PyFloat_AsDouble(m_ptr); + check_error(); + return val; +} + } // namespace python } // namespace lbann diff --git a/src/utils/unit_test/CMakeLists.txt b/src/utils/unit_test/CMakeLists.txt index 35aba73754b..343bcbbb753 100644 --- a/src/utils/unit_test/CMakeLists.txt +++ b/src/utils/unit_test/CMakeLists.txt @@ -3,6 +3,7 @@ set_full_path(_DIR_LBANN_CATCH2_TEST_FILES beta_distribution_test.cpp factory_test.cpp image_test.cpp + python_test.cpp random_test.cpp type_erased_matrix_test.cpp ) diff --git a/src/utils/unit_test/python_test.cpp b/src/utils/unit_test/python_test.cpp new file mode 100644 index 00000000000..595c070b74b --- /dev/null +++ b/src/utils/unit_test/python_test.cpp @@ -0,0 +1,269 @@ +// MUST include this +#include + +// File being tested +#include + +#ifdef LBANN_HAS_PYTHON +TEST_CASE ("Testing the embedded Python session", "[python][utilities]") { + + SECTION ("Initializing and finalizing the Python session") { + REQUIRE_NOTHROW(lbann::python::initialize()); + REQUIRE(lbann::python::is_active()); + REQUIRE_NOTHROW(lbann::python::initialize()); + REQUIRE(lbann::python::is_active()); + REQUIRE_NOTHROW(lbann::python::finalize()); + REQUIRE_FALSE(lbann::python::is_active()); + REQUIRE_NOTHROW(lbann::python::finalize()); + REQUIRE_FALSE(lbann::python::is_active()); + REQUIRE_NOTHROW(lbann::python::initialize()); + REQUIRE(lbann::python::is_active()); + } + + SECTION ("Acquiring the global interpreter lock") { + SECTION ("Acquiring GIL once") { + std::unique_ptr gil; + REQUIRE_NOTHROW(gil.reset(new lbann::python::global_interpreter_lock())); + REQUIRE_NOTHROW(gil.reset()); + } + SECTION ("Acquiring GIL recursively") { + std::unique_ptr gil1, gil2, gil3; + REQUIRE_NOTHROW(gil1.reset(new lbann::python::global_interpreter_lock())); + REQUIRE_NOTHROW(gil2.reset(new lbann::python::global_interpreter_lock())); + REQUIRE_NOTHROW(gil3.reset(new lbann::python::global_interpreter_lock())); + REQUIRE_NOTHROW(gil3.reset()); + REQUIRE_NOTHROW(gil2.reset()); + REQUIRE_NOTHROW(gil1.reset()); + } + } + + SECTION ("Python error checking") { + lbann::python::global_interpreter_lock gil; + REQUIRE_NOTHROW(lbann::python::check_error()); + REQUIRE_THROWS(lbann::python::check_error(true)); + REQUIRE_NOTHROW(lbann::python::check_error()); + + SECTION ("Raising Python exception") { + PyObject* main = PyImport_ImportModule("__main__"); + std::string func_def = R"( +def throw_exception(): + raise RuntimeError('This error is expected') +)"; + PyRun_SimpleString(func_def.c_str()); + PyObject_CallMethod(main, "throw_exception", "()"); + REQUIRE_THROWS(lbann::python::check_error()); + Py_DECREF(main); + REQUIRE_NOTHROW(lbann::python::check_error()); + } + + SECTION ("Making syntax error") { + PyObject* main = PyImport_ImportModule("__main__"); + std::string func_def = R"( +def make_syntax_error(): + this should throw a NameError +)"; + PyRun_SimpleString(func_def.c_str()); + PyObject_CallMethod(main, "make_syntax_error", "()"); + REQUIRE_THROWS(lbann::python::check_error()); + Py_DECREF(main); + REQUIRE_NOTHROW(lbann::python::check_error()); + } + + SECTION ("Passing bad arguments into Python/C API") { + PyLong_AsLong(nullptr); + REQUIRE_THROWS(lbann::python::check_error()); + REQUIRE_NOTHROW(lbann::python::check_error()); + } + + } + + SECTION ("Python object wrapper") { + lbann::python::global_interpreter_lock gil; + + SECTION ("Default constructor") { + std::unique_ptr obj; + REQUIRE_NOTHROW(obj.reset(new lbann::python::object())); + REQUIRE(*obj == nullptr); + REQUIRE_NOTHROW(obj.reset()); + } + + SECTION ("Constructor with raw Python object pointer") { + PyObject* ptr = Py_BuildValue("(i,d,s)", 987, 6.54, "321"); + REQUIRE(ptr != nullptr); + Py_INCREF(ptr); + REQUIRE(Py_REFCNT(ptr) == 2); + std::unique_ptr obj; + REQUIRE_NOTHROW(obj.reset(new lbann::python::object(ptr))); + REQUIRE(Py_REFCNT(ptr) == 2); + REQUIRE_NOTHROW(obj.reset()); + REQUIRE(Py_REFCNT(ptr) == 1); + Py_DECREF(ptr); + REQUIRE_NOTHROW(lbann::python::check_error()); + } + + SECTION ("Constructor with null pointer") { + std::unique_ptr obj; + REQUIRE_NOTHROW(obj.reset(new lbann::python::object(nullptr))); + REQUIRE_NOTHROW(obj.reset()); + REQUIRE_NOTHROW(lbann::python::check_error()); + } + + SECTION ("Access functions to raw Python object pointer") { + PyObject* ptr = Py_BuildValue("(i,d,s)", 12, 3.4, "56"); + REQUIRE(ptr != nullptr); + Py_INCREF(ptr); + std::unique_ptr obj; + REQUIRE_NOTHROW(obj.reset(new lbann::python::object(ptr))); + REQUIRE(Py_REFCNT(ptr) == 2); + REQUIRE(obj->get() == ptr); + REQUIRE(Py_REFCNT(ptr) == 2); + REQUIRE(const_cast(*obj).get() == ptr); + REQUIRE(Py_REFCNT(ptr) == 2); + REQUIRE(*obj == ptr); + REQUIRE(Py_REFCNT(ptr) == 2); + REQUIRE(const_cast(*obj) == ptr); + REQUIRE(Py_REFCNT(ptr) == 2); + REQUIRE(obj->release() == ptr); + REQUIRE(obj->get() == nullptr); + REQUIRE(Py_REFCNT(ptr) == 2); + REQUIRE_NOTHROW(obj.reset()); + REQUIRE(Py_REFCNT(ptr) == 2); + Py_DECREF(ptr); + Py_DECREF(ptr); + REQUIRE_NOTHROW(lbann::python::check_error()); + } + + SECTION ("Copy constructor") { + PyObject* ptr = Py_BuildValue("(i,d,s)", 98, 7.6, "54"); + std::unique_ptr obj1(new lbann::python::object(ptr)); + std::unique_ptr obj2; + REQUIRE_NOTHROW(obj2.reset(new lbann::python::object(*obj1))); + REQUIRE(*obj1 == ptr); + REQUIRE(*obj2 == ptr); + REQUIRE(Py_REFCNT(ptr) == 2); + obj1.reset(); + REQUIRE(Py_REFCNT(ptr) == 1); + obj2.reset(); + REQUIRE_NOTHROW(lbann::python::check_error()); + } + + SECTION ("Copy assignment operator") { + PyObject* ptr1 = Py_BuildValue("(i,d,s)", 1, 2., "3"); + PyObject* ptr2 = Py_BuildValue("(i,d,s)", 4, 5., "6"); + Py_INCREF(ptr1); + Py_INCREF(ptr2); + REQUIRE((Py_REFCNT(ptr1) == 2 && Py_REFCNT(ptr2) == 2)); + std::unique_ptr obj1(new lbann::python::object(ptr1)); + std::unique_ptr obj2(new lbann::python::object(ptr2)); + REQUIRE_NOTHROW(*obj2 = *obj1); + REQUIRE(*obj1 == ptr1); + REQUIRE(*obj2 == ptr1); + REQUIRE((Py_REFCNT(ptr1) == 3 && Py_REFCNT(ptr2) == 1)); + obj1.reset(); + REQUIRE((Py_REFCNT(ptr1) == 2 && Py_REFCNT(ptr2) == 1)); + obj2.reset(); + Py_DECREF(ptr1); + Py_DECREF(ptr2); + REQUIRE_NOTHROW(lbann::python::check_error()); + } + + SECTION ("Move constructor") { + PyObject* ptr = Py_BuildValue("(i,d,s)", 987, 65.4, "three two one"); + std::unique_ptr obj1(new lbann::python::object(ptr)); + std::unique_ptr obj2; + REQUIRE_NOTHROW(obj2.reset(new lbann::python::object(std::move(*obj1)))); + REQUIRE(*obj1 == nullptr); + REQUIRE(*obj2 == ptr); + REQUIRE(Py_REFCNT(ptr) == 1); + obj1.reset(); + REQUIRE(Py_REFCNT(ptr) == 1); + obj2.reset(); + REQUIRE_NOTHROW(lbann::python::check_error()); + } + + SECTION ("Move assignment operator") { + PyObject* ptr1 = Py_BuildValue("(i,d,s)", 9, 8., "7"); + PyObject* ptr2 = Py_BuildValue("(i,d,s)", 6, 5., "4"); + Py_INCREF(ptr1); + Py_INCREF(ptr2); + REQUIRE((Py_REFCNT(ptr1) == 2 && Py_REFCNT(ptr2) == 2)); + std::unique_ptr obj1(new lbann::python::object(ptr1)); + std::unique_ptr obj2(new lbann::python::object(ptr2)); + REQUIRE_NOTHROW(*obj2 = std::move(*obj1)); + REQUIRE(*obj1 == nullptr); + REQUIRE(*obj2 == ptr1); + REQUIRE((Py_REFCNT(ptr1) == 2 && Py_REFCNT(ptr2) == 1)); + obj1.reset(); + REQUIRE((Py_REFCNT(ptr1) == 2 && Py_REFCNT(ptr2) == 1)); + obj2.reset(); + Py_DECREF(ptr1); + Py_DECREF(ptr2); + REQUIRE_NOTHROW(lbann::python::check_error()); + } + + SECTION ("Convenience functions for Python str") { + SECTION ("Empty string"){ + std::unique_ptr obj; + REQUIRE_NOTHROW(obj.reset(new lbann::python::object(""))); + REQUIRE(static_cast(*obj).empty()); + } + SECTION ("Non-empty string"){ + std::unique_ptr obj; + REQUIRE_NOTHROW(obj.reset(new lbann::python::object("one two three"))); + REQUIRE(static_cast(*obj) == "one two three"); + } + } + + SECTION ("Convenience functions for Python int") { + SECTION ("Zero value") { + std::unique_ptr obj; + REQUIRE_NOTHROW(obj.reset(new lbann::python::object(0l))); + REQUIRE(static_cast(*obj) == 0l); + } + SECTION ("Positive value") { + std::unique_ptr obj; + REQUIRE_NOTHROW(obj.reset(new lbann::python::object(123l))); + REQUIRE(static_cast(*obj) == 123l); + } + SECTION ("Negative value") { + std::unique_ptr obj; + REQUIRE_NOTHROW(obj.reset(new lbann::python::object(-321l))); + REQUIRE(static_cast(*obj) == -321l); + } + } + + SECTION ("Convenience functions for Python float") { + SECTION ("Zero value") { + std::unique_ptr obj; + REQUIRE_NOTHROW(obj.reset(new lbann::python::object(0.0))); + REQUIRE(static_cast(*obj) == 0.0); + } + SECTION ("Positive value") { + std::unique_ptr obj; + REQUIRE_NOTHROW(obj.reset(new lbann::python::object(3.21))); + REQUIRE(static_cast(*obj) == 3.21); + } + SECTION ("Negative value") { + std::unique_ptr obj; + REQUIRE_NOTHROW(obj.reset(new lbann::python::object(-12.3))); + REQUIRE(static_cast(*obj) == -12.3); + } + SECTION ("Infinite value") { + constexpr double inf = std::numeric_limits::infinity(); + std::unique_ptr obj; + REQUIRE_NOTHROW(obj.reset(new lbann::python::object(inf))); + REQUIRE(static_cast(*obj) == inf); + REQUIRE_NOTHROW(obj.reset(new lbann::python::object(-inf))); + REQUIRE(static_cast(*obj) == -inf); + } + SECTION ("NaN value") { + std::unique_ptr obj; + REQUIRE_NOTHROW(obj.reset(new lbann::python::object(std::nan("")))); + REQUIRE(std::isnan(static_cast(*obj))); + } + } + + } + +} +#endif // LBANN_HAS_PYTHON From 89b300db5ae7737f6ff31322e7d8b90b03529089 Mon Sep 17 00:00:00 2001 From: Tim Moon Date: Mon, 14 Oct 2019 09:23:44 -0700 Subject: [PATCH 340/634] Cleanup protobuf message for trainer (#1255) * Remove num_gpus field from Trainer protobuf message It was unused * Use default Hydrogen block size --- bamboo/common_python/test_tools.py | 14 ++++----- bamboo/common_python/tools.py | 3 +- .../models/alexnet/model_alexnet.prototext | 3 -- .../model_autoencoder_chem_ecfp.prototext | 2 -- ...er_chem_ecfp_200x150x100x100x100.prototext | 2 -- ...utoencoder_chem_ecfp_500x250x100.prototext | 2 -- .../model_autoencoder_chem_sigmoid.prototext | 2 -- .../model_dnn_chem_ecfp.prototext | 2 -- .../model_autoencoder_cifar10.prototext | 3 -- .../model_conv_autoencoder_cifar10.prototext | 2 -- .../model_conv_autoencoder_imagenet.prototext | 2 -- .../model_autoencoder_mnist.prototext | 3 -- .../model_conv_autoencoder_mnist.prototext | 2 -- .../autoencoder_mnist/vae_mnist.prototext | 1 - .../candle/pilot1/ae_nodeselect_gdc.prototext | 3 -- .../models/candle/pilot1/combo.prototext | 3 -- .../cosmoflow/model_cosmoflow.prototext | 3 -- .../densenet/generated_densenet.prototext | 1 - .../gan/jags/cycle_gan/cycgan_m1.prototext | 1 - .../cycle_gan/cycgan_m1_template.prototext | 3 -- .../gan/jags/cycle_gan/cycgan_m2.prototext | 1 - .../cycle_gan/cycgan_m2_template.prototext | 3 -- .../gan/jags/cycle_gan/cycgan_m3.prototext | 1 - .../cycle_gan/cycgan_m3_template.prototext | 3 -- .../gan/mnist/adversarial_model.prototext | 3 -- .../gan/mnist/discriminator_model.prototext | 3 -- .../jag/gan/cyclic/cyclic_gan_model.prototext | 2 -- .../jag/gan/cyclic/model_template.prototext | 3 -- .../models/jag/gan/vanilla/gan.prototext | 2 -- .../jag/gan/vanilla/gan_template.prototext | 3 -- model_zoo/models/jag/vae_fcn.prototext | 3 -- model_zoo/models/jag/wae.prototext | 2 -- .../jag/wae_cycle_gan/cycle_gan.prototext | 2 -- .../wae_cycle_gan/cycle_gan_only.prototext | 2 -- .../models/jag/wae_cycle_gan/wae.prototext | 2 -- .../jag/wae_cycle_gan/wae_fw_inv.prototext | 3 -- .../jag/wae_cycle_gan/wae_nobn.prototext | 1 - .../lenet_mnist/model_lenet_mnist.prototext | 3 -- ...onv_molecular_autoencoder_pilot2.prototext | 2 -- ...olecular_bead_autoencoder_pilot2.prototext | 2 -- ...del_molecular_autoencoder_pilot2.prototext | 2 -- .../models/python/keras/mnist_cnn.prototext | 1 - .../models/resnet50/model_resnet50.prototext | 3 -- .../siamese/finetune-cub/model_cub.prototext | 3 -- .../model_cub_batchnorm.prototext | 3 -- ...batchnorm_transferred_and_frozen.prototext | 3 -- ..._alexnet_batchnorm_dag_frozen_bn.prototext | 3 -- .../model_mnist_simple_1.prototext | 3 -- .../model_mnist_simple_2.prototext | 3 -- .../jag_single_layer_ae.prototext | 3 -- .../model_channelwise_mean.prototext | 3 -- .../tests/layer_tests/model_clamp.prototext | 3 -- .../layer_tests/model_covariance.prototext | 3 -- .../tests/layer_tests/model_elu.prototext | 3 -- .../layer_tests/model_identity.prototext | 3 -- .../tests/layer_tests/model_l1_norm.prototext | 3 -- .../layer_tests/model_l2_norm2.prototext | 3 -- .../layer_tests/model_leaky_relu.prototext | 3 -- .../layer_tests/model_log_sigmoid.prototext | 3 -- .../layer_tests/model_log_softmax.prototext | 3 -- .../model_mean_absolute_error.prototext | 3 -- .../tests/layer_tests/model_relu.prototext | 3 -- .../tests/layer_tests/model_selu.prototext | 3 -- .../tests/layer_tests/model_sigmoid.prototext | 3 -- .../tests/layer_tests/model_softmax.prototext | 3 -- .../layer_tests/model_softplus.prototext | 3 -- .../layer_tests/model_softsign.prototext | 3 -- .../model_squared_difference.prototext | 1 - .../layer_tests/model_tessellate.prototext | 1 - .../layer_tests/model_variance.prototext | 3 -- .../tests/model_jag_single_layer_ae.prototext | 3 -- .../tests/model_lenet_mnist_ckpt.prototext | 3 -- .../model_lenet_mnist_dist_ckpt.prototext | 3 -- .../model_lenet_mnist_lbann2ckpt.prototext | 3 -- .../tests/model_mnist_conv_graph.prototext | 2 +- .../model_mnist_ridge_regression.prototext | 4 +-- .../model_mnist_softmax_classifier.prototext | 4 +-- python/lbann/trainer.py | 27 +++++++++------- src/proto/proto_common.cpp | 9 +++--- src/proto/trainer.proto | 31 ++++++++++++++++--- src/utils/lbann_library.cpp | 8 ++--- 81 files changed, 60 insertions(+), 223 deletions(-) diff --git a/bamboo/common_python/test_tools.py b/bamboo/common_python/test_tools.py index c15bc5f36d5..2146ba05b3b 100644 --- a/bamboo/common_python/test_tools.py +++ b/bamboo/common_python/test_tools.py @@ -22,7 +22,7 @@ num_epochs=7, optimizer_name='adagrad', processes_per_model=10, - extra_lbann_flags={'block_size': 4, 'print_affinity': None}, + extra_lbann_flags={'print_affinity': None}, output_file_name='output_file', error_file_name='error_file', check_executable_existence=False) @@ -30,31 +30,31 @@ def test_command_catalyst(): actual = tools.get_command(cluster='catalyst', **d) - expected = 'salloc --nodes=20 --partition=pdebug --time=30 srun --mpibind=off --time=30 --ntasks=40 exe --reader=dir/model_zoo/data_readers/data_reader_mnist.prototext --data_reader_percent=0.100000 --exit_after_setup --mini_batch_size=15 --model=dir/model_zoo/models/folder/model_lenet.prototext --num_epochs=7 --optimizer=dir/model_zoo/optimizers/opt_adagrad.prototext --procs_per_model=10 --block_size=4 --print_affinity > output_file 2> error_file' + expected = 'salloc --nodes=20 --partition=pdebug --time=30 srun --mpibind=off --time=30 --ntasks=40 exe --reader=dir/model_zoo/data_readers/data_reader_mnist.prototext --data_reader_percent=0.100000 --exit_after_setup --mini_batch_size=15 --model=dir/model_zoo/models/folder/model_lenet.prototext --num_epochs=7 --optimizer=dir/model_zoo/optimizers/opt_adagrad.prototext --procs_per_model=10 --print_affinity > output_file 2> error_file' assert actual == expected def test_command_corona(): actual = tools.get_command(cluster='corona', **d) - expected = 'salloc --nodes=20 --partition=pdebug --time=30 srun --mpibind=off --time=30 --ntasks=40 exe --reader=dir/model_zoo/data_readers/data_reader_mnist.prototext --data_reader_percent=0.100000 --exit_after_setup --mini_batch_size=15 --model=dir/model_zoo/models/folder/model_lenet.prototext --num_epochs=7 --optimizer=dir/model_zoo/optimizers/opt_adagrad.prototext --procs_per_model=10 --block_size=4 --print_affinity > output_file 2> error_file' + expected = 'salloc --nodes=20 --partition=pdebug --time=30 srun --mpibind=off --time=30 --ntasks=40 exe --reader=dir/model_zoo/data_readers/data_reader_mnist.prototext --data_reader_percent=0.100000 --exit_after_setup --mini_batch_size=15 --model=dir/model_zoo/models/folder/model_lenet.prototext --num_epochs=7 --optimizer=dir/model_zoo/optimizers/opt_adagrad.prototext --procs_per_model=10 --print_affinity > output_file 2> error_file' assert actual == expected def test_command_lassen(): actual = tools.get_command(cluster='lassen', **d) - expected = 'bsub -G guests -Is -q pdebug -nnodes 20 -W 30 jsrun -b "packed:10" -c 40 -g 4 -d packed -n 16 -r 1 -a 4 exe --data_filedir=gpfs1/filedir --reader=dir/model_zoo/data_readers/data_reader_mnist.prototext --data_reader_percent=0.100000 --exit_after_setup --mini_batch_size=15 --model=dir/model_zoo/models/folder/model_lenet.prototext --num_epochs=7 --optimizer=dir/model_zoo/optimizers/opt_adagrad.prototext --procs_per_model=10 --block_size=4 --print_affinity > output_file 2> error_file' + expected = 'bsub -G guests -Is -q pdebug -nnodes 20 -W 30 jsrun -b "packed:10" -c 40 -g 4 -d packed -n 16 -r 1 -a 4 exe --data_filedir=gpfs1/filedir --reader=dir/model_zoo/data_readers/data_reader_mnist.prototext --data_reader_percent=0.100000 --exit_after_setup --mini_batch_size=15 --model=dir/model_zoo/models/folder/model_lenet.prototext --num_epochs=7 --optimizer=dir/model_zoo/optimizers/opt_adagrad.prototext --procs_per_model=10 --print_affinity > output_file 2> error_file' assert actual == expected def test_command_pascal(): actual = tools.get_command(cluster='pascal', **d) - expected = 'salloc --nodes=20 --partition=pbatch --time=30 srun --mpibind=off --time=30 --ntasks=40 exe --reader=dir/model_zoo/data_readers/data_reader_mnist.prototext --data_reader_percent=0.100000 --exit_after_setup --mini_batch_size=15 --model=dir/model_zoo/models/folder/model_lenet.prototext --num_epochs=7 --optimizer=dir/model_zoo/optimizers/opt_adagrad.prototext --procs_per_model=10 --block_size=4 --print_affinity > output_file 2> error_file' + expected = 'salloc --nodes=20 --partition=pbatch --time=30 srun --mpibind=off --time=30 --ntasks=40 exe --reader=dir/model_zoo/data_readers/data_reader_mnist.prototext --data_reader_percent=0.100000 --exit_after_setup --mini_batch_size=15 --model=dir/model_zoo/models/folder/model_lenet.prototext --num_epochs=7 --optimizer=dir/model_zoo/optimizers/opt_adagrad.prototext --procs_per_model=10 --print_affinity > output_file 2> error_file' assert actual == expected def test_command_ray(): actual = tools.get_command(cluster='ray', **d) - expected = 'bsub -x -G guests -Is -n 40 -q pdebug -R "span[ptile=2]" -W 30 mpirun --timeout=30 -np 40 -N 2 exe --data_filedir=gscratchr/filedir --reader=dir/model_zoo/data_readers/data_reader_mnist.prototext --data_reader_percent=0.100000 --exit_after_setup --mini_batch_size=15 --model=dir/model_zoo/models/folder/model_lenet.prototext --num_epochs=7 --optimizer=dir/model_zoo/optimizers/opt_adagrad.prototext --procs_per_model=10 --block_size=4 --print_affinity > output_file 2> error_file' + expected = 'bsub -x -G guests -Is -n 40 -q pdebug -R "span[ptile=2]" -W 30 mpirun --timeout=30 -np 40 -N 2 exe --data_filedir=gscratchr/filedir --reader=dir/model_zoo/data_readers/data_reader_mnist.prototext --data_reader_percent=0.100000 --exit_after_setup --mini_batch_size=15 --model=dir/model_zoo/models/folder/model_lenet.prototext --num_epochs=7 --optimizer=dir/model_zoo/optimizers/opt_adagrad.prototext --procs_per_model=10 --print_affinity > output_file 2> error_file' assert actual == expected @@ -433,7 +433,7 @@ def test_bad_extra_lbann_flags_invalid_flag(): actual = str(e) expected = ("Invalid Usage: extra_lbann_flags includes invalid" " flag=invalid_flag. Flags must" - " be in ['block_size', 'procs_per_trainer', 'num_gpus'," + " be in ['hydrogen_block_size', 'procs_per_trainer'," " 'num_parallel_readers', 'num_io_threads', 'serialize_io'," " 'disable_background_io_activity', 'disable_cuda'," " 'random_seed', 'objective_function', 'data_layout'," diff --git a/bamboo/common_python/tools.py b/bamboo/common_python/tools.py index b27f6599a83..11bf9722bf5 100644 --- a/bamboo/common_python/tools.py +++ b/bamboo/common_python/tools.py @@ -455,9 +455,8 @@ def get_command(cluster, # General: # 'mini_batch_size', # 'num_epochs', - 'block_size', + 'hydrogen_block_size', 'procs_per_trainer', - 'num_gpus', 'num_parallel_readers', 'num_io_threads', 'serialize_io', diff --git a/model_zoo/models/alexnet/model_alexnet.prototext b/model_zoo/models/alexnet/model_alexnet.prototext index a620616b698..40c20addb55 100644 --- a/model_zoo/models/alexnet/model_alexnet.prototext +++ b/model_zoo/models/alexnet/model_alexnet.prototext @@ -1,7 +1,4 @@ trainer { - block_size: 256 - procs_per_trainer: 0 - num_parallel_readers: 0 } model { data_layout: "data_parallel" diff --git a/model_zoo/models/autoencoder_candle_pilot1/model_autoencoder_chem_ecfp.prototext b/model_zoo/models/autoencoder_candle_pilot1/model_autoencoder_chem_ecfp.prototext index 7774b0ffbb3..21026e35f17 100644 --- a/model_zoo/models/autoencoder_candle_pilot1/model_autoencoder_chem_ecfp.prototext +++ b/model_zoo/models/autoencoder_candle_pilot1/model_autoencoder_chem_ecfp.prototext @@ -1,6 +1,4 @@ trainer { - block_size: 256 - procs_per_trainer: 0 num_parallel_readers: 1 } model { diff --git a/model_zoo/models/autoencoder_candle_pilot1/model_autoencoder_chem_ecfp_200x150x100x100x100.prototext b/model_zoo/models/autoencoder_candle_pilot1/model_autoencoder_chem_ecfp_200x150x100x100x100.prototext index e322dccfa01..2c3633c89a5 100644 --- a/model_zoo/models/autoencoder_candle_pilot1/model_autoencoder_chem_ecfp_200x150x100x100x100.prototext +++ b/model_zoo/models/autoencoder_candle_pilot1/model_autoencoder_chem_ecfp_200x150x100x100x100.prototext @@ -1,6 +1,4 @@ trainer { - block_size: 256 - procs_per_trainer: 0 num_parallel_readers: 1 } model { diff --git a/model_zoo/models/autoencoder_candle_pilot1/model_autoencoder_chem_ecfp_500x250x100.prototext b/model_zoo/models/autoencoder_candle_pilot1/model_autoencoder_chem_ecfp_500x250x100.prototext index 715ac13b195..8d0e41430c4 100644 --- a/model_zoo/models/autoencoder_candle_pilot1/model_autoencoder_chem_ecfp_500x250x100.prototext +++ b/model_zoo/models/autoencoder_candle_pilot1/model_autoencoder_chem_ecfp_500x250x100.prototext @@ -1,6 +1,4 @@ trainer { - block_size: 256 - procs_per_trainer: 0 num_parallel_readers: 1 } model { diff --git a/model_zoo/models/autoencoder_candle_pilot1/model_autoencoder_chem_sigmoid.prototext b/model_zoo/models/autoencoder_candle_pilot1/model_autoencoder_chem_sigmoid.prototext index 62bb76d84d1..8122e4ceff3 100644 --- a/model_zoo/models/autoencoder_candle_pilot1/model_autoencoder_chem_sigmoid.prototext +++ b/model_zoo/models/autoencoder_candle_pilot1/model_autoencoder_chem_sigmoid.prototext @@ -1,6 +1,4 @@ trainer { - block_size: 256 - procs_per_trainer: 0 num_parallel_readers: 1 } model { diff --git a/model_zoo/models/autoencoder_candle_pilot1/model_dnn_chem_ecfp.prototext b/model_zoo/models/autoencoder_candle_pilot1/model_dnn_chem_ecfp.prototext index 9c8dc907072..2d0305e249d 100644 --- a/model_zoo/models/autoencoder_candle_pilot1/model_dnn_chem_ecfp.prototext +++ b/model_zoo/models/autoencoder_candle_pilot1/model_dnn_chem_ecfp.prototext @@ -1,6 +1,4 @@ trainer { - block_size: 256 - procs_per_trainer: 0 num_parallel_readers: 1 } model { diff --git a/model_zoo/models/autoencoder_cifar10/model_autoencoder_cifar10.prototext b/model_zoo/models/autoencoder_cifar10/model_autoencoder_cifar10.prototext index 29dbbbcc6ba..4ee76fadfe4 100644 --- a/model_zoo/models/autoencoder_cifar10/model_autoencoder_cifar10.prototext +++ b/model_zoo/models/autoencoder_cifar10/model_autoencoder_cifar10.prototext @@ -1,7 +1,4 @@ trainer { - block_size: 256 - procs_per_trainer: 0 - num_parallel_readers: 0 } model { data_layout: "model_parallel" diff --git a/model_zoo/models/autoencoder_cifar10/model_conv_autoencoder_cifar10.prototext b/model_zoo/models/autoencoder_cifar10/model_conv_autoencoder_cifar10.prototext index 463b9f50484..1083a28df0b 100644 --- a/model_zoo/models/autoencoder_cifar10/model_conv_autoencoder_cifar10.prototext +++ b/model_zoo/models/autoencoder_cifar10/model_conv_autoencoder_cifar10.prototext @@ -1,6 +1,4 @@ trainer { - block_size: 256 - #procs_per_trainer: 12 num_parallel_readers: 1 } model { diff --git a/model_zoo/models/autoencoder_imagenet/model_conv_autoencoder_imagenet.prototext b/model_zoo/models/autoencoder_imagenet/model_conv_autoencoder_imagenet.prototext index 95acabd67f7..21c05ea8f63 100644 --- a/model_zoo/models/autoencoder_imagenet/model_conv_autoencoder_imagenet.prototext +++ b/model_zoo/models/autoencoder_imagenet/model_conv_autoencoder_imagenet.prototext @@ -1,6 +1,4 @@ trainer { - block_size: 256 - procs_per_trainer: 0 num_parallel_readers: 1 } model { diff --git a/model_zoo/models/autoencoder_mnist/model_autoencoder_mnist.prototext b/model_zoo/models/autoencoder_mnist/model_autoencoder_mnist.prototext index f8321142088..87d2d8f6585 100644 --- a/model_zoo/models/autoencoder_mnist/model_autoencoder_mnist.prototext +++ b/model_zoo/models/autoencoder_mnist/model_autoencoder_mnist.prototext @@ -1,7 +1,4 @@ trainer { - block_size: 256 - procs_per_trainer: 0 - num_parallel_readers: 0 } model { data_layout: "model_parallel" diff --git a/model_zoo/models/autoencoder_mnist/model_conv_autoencoder_mnist.prototext b/model_zoo/models/autoencoder_mnist/model_conv_autoencoder_mnist.prototext index a7f0b9ba513..d2d162c74b0 100644 --- a/model_zoo/models/autoencoder_mnist/model_conv_autoencoder_mnist.prototext +++ b/model_zoo/models/autoencoder_mnist/model_conv_autoencoder_mnist.prototext @@ -1,6 +1,4 @@ trainer { - block_size: 256 - procs_per_trainer: 0 num_parallel_readers: 1 } model { diff --git a/model_zoo/models/autoencoder_mnist/vae_mnist.prototext b/model_zoo/models/autoencoder_mnist/vae_mnist.prototext index 9f22ed365a8..a5c26f2ae7f 100644 --- a/model_zoo/models/autoencoder_mnist/vae_mnist.prototext +++ b/model_zoo/models/autoencoder_mnist/vae_mnist.prototext @@ -1,7 +1,6 @@ # LBANN implementation of MNIST VAE in Doersch's autoencoder tutorial # See https://github.com/cdoersch/vae_tutorial/blob/master/mnist_vae.prototxt trainer { - block_size: 256 } model { data_layout: "data_parallel" diff --git a/model_zoo/models/candle/pilot1/ae_nodeselect_gdc.prototext b/model_zoo/models/candle/pilot1/ae_nodeselect_gdc.prototext index b1c68492b9e..815924a0e86 100644 --- a/model_zoo/models/candle/pilot1/ae_nodeselect_gdc.prototext +++ b/model_zoo/models/candle/pilot1/ae_nodeselect_gdc.prototext @@ -1,7 +1,4 @@ trainer { - block_size: 256 - procs_per_trainer: 0 - num_parallel_readers: 0 } model { data_layout: "model_parallel" diff --git a/model_zoo/models/candle/pilot1/combo.prototext b/model_zoo/models/candle/pilot1/combo.prototext index dcb39f8031e..25807cdec5c 100644 --- a/model_zoo/models/candle/pilot1/combo.prototext +++ b/model_zoo/models/candle/pilot1/combo.prototext @@ -1,9 +1,6 @@ #Example taken from:https://github.com/ECP-CANDLE/Benchmarks/tree/frameworks/Pilot1/Combo #Timestamp 03/07/2018 8:30PM trainer{ - block_size: 256 - procs_per_trainer: 0 - num_parallel_readers: 0 } model { data_layout: "model_parallel" diff --git a/model_zoo/models/cosmoflow/model_cosmoflow.prototext b/model_zoo/models/cosmoflow/model_cosmoflow.prototext index 782682f97bf..eab8673523a 100644 --- a/model_zoo/models/cosmoflow/model_cosmoflow.prototext +++ b/model_zoo/models/cosmoflow/model_cosmoflow.prototext @@ -1,7 +1,4 @@ trainer { - block_size: 256 - procs_per_trainer: 0 - num_parallel_readers: 0 } model { type: "directed_acyclic_graph_model" diff --git a/model_zoo/models/densenet/generated_densenet.prototext b/model_zoo/models/densenet/generated_densenet.prototext index 2342c1bf3e9..bdb2ed9b015 100644 --- a/model_zoo/models/densenet/generated_densenet.prototext +++ b/model_zoo/models/densenet/generated_densenet.prototext @@ -83,7 +83,6 @@ model { } mini_batch_size: 256 num_epochs: 90 - block_size: 256 layer { name: "layer1" children: "layer2 layer3" diff --git a/model_zoo/models/gan/jags/cycle_gan/cycgan_m1.prototext b/model_zoo/models/gan/jags/cycle_gan/cycgan_m1.prototext index f2a919a628c..015816ca877 100644 --- a/model_zoo/models/gan/jags/cycle_gan/cycgan_m1.prototext +++ b/model_zoo/models/gan/jags/cycle_gan/cycgan_m1.prototext @@ -1,5 +1,4 @@ trainer { - block_size: 256 } model { objective_function { diff --git a/model_zoo/models/gan/jags/cycle_gan/cycgan_m1_template.prototext b/model_zoo/models/gan/jags/cycle_gan/cycgan_m1_template.prototext index b77befee088..2fbcf3df83c 100644 --- a/model_zoo/models/gan/jags/cycle_gan/cycgan_m1_template.prototext +++ b/model_zoo/models/gan/jags/cycle_gan/cycgan_m1_template.prototext @@ -1,7 +1,4 @@ trainer { - block_size: 256 - procs_per_trainer: 0 - num_parallel_readers: 0 } model { data_layout: "data_parallel" diff --git a/model_zoo/models/gan/jags/cycle_gan/cycgan_m2.prototext b/model_zoo/models/gan/jags/cycle_gan/cycgan_m2.prototext index 053dae3b448..2f2148b32aa 100644 --- a/model_zoo/models/gan/jags/cycle_gan/cycgan_m2.prototext +++ b/model_zoo/models/gan/jags/cycle_gan/cycgan_m2.prototext @@ -1,5 +1,4 @@ trainer { - block_size: 256 } model { objective_function { diff --git a/model_zoo/models/gan/jags/cycle_gan/cycgan_m2_template.prototext b/model_zoo/models/gan/jags/cycle_gan/cycgan_m2_template.prototext index 758d7d353e1..521269aa275 100644 --- a/model_zoo/models/gan/jags/cycle_gan/cycgan_m2_template.prototext +++ b/model_zoo/models/gan/jags/cycle_gan/cycgan_m2_template.prototext @@ -1,7 +1,4 @@ trainer { - block_size: 256 - procs_per_trainer: 0 - num_parallel_readers: 0 } model { data_layout: "data_parallel" diff --git a/model_zoo/models/gan/jags/cycle_gan/cycgan_m3.prototext b/model_zoo/models/gan/jags/cycle_gan/cycgan_m3.prototext index 81560f62300..59d45326a8f 100644 --- a/model_zoo/models/gan/jags/cycle_gan/cycgan_m3.prototext +++ b/model_zoo/models/gan/jags/cycle_gan/cycgan_m3.prototext @@ -1,5 +1,4 @@ trainer { - block_size: 256 } model { objective_function { diff --git a/model_zoo/models/gan/jags/cycle_gan/cycgan_m3_template.prototext b/model_zoo/models/gan/jags/cycle_gan/cycgan_m3_template.prototext index d6d8ed35499..a8c24c84878 100644 --- a/model_zoo/models/gan/jags/cycle_gan/cycgan_m3_template.prototext +++ b/model_zoo/models/gan/jags/cycle_gan/cycgan_m3_template.prototext @@ -1,7 +1,4 @@ trainer { - block_size: 256 - procs_per_trainer: 0 - num_parallel_readers: 0 } model { data_layout: "data_parallel" diff --git a/model_zoo/models/gan/mnist/adversarial_model.prototext b/model_zoo/models/gan/mnist/adversarial_model.prototext index 348e618a66e..8d15ed42da7 100644 --- a/model_zoo/models/gan/mnist/adversarial_model.prototext +++ b/model_zoo/models/gan/mnist/adversarial_model.prototext @@ -1,8 +1,5 @@ #Adversarial Model trainer { - block_size: 256 - procs_per_trainer: 0 - num_parallel_readers: 0 } model { data_layout: "model_parallel" diff --git a/model_zoo/models/gan/mnist/discriminator_model.prototext b/model_zoo/models/gan/mnist/discriminator_model.prototext index fde26792721..db66dc80fa8 100644 --- a/model_zoo/models/gan/mnist/discriminator_model.prototext +++ b/model_zoo/models/gan/mnist/discriminator_model.prototext @@ -1,8 +1,5 @@ #Discriminator Model trainer { - block_size: 256 - procs_per_trainer: 0 - num_parallel_readers: 0 } model { data_layout: "model_parallel" diff --git a/model_zoo/models/jag/gan/cyclic/cyclic_gan_model.prototext b/model_zoo/models/jag/gan/cyclic/cyclic_gan_model.prototext index 59f04954428..72a6721ef7a 100644 --- a/model_zoo/models/jag/gan/cyclic/cyclic_gan_model.prototext +++ b/model_zoo/models/jag/gan/cyclic/cyclic_gan_model.prototext @@ -1,6 +1,4 @@ trainer { - block_size: 256 - procs_per_trainer:0 } model { objective_function { diff --git a/model_zoo/models/jag/gan/cyclic/model_template.prototext b/model_zoo/models/jag/gan/cyclic/model_template.prototext index d41b06eecab..e490348cdd6 100644 --- a/model_zoo/models/jag/gan/cyclic/model_template.prototext +++ b/model_zoo/models/jag/gan/cyclic/model_template.prototext @@ -1,7 +1,4 @@ trainer { - block_size: 256 - procs_per_trainer: 0 - num_parallel_readers: 0 } model { data_layout: "data_parallel" diff --git a/model_zoo/models/jag/gan/vanilla/gan.prototext b/model_zoo/models/jag/gan/vanilla/gan.prototext index 8a4a408fabf..ef720e0b0d2 100644 --- a/model_zoo/models/jag/gan/vanilla/gan.prototext +++ b/model_zoo/models/jag/gan/vanilla/gan.prototext @@ -1,6 +1,4 @@ trainer { - block_size: 256 - procs_per_trainer:0 } model { random_init_models_differently: true diff --git a/model_zoo/models/jag/gan/vanilla/gan_template.prototext b/model_zoo/models/jag/gan/vanilla/gan_template.prototext index f3cfbe4cb55..af3ccc98014 100644 --- a/model_zoo/models/jag/gan/vanilla/gan_template.prototext +++ b/model_zoo/models/jag/gan/vanilla/gan_template.prototext @@ -1,7 +1,4 @@ trainer { - block_size: 256 - procs_per_trainer: 0 - num_parallel_readers: 0 } model { data_layout: "data_parallel" diff --git a/model_zoo/models/jag/vae_fcn.prototext b/model_zoo/models/jag/vae_fcn.prototext index ed560d473f9..bbf9b38e349 100644 --- a/model_zoo/models/jag/vae_fcn.prototext +++ b/model_zoo/models/jag/vae_fcn.prototext @@ -2,9 +2,6 @@ #https://lc.llnl.gov/bitbucket/users/jjayaram/repos/deep-latent-spaces/browse/codes/dev/VAE-FCN/run_vae.py #Timestamp 02/26/2018 8:45AM trainer { - block_size: 256 - procs_per_trainer: 0 - num_parallel_readers: 0 } model { data_layout: "model_parallel" diff --git a/model_zoo/models/jag/wae.prototext b/model_zoo/models/jag/wae.prototext index 8c7a125c64a..d14a2b56b8b 100644 --- a/model_zoo/models/jag/wae.prototext +++ b/model_zoo/models/jag/wae.prototext @@ -1,6 +1,4 @@ trainer { - block_size: 256 - procs_per_trainer:0 } model { random_init_models_differently: true diff --git a/model_zoo/models/jag/wae_cycle_gan/cycle_gan.prototext b/model_zoo/models/jag/wae_cycle_gan/cycle_gan.prototext index 408daf91122..70464a0e4b0 100644 --- a/model_zoo/models/jag/wae_cycle_gan/cycle_gan.prototext +++ b/model_zoo/models/jag/wae_cycle_gan/cycle_gan.prototext @@ -1,6 +1,4 @@ trainer { - block_size: 256 - procs_per_trainer:0 } model { name: "cycgan_model" diff --git a/model_zoo/models/jag/wae_cycle_gan/cycle_gan_only.prototext b/model_zoo/models/jag/wae_cycle_gan/cycle_gan_only.prototext index d0d79a4244f..ba5673289a5 100644 --- a/model_zoo/models/jag/wae_cycle_gan/cycle_gan_only.prototext +++ b/model_zoo/models/jag/wae_cycle_gan/cycle_gan_only.prototext @@ -1,6 +1,4 @@ trainer { - block_size: 256 - procs_per_trainer:0 } model { name: "cycgan_model" diff --git a/model_zoo/models/jag/wae_cycle_gan/wae.prototext b/model_zoo/models/jag/wae_cycle_gan/wae.prototext index ebefbe75fa5..c36f9460080 100644 --- a/model_zoo/models/jag/wae_cycle_gan/wae.prototext +++ b/model_zoo/models/jag/wae_cycle_gan/wae.prototext @@ -1,6 +1,4 @@ trainer { - block_size: 256 - procs_per_trainer:0 } model { random_init_models_differently: true diff --git a/model_zoo/models/jag/wae_cycle_gan/wae_fw_inv.prototext b/model_zoo/models/jag/wae_cycle_gan/wae_fw_inv.prototext index b8e70ace800..391b844f29b 100644 --- a/model_zoo/models/jag/wae_cycle_gan/wae_fw_inv.prototext +++ b/model_zoo/models/jag/wae_cycle_gan/wae_fw_inv.prototext @@ -1,8 +1,5 @@ #Augumented version of ae_cyc.prototext so we can we ae_loss, fw_latent_loss and fw_out_loss all in the same file instead of 3 files, a request from MLSI ML team. This augmentation involves replicating blocks for fw_model from cycle gan and encode from autoencoder. trainer { - block_size: 256 - procs_per_trainer: 0 - num_parallel_readers: 0 } model { name: "wae_fw_inv_model" diff --git a/model_zoo/models/jag/wae_cycle_gan/wae_nobn.prototext b/model_zoo/models/jag/wae_cycle_gan/wae_nobn.prototext index 4dfbf4d52f8..71ad7e0b3a3 100644 --- a/model_zoo/models/jag/wae_cycle_gan/wae_nobn.prototext +++ b/model_zoo/models/jag/wae_cycle_gan/wae_nobn.prototext @@ -1,5 +1,4 @@ trainer { - block_size: 256 } model { random_init_models_differently: true diff --git a/model_zoo/models/lenet_mnist/model_lenet_mnist.prototext b/model_zoo/models/lenet_mnist/model_lenet_mnist.prototext index 9d2c4bcf368..39aa4bd6eb1 100644 --- a/model_zoo/models/lenet_mnist/model_lenet_mnist.prototext +++ b/model_zoo/models/lenet_mnist/model_lenet_mnist.prototext @@ -1,7 +1,4 @@ trainer{ - block_size: 256 - procs_per_trainer: 0 - num_parallel_readers: 0 } model { data_layout: "data_parallel" diff --git a/model_zoo/models/molecular_autoencoder_candle_pilot2/model_conv_molecular_autoencoder_pilot2.prototext b/model_zoo/models/molecular_autoencoder_candle_pilot2/model_conv_molecular_autoencoder_pilot2.prototext index a6e7b0f6442..872b81db215 100644 --- a/model_zoo/models/molecular_autoencoder_candle_pilot2/model_conv_molecular_autoencoder_pilot2.prototext +++ b/model_zoo/models/molecular_autoencoder_candle_pilot2/model_conv_molecular_autoencoder_pilot2.prototext @@ -1,6 +1,4 @@ trainer { - block_size: 256 - procs_per_trainer: 0 num_parallel_readers: 1 } model { diff --git a/model_zoo/models/molecular_autoencoder_candle_pilot2/model_conv_molecular_bead_autoencoder_pilot2.prototext b/model_zoo/models/molecular_autoencoder_candle_pilot2/model_conv_molecular_bead_autoencoder_pilot2.prototext index 8603f2bd397..316dc5c077c 100644 --- a/model_zoo/models/molecular_autoencoder_candle_pilot2/model_conv_molecular_bead_autoencoder_pilot2.prototext +++ b/model_zoo/models/molecular_autoencoder_candle_pilot2/model_conv_molecular_bead_autoencoder_pilot2.prototext @@ -1,6 +1,4 @@ trainer { - block_size: 256 - procs_per_trainer: 0 num_parallel_readers: 1 } model { diff --git a/model_zoo/models/molecular_autoencoder_candle_pilot2/model_molecular_autoencoder_pilot2.prototext b/model_zoo/models/molecular_autoencoder_candle_pilot2/model_molecular_autoencoder_pilot2.prototext index 99a392e63d8..3280af90967 100644 --- a/model_zoo/models/molecular_autoencoder_candle_pilot2/model_molecular_autoencoder_pilot2.prototext +++ b/model_zoo/models/molecular_autoencoder_candle_pilot2/model_molecular_autoencoder_pilot2.prototext @@ -1,6 +1,4 @@ trainer { - block_size: 256 - procs_per_trainer: 0 num_parallel_readers: 1 } model { diff --git a/model_zoo/models/python/keras/mnist_cnn.prototext b/model_zoo/models/python/keras/mnist_cnn.prototext index f32940773dc..f06df78d39e 100644 --- a/model_zoo/models/python/keras/mnist_cnn.prototext +++ b/model_zoo/models/python/keras/mnist_cnn.prototext @@ -132,5 +132,4 @@ model { print { } } - block_size: 256 } diff --git a/model_zoo/models/resnet50/model_resnet50.prototext b/model_zoo/models/resnet50/model_resnet50.prototext index e617beef9b5..e4aed42fa18 100644 --- a/model_zoo/models/resnet50/model_resnet50.prototext +++ b/model_zoo/models/resnet50/model_resnet50.prototext @@ -1,7 +1,4 @@ trainer { - block_size: 256 - procs_per_trainer: 0 - num_parallel_readers: 0 } model { data_layout: "data_parallel" diff --git a/model_zoo/models/siamese/finetune-cub/model_cub.prototext b/model_zoo/models/siamese/finetune-cub/model_cub.prototext index af345911707..fc633acef11 100644 --- a/model_zoo/models/siamese/finetune-cub/model_cub.prototext +++ b/model_zoo/models/siamese/finetune-cub/model_cub.prototext @@ -1,7 +1,4 @@ trainer { - block_size: 256 - procs_per_trainer: 0 - num_parallel_readers: 0 } model { data_layout: "data_parallel" diff --git a/model_zoo/models/siamese/finetune-cub/model_cub_batchnorm.prototext b/model_zoo/models/siamese/finetune-cub/model_cub_batchnorm.prototext index 1e2b87a0753..04758c0b85e 100644 --- a/model_zoo/models/siamese/finetune-cub/model_cub_batchnorm.prototext +++ b/model_zoo/models/siamese/finetune-cub/model_cub_batchnorm.prototext @@ -1,7 +1,4 @@ trainer { - block_size: 256 - procs_per_trainer: 0 - num_parallel_readers: 0 } model { data_layout: "data_parallel" diff --git a/model_zoo/models/siamese/finetune-cub/model_cub_batchnorm_transferred_and_frozen.prototext b/model_zoo/models/siamese/finetune-cub/model_cub_batchnorm_transferred_and_frozen.prototext index 04d0a8c8efe..0fe7b36f50c 100644 --- a/model_zoo/models/siamese/finetune-cub/model_cub_batchnorm_transferred_and_frozen.prototext +++ b/model_zoo/models/siamese/finetune-cub/model_cub_batchnorm_transferred_and_frozen.prototext @@ -1,7 +1,4 @@ trainer { - block_size: 256 - procs_per_trainer: 0 - num_parallel_readers: 0 } model { data_layout: "data_parallel" diff --git a/model_zoo/models/siamese/triplet/model_triplet_alexnet_batchnorm_dag_frozen_bn.prototext b/model_zoo/models/siamese/triplet/model_triplet_alexnet_batchnorm_dag_frozen_bn.prototext index 555b2736cec..ea13206c441 100644 --- a/model_zoo/models/siamese/triplet/model_triplet_alexnet_batchnorm_dag_frozen_bn.prototext +++ b/model_zoo/models/siamese/triplet/model_triplet_alexnet_batchnorm_dag_frozen_bn.prototext @@ -1,7 +1,4 @@ trainer { - block_size: 256 - procs_per_trainer: 0 - num_parallel_readers: 0 } model { data_layout: "data_parallel" diff --git a/model_zoo/models/simple_mnist/model_mnist_simple_1.prototext b/model_zoo/models/simple_mnist/model_mnist_simple_1.prototext index 1727cb235d8..c4c4d6e0370 100644 --- a/model_zoo/models/simple_mnist/model_mnist_simple_1.prototext +++ b/model_zoo/models/simple_mnist/model_mnist_simple_1.prototext @@ -1,7 +1,4 @@ trainer { - block_size: 256 - procs_per_trainer: 0 - num_parallel_readers: 0 } model { data_layout: "data_parallel" diff --git a/model_zoo/models/simple_mnist/model_mnist_simple_2.prototext b/model_zoo/models/simple_mnist/model_mnist_simple_2.prototext index 4a7b090d43a..4ed6dc1c269 100644 --- a/model_zoo/models/simple_mnist/model_mnist_simple_2.prototext +++ b/model_zoo/models/simple_mnist/model_mnist_simple_2.prototext @@ -1,7 +1,4 @@ trainer { - block_size: 256 - procs_per_trainer: 0 - num_parallel_readers: 0 } model { data_layout: "data_parallel" diff --git a/model_zoo/tests/data_reader_tests/jag_single_layer_ae.prototext b/model_zoo/tests/data_reader_tests/jag_single_layer_ae.prototext index 5f1684c27f8..b4aa6efff8b 100644 --- a/model_zoo/tests/data_reader_tests/jag_single_layer_ae.prototext +++ b/model_zoo/tests/data_reader_tests/jag_single_layer_ae.prototext @@ -4,10 +4,7 @@ model { serialize_io: true data_layout: "data_parallel" mini_batch_size: 128 - block_size: 256 num_epochs: 4 - num_parallel_readers: 0 - procs_per_trainer: 0 ################################################### # Objective function diff --git a/model_zoo/tests/layer_tests/model_channelwise_mean.prototext b/model_zoo/tests/layer_tests/model_channelwise_mean.prototext index 01e489066ce..fe8e1de1f1e 100644 --- a/model_zoo/tests/layer_tests/model_channelwise_mean.prototext +++ b/model_zoo/tests/layer_tests/model_channelwise_mean.prototext @@ -1,7 +1,4 @@ trainer { - block_size: 256 - procs_per_trainer: 0 - num_parallel_readers: 0 } model { data_layout: "data_parallel" diff --git a/model_zoo/tests/layer_tests/model_clamp.prototext b/model_zoo/tests/layer_tests/model_clamp.prototext index f0120282b07..5d6aaf40fb1 100644 --- a/model_zoo/tests/layer_tests/model_clamp.prototext +++ b/model_zoo/tests/layer_tests/model_clamp.prototext @@ -1,7 +1,4 @@ trainer { - block_size: 256 - procs_per_trainer: 0 - num_parallel_readers: 0 } model { data_layout: "data_parallel" diff --git a/model_zoo/tests/layer_tests/model_covariance.prototext b/model_zoo/tests/layer_tests/model_covariance.prototext index ad224af6739..18076486724 100644 --- a/model_zoo/tests/layer_tests/model_covariance.prototext +++ b/model_zoo/tests/layer_tests/model_covariance.prototext @@ -1,7 +1,4 @@ trainer { - block_size: 256 - procs_per_trainer: 0 - num_parallel_readers: 0 } model { data_layout: "data_parallel" diff --git a/model_zoo/tests/layer_tests/model_elu.prototext b/model_zoo/tests/layer_tests/model_elu.prototext index e045dea1d11..88de9a6d908 100644 --- a/model_zoo/tests/layer_tests/model_elu.prototext +++ b/model_zoo/tests/layer_tests/model_elu.prototext @@ -1,7 +1,4 @@ trainer { - block_size: 256 - procs_per_trainer: 0 - num_parallel_readers: 0 } model { data_layout: "data_parallel" diff --git a/model_zoo/tests/layer_tests/model_identity.prototext b/model_zoo/tests/layer_tests/model_identity.prototext index 606803d8721..aa26c7c46ea 100644 --- a/model_zoo/tests/layer_tests/model_identity.prototext +++ b/model_zoo/tests/layer_tests/model_identity.prototext @@ -1,7 +1,4 @@ trainer { - block_size: 256 - procs_per_trainer: 0 - num_parallel_readers: 0 } model { data_layout: "data_parallel" diff --git a/model_zoo/tests/layer_tests/model_l1_norm.prototext b/model_zoo/tests/layer_tests/model_l1_norm.prototext index c2175bcb852..fd87729973d 100644 --- a/model_zoo/tests/layer_tests/model_l1_norm.prototext +++ b/model_zoo/tests/layer_tests/model_l1_norm.prototext @@ -1,7 +1,4 @@ trainer { - block_size: 256 - procs_per_trainer: 0 - num_parallel_readers: 0 } model { data_layout: "data_parallel" diff --git a/model_zoo/tests/layer_tests/model_l2_norm2.prototext b/model_zoo/tests/layer_tests/model_l2_norm2.prototext index 694d374536e..e327e05846a 100644 --- a/model_zoo/tests/layer_tests/model_l2_norm2.prototext +++ b/model_zoo/tests/layer_tests/model_l2_norm2.prototext @@ -1,7 +1,4 @@ trainer { - block_size: 256 - procs_per_trainer: 0 - num_parallel_readers: 0 } model { data_layout: "data_parallel" diff --git a/model_zoo/tests/layer_tests/model_leaky_relu.prototext b/model_zoo/tests/layer_tests/model_leaky_relu.prototext index 126c2962870..a5fccee5a42 100644 --- a/model_zoo/tests/layer_tests/model_leaky_relu.prototext +++ b/model_zoo/tests/layer_tests/model_leaky_relu.prototext @@ -1,7 +1,4 @@ trainer { - block_size: 256 - procs_per_trainer: 0 - num_parallel_readers: 0 } model { data_layout: "data_parallel" diff --git a/model_zoo/tests/layer_tests/model_log_sigmoid.prototext b/model_zoo/tests/layer_tests/model_log_sigmoid.prototext index 8b559f9766d..af3b0526eb2 100644 --- a/model_zoo/tests/layer_tests/model_log_sigmoid.prototext +++ b/model_zoo/tests/layer_tests/model_log_sigmoid.prototext @@ -1,7 +1,4 @@ trainer { - block_size: 256 - procs_per_trainer: 0 - num_parallel_readers: 0 } model { data_layout: "data_parallel" diff --git a/model_zoo/tests/layer_tests/model_log_softmax.prototext b/model_zoo/tests/layer_tests/model_log_softmax.prototext index 1de02b0342d..4645fe727d2 100644 --- a/model_zoo/tests/layer_tests/model_log_softmax.prototext +++ b/model_zoo/tests/layer_tests/model_log_softmax.prototext @@ -1,7 +1,4 @@ trainer { - block_size: 256 - procs_per_trainer: 0 - num_parallel_readers: 0 } model { data_layout: "data_parallel" diff --git a/model_zoo/tests/layer_tests/model_mean_absolute_error.prototext b/model_zoo/tests/layer_tests/model_mean_absolute_error.prototext index 8728d2bcb80..26521501938 100644 --- a/model_zoo/tests/layer_tests/model_mean_absolute_error.prototext +++ b/model_zoo/tests/layer_tests/model_mean_absolute_error.prototext @@ -1,7 +1,4 @@ trainer { - block_size: 256 - procs_per_trainer: 0 - num_parallel_readers: 0 } model { data_layout: "data_parallel" diff --git a/model_zoo/tests/layer_tests/model_relu.prototext b/model_zoo/tests/layer_tests/model_relu.prototext index 4acdbe6aaf1..edfb9ab5e89 100644 --- a/model_zoo/tests/layer_tests/model_relu.prototext +++ b/model_zoo/tests/layer_tests/model_relu.prototext @@ -1,7 +1,4 @@ trainer { - block_size: 256 - procs_per_trainer: 0 - num_parallel_readers: 0 } model { data_layout: "data_parallel" diff --git a/model_zoo/tests/layer_tests/model_selu.prototext b/model_zoo/tests/layer_tests/model_selu.prototext index 007859a7bac..2b76d8f003b 100644 --- a/model_zoo/tests/layer_tests/model_selu.prototext +++ b/model_zoo/tests/layer_tests/model_selu.prototext @@ -1,7 +1,4 @@ trainer { - block_size: 256 - procs_per_trainer: 0 - num_parallel_readers: 0 } model { data_layout: "data_parallel" diff --git a/model_zoo/tests/layer_tests/model_sigmoid.prototext b/model_zoo/tests/layer_tests/model_sigmoid.prototext index 13323653add..08f16f063ca 100644 --- a/model_zoo/tests/layer_tests/model_sigmoid.prototext +++ b/model_zoo/tests/layer_tests/model_sigmoid.prototext @@ -1,7 +1,4 @@ trainer { - block_size: 256 - procs_per_trainer: 0 - num_parallel_readers: 0 } model { data_layout: "data_parallel" diff --git a/model_zoo/tests/layer_tests/model_softmax.prototext b/model_zoo/tests/layer_tests/model_softmax.prototext index 71ed61145fe..5e6891cb2ef 100644 --- a/model_zoo/tests/layer_tests/model_softmax.prototext +++ b/model_zoo/tests/layer_tests/model_softmax.prototext @@ -1,7 +1,4 @@ trainer { - block_size: 256 - procs_per_trainer: 0 - num_parallel_readers: 0 } model { data_layout: "data_parallel" diff --git a/model_zoo/tests/layer_tests/model_softplus.prototext b/model_zoo/tests/layer_tests/model_softplus.prototext index c2543bcc1a4..7e5f31df652 100644 --- a/model_zoo/tests/layer_tests/model_softplus.prototext +++ b/model_zoo/tests/layer_tests/model_softplus.prototext @@ -1,7 +1,4 @@ trainer { - block_size: 256 - procs_per_trainer: 0 - num_parallel_readers: 0 } model { data_layout: "data_parallel" diff --git a/model_zoo/tests/layer_tests/model_softsign.prototext b/model_zoo/tests/layer_tests/model_softsign.prototext index 3c83855f991..61979cb03a0 100644 --- a/model_zoo/tests/layer_tests/model_softsign.prototext +++ b/model_zoo/tests/layer_tests/model_softsign.prototext @@ -1,7 +1,4 @@ trainer { - block_size: 256 - procs_per_trainer: 0 - num_parallel_readers: 0 } model { data_layout: "data_parallel" diff --git a/model_zoo/tests/layer_tests/model_squared_difference.prototext b/model_zoo/tests/layer_tests/model_squared_difference.prototext index 2142f7e5144..73de72545d9 100644 --- a/model_zoo/tests/layer_tests/model_squared_difference.prototext +++ b/model_zoo/tests/layer_tests/model_squared_difference.prototext @@ -1,5 +1,4 @@ trainer { - block_size: 256 } model { mini_batch_size: 11 diff --git a/model_zoo/tests/layer_tests/model_tessellate.prototext b/model_zoo/tests/layer_tests/model_tessellate.prototext index 5da0b5fa989..e48fd2a5005 100644 --- a/model_zoo/tests/layer_tests/model_tessellate.prototext +++ b/model_zoo/tests/layer_tests/model_tessellate.prototext @@ -1,5 +1,4 @@ trainer { - block_size: 256 } model { mini_batch_size: 11 diff --git a/model_zoo/tests/layer_tests/model_variance.prototext b/model_zoo/tests/layer_tests/model_variance.prototext index d01a9e9ce68..71bbc8f948e 100644 --- a/model_zoo/tests/layer_tests/model_variance.prototext +++ b/model_zoo/tests/layer_tests/model_variance.prototext @@ -1,7 +1,4 @@ trainer { - block_size: 256 - procs_per_trainer: 0 - num_parallel_readers: 0 } model { data_layout: "data_parallel" diff --git a/model_zoo/tests/model_jag_single_layer_ae.prototext b/model_zoo/tests/model_jag_single_layer_ae.prototext index dfcb5501d72..4c8b42578a2 100644 --- a/model_zoo/tests/model_jag_single_layer_ae.prototext +++ b/model_zoo/tests/model_jag_single_layer_ae.prototext @@ -3,9 +3,6 @@ # Example on how to run: # srun --nodes=16 --ntasks=32 build/gnu.Release.catalyst.llnl.gov/lbann/build/model_zoo/lbann --model=model_zoo/tests/model_jag_single_layer_ae.prototext --optimizer=model_zoo/optimizers/opt_adam.prototext --reader=model_zoo/data_readers/data_reader_jag.prototext --metadata=model_zoo/models/jag/wae_cycle_gan/jag_100M_metadata.prototext trainer { - block_size: 256 - procs_per_trainer:0 - num_parallel_readers: 0 } model { name: "ae_model" diff --git a/model_zoo/tests/model_lenet_mnist_ckpt.prototext b/model_zoo/tests/model_lenet_mnist_ckpt.prototext index a0aed7f7c83..cd194143551 100644 --- a/model_zoo/tests/model_lenet_mnist_ckpt.prototext +++ b/model_zoo/tests/model_lenet_mnist_ckpt.prototext @@ -1,7 +1,4 @@ trainer { - block_size: 256 - procs_per_trainer: 0 - num_parallel_readers: 0 } model { data_layout: "data_parallel" diff --git a/model_zoo/tests/model_lenet_mnist_dist_ckpt.prototext b/model_zoo/tests/model_lenet_mnist_dist_ckpt.prototext index 0c325acd036..76611120369 100644 --- a/model_zoo/tests/model_lenet_mnist_dist_ckpt.prototext +++ b/model_zoo/tests/model_lenet_mnist_dist_ckpt.prototext @@ -1,7 +1,4 @@ trainer { - block_size: 256 - procs_per_trainer: 0 - num_parallel_readers: 0 } model { data_layout: "data_parallel" diff --git a/model_zoo/tests/model_lenet_mnist_lbann2ckpt.prototext b/model_zoo/tests/model_lenet_mnist_lbann2ckpt.prototext index b6c8b635ebb..09a11f5d9f6 100644 --- a/model_zoo/tests/model_lenet_mnist_lbann2ckpt.prototext +++ b/model_zoo/tests/model_lenet_mnist_lbann2ckpt.prototext @@ -1,7 +1,4 @@ trainer { - block_size: 256 - procs_per_trainer: 0 - num_parallel_readers: 0 } model { data_layout: "data_parallel" diff --git a/model_zoo/tests/model_mnist_conv_graph.prototext b/model_zoo/tests/model_mnist_conv_graph.prototext index 61ecd70e0dd..9ef468864c3 100644 --- a/model_zoo/tests/model_mnist_conv_graph.prototext +++ b/model_zoo/tests/model_mnist_conv_graph.prototext @@ -1,5 +1,5 @@ trainer { - block_size: 257 + hydrogen_block_size: 257 } model { data_layout: "data_parallel" diff --git a/model_zoo/tests/model_mnist_ridge_regression.prototext b/model_zoo/tests/model_mnist_ridge_regression.prototext index 0edf0a32028..8670847404b 100644 --- a/model_zoo/tests/model_mnist_ridge_regression.prototext +++ b/model_zoo/tests/model_mnist_ridge_regression.prototext @@ -1,7 +1,5 @@ trainer { - block_size: 257 - procs_per_trainer: 0 - num_parallel_readers: 0 + hydrogen_block_size: 257 } model { data_layout: "data_parallel" diff --git a/model_zoo/tests/model_mnist_softmax_classifier.prototext b/model_zoo/tests/model_mnist_softmax_classifier.prototext index f776f5e89fb..941705fbbca 100644 --- a/model_zoo/tests/model_mnist_softmax_classifier.prototext +++ b/model_zoo/tests/model_mnist_softmax_classifier.prototext @@ -1,7 +1,5 @@ trainer { - block_size: 199 - procs_per_trainer: 0 - num_parallel_readers: 0 + hydrogen_block_size: 199 } model { data_layout: "data_parallel" diff --git a/python/lbann/trainer.py b/python/lbann/trainer.py index 75b0ff994e1..6e917add9ab 100644 --- a/python/lbann/trainer.py +++ b/python/lbann/trainer.py @@ -6,20 +6,25 @@ class Trainer: """LBANN Trainer.""" - def __init__(self): - # Scalar fields - self.block_size = 256 # TODO: Make configurable - self.procs_per_trainer = 0 # TODO: Make configurable - self.num_parallel_readers = 0 # TODO: Make configurable - self.num_gpus = 1 # TODO: Make configurable + def __init__(self, + name=None, + procs_per_trainer=None, + num_parallel_readers=None): + self.name = name + self.procs_per_trainer = procs_per_trainer + self.num_parallel_readers = num_parallel_readers + self.hydrogen_block_size = None def export_proto(self): """Construct and return a protobuf message.""" # Initialize protobuf message trainer = trainer_pb2.Trainer() - trainer.block_size = self.block_size - trainer.procs_per_trainer = self.procs_per_trainer - trainer.num_parallel_readers = self.num_parallel_readers - trainer.num_gpus = self.num_gpus - + if self.name is not None: + trainer.name = self.name + if self.procs_per_trainer is not None: + trainer.procs_per_trainer = self.procs_per_trainer + if self.num_parallel_readers is not None: + trainer.num_parallel_readers = self.num_parallel_readers + if self.hydrogen_block_size is not None: + trainer.hydrogen_block_size = self.hydrogen_block_size return trainer diff --git a/src/proto/proto_common.cpp b/src/proto/proto_common.cpp index f5fbb6cf0f4..f71ef31542f 100644 --- a/src/proto/proto_common.cpp +++ b/src/proto/proto_common.cpp @@ -780,8 +780,8 @@ void get_cmdline_overrides(const lbann_comm& comm, lbann_data::LbannPB& p) if (opts->has_int("num_epochs")) { model->set_num_epochs(opts->get_int("num_epochs")); } - if (opts->has_int("block_size")) { - trainer->set_block_size(opts->get_int("block_size")); + if (opts->has_int("hydrogen_block_size")) { + trainer->set_hydrogen_block_size(opts->get_int("hydrogen_block_size")); } if (opts->has_int("procs_per_trainer")) { trainer->set_procs_per_trainer(opts->get_int("procs_per_trainer")); @@ -825,7 +825,7 @@ void print_parameters(const lbann_comm& comm, lbann_data::LbannPB& p) << " datatype size: " << sizeof(DataType) << std::endl << " mini_batch_size: " << m.mini_batch_size() << std::endl << " num_epochs: " << m.num_epochs() << std::endl - << " block_size: " << t.block_size() << std::endl + << " hydrogen_block_size: " << t.hydrogen_block_size() << std::endl << " procs_per_trainer: " << t.procs_per_trainer() << std::endl << " num_parallel_readers: " << t.num_parallel_readers() << std::endl << " serialize_io: " << m.serialize_io() << std::endl @@ -864,9 +864,8 @@ void print_help(std::ostream& os) "General:\n" " --mini_batch_size=\n" " --num_epochs=\n" - " --block_size=\n" + " --hydrogen_block_size=\n" " --procs_per_trainer=\n" - " --num_gpus=\n" " --num_parallel_readers=\n" " --num_io_threads=\n" " # of threads used for I/O by the data readers\n" diff --git a/src/proto/trainer.proto b/src/proto/trainer.proto index b45384b84bb..b16a0acc615 100644 --- a/src/proto/trainer.proto +++ b/src/proto/trainer.proto @@ -29,10 +29,31 @@ syntax = "proto3"; package lbann_data; message Trainer { - string name = 3; - int64 block_size = 50; - int64 procs_per_trainer = 51; - int64 num_gpus = 53; //has no effect - int64 num_parallel_readers = 100; + // Unique identifier + string name = 1; + + // Parallel processes per trainer + // + // The number of processes per trainer must evenly divide the total + // number of MPI ranks. The number of resulting trainers is + // num_procs / procs_per_trainer. + // + // If procs_per_trainer is not provided, then all MPI ranks are + // assigned to one trainer. + int64 procs_per_trainer = 2; + + // I/O threads per parallel process + // + // These threads are typically used to perform data ingestion in the + // background. + int64 num_parallel_readers = 3; + + // ------------------------------- + // Advanced options + // ------------------------------- + + // Algorithmic block size for Hydrogen + int64 hydrogen_block_size = 100; + } diff --git a/src/utils/lbann_library.cpp b/src/utils/lbann_library.cpp index f9228c45139..3617de3c6a5 100644 --- a/src/utils/lbann_library.cpp +++ b/src/utils/lbann_library.cpp @@ -42,7 +42,6 @@ namespace lbann { std::unique_ptr construct_trainer(lbann_comm *comm, lbann_data::Trainer* pb_trainer, options *opts) { - bool master = comm->am_world_master(); try { int procs_per_trainer = 0; if(pb_trainer->procs_per_trainer() > 0) { @@ -69,11 +68,10 @@ std::unique_ptr construct_trainer(lbann_comm *comm, auto io_threads_per_process = io_thread_pool->get_num_threads(); auto io_threads_offset = io_thread_pool->get_threads_offset(); - // Set algorithmic blocksize - if (pb_trainer->block_size() == 0 and master) { - LBANN_ERROR("model does not provide a valid block size (", pb_trainer->block_size(), ")"); + // Set algorithmic blocksize in Hydrogen + if (pb_trainer->hydrogen_block_size() > 0) { + El::SetBlocksize(pb_trainer->hydrogen_block_size()); } - El::SetBlocksize(pb_trainer->block_size()); // Set up the communicator and get the grid based on the trainers' spec. // We do not currently support splitting different trainers in different ways, From 8ab3e70e746a410f7f97f25956ebcf3a9386f08c Mon Sep 17 00:00:00 2001 From: davidHysom Date: Mon, 14 Oct 2019 15:06:47 -0700 Subject: [PATCH 341/634] Fix for case where global archive size is > INT_MAX (#1288) * Modified sample_list::all_gather_archive() to deal with the case where the size of the global archive is > INT_MAX. * Changed INT_MAX to c++ idiom, per Tom's suggestion * avoid a second lookup for find(...) per change suggested by Tom (there are many similar places in the codebase that should be changed ... though I expect there would be a very, very minimal affect on runtime.) * Made "broadcast" version of sample_list::all_gather_archive() the default. The version that uses all_gather can be selected via the cmd line flag: --all_gather_old; previously the "broadcast" version was selected via the cmd line flag --all_gather_new. --- include/lbann/data_readers/sample_list.hpp | 2 + .../lbann/data_readers/sample_list_impl.hpp | 94 +++++++++++++++++++ .../sample_list_open_files_impl.hpp | 2 +- 3 files changed, 97 insertions(+), 1 deletion(-) diff --git a/include/lbann/data_readers/sample_list.hpp b/include/lbann/data_readers/sample_list.hpp index 45fbdb39e92..6d4aa5e051f 100644 --- a/include/lbann/data_readers/sample_list.hpp +++ b/include/lbann/data_readers/sample_list.hpp @@ -105,6 +105,8 @@ class sample_list { const std::string& get_samples_dirname() const; void all_gather_archive(const std::string &archive, std::vector& gathered_archive, lbann_comm& comm); + void all_gather_archive_new(const std::string &archive, std::vector& gathered_archive, lbann_comm& comm); + template size_t all_gather_field(T data, std::vector& gathered_data, lbann_comm& comm); virtual void all_gather_packed_lists(lbann_comm& comm); diff --git a/include/lbann/data_readers/sample_list_impl.hpp b/include/lbann/data_readers/sample_list_impl.hpp index 364de74bb27..3fc06406a12 100644 --- a/include/lbann/data_readers/sample_list_impl.hpp +++ b/include/lbann/data_readers/sample_list_impl.hpp @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -337,6 +338,11 @@ inline void sample_list ::all_gather_archive(const std::string &archive, std::vector& gathered_archive, lbann_comm& comm) { + if (!options::get()->get_bool("all_gather_old")) { + all_gather_archive_new(archive, gathered_archive, comm); + return; + } + int size_of_list_archive = archive.size(); std::vector packed_sizes(comm.get_procs_per_trainer()); @@ -378,6 +384,94 @@ ::all_gather_archive(const std::string &archive, return; } +template +inline void sample_list +::all_gather_archive(const std::string &archive, + std::vector& gathered_archive, + lbann_comm& comm) { + + // there's commented out code below to deal with the case where + // archive.size() > INT_MAX; but for now let's assume we won't + // encounter that (which is true for the 100M JAG set) + int constexpr max_int = std::numeric_limits::max(); + size_t n = archive.size(); + if (n > max_int) { + LBANN_ERROR("(n > max_int"); + } + + // change int to size_t for case where n > max_int (see commented out + // code block below) + int size_of_my_archive= archive.size(); + std::vector packed_sizes(comm.get_procs_per_trainer()); + comm.trainer_all_gather(size_of_my_archive, packed_sizes); + + int me = comm.get_rank_in_trainer(); + int np = comm.get_procs_per_trainer(); + + size_t g = 0; + for (auto t : packed_sizes) { + g += t; + } + if (!me) { + std::cout << "global archive size: " << g << std::endl; + } + + + for (int p=0; p(gathered_archive[p].data()); + comm.trainer_broadcast(p, data, sz); + } + +#if 0 + std::vector rounds; + for (int p=0; p(archive.data() + offset); + comm.trainer_broadcast(p, data, rounds[k]); + } else { + char *data = const_cast(gathered_archive[p].data() + offset); + comm.trainer_broadcast(p, data, rounds[k]); + } + offset += rounds[k]; +if (me == p) { +std::cout << "XX finished round" << std::endl; +} + } + } +#endif + + return; +} + template template inline size_t sample_list diff --git a/include/lbann/data_readers/sample_list_open_files_impl.hpp b/include/lbann/data_readers/sample_list_open_files_impl.hpp index 3839ca2a425..565b016bd22 100644 --- a/include/lbann/data_readers/sample_list_open_files_impl.hpp +++ b/include/lbann/data_readers/sample_list_open_files_impl.hpp @@ -556,7 +556,7 @@ ::all_gather_packed_lists(lbann_comm& comm) { LBANN_ERROR("mp.find(filename) == mp.end()"); } index = search_result->second; - } + } m_sample_list.emplace_back(std::make_pair(index, s.second)); } } From 5e0d68e928be2fdc092afc707236dabb769b7b0a Mon Sep 17 00:00:00 2001 From: Tom Benson <30674819+benson31@users.noreply.github.com> Date: Tue, 15 Oct 2019 09:26:32 -0700 Subject: [PATCH 342/634] Revert "Fix for case where global archive size is > INT_MAX (#1288)" (#1296) This reverts commit 8ab3e70e746a410f7f97f25956ebcf3a9386f08c. --- include/lbann/data_readers/sample_list.hpp | 2 - .../lbann/data_readers/sample_list_impl.hpp | 94 ------------------- .../sample_list_open_files_impl.hpp | 2 +- 3 files changed, 1 insertion(+), 97 deletions(-) diff --git a/include/lbann/data_readers/sample_list.hpp b/include/lbann/data_readers/sample_list.hpp index 6d4aa5e051f..45fbdb39e92 100644 --- a/include/lbann/data_readers/sample_list.hpp +++ b/include/lbann/data_readers/sample_list.hpp @@ -105,8 +105,6 @@ class sample_list { const std::string& get_samples_dirname() const; void all_gather_archive(const std::string &archive, std::vector& gathered_archive, lbann_comm& comm); - void all_gather_archive_new(const std::string &archive, std::vector& gathered_archive, lbann_comm& comm); - template size_t all_gather_field(T data, std::vector& gathered_data, lbann_comm& comm); virtual void all_gather_packed_lists(lbann_comm& comm); diff --git a/include/lbann/data_readers/sample_list_impl.hpp b/include/lbann/data_readers/sample_list_impl.hpp index 3fc06406a12..364de74bb27 100644 --- a/include/lbann/data_readers/sample_list_impl.hpp +++ b/include/lbann/data_readers/sample_list_impl.hpp @@ -12,7 +12,6 @@ #include #include #include -#include #include #include @@ -338,11 +337,6 @@ inline void sample_list ::all_gather_archive(const std::string &archive, std::vector& gathered_archive, lbann_comm& comm) { - if (!options::get()->get_bool("all_gather_old")) { - all_gather_archive_new(archive, gathered_archive, comm); - return; - } - int size_of_list_archive = archive.size(); std::vector packed_sizes(comm.get_procs_per_trainer()); @@ -384,94 +378,6 @@ ::all_gather_archive(const std::string &archive, return; } -template -inline void sample_list -::all_gather_archive(const std::string &archive, - std::vector& gathered_archive, - lbann_comm& comm) { - - // there's commented out code below to deal with the case where - // archive.size() > INT_MAX; but for now let's assume we won't - // encounter that (which is true for the 100M JAG set) - int constexpr max_int = std::numeric_limits::max(); - size_t n = archive.size(); - if (n > max_int) { - LBANN_ERROR("(n > max_int"); - } - - // change int to size_t for case where n > max_int (see commented out - // code block below) - int size_of_my_archive= archive.size(); - std::vector packed_sizes(comm.get_procs_per_trainer()); - comm.trainer_all_gather(size_of_my_archive, packed_sizes); - - int me = comm.get_rank_in_trainer(); - int np = comm.get_procs_per_trainer(); - - size_t g = 0; - for (auto t : packed_sizes) { - g += t; - } - if (!me) { - std::cout << "global archive size: " << g << std::endl; - } - - - for (int p=0; p(gathered_archive[p].data()); - comm.trainer_broadcast(p, data, sz); - } - -#if 0 - std::vector rounds; - for (int p=0; p(archive.data() + offset); - comm.trainer_broadcast(p, data, rounds[k]); - } else { - char *data = const_cast(gathered_archive[p].data() + offset); - comm.trainer_broadcast(p, data, rounds[k]); - } - offset += rounds[k]; -if (me == p) { -std::cout << "XX finished round" << std::endl; -} - } - } -#endif - - return; -} - template template inline size_t sample_list diff --git a/include/lbann/data_readers/sample_list_open_files_impl.hpp b/include/lbann/data_readers/sample_list_open_files_impl.hpp index 565b016bd22..3839ca2a425 100644 --- a/include/lbann/data_readers/sample_list_open_files_impl.hpp +++ b/include/lbann/data_readers/sample_list_open_files_impl.hpp @@ -556,7 +556,7 @@ ::all_gather_packed_lists(lbann_comm& comm) { LBANN_ERROR("mp.find(filename) == mp.end()"); } index = search_result->second; - } + } m_sample_list.emplace_back(std::make_pair(index, s.second)); } } From 05cbcbe0b13a71a61b77870fe8bb234c641d01b6 Mon Sep 17 00:00:00 2001 From: Tom Benson <30674819+benson31@users.noreply.github.com> Date: Tue, 15 Oct 2019 13:36:39 -0700 Subject: [PATCH 343/634] cleanup entrywise layer macros to facilitate templating (#1298) --- .../lbann/layers/activations/activations.hpp | 13 +-- include/lbann/layers/loss/entrywise.hpp | 15 +-- include/lbann/layers/math/binary.hpp | 106 +++++++----------- include/lbann/layers/math/unary.hpp | 62 +++++----- 4 files changed, 75 insertions(+), 121 deletions(-) diff --git a/include/lbann/layers/activations/activations.hpp b/include/lbann/layers/activations/activations.hpp index 65e79ae3acb..35895f3dcc4 100644 --- a/include/lbann/layers/activations/activations.hpp +++ b/include/lbann/layers/activations/activations.hpp @@ -35,10 +35,8 @@ namespace lbann { #ifndef LBANN_ACTIVATIONS_LAYER_INSTANTIATE #define UNARY_ETI_DECL_MACRO_DEV(LAYER_NAME, DEVICE) \ - extern template class entrywise_unary_layer< \ - data_layout::DATA_PARALLEL, DEVICE, LAYER_NAME##_name_struct>; \ - extern template class entrywise_unary_layer< \ - data_layout::MODEL_PARALLEL, DEVICE, LAYER_NAME##_name_struct> + extern template class LAYER_NAME; \ + extern template class LAYER_NAME #else #define UNARY_ETI_DECL_MACRO_DEV(...) #endif // LBANN_UNARY_LAYER_INSTANTIATE @@ -54,12 +52,7 @@ namespace lbann { // Convenience macro to define an entry-wise unary layer class #define DEFINE_ENTRYWISE_UNARY_LAYER(layer_name, layer_string) \ - struct layer_name##_name_struct { \ - inline operator std::string() { return layer_string; } \ - }; \ - template \ - using layer_name \ - = entrywise_unary_layer; \ + LBANN_DECLARE_ENTRYWISE_UNARY_LAYER(layer_name, layer_string); \ UNARY_ETI_DECL_MACRO(layer_name) /** @class lbann::log_sigmoid_layer diff --git a/include/lbann/layers/loss/entrywise.hpp b/include/lbann/layers/loss/entrywise.hpp index f4fc640869d..33cb7c9262d 100644 --- a/include/lbann/layers/loss/entrywise.hpp +++ b/include/lbann/layers/loss/entrywise.hpp @@ -32,11 +32,9 @@ namespace lbann { #ifndef LBANN_ENTRYWISE_LAYER_INSTANTIATE -#define BINARY_ETI_DECL_MACRO_DEV(LAYER_NAME, DEVICE) \ - extern template class entrywise_binary_layer< \ - data_layout::DATA_PARALLEL, DEVICE, LAYER_NAME##_name_struct>; \ - extern template class entrywise_binary_layer< \ - data_layout::MODEL_PARALLEL, DEVICE, LAYER_NAME##_name_struct> +#define BINARY_ETI_DECL_MACRO_DEV(LAYER_NAME, DEVICE) \ + extern template class LAYER_NAME; \ + extern template class LAYER_NAME #else #define BINARY_ETI_DECL_MACRO_DEV(...) #endif // LBANN_BINARY_LAYER_INSTANTIATE @@ -52,12 +50,7 @@ namespace lbann { // Convenience macro to define an entry-wise binary layer class #define DEFINE_ENTRYWISE_BINARY_LAYER(layer_name, layer_string) \ - struct layer_name##_name_struct { \ - inline operator std::string() { return layer_string; } \ - }; \ - template \ - using layer_name \ - = entrywise_binary_layer; \ + LBANN_DECLARE_ENTRYWISE_BINARY_LAYER(layer_name, layer_string); \ BINARY_ETI_DECL_MACRO(layer_name) // Cross entropy loss diff --git a/include/lbann/layers/math/binary.hpp b/include/lbann/layers/math/binary.hpp index 1b7d5cadbd5..e95df55fd97 100644 --- a/include/lbann/layers/math/binary.hpp +++ b/include/lbann/layers/math/binary.hpp @@ -31,73 +31,58 @@ namespace lbann { -/** @brief Templated class for entry-wise binary layers. - * @param Layout Parallelism scheme. - * @param Device Device allocation. - * @param Name Type that can be converted into a string. - */ -template -class entrywise_binary_layer : public Layer { -public: - - entrywise_binary_layer(lbann_comm *comm) : Layer(comm) { - this->m_expected_num_parent_layers = 2; - } - entrywise_binary_layer* copy() const override { - return new entrywise_binary_layer(*this); +#define LBANN_DECLARE_ENTRYWISE_BINARY_LAYER(LAYER_NAME, LAYER_STRING) \ + template \ + class LAYER_NAME : public Layer { \ + public: \ + LAYER_NAME(lbann_comm *comm) : Layer(comm) { \ + this->m_expected_num_parent_layers = 2; \ + } \ + LAYER_NAME* copy() const override { \ + return new LAYER_NAME(*this); \ + } \ + std::string get_type() const override { return LAYER_STRING; } \ + data_layout get_data_layout() const override { return Layout; } \ + El::Device get_device_allocation() const override { return Device; } \ + protected: \ + void setup_dims() override { \ + Layer::setup_dims(); \ + set_output_dims(get_input_dims()); \ + /* Check that input dimensions match */ \ + if (get_input_dims(0) != get_input_dims(1)) { \ + const auto& parents = get_parent_layers(); \ + std::stringstream err; \ + err << get_type() << " layer \"" << get_name() << "\" " \ + << "has input tensors with different dimensions ("; \ + for (int i = 0; i < get_num_parents(); ++i) { \ + const auto& dims = get_input_dims(i); \ + err << (i > 0 ? ", " : "") \ + << "layer \"" << parents[i]->get_name() << "\" outputs "; \ + for (size_t j = 0; j < dims.size(); ++j) { \ + err << (j > 0 ? " x " : "") << dims[j]; \ + } \ + } \ + err << ")"; \ + LBANN_ERROR(err.str()); \ + } \ + } \ + void fp_compute() override; \ + void bp_compute() override; \ } - std::string get_type() const override { return Name(); } - data_layout get_data_layout() const override { return Layout; } - El::Device get_device_allocation() const override { return Device; } - -protected: - - void setup_dims() override { - Layer::setup_dims(); - set_output_dims(get_input_dims()); - - // Check that input dimensions match - if (get_input_dims(0) != get_input_dims(1)) { - const auto& parents = get_parent_layers(); - std::stringstream err; - err << get_type() << " layer \"" << get_name() << "\" " - << "has input tensors with different dimensions ("; - for (int i = 0; i < get_num_parents(); ++i) { - const auto& dims = get_input_dims(i); - err << (i > 0 ? ", " : "") - << "layer \"" << parents[i]->get_name() << "\" outputs "; - for (size_t j = 0; j < dims.size(); ++j) { - err << (j > 0 ? " x " : "") << dims[j]; - } - } - err << ")"; - LBANN_ERROR(err.str()); - } - - } - - void fp_compute() override; - void bp_compute() override; - -}; // Convenience macros for ETI decls for binary layers #ifndef LBANN_BINARY_LAYER_INSTANTIATE -#define BINARY_ETI_DECL_MACRO_DEV(LAYER_NAME, DEVICE) \ - extern template class entrywise_binary_layer< \ - data_layout::DATA_PARALLEL, DEVICE, LAYER_NAME##_name_struct>; \ - extern template class entrywise_binary_layer< \ - data_layout::MODEL_PARALLEL, DEVICE, LAYER_NAME##_name_struct> +#define BINARY_ETI_DECL_MACRO_DEV(LAYER_NAME, DEVICE) \ + extern template class LAYER_NAME; \ + extern template class LAYER_NAME #else #define BINARY_ETI_DECL_MACRO_DEV(...) #endif // LBANN_BINARY_LAYER_INSTANTIATE #define BINARY_ETI_INST_MACRO_DEV(LAYER_NAME, DEVICE) \ - template class entrywise_binary_layer< \ - data_layout::DATA_PARALLEL, DEVICE, LAYER_NAME##_name_struct>; \ - template class entrywise_binary_layer< \ - data_layout::MODEL_PARALLEL, DEVICE, LAYER_NAME##_name_struct> + template class LAYER_NAME; \ + template class LAYER_NAME #ifdef LBANN_HAS_GPU #define BINARY_ETI_DECL_MACRO(LAYER_NAME) \ @@ -110,12 +95,7 @@ class entrywise_binary_layer : public Layer { // Convenience macro to define an entry-wise binary layer class #define DEFINE_ENTRYWISE_BINARY_LAYER(layer_name, layer_string) \ - struct layer_name##_name_struct { \ - inline operator std::string() { return layer_string; } \ - }; \ - template \ - using layer_name \ - = entrywise_binary_layer; \ + LBANN_DECLARE_ENTRYWISE_BINARY_LAYER(layer_name, layer_string); \ BINARY_ETI_DECL_MACRO(layer_name) // Arithmetic operations diff --git a/include/lbann/layers/math/unary.hpp b/include/lbann/layers/math/unary.hpp index f73d61c7977..b4fd0c82d4c 100644 --- a/include/lbann/layers/math/unary.hpp +++ b/include/lbann/layers/math/unary.hpp @@ -31,47 +31,40 @@ namespace lbann { -/** @brief Templated class for entry-wise unary layers. - * @param Layout Parallelism scheme. - * @param Device Device allocation. - * @param Name Type that can be converted into a string. - */ -template -class entrywise_unary_layer : public Layer { -public: - entrywise_unary_layer(lbann_comm *comm) : Layer(comm) {} - entrywise_unary_layer* copy() const override { - return new entrywise_unary_layer(*this); - } - std::string get_type() const override { return Name(); } - data_layout get_data_layout() const override { return Layout; } - El::Device get_device_allocation() const override { return Device; } -protected: - void setup_dims() override { - Layer::setup_dims(); - set_output_dims(get_input_dims()); + +#define LBANN_DECLARE_ENTRYWISE_UNARY_LAYER(LAYER_NAME, LAYER_STRING) \ + template \ + class LAYER_NAME : public Layer { \ + public: \ + LAYER_NAME(lbann_comm *comm) : Layer(comm) {} \ + LAYER_NAME* copy() const override { \ + return new LAYER_NAME(*this); \ + } \ + std::string get_type() const override { return LAYER_STRING; } \ + data_layout get_data_layout() const override { return Layout; } \ + El::Device get_device_allocation() const override { return Device; } \ + protected: \ + void setup_dims() override { \ + Layer::setup_dims(); \ + set_output_dims(get_input_dims()); \ + } \ + void fp_compute() override; \ + void bp_compute() override; \ } - void fp_compute() override; - void bp_compute() override; -}; // Convenience macros for ETI decls for unary layers #ifndef LBANN_UNARY_LAYER_INSTANTIATE -#define UNARY_ETI_DECL_MACRO_DEV(LAYER_NAME, DEVICE) \ - extern template class entrywise_unary_layer< \ - data_layout::DATA_PARALLEL, DEVICE, LAYER_NAME##_name_struct>; \ - extern template class entrywise_unary_layer< \ - data_layout::MODEL_PARALLEL, DEVICE, LAYER_NAME##_name_struct> +#define UNARY_ETI_DECL_MACRO_DEV(LAYER_NAME, DEVICE) \ + extern template class LAYER_NAME; \ + extern template class LAYER_NAME #else #define UNARY_ETI_DECL_MACRO_DEV(...) #endif // LBANN_UNARY_LAYER_INSTANTIATE #define UNARY_ETI_INST_MACRO_DEV(LAYER_NAME, DEVICE) \ - template class entrywise_unary_layer< \ - data_layout::DATA_PARALLEL, DEVICE, LAYER_NAME##_name_struct>; \ - template class entrywise_unary_layer< \ - data_layout::MODEL_PARALLEL, DEVICE, LAYER_NAME##_name_struct> + template class LAYER_NAME; \ + template class LAYER_NAME #ifdef LBANN_HAS_GPU #define UNARY_ETI_DECL_MACRO(LAYER_NAME) \ @@ -84,12 +77,7 @@ class entrywise_unary_layer : public Layer { // Convenience macro to define an entry-wise unary layer class #define DEFINE_ENTRYWISE_UNARY_LAYER(layer_name, layer_string) \ - struct layer_name##_name_struct { \ - inline operator std::string() { return layer_string; } \ - }; \ - template \ - using layer_name \ - = entrywise_unary_layer; \ + LBANN_DECLARE_ENTRYWISE_UNARY_LAYER(layer_name, layer_string); \ UNARY_ETI_DECL_MACRO(layer_name) // Logical operations From 8f307116107faea61bb8349b8f1af98d0dd85f41 Mon Sep 17 00:00:00 2001 From: davidHysom Date: Tue, 15 Oct 2019 18:09:02 -0700 Subject: [PATCH 344/634] Sample list bug (#1299) * Modified sample_list::all_gather_archive() to deal with the case where the size of the global archive is > INT_MAX. * Changed INT_MAX to c++ idiom * avoid a second lookup; change suggested by Tom * Made "broadcast" version of sample_list::all_gather_archive() the default. The version that uses all_gather can be selected via the cmd line flag: --all_gather_old * bug fix; all_gather_archive_new was accidentally changed to all_gather_archive. --- include/lbann/data_readers/sample_list.hpp | 2 + .../lbann/data_readers/sample_list_impl.hpp | 93 +++++++++++++++++++ .../sample_list_open_files_impl.hpp | 2 +- 3 files changed, 96 insertions(+), 1 deletion(-) diff --git a/include/lbann/data_readers/sample_list.hpp b/include/lbann/data_readers/sample_list.hpp index 45fbdb39e92..6d4aa5e051f 100644 --- a/include/lbann/data_readers/sample_list.hpp +++ b/include/lbann/data_readers/sample_list.hpp @@ -105,6 +105,8 @@ class sample_list { const std::string& get_samples_dirname() const; void all_gather_archive(const std::string &archive, std::vector& gathered_archive, lbann_comm& comm); + void all_gather_archive_new(const std::string &archive, std::vector& gathered_archive, lbann_comm& comm); + template size_t all_gather_field(T data, std::vector& gathered_data, lbann_comm& comm); virtual void all_gather_packed_lists(lbann_comm& comm); diff --git a/include/lbann/data_readers/sample_list_impl.hpp b/include/lbann/data_readers/sample_list_impl.hpp index 364de74bb27..0f161bed61f 100644 --- a/include/lbann/data_readers/sample_list_impl.hpp +++ b/include/lbann/data_readers/sample_list_impl.hpp @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -337,6 +338,11 @@ inline void sample_list ::all_gather_archive(const std::string &archive, std::vector& gathered_archive, lbann_comm& comm) { + if (!options::get()->get_bool("all_gather_old")) { + all_gather_archive_new(archive, gathered_archive, comm); + return; + } + int size_of_list_archive = archive.size(); std::vector packed_sizes(comm.get_procs_per_trainer()); @@ -378,6 +384,93 @@ ::all_gather_archive(const std::string &archive, return; } +template +inline void sample_list +::all_gather_archive_new(const std::string &archive, + std::vector& gathered_archive, + lbann_comm& comm) { + + // there's commented out code below to deal with the case where + // archive.size() > INT_MAX; but for now let's assume we won't + // encounter that (which is true for the 100M JAG set) + int constexpr max_int = std::numeric_limits::max(); + size_t n = archive.size(); + if (n > max_int) { + LBANN_ERROR("(n > max_int"); + } + + // change int to size_t for case where n > max_int (see commented out + // code block below) + int size_of_my_archive= archive.size(); + std::vector packed_sizes(comm.get_procs_per_trainer()); + comm.trainer_all_gather(size_of_my_archive, packed_sizes); + + int me = comm.get_rank_in_trainer(); + int np = comm.get_procs_per_trainer(); + + size_t g = 0; + for (auto t : packed_sizes) { + g += t; + } + if (!me) { + std::cout << "global archive size: " << g << std::endl; + } + + for (int p=0; p(gathered_archive[p].data()); + comm.trainer_broadcast(p, data, sz); + } + +#if 0 + std::vector rounds; + for (int p=0; p(archive.data() + offset); + comm.trainer_broadcast(p, data, rounds[k]); + } else { + char *data = const_cast(gathered_archive[p].data() + offset); + comm.trainer_broadcast(p, data, rounds[k]); + } + offset += rounds[k]; +if (me == p) { +std::cout << "XX finished round" << std::endl; +} + } + } +#endif + + return; +} + template template inline size_t sample_list diff --git a/include/lbann/data_readers/sample_list_open_files_impl.hpp b/include/lbann/data_readers/sample_list_open_files_impl.hpp index 3839ca2a425..565b016bd22 100644 --- a/include/lbann/data_readers/sample_list_open_files_impl.hpp +++ b/include/lbann/data_readers/sample_list_open_files_impl.hpp @@ -556,7 +556,7 @@ ::all_gather_packed_lists(lbann_comm& comm) { LBANN_ERROR("mp.find(filename) == mp.end()"); } index = search_result->second; - } + } m_sample_list.emplace_back(std::make_pair(index, s.second)); } } From d2a1b908811cc776d9bf351e697b018f940b49d5 Mon Sep 17 00:00:00 2001 From: davidHysom Date: Wed, 16 Oct 2019 09:54:08 -0700 Subject: [PATCH 345/634] Removed code associated with the super_node mode. Super_node mode has been found to be slower than multi-message modes, so there seems no reason the keep it. (#1301) --- .../lbann/data_store/data_store_conduit.hpp | 9 - src/data_store/data_store_conduit.cpp | 221 ++---------------- 2 files changed, 25 insertions(+), 205 deletions(-) diff --git a/include/lbann/data_store/data_store_conduit.hpp b/include/lbann/data_store/data_store_conduit.hpp index d12421d0782..dc0ca7e7c1d 100644 --- a/include/lbann/data_store/data_store_conduit.hpp +++ b/include/lbann/data_store/data_store_conduit.hpp @@ -127,10 +127,6 @@ class data_store_conduit { void exchange_mini_batch_data(size_t current_pos, size_t mb_size); - void set_super_node_mode() { - m_super_node = true; - } - void set_node_sizes_vary() { m_node_sizes_vary = true; } bool has_conduit_node(int data_id) const; @@ -152,7 +148,6 @@ protected : double m_exchange_time = 0; double m_rebuild_time = 0; - double m_super_node_packaging_time = 0; int m_cur_epoch = 0; @@ -171,10 +166,6 @@ protected : /// and received. int m_owner_map_mb_size = 0; - /// if true, use exchange_data_by_super_node, else use - /// exchange_data_by_sample; default if false - bool m_super_node = false; - /// size of a compacted conduit::Node that contains a single sample int m_compacted_sample_size = 0; diff --git a/src/data_store/data_store_conduit.cpp b/src/data_store/data_store_conduit.cpp index e1a0e754ce5..ae21fa47e01 100644 --- a/src/data_store/data_store_conduit.cpp +++ b/src/data_store/data_store_conduit.cpp @@ -58,7 +58,6 @@ data_store_conduit::data_store_conduit( m_np_in_trainer = m_comm->get_procs_per_trainer(); options *opts = options::get(); - m_super_node = opts->get_bool("super_node"); if (opts->get_bool("debug")) { std::stringstream ss; @@ -79,8 +78,6 @@ data_store_conduit::data_store_conduit( if (m_world_master) { if (m_is_local_cache) { std::cerr << "data_store_conduit is running in local_cache mode\n"; - } else if (m_super_node) { - std::cerr << "data_store_conduit is running in super_node mode\n"; } else { std::cerr << "data_store_conduit is running in multi-message mode\n"; } @@ -138,7 +135,6 @@ void data_store_conduit::copy_members(const data_store_conduit& rhs, const std:: m_preload = rhs.m_preload; m_explicit_loading = rhs.m_explicit_loading; m_owner_map_mb_size = rhs.m_owner_map_mb_size; - m_super_node = rhs.m_super_node; m_compacted_sample_size = rhs.m_compacted_sample_size; m_is_local_cache = rhs.m_is_local_cache; m_node_sizes_vary = rhs.m_node_sizes_vary; @@ -169,25 +165,21 @@ void data_store_conduit::copy_members(const data_store_conduit& rhs, const std:: for(auto&& i : ds_sample_move_list) { if(rhs.m_data.find(i) != rhs.m_data.end()){ - if (!m_super_node) { - /// Repack the nodes because they don't seem to copy correctly - // - //dah - previously this code block only contained the line: - // build_node_for_sending(rhs.m_data[i]["data"], m_data[i]); - //However, this resulted in errors in the schema; not sure why, - //as it used to work; some change in the conduit library? - conduit::Node n2; - const std::vector &names = rhs.m_data[i]["data"].child_names(); - const std::vector &names2 = rhs.m_data[i]["data"][names[0]].child_names(); - for (auto t : names2) { - n2[names[0]][t] = rhs.m_data[i]["data"][names[0]][t]; - } - build_node_for_sending(n2, m_data[i]); - } else { - m_data[i] = rhs.m_data[i]; + /// Repack the nodes because they don't seem to copy correctly + // + //dah - previously this code block only contained the line: + // build_node_for_sending(rhs.m_data[i]["data"], m_data[i]); + //However, this resulted in errors in the schema; not sure why, + //as it used to work; some change in the conduit library? + conduit::Node n2; + const std::vector &names = rhs.m_data[i]["data"].child_names(); + const std::vector &names2 = rhs.m_data[i]["data"][names[0]].child_names(); + for (auto t : names2) { + n2[names[0]][t] = rhs.m_data[i]["data"][names[0]][t]; } - rhs.m_data.erase(i); + build_node_for_sending(n2, m_data[i]); } + rhs.m_data.erase(i); /// Removed migrated nodes from the original data store's owner list if(rhs.m_owner.find(i) != rhs.m_owner.end()) { @@ -222,8 +214,6 @@ void data_store_conduit::setup(int mini_batch_size) { std::cerr << "starting data_store_conduit::setup() for role: " << m_reader->get_role() << "\n"; if (m_is_local_cache) { std::cerr << "data store mode: local cache\n"; - } else if (m_super_node) { - std::cerr << "data store mode: exchange_data via super nodes\n"; } else { std::cerr << "data store mode: exchange_data via individual samples\n"; } @@ -255,155 +245,20 @@ void data_store_conduit::setup_data_store_buffers() { m_reconstituted.resize(m_np_in_trainer); } -// Note: conduit has a very nice interface for communicating nodes -// in blocking scenarios. Unf, for non-blocking we need to -// handle things ourselves. TODO: possibly modify conduit to -// handle non-blocking comms -void data_store_conduit::exchange_data_by_super_node(size_t current_pos, size_t mb_size) { - if (! m_is_setup) { - LBANN_ERROR("setup(mb_size) has not been called"); - } - - if (m_output) { - (*m_output) << "starting data_store_conduit::exchange_data_by_super_node; mb_size: " << mb_size << std::endl; - } - - if (m_send_buffer.size() == 0) { - setup_data_store_buffers(); - } - - //======================================================================== - //part 1: construct the super_nodes - - build_indices_i_will_send(current_pos, mb_size); - build_indices_i_will_recv(current_pos, mb_size); - - // construct a super node for each processor; the super node - // contains all samples this proc owns that other procs need - if (m_send_buffer.size() != (size_t)m_np_in_trainer) { - LBANN_ERROR("m_send_buffer.size() != m_np_in_trainer; m_send_buffer.size: ", m_send_buffer.size()); - } - - double tm3 = get_time(); - for (int p=0; p(&m_outgoing_msg_sizes[p]); - m_comm->nb_send(s, sizeof(int), m_comm->get_trainer_rank(), p, m_send_requests[p]); - } - - for (int p=0; p(&m_incoming_msg_sizes[p]); - m_comm->nb_recv(s, sizeof(int), m_comm->get_trainer_rank(), p, m_recv_requests[p]); - } - m_comm->wait_all(m_send_requests); - m_comm->wait_all(m_recv_requests); - - //======================================================================== - //part 2: exchange the actual data - - // start sends for outgoing data - for (int p=0; p(m_send_buffer_2[p].data_ptr()); - m_comm->nb_send(s, m_outgoing_msg_sizes[p], m_comm->get_trainer_rank(), p, m_send_requests[p]); - } - - // start recvs for incoming data - for (int p=0; pnb_recv((El::byte*)m_recv_buffer[p].data_ptr(), m_incoming_msg_sizes[p], m_comm->get_trainer_rank(), p, m_recv_requests[p]); - } - - // wait for all msgs to complete - m_comm->wait_all(m_send_requests); - m_comm->wait_all(m_recv_requests); - - //======================================================================== - //part 3: construct the Nodes needed by me for the current minibatch - - m_minibatch_data.clear(); - for (int p=0; p &names = m_reconstituted[p].child_names(); - - for (auto &t : names) { - if (m_output) { - (*m_output) << "next name: " << t << std::endl; - } - m_minibatch_data[atoi(t.c_str())][t].update_external(m_reconstituted[p][t]); - } - } - - if (m_output) { - (*m_output) << "m_minibatch_data.size(): " << m_minibatch_data.size() << "; indices: "; - for (auto t : m_minibatch_data) { - (*m_output) << t.first << " "; - } - (*m_output) << std::endl; - } -} - void data_store_conduit::set_preloaded_conduit_node(int data_id, conduit::Node &node) { // note: at this point m_data[data_id] = node - // note: if running in super_node mode, nothing to do - // note2: this may depend on the particular data reader - if (!m_super_node) { - if (m_output) { - (*m_output) << "set_preloaded_conduit_node: " << data_id << " for non-super_node mode\n"; - } - conduit::Node n2 = node; - m_mutex.lock(); - build_node_for_sending(n2, m_data[data_id]); - m_mutex.unlock(); - if (!m_node_sizes_vary) { - error_check_compacted_node(m_data[data_id], data_id); - } else { - m_mutex.lock(); - m_sample_sizes[data_id] = m_data[data_id].total_bytes_compact(); - m_mutex.unlock(); - } + if (m_output) { + (*m_output) << "set_preloaded_conduit_node: " << data_id << std::endl; + } + conduit::Node n2 = node; + m_mutex.lock(); + build_node_for_sending(n2, m_data[data_id]); + m_mutex.unlock(); + if (!m_node_sizes_vary) { + error_check_compacted_node(m_data[data_id], data_id); } else { m_mutex.lock(); - if (m_data.find(data_id) == m_data.end()) { - m_data[data_id] = node; - if (m_output) { - (*m_output) << "set_preloaded_conduit_node: " << data_id << " for super_node mode\n"; - } - } else { - if (m_output) { - (*m_output) << "set_preloaded_conduit_node: " << data_id << " is already in m_data\n"; - } - } + m_sample_sizes[data_id] = m_data[data_id].total_bytes_compact(); m_mutex.unlock(); } } @@ -461,15 +316,7 @@ void data_store_conduit::set_conduit_node(int data_id, conduit::Node &node, bool m_mutex.unlock(); } - #if 0 - else if (m_owner[data_id] != m_rank_in_trainer) { - LBANN_ERROR("set_conduit_node error for data id: ", data_id, " m_owner: ", - m_owner[data_id], " me: ", m_rank_in_trainer, - "; data reader role: ", m_reader->get_role()); - } - #endif - - else if (! m_super_node) { + else { m_mutex.lock(); m_owner[data_id] = m_rank_in_trainer; build_node_for_sending(node, m_data[data_id]); @@ -477,13 +324,6 @@ void data_store_conduit::set_conduit_node(int data_id, conduit::Node &node, bool m_sample_sizes[data_id] = m_data[data_id].total_bytes_compact(); m_mutex.unlock(); } - - else { - m_mutex.lock(); - m_owner[data_id] = m_rank_in_trainer; - m_data[data_id] = node; - m_mutex.unlock(); - } } const conduit::Node & data_store_conduit::get_conduit_node(int data_id) const { @@ -818,9 +658,6 @@ void data_store_conduit::purge_unused_samples(const std::vector& indices) { } void data_store_conduit::compact_nodes() { - if (m_super_node) { - return; - } for(auto&& j : *m_shuffled_indices) { if(m_data.find(j) != m_data.end()){ if(! (m_data[j].is_contiguous() && m_data[j].is_compact()) ) { @@ -1394,13 +1231,9 @@ void data_store_conduit::exchange_mini_batch_data(size_t current_pos, size_t mb_ << m_exchange_time << std::endl << "time for constructing conduit Nodes: " << m_rebuild_time << std::endl; - if (m_super_node) { - std::cout << "time for constructing super_nodes: " << m_super_node_packaging_time; - } std::cout << std::endl; m_exchange_time = 0.; m_rebuild_time = 0.; - m_super_node_packaging_time = 0.; } ++m_cur_epoch; } @@ -1409,11 +1242,7 @@ void data_store_conduit::exchange_mini_batch_data(size_t current_pos, size_t mb_ exchange_owner_maps(); } - if (m_super_node) { - exchange_data_by_super_node(current_pos, mb_size); - } else { - exchange_data_by_sample(current_pos, mb_size); - } + exchange_data_by_sample(current_pos, mb_size); m_exchange_time += (get_time() - tm1); } From 88c6ccdbed0adeca18a974b2ad90788d6faca5f3 Mon Sep 17 00:00:00 2001 From: davidHysom Date: Wed, 16 Oct 2019 10:12:38 -0700 Subject: [PATCH 346/634] Data store supernode (#1302) * Removed code associated with the super_node mode. Super_node mode has been found to be slower than multi-message modes, so there seems no reason the keep it. * removed exhange_by_super_node() --- include/lbann/data_store/data_store_conduit.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/include/lbann/data_store/data_store_conduit.hpp b/include/lbann/data_store/data_store_conduit.hpp index dc0ca7e7c1d..63e56e31585 100644 --- a/include/lbann/data_store/data_store_conduit.hpp +++ b/include/lbann/data_store/data_store_conduit.hpp @@ -198,7 +198,6 @@ protected : /// convenience handle const std::vector *m_shuffled_indices; - void exchange_data_by_super_node(size_t current_pos, size_t mb_size); void exchange_data_by_sample(size_t current_pos, size_t mb_size); /// Contains the list of data IDs that will be received From f1a25d690fed37f220b8d0b0008bf9a2b9a981b5 Mon Sep 17 00:00:00 2001 From: Brian Van Essen Date: Thu, 17 Oct 2019 10:49:34 -0700 Subject: [PATCH 347/634] Creating a new application directory hierarchy. (#1304) * Creating a new application directory hierarchy. This will eventually replace the model zoo since models should be free range and not kept in cages. --- applications/.gitignore | 21 ++++++++++++++++ applications/ATOM/README.md | 3 +++ applications/CONTRIBUTING.md | 49 ++++++++++++++++++++++++++++++++++++ applications/README.md | 12 +++++++++ 4 files changed, 85 insertions(+) create mode 100644 applications/.gitignore create mode 100644 applications/ATOM/README.md create mode 100644 applications/CONTRIBUTING.md create mode 100644 applications/README.md diff --git a/applications/.gitignore b/applications/.gitignore new file mode 100644 index 00000000000..aa6a015fd1e --- /dev/null +++ b/applications/.gitignore @@ -0,0 +1,21 @@ +# Setup standard ignores to keep the applications directory hierarchy clean + +# Building in source tree garbage +.cproject +.project +*.o +*.a + +# Emacs backup garbage +.backup/ + +# Other standard ignores +*~ +*.pyc +\#*# +.#* +.*.swp +.DS_Store + +# Python garbage +__pycache__/ diff --git a/applications/ATOM/README.md b/applications/ATOM/README.md new file mode 100644 index 00000000000..a752d534160 --- /dev/null +++ b/applications/ATOM/README.md @@ -0,0 +1,3 @@ +## Accelerating Therapeutics for Opportunities in Medicine (ATOM) + +Models for training neural networks to suppor the [ATOM](https://atomscience.org) project diff --git a/applications/CONTRIBUTING.md b/applications/CONTRIBUTING.md new file mode 100644 index 00000000000..72ff8591e3f --- /dev/null +++ b/applications/CONTRIBUTING.md @@ -0,0 +1,49 @@ +## Contributing Applications: + +The application directory contains the user-facing code for projects +to use LBANN. Each project directory should contain the python code +to instantiate the model, run both training and inference, an +experiments directory, as well as utility / helper code to pre- or +post-process data. In addition to project-specific directories the +directory hierarchy groups together similar projects into broader +categories, such as vision-based networks. + +### Directory Structure: + +``` +applications +└─── ATOM +``` + +The applications directory has primary __projects__ directories as well +as __categories__ that contain related __projects__. + +### Project Directory Structure: + +The general structure of a project directory should be: + +``` + +└─── README.md +└─── .py +└─── lib_.py +└─── experiments + └─── run_.py +└─── utils + +``` + +* README.md + * Describe the project, how to run it, etc. +* `.py` + * Python code that builds the model's compute graph +* `lib_.py` + * Common Python code that builds common substructurs used by the + application +* experiments + * Directory to run an experiment. Should include launcher scripts, + etc. + * `run_.py` + * Launcher script to run the model in train or inference mode +* utils + * Directory for holding pre- and post-processing scripts diff --git a/applications/README.md b/applications/README.md new file mode 100644 index 00000000000..602eec0cd3f --- /dev/null +++ b/applications/README.md @@ -0,0 +1,12 @@ +## Applications: + +The application directory contains the user-facing code for projects +to use LBANN. Each project directory should contain the python code +to instantiate the model, run both training and inference, an +experiments directory, as well as utility / helper code to pre- or +post-process data. + +These are some of applications that leverage LBANN: +- [Atom](atom/README.md): Accelerating Therapeutics for Opportunities + in Medicine (ATOM) - Networks for predicting molecular compounds + that are optimized for multiple objectives From 5ec550436907769d070d7bb35cf1e74298ad9afb Mon Sep 17 00:00:00 2001 From: Tom Benson <30674819+benson31@users.noreply.github.com> Date: Thu, 17 Oct 2019 18:16:19 -0700 Subject: [PATCH 348/634] Make jag-utils optional in the build/install (#1263) * make jag-utils optional in the build/install The JAG utils will still be built and installed as part of the superbuild (and, by extension, by build_lbann_lc.sh). * update docs to include a comment on the JAG utilities --- docs/build_with_cmake.rst | 22 ++++ model_zoo/jag_utils/CMakeLists.txt | 163 +++++++++++++++++------------ superbuild/lbann/CMakeLists.txt | 9 ++ 3 files changed, 126 insertions(+), 68 deletions(-) diff --git a/docs/build_with_cmake.rst b/docs/build_with_cmake.rst index c6722fa956a..dbecc85bb2d 100644 --- a/docs/build_with_cmake.rst +++ b/docs/build_with_cmake.rst @@ -224,6 +224,28 @@ documentation of the packages that are causing the issues as they may require additional CMake/environment flags to be set before properly resolving. +------------------------------ +Building JAG utilities +------------------------------ +The JAG utility executables are not part of the `all` target. In order +to use or install them, they must be built using the `jag-utils` +target. In order to install them, this must be done before installing. + +.. code-block:: bash + + # Configure LBANN + cmake /path/to/lbann + + # Build main LBANN library and front-ends + cmake --build . + + # If JAG utilities are required, build them + cmake --build . --target jag-utils + + # Install all (built) targets + cmake --build . --target install + + ------------------------------ Example CMake invocation ------------------------------ diff --git a/model_zoo/jag_utils/CMakeLists.txt b/model_zoo/jag_utils/CMakeLists.txt index 794ee8e61b8..8f459718193 100644 --- a/model_zoo/jag_utils/CMakeLists.txt +++ b/model_zoo/jag_utils/CMakeLists.txt @@ -1,77 +1,104 @@ - add_executable( build_index-bin build_index.cpp ) - target_link_libraries(build_index-bin lbann ) - set_target_properties(build_index-bin PROPERTIES OUTPUT_NAME build_index) - - add_executable( extract_random_samples-bin extract_random_samples.cpp ) - target_link_libraries(extract_random_samples-bin lbann ) - set_target_properties(extract_random_samples-bin PROPERTIES OUTPUT_NAME extract_random_samples) - - add_executable( dump_bundle-bin dump_bundle.cpp ) - target_link_libraries(dump_bundle-bin lbann ) - set_target_properties(dump_bundle-bin PROPERTIES OUTPUT_NAME dump_bundle) - - add_executable( check_images-bin check_images.cpp ) - target_link_libraries(check_images-bin lbann ) - set_target_properties(check_images-bin PROPERTIES OUTPUT_NAME check_images) - - add_executable( detect_corruption-bin detect_corruption.cpp ) - target_link_libraries(detect_corruption-bin lbann ) - set_target_properties(detect_corruption-bin PROPERTIES OUTPUT_NAME detect_corruption) - - add_executable( load_bundle2raw-bin load_bundle2raw.cpp ) - target_link_libraries(load_bundle2raw-bin lbann ) - set_target_properties(load_bundle2raw-bin PROPERTIES OUTPUT_NAME load_bundle2raw) - - add_executable( compute_min_max_images-bin compute_min_max_images.cpp ) - target_link_libraries(compute_min_max_images-bin lbann ) - set_target_properties(compute_min_max_images-bin PROPERTIES OUTPUT_NAME compute_min_max_images) - - add_executable( compute_per_channel_image_avg_min_max-bin compute_per_channel_image_avg_min_max.cpp ) - target_link_libraries(compute_per_channel_image_avg_min_max-bin lbann ) - set_target_properties(compute_per_channel_image_avg_min_max-bin PROPERTIES OUTPUT_NAME compute_per_channel_image_avg_min_max) - - add_executable( load_balance-bin load_balance.cpp ) - target_link_libraries(load_balance-bin lbann ) - set_target_properties(load_balance-bin PROPERTIES OUTPUT_NAME load_balance) - - add_executable( check_for_duplicate_samples-bin check_for_duplicate_samples.cpp ) - target_link_libraries(check_for_duplicate_samples-bin lbann ) - set_target_properties(check_for_duplicate_samples-bin PROPERTIES OUTPUT_NAME check_for_duplicate_samples) - - add_executable( test_conduit_hdf5-bin test_conduit_hdf5.cpp ) - target_link_libraries(test_conduit_hdf5-bin lbann ) - set_target_properties(test_conduit_hdf5-bin PROPERTIES OUTPUT_NAME test_conduit_hdf5) - - add_executable( select_samples-bin select_samples.cpp ) - target_link_libraries(select_samples-bin lbann ) - set_target_properties(select_samples-bin PROPERTIES OUTPUT_NAME select_samples) - - add_executable( build_sample_id_mapping-bin build_sample_id_mapping.cpp ) - target_link_libraries(build_sample_id_mapping-bin lbann ) - set_target_properties(build_sample_id_mapping-bin PROPERTIES OUTPUT_NAME build_sample_id_mapping) - - add_executable( generate_corrupt_samples-bin generate_corrupt_samples.cpp ) - target_link_libraries(generate_corrupt_samples-bin lbann ) - set_target_properties(generate_corrupt_samples-bin PROPERTIES OUTPUT_NAME generate_corrupt_samples) - - add_executable( compute_hydra_normalization-bin compute_hydra_normalization.cpp ) - target_link_libraries(compute_hydra_normalization-bin lbann ) - set_target_properties(compute_hydra_normalization-bin PROPERTIES OUTPUT_NAME compute_hydra_normalization) - - add_executable( test_reading_speed-bin test_reading_speed.cpp ) - target_link_libraries(test_reading_speed-bin lbann ) - set_target_properties(test_reading_speed-bin PROPERTIES OUTPUT_NAME test_reading_speed) - - add_executable( convert-bin convert.cpp ) - target_link_libraries(convert-bin lbann ) - set_target_properties(convert-bin PROPERTIES OUTPUT_NAME convert) +# Add a target to control building all the utilities +add_custom_target(jag-utils) + +add_executable(build_index + EXCLUDE_FROM_ALL build_index.cpp) +target_link_libraries(build_index lbann) +add_dependencies(jag-utils build_index) + +add_executable(extract_random_samples + EXCLUDE_FROM_ALL extract_random_samples.cpp) +target_link_libraries(extract_random_samples lbann) +add_dependencies(jag-utils extract_random_samples) + +add_executable(dump_bundle + EXCLUDE_FROM_ALL dump_bundle.cpp) +target_link_libraries(dump_bundle lbann) +add_dependencies(jag-utils dump_bundle) + +add_executable(check_images + EXCLUDE_FROM_ALL check_images.cpp) +target_link_libraries(check_images lbann) +add_dependencies(jag-utils check_images) + +add_executable(detect_corruption + EXCLUDE_FROM_ALL detect_corruption.cpp) +target_link_libraries(detect_corruption lbann) +add_dependencies(jag-utils detect_corruption) + +add_executable(load_bundle2raw + EXCLUDE_FROM_ALL load_bundle2raw.cpp) +target_link_libraries(load_bundle2raw lbann) +add_dependencies(jag-utils load_bundle2raw) + +add_executable(compute_min_max_images + EXCLUDE_FROM_ALL compute_min_max_images.cpp) +target_link_libraries(compute_min_max_images lbann) +add_dependencies(jag-utils compute_min_max_images) + +add_executable(compute_per_channel_image_avg_min_max + EXCLUDE_FROM_ALL compute_per_channel_image_avg_min_max.cpp) +target_link_libraries(compute_per_channel_image_avg_min_max lbann) +add_dependencies(jag-utils compute_per_channel_image_avg_min_max) + +add_executable(load_balance + EXCLUDE_FROM_ALL load_balance.cpp) +target_link_libraries(load_balance lbann) +add_dependencies(jag-utils load_balance) + +add_executable(check_for_duplicate_samples + EXCLUDE_FROM_ALL check_for_duplicate_samples.cpp) +target_link_libraries(check_for_duplicate_samples lbann) +add_dependencies(jag-utils extract_random_samples) + +add_executable(test_conduit_hdf5 + EXCLUDE_FROM_ALL test_conduit_hdf5.cpp) +target_link_libraries(test_conduit_hdf5 lbann) +add_dependencies(jag-utils test_conduit_hdf5) + +add_executable(select_samples + EXCLUDE_FROM_ALL select_samples.cpp) +target_link_libraries(select_samples lbann) +add_dependencies(jag-utils select_samples) + +add_executable(build_sample_id_mapping + EXCLUDE_FROM_ALL build_sample_id_mapping.cpp) +target_link_libraries(build_sample_id_mapping lbann) +add_dependencies(jag-utils build_sample_id_mapping) + +add_executable(generate_corrupt_samples + EXCLUDE_FROM_ALL generate_corrupt_samples.cpp) +target_link_libraries(generate_corrupt_samples lbann) +add_dependencies(jag-utils generate_corrupt_samples) + +add_executable(compute_hydra_normalization + EXCLUDE_FROM_ALL compute_hydra_normalization.cpp) +target_link_libraries(compute_hydra_normalization lbann) +add_dependencies(jag-utils compute_hydra_normalization) + +add_executable(test_reading_speed + EXCLUDE_FROM_ALL test_reading_speed.cpp) +target_link_libraries(test_reading_speed lbann) +add_dependencies(jag-utils test_reading_speed) + +add_executable(convert + EXCLUDE_FROM_ALL convert.cpp) +target_link_libraries(convert lbann) +add_dependencies(jag-utils convert) # Install the binaries install( - TARGETS select_samples-bin build_sample_id_mapping-bin build_index-bin + TARGETS select_samples build_sample_id_mapping build_index EXPORT LBANNTargets RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} + OPTIONAL ) + +# The use of `OPTIONAL` here will trigger CMake warnings. These can +# safely be ignored and tests confirm that. See these for more info: +# +# https://gitlab.kitware.com/cmake/cmake/issues/18258 +# https://cmake.org/pipermail/cmake/2011-August/046014.html diff --git a/superbuild/lbann/CMakeLists.txt b/superbuild/lbann/CMakeLists.txt index 0c65f388580..80f202783e8 100644 --- a/superbuild/lbann/CMakeLists.txt +++ b/superbuild/lbann/CMakeLists.txt @@ -187,5 +187,14 @@ ExternalProject_Add(LBANN ${LBANN_CMAKE_ARGS} ) +# Ensure the JAG utils are built +ExternalProject_Add_Step(LBANN build-jag-utils + COMMAND ${CMAKE_COMMAND} --build --config $ --target jag-utils + COMMENT "Performing building of JAG utils for 'LBANN'" + DEPENDEES build + DEPENDERS install + LOG 1 + USES_TERMINAL 1) + set(LBANN_DIR ${LBANN_CMAKE_INSTALL_PREFIX} CACHE INTERNAL "The install prefix of LBANN.") From 697077ebcad8b82f10f0cd2f19c6e46ddfe99bf7 Mon Sep 17 00:00:00 2001 From: Tim Moon Date: Fri, 18 Oct 2019 10:59:22 -0700 Subject: [PATCH 349/634] Revert "Make jag-utils optional in the build/install (#1263)" This reverts commit 5ec550436907769d070d7bb35cf1e74298ad9afb. --- docs/build_with_cmake.rst | 22 ---- model_zoo/jag_utils/CMakeLists.txt | 163 ++++++++++++----------------- superbuild/lbann/CMakeLists.txt | 9 -- 3 files changed, 68 insertions(+), 126 deletions(-) diff --git a/docs/build_with_cmake.rst b/docs/build_with_cmake.rst index dbecc85bb2d..c6722fa956a 100644 --- a/docs/build_with_cmake.rst +++ b/docs/build_with_cmake.rst @@ -224,28 +224,6 @@ documentation of the packages that are causing the issues as they may require additional CMake/environment flags to be set before properly resolving. ------------------------------- -Building JAG utilities ------------------------------- -The JAG utility executables are not part of the `all` target. In order -to use or install them, they must be built using the `jag-utils` -target. In order to install them, this must be done before installing. - -.. code-block:: bash - - # Configure LBANN - cmake /path/to/lbann - - # Build main LBANN library and front-ends - cmake --build . - - # If JAG utilities are required, build them - cmake --build . --target jag-utils - - # Install all (built) targets - cmake --build . --target install - - ------------------------------ Example CMake invocation ------------------------------ diff --git a/model_zoo/jag_utils/CMakeLists.txt b/model_zoo/jag_utils/CMakeLists.txt index 8f459718193..794ee8e61b8 100644 --- a/model_zoo/jag_utils/CMakeLists.txt +++ b/model_zoo/jag_utils/CMakeLists.txt @@ -1,104 +1,77 @@ -# Add a target to control building all the utilities -add_custom_target(jag-utils) - -add_executable(build_index - EXCLUDE_FROM_ALL build_index.cpp) -target_link_libraries(build_index lbann) -add_dependencies(jag-utils build_index) - -add_executable(extract_random_samples - EXCLUDE_FROM_ALL extract_random_samples.cpp) -target_link_libraries(extract_random_samples lbann) -add_dependencies(jag-utils extract_random_samples) - -add_executable(dump_bundle - EXCLUDE_FROM_ALL dump_bundle.cpp) -target_link_libraries(dump_bundle lbann) -add_dependencies(jag-utils dump_bundle) - -add_executable(check_images - EXCLUDE_FROM_ALL check_images.cpp) -target_link_libraries(check_images lbann) -add_dependencies(jag-utils check_images) - -add_executable(detect_corruption - EXCLUDE_FROM_ALL detect_corruption.cpp) -target_link_libraries(detect_corruption lbann) -add_dependencies(jag-utils detect_corruption) - -add_executable(load_bundle2raw - EXCLUDE_FROM_ALL load_bundle2raw.cpp) -target_link_libraries(load_bundle2raw lbann) -add_dependencies(jag-utils load_bundle2raw) - -add_executable(compute_min_max_images - EXCLUDE_FROM_ALL compute_min_max_images.cpp) -target_link_libraries(compute_min_max_images lbann) -add_dependencies(jag-utils compute_min_max_images) - -add_executable(compute_per_channel_image_avg_min_max - EXCLUDE_FROM_ALL compute_per_channel_image_avg_min_max.cpp) -target_link_libraries(compute_per_channel_image_avg_min_max lbann) -add_dependencies(jag-utils compute_per_channel_image_avg_min_max) - -add_executable(load_balance - EXCLUDE_FROM_ALL load_balance.cpp) -target_link_libraries(load_balance lbann) -add_dependencies(jag-utils load_balance) - -add_executable(check_for_duplicate_samples - EXCLUDE_FROM_ALL check_for_duplicate_samples.cpp) -target_link_libraries(check_for_duplicate_samples lbann) -add_dependencies(jag-utils extract_random_samples) - -add_executable(test_conduit_hdf5 - EXCLUDE_FROM_ALL test_conduit_hdf5.cpp) -target_link_libraries(test_conduit_hdf5 lbann) -add_dependencies(jag-utils test_conduit_hdf5) - -add_executable(select_samples - EXCLUDE_FROM_ALL select_samples.cpp) -target_link_libraries(select_samples lbann) -add_dependencies(jag-utils select_samples) - -add_executable(build_sample_id_mapping - EXCLUDE_FROM_ALL build_sample_id_mapping.cpp) -target_link_libraries(build_sample_id_mapping lbann) -add_dependencies(jag-utils build_sample_id_mapping) - -add_executable(generate_corrupt_samples - EXCLUDE_FROM_ALL generate_corrupt_samples.cpp) -target_link_libraries(generate_corrupt_samples lbann) -add_dependencies(jag-utils generate_corrupt_samples) - -add_executable(compute_hydra_normalization - EXCLUDE_FROM_ALL compute_hydra_normalization.cpp) -target_link_libraries(compute_hydra_normalization lbann) -add_dependencies(jag-utils compute_hydra_normalization) - -add_executable(test_reading_speed - EXCLUDE_FROM_ALL test_reading_speed.cpp) -target_link_libraries(test_reading_speed lbann) -add_dependencies(jag-utils test_reading_speed) - -add_executable(convert - EXCLUDE_FROM_ALL convert.cpp) -target_link_libraries(convert lbann) -add_dependencies(jag-utils convert) + add_executable( build_index-bin build_index.cpp ) + target_link_libraries(build_index-bin lbann ) + set_target_properties(build_index-bin PROPERTIES OUTPUT_NAME build_index) + + add_executable( extract_random_samples-bin extract_random_samples.cpp ) + target_link_libraries(extract_random_samples-bin lbann ) + set_target_properties(extract_random_samples-bin PROPERTIES OUTPUT_NAME extract_random_samples) + + add_executable( dump_bundle-bin dump_bundle.cpp ) + target_link_libraries(dump_bundle-bin lbann ) + set_target_properties(dump_bundle-bin PROPERTIES OUTPUT_NAME dump_bundle) + + add_executable( check_images-bin check_images.cpp ) + target_link_libraries(check_images-bin lbann ) + set_target_properties(check_images-bin PROPERTIES OUTPUT_NAME check_images) + + add_executable( detect_corruption-bin detect_corruption.cpp ) + target_link_libraries(detect_corruption-bin lbann ) + set_target_properties(detect_corruption-bin PROPERTIES OUTPUT_NAME detect_corruption) + + add_executable( load_bundle2raw-bin load_bundle2raw.cpp ) + target_link_libraries(load_bundle2raw-bin lbann ) + set_target_properties(load_bundle2raw-bin PROPERTIES OUTPUT_NAME load_bundle2raw) + + add_executable( compute_min_max_images-bin compute_min_max_images.cpp ) + target_link_libraries(compute_min_max_images-bin lbann ) + set_target_properties(compute_min_max_images-bin PROPERTIES OUTPUT_NAME compute_min_max_images) + + add_executable( compute_per_channel_image_avg_min_max-bin compute_per_channel_image_avg_min_max.cpp ) + target_link_libraries(compute_per_channel_image_avg_min_max-bin lbann ) + set_target_properties(compute_per_channel_image_avg_min_max-bin PROPERTIES OUTPUT_NAME compute_per_channel_image_avg_min_max) + + add_executable( load_balance-bin load_balance.cpp ) + target_link_libraries(load_balance-bin lbann ) + set_target_properties(load_balance-bin PROPERTIES OUTPUT_NAME load_balance) + + add_executable( check_for_duplicate_samples-bin check_for_duplicate_samples.cpp ) + target_link_libraries(check_for_duplicate_samples-bin lbann ) + set_target_properties(check_for_duplicate_samples-bin PROPERTIES OUTPUT_NAME check_for_duplicate_samples) + + add_executable( test_conduit_hdf5-bin test_conduit_hdf5.cpp ) + target_link_libraries(test_conduit_hdf5-bin lbann ) + set_target_properties(test_conduit_hdf5-bin PROPERTIES OUTPUT_NAME test_conduit_hdf5) + + add_executable( select_samples-bin select_samples.cpp ) + target_link_libraries(select_samples-bin lbann ) + set_target_properties(select_samples-bin PROPERTIES OUTPUT_NAME select_samples) + + add_executable( build_sample_id_mapping-bin build_sample_id_mapping.cpp ) + target_link_libraries(build_sample_id_mapping-bin lbann ) + set_target_properties(build_sample_id_mapping-bin PROPERTIES OUTPUT_NAME build_sample_id_mapping) + + add_executable( generate_corrupt_samples-bin generate_corrupt_samples.cpp ) + target_link_libraries(generate_corrupt_samples-bin lbann ) + set_target_properties(generate_corrupt_samples-bin PROPERTIES OUTPUT_NAME generate_corrupt_samples) + + add_executable( compute_hydra_normalization-bin compute_hydra_normalization.cpp ) + target_link_libraries(compute_hydra_normalization-bin lbann ) + set_target_properties(compute_hydra_normalization-bin PROPERTIES OUTPUT_NAME compute_hydra_normalization) + + add_executable( test_reading_speed-bin test_reading_speed.cpp ) + target_link_libraries(test_reading_speed-bin lbann ) + set_target_properties(test_reading_speed-bin PROPERTIES OUTPUT_NAME test_reading_speed) + + add_executable( convert-bin convert.cpp ) + target_link_libraries(convert-bin lbann ) + set_target_properties(convert-bin PROPERTIES OUTPUT_NAME convert) # Install the binaries install( - TARGETS select_samples build_sample_id_mapping build_index + TARGETS select_samples-bin build_sample_id_mapping-bin build_index-bin EXPORT LBANNTargets RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} - OPTIONAL ) - -# The use of `OPTIONAL` here will trigger CMake warnings. These can -# safely be ignored and tests confirm that. See these for more info: -# -# https://gitlab.kitware.com/cmake/cmake/issues/18258 -# https://cmake.org/pipermail/cmake/2011-August/046014.html diff --git a/superbuild/lbann/CMakeLists.txt b/superbuild/lbann/CMakeLists.txt index 80f202783e8..0c65f388580 100644 --- a/superbuild/lbann/CMakeLists.txt +++ b/superbuild/lbann/CMakeLists.txt @@ -187,14 +187,5 @@ ExternalProject_Add(LBANN ${LBANN_CMAKE_ARGS} ) -# Ensure the JAG utils are built -ExternalProject_Add_Step(LBANN build-jag-utils - COMMAND ${CMAKE_COMMAND} --build --config $ --target jag-utils - COMMENT "Performing building of JAG utils for 'LBANN'" - DEPENDEES build - DEPENDERS install - LOG 1 - USES_TERMINAL 1) - set(LBANN_DIR ${LBANN_CMAKE_INSTALL_PREFIX} CACHE INTERNAL "The install prefix of LBANN.") From 71564caccc10f7ea44bc3d44fbde92670666c697 Mon Sep 17 00:00:00 2001 From: Tim Moon Date: Fri, 18 Oct 2019 11:08:02 -0700 Subject: [PATCH 350/634] Utility functions for hashing (#1303) * Add utility functions for hashing * Use hashing utility functions in trainer Closes #1222 * Use hashing utility functions when mixing RNG seeds --- include/lbann/trainers/trainer.hpp | 20 +++----- include/lbann/utils/hash.hpp | 81 ++++++++++++++++++++++++++++++ src/utils/random.cpp | 32 ++++++------ src/utils/unit_test/CMakeLists.txt | 1 + src/utils/unit_test/hash_test.cpp | 49 ++++++++++++++++++ 5 files changed, 156 insertions(+), 27 deletions(-) create mode 100644 include/lbann/utils/hash.hpp create mode 100644 src/utils/unit_test/hash_test.cpp diff --git a/include/lbann/trainers/trainer.hpp b/include/lbann/trainers/trainer.hpp index 3b90b9cd648..a7e2c6f1dee 100644 --- a/include/lbann/trainers/trainer.hpp +++ b/include/lbann/trainers/trainer.hpp @@ -33,6 +33,7 @@ #include "lbann/execution_contexts/execution_context.hpp" #include "lbann/io/persist.hpp" #include "lbann/utils/threads/thread_pool.hpp" +#include "lbann/utils/hash.hpp" #include #include #include @@ -45,17 +46,6 @@ class lbann_callback; class training_algorithm; class termination_criteria; -/** Create a hash function for hashing a std::pair type */ -struct pair_hash -{ - template - std::size_t operator() (const std::pair &pair) const - { - using underlying_t = typename std::underlying_type::type; - return std::hash()(pair.first) ^ std::hash()(static_cast(pair.second)); - } -}; - /** Represents an LBANN trainer and its context. */ class trainer { public: @@ -151,10 +141,16 @@ class trainer { /** Flag that allows input layers to fetch data in the background */ bool m_background_io_allowed; + /** Hash function for @c m_model_execution_context */ + using model_execution_context_hash_t = pair_hash, + execution_mode, + std::hash>, + enum_hash>; + /** @brief Map from model and execution mode to its execution context */ std::unordered_map, execution_mode>, std::unique_ptr, - pair_hash> m_model_execution_context; + model_execution_context_hash_t> m_model_execution_context; }; } // namespace lbann diff --git a/include/lbann/utils/hash.hpp b/include/lbann/utils/hash.hpp new file mode 100644 index 00000000000..20ff50175bf --- /dev/null +++ b/include/lbann/utils/hash.hpp @@ -0,0 +1,81 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_UTILS_HASH_HPP_INCLUDED +#define LBANN_UTILS_HASH_HPP_INCLUDED + +#include +#include +#include + +namespace lbann { + +/** @brief Combine two hash values + * + * A hash function is applied to an object and the resulting hash + * value is mixed with another hash value. See + * https://www.boost.org/doc/libs/1_55_0/doc/html/hash/reference.html#boost.hash_combine. + * + * @param seed Hash value. + * @param val Input to hash function. + * @tparam Hash Hash function for type @c T. + */ +template > +std::size_t hash_combine(std::size_t seed, const T& val) { + return seed ^ (Hash()(val) + 0x9e3779b9 + (seed << 6) + (seed >> 2)); +} + +/** @brief Hash function for enumeration type + * + * Equivalent to @c std::hash if the input is not an enumeration + * type. + */ +template +struct enum_hash { + using underlying_t + = typename std::conditional::value, + typename std::underlying_type::type, + T>::type; + std::size_t operator()(T val) const { + return std::hash()(static_cast(val)); + } +}; + +/** @brief Hash function for @c std::pair */ +template , + class Hash2=std::hash> +struct pair_hash { + std::size_t operator()(const std::pair& val) const { + auto seed = Hash1()(val.first); + return hash_combine(seed, val.second); + } +}; + +} // namespace lbann + +#endif // LBANN_UTILS_HASH_HPP_INCLUDED diff --git a/src/utils/random.cpp b/src/utils/random.cpp index 4c53952f0c0..6421f8c491e 100644 --- a/src/utils/random.cpp +++ b/src/utils/random.cpp @@ -27,6 +27,7 @@ #include #include "lbann/utils/random.hpp" #include "lbann/io/file_io.hpp" +#include "lbann/utils/hash.hpp" #include namespace { @@ -82,9 +83,8 @@ rng_gen& get_data_seq_generator() { rng_gen& get_io_generator() { if (!::io_generator_inited) { - std::hash h; - ::io_generator.seed((::io_generator_seed_base << 8) | - h(std::this_thread::get_id())); + ::io_generator.seed(hash_combine(::io_generator_seed_base, + std::this_thread::get_id())); ::io_generator_inited = true; } return ::io_generator; @@ -92,9 +92,8 @@ rng_gen& get_io_generator() { fast_rng_gen& get_fast_io_generator() { if (!::fast_io_generator_inited) { - std::hash h; - ::fast_io_generator.seed((::fast_io_generator_seed_base << 8) | - h(std::this_thread::get_id())); + ::fast_io_generator.seed(hash_combine(::fast_io_generator_seed_base, + std::this_thread::get_id())); ::fast_io_generator_inited = true; } return ::fast_io_generator; @@ -245,20 +244,23 @@ void init_random(int seed, lbann_comm *comm) { #ifdef _OPENMP #pragma omp parallel { - get_generator().seed((seed << 8) | omp_get_thread_num()); - get_fast_generator().seed((seed << 8) | omp_get_thread_num()); + get_generator().seed(hash_combine(seed, omp_get_thread_num())); + get_fast_generator().seed(hash_combine(seed, omp_get_thread_num())); } #else get_generator().seed(seed); get_fast_generator().seed(seed); #endif + #ifdef LBANN_SET_EL_RNG - if (comm != nullptr) { - El::Generator().seed(seed ^ comm->get_rank_in_trainer()); - } else { - El::Generator().seed(seed ^ El::mpi::Rank(El::mpi::COMM_WORLD)); - } + // Set Elemental's RNG seed + auto elemental_seed = hash_combine(seed, 104729); // 10000th prime + elemental_seed = (comm == nullptr + ? hash_combine(elemental_seed, El::mpi::Rank(El::mpi::COMM_WORLD)) + : hash_combine(elemental_seed, comm->get_rank_in_trainer())); + El::Generator().seed(elemental_seed); #endif + } else { // Seed with a random value. std::random_device rd; @@ -266,8 +268,8 @@ void init_random(int seed, lbann_comm *comm) { #ifdef _OPENMP #pragma omp parallel { - get_generator().seed((rand_val << 8) | omp_get_thread_num()); - get_fast_generator().seed((rand_val << 8) | omp_get_thread_num()); + get_generator().seed(hash_combine(rand_val, omp_get_thread_num())); + get_fast_generator().seed(hash_combine(rand_val, omp_get_thread_num())); } #else get_generator().seed(rand_val); diff --git a/src/utils/unit_test/CMakeLists.txt b/src/utils/unit_test/CMakeLists.txt index 343bcbbb753..78043ef5ca1 100644 --- a/src/utils/unit_test/CMakeLists.txt +++ b/src/utils/unit_test/CMakeLists.txt @@ -2,6 +2,7 @@ set_full_path(_DIR_LBANN_CATCH2_TEST_FILES any_test.cpp beta_distribution_test.cpp factory_test.cpp + hash_test.cpp image_test.cpp python_test.cpp random_test.cpp diff --git a/src/utils/unit_test/hash_test.cpp b/src/utils/unit_test/hash_test.cpp new file mode 100644 index 00000000000..7de4b81802a --- /dev/null +++ b/src/utils/unit_test/hash_test.cpp @@ -0,0 +1,49 @@ +// MUST include this +#include + +// File being tested +#include + +#include + +TEST_CASE ("Testing convenience functions for hashing", "[hash][utilities]") { + + SECTION ("hash_combine") { + std::unordered_set hashes; + for (size_t seed=0; seed<10; ++seed) { + hashes.insert(seed); + } + for (size_t seed=0; seed<=16; seed+=2) { + for (int val=-49; val<=49; val+=7) { + const auto hash = lbann::hash_combine(seed, val); + CHECK_FALSE(hashes.count(hash)); + hashes.insert(hash); + } + } + } + + SECTION ("enum_hash") { + enum class Humor { PHLEGMATIC, CHOLERIC, SANGUINE, MELANCHOLIC }; + std::vector enum_list = { Humor::MELANCHOLIC, Humor::SANGUINE, + Humor::CHOLERIC, Humor::PHLEGMATIC }; + std::unordered_set hashes; + for (size_t i=0; i()(enum_list[i]); + CHECK_FALSE(hashes.count(hash)); + hashes.insert(hash); + } + } + + SECTION ("pair_hash") { + std::unordered_set hashes; + for (char i=-12; i<=12; i+=3) { + for (unsigned long j=0; j<=11209; j+=1019) { + std::pair val(i,j); + const auto hash = lbann::pair_hash()(val); + CHECK_FALSE(hashes.count(hash)); + hashes.insert(hash); + } + } + } + +} From 24b1ddadc8e51c85ff166a0f2dca47b8410febf4 Mon Sep 17 00:00:00 2001 From: Tom Benson <30674819+benson31@users.noreply.github.com> Date: Fri, 18 Oct 2019 13:14:58 -0700 Subject: [PATCH 351/634] Redo of #1263 (#1309) * make jag-utils optional in the main build * fix the position of "OPTIONAL" in the install * Update docs to reflect change in JAG utils * Update superbuild to build JAG utils --- docs/build_with_cmake.rst | 22 ++++ model_zoo/jag_utils/CMakeLists.txt | 163 +++++++++++++++++------------ superbuild/lbann/CMakeLists.txt | 9 ++ 3 files changed, 126 insertions(+), 68 deletions(-) diff --git a/docs/build_with_cmake.rst b/docs/build_with_cmake.rst index c6722fa956a..45ba7464dba 100644 --- a/docs/build_with_cmake.rst +++ b/docs/build_with_cmake.rst @@ -224,6 +224,28 @@ documentation of the packages that are causing the issues as they may require additional CMake/environment flags to be set before properly resolving. +------------------------------ +Building JAG utilities +------------------------------ +The JAG utility executables are not part of the `all` target. In order +to use or install them, they must be built using the `jag-utils` +target. In order to install them, this must be done before installing. + +.. code-block:: bash + + # Configure LBANN + cmake /path/to/lbann + + # Build main LBANN library and front-ends + cmake --build . + + # If JAG utilities are required, build them + cmake --build . --target jag-utils + + # Install all (built) targets + cmake --build . --target install + + ------------------------------ Example CMake invocation ------------------------------ diff --git a/model_zoo/jag_utils/CMakeLists.txt b/model_zoo/jag_utils/CMakeLists.txt index 794ee8e61b8..98b95212085 100644 --- a/model_zoo/jag_utils/CMakeLists.txt +++ b/model_zoo/jag_utils/CMakeLists.txt @@ -1,77 +1,104 @@ - add_executable( build_index-bin build_index.cpp ) - target_link_libraries(build_index-bin lbann ) - set_target_properties(build_index-bin PROPERTIES OUTPUT_NAME build_index) - - add_executable( extract_random_samples-bin extract_random_samples.cpp ) - target_link_libraries(extract_random_samples-bin lbann ) - set_target_properties(extract_random_samples-bin PROPERTIES OUTPUT_NAME extract_random_samples) - - add_executable( dump_bundle-bin dump_bundle.cpp ) - target_link_libraries(dump_bundle-bin lbann ) - set_target_properties(dump_bundle-bin PROPERTIES OUTPUT_NAME dump_bundle) - - add_executable( check_images-bin check_images.cpp ) - target_link_libraries(check_images-bin lbann ) - set_target_properties(check_images-bin PROPERTIES OUTPUT_NAME check_images) - - add_executable( detect_corruption-bin detect_corruption.cpp ) - target_link_libraries(detect_corruption-bin lbann ) - set_target_properties(detect_corruption-bin PROPERTIES OUTPUT_NAME detect_corruption) - - add_executable( load_bundle2raw-bin load_bundle2raw.cpp ) - target_link_libraries(load_bundle2raw-bin lbann ) - set_target_properties(load_bundle2raw-bin PROPERTIES OUTPUT_NAME load_bundle2raw) - - add_executable( compute_min_max_images-bin compute_min_max_images.cpp ) - target_link_libraries(compute_min_max_images-bin lbann ) - set_target_properties(compute_min_max_images-bin PROPERTIES OUTPUT_NAME compute_min_max_images) - - add_executable( compute_per_channel_image_avg_min_max-bin compute_per_channel_image_avg_min_max.cpp ) - target_link_libraries(compute_per_channel_image_avg_min_max-bin lbann ) - set_target_properties(compute_per_channel_image_avg_min_max-bin PROPERTIES OUTPUT_NAME compute_per_channel_image_avg_min_max) - - add_executable( load_balance-bin load_balance.cpp ) - target_link_libraries(load_balance-bin lbann ) - set_target_properties(load_balance-bin PROPERTIES OUTPUT_NAME load_balance) - - add_executable( check_for_duplicate_samples-bin check_for_duplicate_samples.cpp ) - target_link_libraries(check_for_duplicate_samples-bin lbann ) - set_target_properties(check_for_duplicate_samples-bin PROPERTIES OUTPUT_NAME check_for_duplicate_samples) - - add_executable( test_conduit_hdf5-bin test_conduit_hdf5.cpp ) - target_link_libraries(test_conduit_hdf5-bin lbann ) - set_target_properties(test_conduit_hdf5-bin PROPERTIES OUTPUT_NAME test_conduit_hdf5) - - add_executable( select_samples-bin select_samples.cpp ) - target_link_libraries(select_samples-bin lbann ) - set_target_properties(select_samples-bin PROPERTIES OUTPUT_NAME select_samples) - - add_executable( build_sample_id_mapping-bin build_sample_id_mapping.cpp ) - target_link_libraries(build_sample_id_mapping-bin lbann ) - set_target_properties(build_sample_id_mapping-bin PROPERTIES OUTPUT_NAME build_sample_id_mapping) - - add_executable( generate_corrupt_samples-bin generate_corrupt_samples.cpp ) - target_link_libraries(generate_corrupt_samples-bin lbann ) - set_target_properties(generate_corrupt_samples-bin PROPERTIES OUTPUT_NAME generate_corrupt_samples) - - add_executable( compute_hydra_normalization-bin compute_hydra_normalization.cpp ) - target_link_libraries(compute_hydra_normalization-bin lbann ) - set_target_properties(compute_hydra_normalization-bin PROPERTIES OUTPUT_NAME compute_hydra_normalization) - - add_executable( test_reading_speed-bin test_reading_speed.cpp ) - target_link_libraries(test_reading_speed-bin lbann ) - set_target_properties(test_reading_speed-bin PROPERTIES OUTPUT_NAME test_reading_speed) - - add_executable( convert-bin convert.cpp ) - target_link_libraries(convert-bin lbann ) - set_target_properties(convert-bin PROPERTIES OUTPUT_NAME convert) +# Add a target to control building all the utilities +add_custom_target(jag-utils) + +add_executable(build_index + EXCLUDE_FROM_ALL build_index.cpp) +target_link_libraries(build_index lbann) +add_dependencies(jag-utils build_index) + +add_executable(extract_random_samples + EXCLUDE_FROM_ALL extract_random_samples.cpp) +target_link_libraries(extract_random_samples lbann) +add_dependencies(jag-utils extract_random_samples) + +add_executable(dump_bundle + EXCLUDE_FROM_ALL dump_bundle.cpp) +target_link_libraries(dump_bundle lbann) +add_dependencies(jag-utils dump_bundle) + +add_executable(check_images + EXCLUDE_FROM_ALL check_images.cpp) +target_link_libraries(check_images lbann) +add_dependencies(jag-utils check_images) + +add_executable(detect_corruption + EXCLUDE_FROM_ALL detect_corruption.cpp) +target_link_libraries(detect_corruption lbann) +add_dependencies(jag-utils detect_corruption) + +add_executable(load_bundle2raw + EXCLUDE_FROM_ALL load_bundle2raw.cpp) +target_link_libraries(load_bundle2raw lbann) +add_dependencies(jag-utils load_bundle2raw) + +add_executable(compute_min_max_images + EXCLUDE_FROM_ALL compute_min_max_images.cpp) +target_link_libraries(compute_min_max_images lbann) +add_dependencies(jag-utils compute_min_max_images) + +add_executable(compute_per_channel_image_avg_min_max + EXCLUDE_FROM_ALL compute_per_channel_image_avg_min_max.cpp) +target_link_libraries(compute_per_channel_image_avg_min_max lbann) +add_dependencies(jag-utils compute_per_channel_image_avg_min_max) + +add_executable(load_balance + EXCLUDE_FROM_ALL load_balance.cpp) +target_link_libraries(load_balance lbann) +add_dependencies(jag-utils load_balance) + +add_executable(check_for_duplicate_samples + EXCLUDE_FROM_ALL check_for_duplicate_samples.cpp) +target_link_libraries(check_for_duplicate_samples lbann) +add_dependencies(jag-utils extract_random_samples) + +add_executable(test_conduit_hdf5 + EXCLUDE_FROM_ALL test_conduit_hdf5.cpp) +target_link_libraries(test_conduit_hdf5 lbann) +add_dependencies(jag-utils test_conduit_hdf5) + +add_executable(select_samples + EXCLUDE_FROM_ALL select_samples.cpp) +target_link_libraries(select_samples lbann) +add_dependencies(jag-utils select_samples) + +add_executable(build_sample_id_mapping + EXCLUDE_FROM_ALL build_sample_id_mapping.cpp) +target_link_libraries(build_sample_id_mapping lbann) +add_dependencies(jag-utils build_sample_id_mapping) + +add_executable(generate_corrupt_samples + EXCLUDE_FROM_ALL generate_corrupt_samples.cpp) +target_link_libraries(generate_corrupt_samples lbann) +add_dependencies(jag-utils generate_corrupt_samples) + +add_executable(compute_hydra_normalization + EXCLUDE_FROM_ALL compute_hydra_normalization.cpp) +target_link_libraries(compute_hydra_normalization lbann) +add_dependencies(jag-utils compute_hydra_normalization) + +add_executable(test_reading_speed + EXCLUDE_FROM_ALL test_reading_speed.cpp) +target_link_libraries(test_reading_speed lbann) +add_dependencies(jag-utils test_reading_speed) + +add_executable(convert + EXCLUDE_FROM_ALL convert.cpp) +target_link_libraries(convert lbann) +add_dependencies(jag-utils convert) # Install the binaries install( - TARGETS select_samples-bin build_sample_id_mapping-bin build_index-bin + TARGETS select_samples build_sample_id_mapping build_index + OPTIONAL EXPORT LBANNTargets RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} ) + +# The use of `OPTIONAL` here will trigger CMake warnings. These can +# safely be ignored and tests confirm that. See these for more info: +# +# https://gitlab.kitware.com/cmake/cmake/issues/18258 +# https://cmake.org/pipermail/cmake/2011-August/046014.html diff --git a/superbuild/lbann/CMakeLists.txt b/superbuild/lbann/CMakeLists.txt index 0c65f388580..80f202783e8 100644 --- a/superbuild/lbann/CMakeLists.txt +++ b/superbuild/lbann/CMakeLists.txt @@ -187,5 +187,14 @@ ExternalProject_Add(LBANN ${LBANN_CMAKE_ARGS} ) +# Ensure the JAG utils are built +ExternalProject_Add_Step(LBANN build-jag-utils + COMMAND ${CMAKE_COMMAND} --build --config $ --target jag-utils + COMMENT "Performing building of JAG utils for 'LBANN'" + DEPENDEES build + DEPENDERS install + LOG 1 + USES_TERMINAL 1) + set(LBANN_DIR ${LBANN_CMAKE_INSTALL_PREFIX} CACHE INTERNAL "The install prefix of LBANN.") From 6b0870abb15bae85b35c5978c5ebc71ee492a551 Mon Sep 17 00:00:00 2001 From: Tim Moon Date: Fri, 18 Oct 2019 15:14:14 -0700 Subject: [PATCH 352/634] Replace MNIST unit tests with individual layer tests (#1295) * Add Bamboo unit test for fully-connected layer * Add Bamboo unit test for convolution layer * Fix typo * Update convolution unit test with strided convolution * Update softmax layer unit test to use Python frontend * Add Bamboo unit test for cross entropy layer * Add Bamboo unit test for mean squared error layer * Remove MNIST-based Bamboo unit tests * Implement NumPy softmax in Bamboo test for softmax layer SciPy 1.2.0 has softmax, but default SciPy on LC is 0.12.1 * Avoid NumPy RNG in Bamboo test for convolution layer RNG values vary across NumPy versions, so precomputed metric values were incorrect. --- .../unit_tests/test_unit_layer_convolution.py | 331 ++++++++++++++++++ .../test_unit_layer_cross_entropy.py | 224 ++++++++++++ ...nit_layer_entrywise_batch_normalization.py | 2 +- .../test_unit_layer_entrywise_scale_bias.py | 2 +- .../test_unit_layer_fully_connected.py | 307 ++++++++++++++++ .../test_unit_layer_mean_squared_error.py | 222 ++++++++++++ bamboo/unit_tests/test_unit_layer_softmax.py | 258 +++++++++++--- .../unit_tests/test_unit_mnist_conv_graph.py | 61 ---- .../test_unit_mnist_ridge_regression.py | 155 -------- .../test_unit_mnist_softmax_classifier.py | 61 ---- .../tests/layer_tests/model_softmax.prototext | 99 ------ .../tests/model_mnist_conv_graph.prototext | 238 ------------- .../model_mnist_ridge_regression.prototext | 77 ---- .../model_mnist_softmax_classifier.prototext | 87 ----- 14 files changed, 1296 insertions(+), 828 deletions(-) create mode 100644 bamboo/unit_tests/test_unit_layer_convolution.py create mode 100644 bamboo/unit_tests/test_unit_layer_cross_entropy.py create mode 100644 bamboo/unit_tests/test_unit_layer_fully_connected.py create mode 100644 bamboo/unit_tests/test_unit_layer_mean_squared_error.py delete mode 100644 bamboo/unit_tests/test_unit_mnist_conv_graph.py delete mode 100644 bamboo/unit_tests/test_unit_mnist_ridge_regression.py delete mode 100644 bamboo/unit_tests/test_unit_mnist_softmax_classifier.py delete mode 100644 model_zoo/tests/layer_tests/model_softmax.prototext delete mode 100644 model_zoo/tests/model_mnist_conv_graph.prototext delete mode 100644 model_zoo/tests/model_mnist_ridge_regression.prototext delete mode 100644 model_zoo/tests/model_mnist_softmax_classifier.prototext diff --git a/bamboo/unit_tests/test_unit_layer_convolution.py b/bamboo/unit_tests/test_unit_layer_convolution.py new file mode 100644 index 00000000000..ba1faec3ed2 --- /dev/null +++ b/bamboo/unit_tests/test_unit_layer_convolution.py @@ -0,0 +1,331 @@ +import functools +import math +import operator +import os +import os.path +import sys +import numpy as np + +# Local files +current_file = os.path.realpath(__file__) +current_dir = os.path.dirname(current_file) +sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python')) +import tools + +# ============================================== +# Objects for Python data reader +# ============================================== +# Note: The Python data reader imports this file as a module and calls +# the functions below to ingest data. + +def make_random_array(shape, seed): + """Hacked function to generate a random array. + + NumPy's RNG produces different values with different NumPy + versions. This function is helpful when array values must be + identical across all runs, e.g. when checking against precomputed + metric values. + + Args: + shape (Iterable of int): Array dimensions + seed (int): Parameter for RNG. Must be non-zero. + Returns: + numpy.ndarray: Array of `np.float32`. Values will be in + [-0.5,0.5). + + """ + size = functools.reduce(operator.mul, shape) + eps = np.finfo(np.float32).eps + x = (seed / np.linspace(math.sqrt(eps), 0.1, size)) % 1 - 0.5 + return x.reshape(shape).astype(np.float32) + +# Data +_num_samples = 23 +_sample_dims = [6,11,7] +_sample_size = functools.reduce(operator.mul, _sample_dims) +_samples = make_random_array([_num_samples] + _sample_dims, 7) + +# Sample access functions +def get_sample(index): + return _samples[index,:].reshape(-1) +def num_samples(): + return _num_samples +def sample_dims(): + return (_sample_size,) + +# ============================================== +# PyTorch convolution +# ============================================== + +def pytorch_convolution(data, + kernel, + bias=None, + stride=1, + padding=0, + dilation=1, + groups=1): + """Wrapper around PyTorch convolution. + + Input and output data are NumPy arrays. + + """ + + # Convert input data to PyTorch tensors + import torch + import torch.nn.functional + if type(data) is np.ndarray: + data = torch.from_numpy(data.astype(np.float64)) + if type(kernel) is np.ndarray: + kernel = torch.from_numpy(kernel.astype(np.float64)) + if type(bias) is np.ndarray: + bias = torch.from_numpy(bias.astype(np.float64)) + + # Perform convolution with PyTorch + output = None + if len(kernel.shape) == 3: + output = torch.nn.functional.conv1d( + data, kernel, bias, stride, padding, dilation, groups + ) + if len(kernel.shape) == 4: + output = torch.nn.functional.conv2d( + data, kernel, bias, stride, padding, dilation, groups + ) + if len(kernel.shape) == 5: + output = torch.nn.functional.conv3d( + data, kernel, bias, stride, padding, dilation, groups + ) + if output is None: + raise ValueError('PyTorch only supports 1D, 2D, and 3D convolution') + + # Return output as NumPy array + return output.numpy() + +# ============================================== +# Setup LBANN experiment +# ============================================== + +def setup_experiment(lbann): + """Construct LBANN experiment. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + trainer = lbann.Trainer() + model = construct_model(lbann) + data_reader = construct_data_reader(lbann) + optimizer = lbann.NoOptimizer() + return trainer, model, data_reader, optimizer + +def construct_model(lbann): + """Construct LBANN model. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Convenience function to convert list to a space-separated string + def str_list(it): + return ' '.join([str(i) for i in it]) + + # Convenience function to compute L2 norm squared with NumPy + def l2_norm2(x): + x = x.reshape(-1).astype(np.float64) + return np.inner(x, x) + + # Input data + # Note: Sum with a weights layer so that gradient checking will + # verify that error signals are correct. + x_weights = lbann.Weights(optimizer=lbann.SGD(), + initializer=lbann.ConstantInitializer(value=0.0), + name='input_weights') + x0 = lbann.WeightsLayer(weights=x_weights, + dims=str_list(_sample_dims)) + x1 = lbann.Reshape(lbann.Input(), dims=str_list(_sample_dims)) + x = lbann.Sum([x0, x1]) + x_lbann = x + + # Objects for LBANN model + obj = [] + metrics = [] + callbacks = [] + + # ------------------------------------------ + # Basic 3x3 convolution + # ------------------------------------------ + # 3x3 conv, stride=1, pad=1, dilation=1, bias + + # Convolution settings + kernel_dims = (5, _sample_dims[0], 3, 3) + strides = (1, 1) + pads = (1, 1) + dilations = (1, 1) + kernel = make_random_array(kernel_dims, 11) + bias = make_random_array([kernel_dims[0]], 123) + + # Apply convolution + kernel_weights = lbann.Weights( + optimizer=lbann.SGD(), + initializer=lbann.ValueInitializer(values=str_list(np.nditer(kernel))), + name='kernel1' + ) + bias_weights = lbann.Weights( + optimizer=lbann.SGD(), + initializer=lbann.ValueInitializer(values=str_list(np.nditer(bias))), + name='bias1' + ) + x = x_lbann + y = lbann.Convolution(x, + weights=(kernel_weights, bias_weights), + num_dims=3, + num_output_channels=kernel_dims[0], + has_vectors=True, + conv_dims=str_list(kernel_dims[2:]), + conv_strides=str_list(strides), + conv_pads=str_list(pads), + conv_dilations=str_list(dilations), + has_bias=True) + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='basic 3x3 convolution')) + + # PyTorch implementation + try: + x = _samples + y = pytorch_convolution( + x, kernel, bias=bias, + stride=strides, padding=pads, dilation=dilations + ) + z = l2_norm2(y) / _num_samples + val = z + except: + # Precomputed value + val = 153.84937996554953 + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # 2x4 strided convolution + # ------------------------------------------ + + # Convolution settings + kernel_dims = (3, _sample_dims[0], 2, 4) + strides = (3, 1) + pads = (3, 0) + dilations = (1, 1) + num_groups = 1 + kernel = make_random_array(kernel_dims, 19) + + # Apply convolution + kernel_weights = lbann.Weights( + optimizer=lbann.SGD(), + initializer=lbann.ValueInitializer(values=str_list(np.nditer(kernel))), + name='kernel2' + ) + x = x_lbann + y = lbann.Convolution(x, + weights=(kernel_weights), + num_dims=3, + num_output_channels=kernel_dims[0], + has_vectors=True, + conv_dims=str_list(kernel_dims[2:]), + conv_strides=str_list(strides), + conv_pads=str_list(pads), + conv_dilations=str_list(dilations), + num_groups=num_groups, + has_bias=False) + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='2x4 convolution')) + + # PyTorch implementation + try: + x = _samples + y = pytorch_convolution( + x, kernel, bias=None, + stride=strides, padding=pads, + dilation=dilations, groups=num_groups + ) + z = l2_norm2(y) / _num_samples + val = z + except: + # Precomputed value + val = 19.24587403346207 + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # Gradient checking + # ------------------------------------------ + + callbacks.append(lbann.CallbackCheckGradients(error_on_failure=True)) + + # ------------------------------------------ + # Construct model + # ------------------------------------------ + + mini_batch_size = 11 + num_epochs = 0 + return lbann.Model(mini_batch_size, + num_epochs, + layers=lbann.traverse_layer_graph(x_lbann), + objective_function=obj, + metrics=metrics, + callbacks=callbacks) + +def construct_data_reader(lbann): + """Construct Protobuf message for Python data reader. + + The Python data reader will import the current Python file to + access the sample access functions. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Note: The training data reader should be removed when + # https://github.com/LLNL/lbann/issues/1098 is resolved. + message = lbann.reader_pb2.DataReader() + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'train' + ) + ]) + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'test' + ) + ]) + return message + +# ============================================== +# Setup PyTest +# ============================================== + +# Create test functions that can interact with PyTest +# Note: Create test name by removing ".py" from file name +_test_name = os.path.splitext(os.path.basename(current_file))[0] +for test in tools.create_tests(setup_experiment, _test_name): + globals()[test.__name__] = test diff --git a/bamboo/unit_tests/test_unit_layer_cross_entropy.py b/bamboo/unit_tests/test_unit_layer_cross_entropy.py new file mode 100644 index 00000000000..0fb25b25553 --- /dev/null +++ b/bamboo/unit_tests/test_unit_layer_cross_entropy.py @@ -0,0 +1,224 @@ +import functools +import operator +import os +import os.path +import sys +import numpy as np + +# Local files +current_file = os.path.realpath(__file__) +current_dir = os.path.dirname(current_file) +sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python')) +import tools + +# ============================================== +# Objects for Python data reader +# ============================================== +# Note: The Python data reader imports this file as a module and calls +# the functions below to ingest data. + +# Data +# Note: The error bounds for gradient checking assume that the fourth +# derivative of the objective function is ~1. However, given our loss +# function: +# L = ( -xhat * log(x) )^2 +# L'''' = O( xhat^2 * log(x) / x^4 ) +# We have x >= 0.25 to make sure the fourth derivative does not get +# too big and mess up the error bounds. +np.random.seed(201910143) +_samples = np.random.uniform(low=0.25, + high=1, + size=(13,2,7)).astype(np.float32) + +# Sample access functions +def get_sample(index): + return _samples[index].reshape(-1) +def num_samples(): + return _samples.shape[0] +def sample_dims(): + return (2*_samples.shape[-1],) + +# ============================================== +# Setup LBANN experiment +# ============================================== + +def setup_experiment(lbann): + """Construct LBANN experiment. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + trainer = lbann.Trainer() + model = construct_model(lbann) + data_reader = construct_data_reader(lbann) + optimizer = lbann.NoOptimizer() + return trainer, model, data_reader, optimizer + +def construct_model(lbann): + """Construct LBANN model. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Convenience function to convert list to a space-separated string + def str_list(it): + return ' '.join([str(i) for i in it]) + + # Convenience function to compute L2 norm squared with NumPy + def l2_norm2(x): + x = x.reshape(-1) + return np.inner(x, x) + + # Input data + # Note: Sum with weights layers so that gradient checking will + # verify that error signals are correct. + slice_size = _samples.shape[-1] + x0_weights = lbann.Weights(optimizer=lbann.SGD(), + initializer=lbann.ConstantInitializer(value=0.0), + name='input0_weights') + x1_weights = lbann.Weights(optimizer=lbann.SGD(), + initializer=lbann.ConstantInitializer(value=0.0), + name='input1_weights') + x_slice = lbann.Slice(lbann.Input(), + slice_points=str_list([0, slice_size, 2*slice_size])) + x0 = lbann.Sum([x_slice, + lbann.WeightsLayer(weights=x0_weights, + dims=str(slice_size))]) + x1 = lbann.Sum([x_slice, + lbann.WeightsLayer(weights=x1_weights, + dims=str(slice_size))]) + x0_lbann = x0 + x1_lbann = x1 + + # Objects for LBANN model + obj = [] + metrics = [] + callbacks = [] + + # ------------------------------------------ + # Data-parallel layout + # ------------------------------------------ + + # LBANN implementation + x0 = x0_lbann + x1 = x1_lbann + y = lbann.CrossEntropy([x0, x1], data_layout='data_parallel') + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='data-parallel output')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i) + x0 = x[:slice_size] + x1 = x[slice_size:] + y = -np.inner(x1, np.log(x0)) + z = l2_norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # Model-parallel layout + # ------------------------------------------ + + # LBANN implementation + x0 = x0_lbann + x1 = x1_lbann + y = lbann.CrossEntropy([x0, x1], data_layout='model_parallel') + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='model-parallel output')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i) + x0 = x[:slice_size] + x1 = x[slice_size:] + y = -np.inner(x1, np.log(x0)) + z = l2_norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # Gradient checking + # ------------------------------------------ + + callbacks.append(lbann.CallbackCheckGradients(error_on_failure=True)) + + # ------------------------------------------ + # Construct model + # ------------------------------------------ + + mini_batch_size = 11 + num_epochs = 0 + return lbann.Model(mini_batch_size, + num_epochs, + layers=lbann.traverse_layer_graph(x0_lbann), + objective_function=obj, + metrics=metrics, + callbacks=callbacks) + +def construct_data_reader(lbann): + """Construct Protobuf message for Python data reader. + + The Python data reader will import the current Python file to + access the sample access functions. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Note: The training data reader should be removed when + # https://github.com/LLNL/lbann/issues/1098 is resolved. + message = lbann.reader_pb2.DataReader() + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'train' + ) + ]) + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'test' + ) + ]) + return message + +# ============================================== +# Setup PyTest +# ============================================== + +# Create test functions that can interact with PyTest +# Note: Create test name by removing ".py" from file name +_test_name = os.path.splitext(os.path.basename(current_file))[0] +for test in tools.create_tests(setup_experiment, _test_name): + globals()[test.__name__] = test diff --git a/bamboo/unit_tests/test_unit_layer_entrywise_batch_normalization.py b/bamboo/unit_tests/test_unit_layer_entrywise_batch_normalization.py index 33592ae3253..3f9d5cf7f74 100644 --- a/bamboo/unit_tests/test_unit_layer_entrywise_batch_normalization.py +++ b/bamboo/unit_tests/test_unit_layer_entrywise_batch_normalization.py @@ -115,7 +115,7 @@ def str_list(it): metrics.append(lbann.Metric(z, name='model-parallel output')) # ------------------------------------------ - # Gradient checkint + # Gradient checking # ------------------------------------------ callbacks.append(lbann.CallbackCheckGradients(error_on_failure=True)) diff --git a/bamboo/unit_tests/test_unit_layer_entrywise_scale_bias.py b/bamboo/unit_tests/test_unit_layer_entrywise_scale_bias.py index c1816202f30..08053863192 100644 --- a/bamboo/unit_tests/test_unit_layer_entrywise_scale_bias.py +++ b/bamboo/unit_tests/test_unit_layer_entrywise_scale_bias.py @@ -155,7 +155,7 @@ def l2_norm2(x): execution_modes='test')) # ------------------------------------------ - # Gradient checkint + # Gradient checking # ------------------------------------------ callbacks.append(lbann.CallbackCheckGradients(error_on_failure=True)) diff --git a/bamboo/unit_tests/test_unit_layer_fully_connected.py b/bamboo/unit_tests/test_unit_layer_fully_connected.py new file mode 100644 index 00000000000..609a51afd5d --- /dev/null +++ b/bamboo/unit_tests/test_unit_layer_fully_connected.py @@ -0,0 +1,307 @@ +import functools +import operator +import os +import os.path +import sys +import numpy as np + +# Local files +current_file = os.path.realpath(__file__) +current_dir = os.path.dirname(current_file) +sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python')) +import tools + +# ============================================== +# Objects for Python data reader +# ============================================== +# Note: The Python data reader imports this file as a module and calls +# the functions below to ingest data. + +# Data +np.random.seed(20191011) +_num_samples = 31 +_input_size = 11 +_output_size = 3 +_samples = np.random.normal(size=(_num_samples,_input_size)).astype(np.float32) + +# Sample access functions +def get_sample(index): + return _samples[index,:] +def num_samples(): + return _num_samples +def sample_dims(): + return (_input_size,) + +# ============================================== +# Setup LBANN experiment +# ============================================== + +def setup_experiment(lbann): + """Construct LBANN experiment. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + trainer = lbann.Trainer() + model = construct_model(lbann) + data_reader = construct_data_reader(lbann) + optimizer = lbann.NoOptimizer() + return trainer, model, data_reader, optimizer + +def construct_model(lbann): + """Construct LBANN model. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Convenience function to convert list to a space-separated string + def str_list(it): + return ' '.join([str(i) for i in it]) + + # Input data + # Note: Sum with a weights layer so that gradient checking will + # verify that error signals are correct. + x_weights = lbann.Weights(optimizer=lbann.SGD(), + initializer=lbann.ConstantInitializer(value=0.0)) + x0 = lbann.WeightsLayer(weights=x_weights, dims=str(_input_size)) + x1 = lbann.Identity(lbann.Input()) + x = lbann.Sum([x0, x1]) + x_lbann = x + + # Objects for LBANN model + obj = [] + metrics = [] + callbacks = [] + + # ------------------------------------------ + # Compute expected metric values with NumPy + # ------------------------------------------ + + # Convenience function to compute L2 norm squared with NumPy + def l2_norm2(x): + x = x.reshape(-1) + return np.inner(x, x) + + # Weight values + linearity = np.random.normal(size=(_output_size,_input_size)).astype(np.float32) + bias = np.random.normal(size=(_output_size,1)).astype(np.float32) + + # With bias + x = _samples.transpose() + y = np.matmul(linearity, x) + bias + z = l2_norm2(y) / _num_samples + val_with_bias = z + + # Without bias + x = _samples.transpose() + y = np.matmul(linearity, x) + z = l2_norm2(y) / _num_samples + val_without_bias = z + + # ------------------------------------------ + # Data-parallel layout, non-transpose, bias + # ------------------------------------------ + + # LBANN implementation + linearity_weights = lbann.Weights( + optimizer=lbann.SGD(), + initializer=lbann.ValueInitializer( + values=str_list(np.nditer(linearity, order='F')) + ) + ) + bias_weights = lbann.Weights( + optimizer=lbann.SGD(), + initializer=lbann.ValueInitializer( + values=str_list(np.nditer(bias)) + ) + ) + x = x_lbann + y = lbann.FullyConnected(x, + weights=(linearity_weights, bias_weights), + data_layout='data_parallel', + num_neurons=_output_size, + has_bias=True, + transpose=False) + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='data-parallel layout, non-transpose, bias')) + + # NumPy implementation + val = val_with_bias + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # Model-parallel layout, non-transpose, bias + # ------------------------------------------ + + # LBANN implementation + linearity_weights = lbann.Weights( + optimizer=lbann.SGD(), + initializer=lbann.ValueInitializer( + values=str_list(np.nditer(linearity, order='F')) + ) + ) + bias_weights = lbann.Weights( + optimizer=lbann.SGD(), + initializer=lbann.ValueInitializer( + values=str_list(np.nditer(bias)) + ) + ) + x = x_lbann + y = lbann.FullyConnected(x, + weights=(linearity_weights, bias_weights), + data_layout='model_parallel', + num_neurons=_output_size, + has_bias=True, + transpose=False) + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='model-parallel layout, non-transpose, bias')) + + # NumPy implementation + val = val_with_bias + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # Data-parallel layout, transpose, no bias + # ------------------------------------------ + + # LBANN implementation + linearity_weights = lbann.Weights( + optimizer=lbann.SGD(), + initializer=lbann.ValueInitializer( + values=str_list(np.nditer(linearity, order='C')) + ) + ) + x = x_lbann + y = lbann.FullyConnected(x, + weights=linearity_weights, + data_layout='data_parallel', + num_neurons=_output_size, + has_bias=False, + transpose=True) + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='data-parallel layout, transpose, no bias')) + + # NumPy implementation + val = val_without_bias + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # Model-parallel layout, transpose, no bias + # ------------------------------------------ + + # LBANN implementation + linearity_weights = lbann.Weights( + optimizer=lbann.SGD(), + initializer=lbann.ValueInitializer( + values=str_list(np.nditer(linearity, order='C')) + ) + ) + x = x_lbann + y = lbann.FullyConnected(x, + weights=linearity_weights, + data_layout='model_parallel', + num_neurons=_output_size, + has_bias=False, + transpose=True) + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='data-parallel layout, transpose, no bias')) + + # NumPy implementation + val = val_without_bias + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # Gradient checking + # ------------------------------------------ + + callbacks.append(lbann.CallbackCheckGradients(error_on_failure=True)) + + # ------------------------------------------ + # Construct model + # ------------------------------------------ + + mini_batch_size = 17 + num_epochs = 0 + return lbann.Model(mini_batch_size, + num_epochs, + layers=lbann.traverse_layer_graph(x_lbann), + objective_function=obj, + metrics=metrics, + callbacks=callbacks) + +def construct_data_reader(lbann): + """Construct Protobuf message for Python data reader. + + The Python data reader will import the current Python file to + access the sample access functions. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Note: The training data reader should be removed when + # https://github.com/LLNL/lbann/issues/1098 is resolved. + message = lbann.reader_pb2.DataReader() + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'train' + ) + ]) + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'test' + ) + ]) + return message + +# ============================================== +# Setup PyTest +# ============================================== + +# Create test functions that can interact with PyTest +# Note: Create test name by removing ".py" from file name +_test_name = os.path.splitext(os.path.basename(current_file))[0] +for test in tools.create_tests(setup_experiment, _test_name): + globals()[test.__name__] = test diff --git a/bamboo/unit_tests/test_unit_layer_mean_squared_error.py b/bamboo/unit_tests/test_unit_layer_mean_squared_error.py new file mode 100644 index 00000000000..02680e8fce4 --- /dev/null +++ b/bamboo/unit_tests/test_unit_layer_mean_squared_error.py @@ -0,0 +1,222 @@ +import functools +import operator +import os +import os.path +import sys +import numpy as np + +# Local files +current_file = os.path.realpath(__file__) +current_dir = os.path.dirname(current_file) +sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python')) +import tools + +# ============================================== +# Objects for Python data reader +# ============================================== +# Note: The Python data reader imports this file as a module and calls +# the functions below to ingest data. + +# Data +# Note: The error bounds for gradient checking assume that the fourth +# derivative of the objective function is ~1. However, given our loss +# function: +# L = ( -xhat * log(x) )^2 +# L'''' = O( xhat^2 * log(x) / x^4 ) +# We have x >= 0.25 to make sure the fourth derivative does not get +# too big and mess up the error bounds. +np.random.seed(201910144) +_samples = np.random.normal(size=(13,2,9)).astype(np.float32) + +# Sample access functions +def get_sample(index): + return _samples[index].reshape(-1) +def num_samples(): + return _samples.shape[0] +def sample_dims(): + return (2*_samples.shape[-1],) + +# ============================================== +# Setup LBANN experiment +# ============================================== + +def setup_experiment(lbann): + """Construct LBANN experiment. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + trainer = lbann.Trainer() + model = construct_model(lbann) + data_reader = construct_data_reader(lbann) + optimizer = lbann.NoOptimizer() + return trainer, model, data_reader, optimizer + +def construct_model(lbann): + """Construct LBANN model. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Convenience function to convert list to a space-separated string + def str_list(it): + return ' '.join([str(i) for i in it]) + + # Convenience function to compute L2 norm squared with NumPy + def l2_norm2(x): + x = x.reshape(-1) + return np.inner(x, x) + + # Input data + # Note: Sum with weights layers so that gradient checking will + # verify that error signals are correct. + slice_size = _samples.shape[-1] + x0_weights = lbann.Weights(optimizer=lbann.SGD(), + initializer=lbann.ConstantInitializer(value=0.0), + name='input0_weights') + x1_weights = lbann.Weights(optimizer=lbann.SGD(), + initializer=lbann.ConstantInitializer(value=0.0), + name='input1_weights') + x_slice = lbann.Slice(lbann.Input(), + slice_points=str_list([0, slice_size, 2*slice_size])) + x0 = lbann.Sum([x_slice, + lbann.WeightsLayer(weights=x0_weights, + dims=str(slice_size))]) + x1 = lbann.Sum([x_slice, + lbann.WeightsLayer(weights=x1_weights, + dims=str(slice_size))]) + x0_lbann = x0 + x1_lbann = x1 + + # Objects for LBANN model + obj = [] + metrics = [] + callbacks = [] + + # ------------------------------------------ + # Data-parallel layout + # ------------------------------------------ + + # LBANN implementation + x0 = x0_lbann + x1 = x1_lbann + y = lbann.MeanSquaredError([x0, x1], data_layout='data_parallel') + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='data-parallel output')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i) + x0 = x[:slice_size] + x1 = x[slice_size:] + y = l2_norm2(x0-x1) / slice_size + z = l2_norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # Model-parallel layout + # ------------------------------------------ + + # LBANN implementation + x0 = x0_lbann + x1 = x1_lbann + y = lbann.MeanSquaredError([x0, x1], data_layout='model_parallel') + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='model-parallel output')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i) + x0 = x[:slice_size] + x1 = x[slice_size:] + y = l2_norm2(x0-x1) / slice_size + z = l2_norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # Gradient checking + # ------------------------------------------ + + callbacks.append(lbann.CallbackCheckGradients(error_on_failure=True)) + + # ------------------------------------------ + # Construct model + # ------------------------------------------ + + mini_batch_size = 11 + num_epochs = 0 + return lbann.Model(mini_batch_size, + num_epochs, + layers=lbann.traverse_layer_graph(x0_lbann), + objective_function=obj, + metrics=metrics, + callbacks=callbacks) + +def construct_data_reader(lbann): + """Construct Protobuf message for Python data reader. + + The Python data reader will import the current Python file to + access the sample access functions. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Note: The training data reader should be removed when + # https://github.com/LLNL/lbann/issues/1098 is resolved. + message = lbann.reader_pb2.DataReader() + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'train' + ) + ]) + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'test' + ) + ]) + return message + +# ============================================== +# Setup PyTest +# ============================================== + +# Create test functions that can interact with PyTest +# Note: Create test name by removing ".py" from file name +_test_name = os.path.splitext(os.path.basename(current_file))[0] +for test in tools.create_tests(setup_experiment, _test_name): + globals()[test.__name__] = test diff --git a/bamboo/unit_tests/test_unit_layer_softmax.py b/bamboo/unit_tests/test_unit_layer_softmax.py index 80a3d3f51a0..5d1e11b83b6 100644 --- a/bamboo/unit_tests/test_unit_layer_softmax.py +++ b/bamboo/unit_tests/test_unit_layer_softmax.py @@ -1,52 +1,214 @@ +import functools +import operator +import os +import os.path import sys -sys.path.insert(0, '../common_python') +import numpy as np + +# Local files +current_file = os.path.realpath(__file__) +current_dir = os.path.dirname(current_file) +sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python')) import tools -import pytest -import os +# ============================================== +# Objects for Python data reader +# ============================================== +# Note: The Python data reader imports this file as a module and calls +# the functions below to ingest data. + +# Data +np.random.seed(201910142) +_num_samples = 19 +_sample_size = 7 +_samples = np.random.normal(size=(_num_samples,_sample_size)).astype(np.float32) + +# Sample access functions +def get_sample(index): + return _samples[index,:] +def num_samples(): + return _num_samples +def sample_dims(): + return (_sample_size,) + +# ============================================== +# NumPy softmax +# ============================================== + +def numpy_softmax(x): + """NumPy implementation of softmax. + + There is an implementation in SciPy 1.2.0 (scipy.special.softmax). + + """ + x = x.astype(np.float64) + y = np.exp(x - np.max(x)) + return y / np.sum(y) + +# ============================================== +# Setup LBANN experiment +# ============================================== + +def setup_experiment(lbann): + """Construct LBANN experiment. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + trainer = lbann.Trainer() + model = construct_model(lbann) + data_reader = construct_data_reader(lbann) + optimizer = lbann.NoOptimizer() + return trainer, model, data_reader, optimizer + +def construct_model(lbann): + """Construct LBANN model. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Convenience function to convert list to a space-separated string + def str_list(it): + return ' '.join([str(i) for i in it]) + + # Convenience function to compute L2 norm squared with NumPy + def l2_norm2(x): + x = x.reshape(-1).astype(np.float64) + return np.inner(x, x) + + # Input data + # Note: Sum with a weights layer so that gradient checking will + # verify that error signals are correct. + x_weights = lbann.Weights(optimizer=lbann.SGD(), + initializer=lbann.ConstantInitializer(value=0.0)) + x0 = lbann.WeightsLayer(weights=x_weights, dims=str(_sample_size)) + x1 = lbann.Identity(lbann.Input()) + x = lbann.Sum([x0, x1]) + x_lbann = x + + # Objects for LBANN model + obj = [] + metrics = [] + callbacks = [] + + # ------------------------------------------ + # Data-parallel layout + # ------------------------------------------ + + # LBANN implementation + x = x_lbann + y = lbann.Softmax(x, data_layout='data_parallel') + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='data-parallel output')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i) + y = numpy_softmax(x) + z = l2_norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # Model-parallel layout + # ------------------------------------------ + + # LBANN implementation + x = x_lbann + y = lbann.Softmax(x, data_layout='model_parallel') + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='model-parallel output')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i) + y = numpy_softmax(x) + z = l2_norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # Gradient checking + # ------------------------------------------ + + callbacks.append(lbann.CallbackCheckGradients(error_on_failure=True)) + + # ------------------------------------------ + # Construct model + # ------------------------------------------ + + mini_batch_size = 17 + num_epochs = 0 + return lbann.Model(mini_batch_size, + num_epochs, + layers=lbann.traverse_layer_graph(x_lbann), + objective_function=obj, + metrics=metrics, + callbacks=callbacks) + +def construct_data_reader(lbann): + """Construct Protobuf message for Python data reader. + + The Python data reader will import the current Python file to + access the sample access functions. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Note: The training data reader should be removed when + # https://github.com/LLNL/lbann/issues/1098 is resolved. + message = lbann.reader_pb2.DataReader() + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'train' + ) + ]) + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'test' + ) + ]) + return message + +# ============================================== +# Setup PyTest +# ============================================== -def skeleton_layer_softmax(cluster, executables, dir_name, compiler_name, - weekly, data_reader_percent): - if compiler_name not in executables: - e = 'skeleton_layer_softmax: default_exes[%s] does not exist' % compiler_name - print('Skip - ' + e) - pytest.skip(e) - output_file_name = '%s/bamboo/unit_tests/output/layer_softmax_%s_output.txt' % (dir_name, compiler_name) - error_file_name = '%s/bamboo/unit_tests/error/layer_softmax_%s_error.txt' % (dir_name, compiler_name) - command = tools.get_command( - cluster=cluster, executable=executables[compiler_name], - num_nodes=1, - time_limit=10, - num_processes=2, dir_name=dir_name, - data_reader_name='synthetic', - data_reader_percent=data_reader_percent, - model_folder='tests/layer_tests', model_name='softmax', - optimizer_name='sgd', - output_file_name=output_file_name, error_file_name=error_file_name, weekly=weekly) - return_code = os.system(command) - tools.assert_success(return_code, error_file_name) - - -def test_unit_layer_softmax_clang6(cluster, exes, dirname, weekly, data_reader_percent): - skeleton_layer_softmax(cluster, exes, dirname, 'clang6', - weekly, data_reader_percent) - - -def test_unit_layer_softmax_gcc7(cluster, exes, dirname, weekly, data_reader_percent): - skeleton_layer_softmax(cluster, exes, dirname, 'gcc7', weekly, data_reader_percent) - - -def test_unit_layer_softmax_intel19(cluster, exes, dirname, - weekly, data_reader_percent): - skeleton_layer_softmax(cluster, exes, dirname, 'intel19', - weekly, data_reader_percent) - - -# Run with python3 -m pytest -s test_unit_ridge_regression.py -k 'test_unit_layer_softmax_exe' --exe= -def test_unit_layer_softmax_exe(cluster, dirname, exe, weekly, data_reader_percent): - if exe is None: - e = 'test_unit_layer_softmax_exe: Non-local testing' - print('Skip - ' + e) - pytest.skip(e) - exes = {'exe': exe} - skeleton_layer_softmax(cluster, exes, dirname, 'exe', weekly, data_reader_percent) +# Create test functions that can interact with PyTest +# Note: Create test name by removing ".py" from file name +_test_name = os.path.splitext(os.path.basename(current_file))[0] +for test in tools.create_tests(setup_experiment, _test_name): + globals()[test.__name__] = test diff --git a/bamboo/unit_tests/test_unit_mnist_conv_graph.py b/bamboo/unit_tests/test_unit_mnist_conv_graph.py deleted file mode 100644 index 1ef04a0ce1a..00000000000 --- a/bamboo/unit_tests/test_unit_mnist_conv_graph.py +++ /dev/null @@ -1,61 +0,0 @@ -import sys -sys.path.insert(0, '../common_python') -import tools -import pytest -import os - - -def skeleton_mnist_conv_graph(cluster, executables, dir_name, compiler_name, - weekly, data_reader_percent): - if compiler_name not in executables: - e = 'skeleton_mnist_conv_graph: default_exes[%s] does not exist' % compiler_name - print('Skip - ' + e) - pytest.skip(e) - output_file_name = '%s/bamboo/unit_tests/output/mnist_conv_graph_%s_output.txt' % (dir_name, compiler_name) - error_file_name = '%s/bamboo/unit_tests/error/mnist_conv_graph_%s_error.txt' % (dir_name, compiler_name) - if compiler_name == 'gcc7': - tl = 240 - else: - tl = None - command = tools.get_command( - cluster=cluster, executable=executables[compiler_name], - num_nodes=1, time_limit=tl, num_processes=1, - dir_name=dir_name, - data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST', - data_reader_name='mnist', - data_reader_percent=data_reader_percent, - model_folder='tests', - model_name='mnist_conv_graph', - optimizer_name='adam', - output_file_name=output_file_name, - error_file_name=error_file_name, weekly=weekly) - return_code = os.system(command) - tools.assert_success(return_code, error_file_name) - - -def test_unit_mnist_conv_graph_clang6(cluster, exes, dirname, - weekly, data_reader_percent): - skeleton_mnist_conv_graph(cluster, exes, dirname, 'clang6', - weekly, data_reader_percent) - - -def test_unit_mnist_conv_graph_gcc7(cluster, exes, dirname, - weekly, data_reader_percent): - skeleton_mnist_conv_graph(cluster, exes, dirname, 'gcc7', - weekly, data_reader_percent) - - -def test_unit_mnist_conv_graph_intel19(cluster, exes, dirname, - weekly, data_reader_percent): - skeleton_mnist_conv_graph(cluster, exes, dirname, 'intel19', - weekly, data_reader_percent) - - -# Run with python3 -m pytest -s test_unit_conv_graph.py -k 'test_unit_mnist_conv_graph_exe' --exe= -def test_unit_mnist_conv_graph_exe(cluster, dirname, exe, weekly, data_reader_percent): - if exe is None: - e = 'test_unit_mnist_conv_graph_exe: Non-local testing' - print('Skip - ' + e) - pytest.skip(e) - exes = {'exe': exe} - skeleton_mnist_conv_graph(cluster, exes, dirname, 'exe', weekly, data_reader_percent) diff --git a/bamboo/unit_tests/test_unit_mnist_ridge_regression.py b/bamboo/unit_tests/test_unit_mnist_ridge_regression.py deleted file mode 100644 index 60bee02df2b..00000000000 --- a/bamboo/unit_tests/test_unit_mnist_ridge_regression.py +++ /dev/null @@ -1,155 +0,0 @@ -import sys -sys.path.insert(0, '../common_python') -import tools - -import os -import pytest - - -def skeleton_mnist_ridge_regression(cluster, executables, dir_name, - compiler_name, weekly, data_reader_percent): - tools.process_executable( - 'skeleton_mnist_ridge_regression', compiler_name, executables) - - if compiler_name == 'exe': - exe = executables[compiler_name] - bin_dir = os.path.dirname(exe) - install_dir = os.path.dirname(bin_dir) - build_path = '{i}/lib/python3.7/site-packages'.format(i=install_dir) - else: - if compiler_name == 'clang6': - path = 'clang.Release' - elif compiler_name == 'clang6_debug': - path = 'clang.Debug' - elif compiler_name == 'gcc7': - path = 'gnu.Release' - elif compiler_name == 'clang6_debug': - path = 'gnu.Debug' - elif compiler_name == 'intel19': - path = 'intel.Release' - elif compiler_name == 'intel19_debug': - path = 'intel.Debug' - path = '{p}.{c}.llnl.gov'.format(p=path, c=cluster) - build_path = '{d}/build/{p}/install/lib/python3.7/site-packages'.format( - d=dir_name, p=path) - print('build_path={b}'.format(b=build_path)) - sys.path.append(build_path) - - # Model - # Converted from lbann/model_zoo/tests/model_mnist_ridge_regression.prototext. - # Equivalent to prototext's "Layers" section. - import lbann - input_node = lbann.Input() - images_node = lbann.Identity(input_node) - image_labels_node = lbann.Identity(input_node) - fc_node = lbann.FullyConnected(images_node, num_neurons=10, has_bias=True) - mse = lbann.MeanSquaredError([fc_node, image_labels_node]) - # Equivalent to prototext's "Objective function" section. - layers = list(lbann.traverse_layer_graph(input_node)) - weights = set() - for l in layers: - weights.update(l.weights) - # scale == weight decay - l2_reg = lbann.L2WeightRegularization(weights=weights, scale=0.01) - objective_function = lbann.ObjectiveFunction([mse, l2_reg]) - # Equivalent to prototext's "Metrics" section. - metrics = [lbann.Metric(mse, name='mean squared error')] - # Equivalent to prototext's "Callbacks" section. - callbacks = [lbann.CallbackPrint(), - lbann.CallbackTimer(), - lbann.CallbackCheckGradients( - verbose=False, error_on_failure=True)] - # Equivalent to prototext's model-level parameters. - model = lbann.Model(mini_batch_size=131, - epochs=4, - layers=layers, - objective_function=objective_function, - metrics=metrics, - callbacks=callbacks) - - # Data Reader - # TODO: Do we also want to programatically construct the data reader, not just the model? - data_reader_prototext_file = os.path.join(dir_name, - 'model_zoo', - 'data_readers', - 'data_reader_mnist.prototext') - data_reader_proto = lbann.lbann_pb2.LbannPB() - with open(data_reader_prototext_file, 'r') as f: - import google.protobuf.text_format as txtf - txtf.Merge(f.read(), data_reader_proto) - data_reader_proto = data_reader_proto.data_reader - - # Optimizer - # Learning rate from model_zoo/optimizers/opt_adam.prototext - optimizer = lbann.optimizer.Adam(learn_rate=0.001, beta1=0.9, beta2=0.99, eps=1e-8) - - # kwargs - kwargs = { - 'account': 'guests', - 'nodes': 1, - 'partition': 'pbatch', - 'procs_per_node': 1 - } - - if data_reader_percent is None: - if weekly: - data_reader_percent = 1.00 - else: - # Nightly - data_reader_percent = 0.10 - lbann_args = '--data_reader_percent={drp}'.format(drp=data_reader_percent) - if cluster == 'lassen': - lbann_args += ' --data_filedir_train=/p/gpfs1/brainusr/datasets/MNIST --data_filedir_test=/p/gpfs1/brainusr/datasets/MNIST' - if cluster == 'ray': - lbann_args += ' --data_filedir_train=/p/gscratchr/brainusr/datasets/MNIST --data_filedir_test=/p/gscratchr/brainusr/datasets/MNIST' - kwargs['lbann_args'] = lbann_args - - # Run - experiment_dir = '{d}/bamboo/unit_tests/experiments/mnist_ridge_regression_{c}'.format( - d=dir_name, c=compiler_name) - # Setup trainer - trainer = lbann.Trainer() - import lbann.contrib.lc.launcher - return_code = lbann.contrib.lc.launcher.run( - trainer=trainer, - experiment_dir=experiment_dir, - model=model, - data_reader=data_reader_proto, - optimizer=optimizer, - overwrite_script=True, - job_name='lbann_ridge_regression', - **kwargs) - - error_file_name = '{e}/err.log'.format( - e=experiment_dir, c=compiler_name) - tools.assert_success(return_code, error_file_name) - - -def test_unit_mnist_ridge_regression_clang6(cluster, exes, dirname, - weekly, data_reader_percent): - skeleton_mnist_ridge_regression(cluster, exes, dirname, 'clang6', - weekly, data_reader_percent) - - -def test_unit_mnist_ridge_regression_gcc7(cluster, exes, dirname, - weekly, data_reader_percent): - skeleton_mnist_ridge_regression(cluster, exes, dirname, 'gcc7', - weekly, data_reader_percent) - - -def test_unit_mnist_ridge_regression_intel19(cluster, exes, dirname, - weekly, data_reader_percent): - skeleton_mnist_ridge_regression(cluster, exes, dirname, 'intel19', - weekly, data_reader_percent) - - -# Run with python3 -m pytest -s test_unit_ridge_regression.py -k 'test_unit_mnist_ridge_regression_exe' --exe= -def test_unit_mnist_ridge_regression_exe(cluster, dirname, exe, - weekly, data_reader_percent): - if exe is None: - e = 'test_unit_mnist_ridge_regression_exe: Non-local testing' - print('Skip - ' + e) - pytest.skip(e) - exes = {'exe': exe} - skeleton_mnist_ridge_regression(cluster, exes, dirname, 'exe', - weekly, data_reader_percent) diff --git a/bamboo/unit_tests/test_unit_mnist_softmax_classifier.py b/bamboo/unit_tests/test_unit_mnist_softmax_classifier.py deleted file mode 100644 index a7bf98175bc..00000000000 --- a/bamboo/unit_tests/test_unit_mnist_softmax_classifier.py +++ /dev/null @@ -1,61 +0,0 @@ -import sys -sys.path.insert(0, '../common_python') -import tools -import pytest -import os - - -def skeleton_mnist_softmax_classifier(cluster, executables, dir_name, compiler_name, - weekly, data_reader_percent): - if not weekly: - e = 'test_unit_mnist_softmax_classifier: Not doing weekly testing' - print('SKIP - ' + e) - pytest.skip(e) - - if compiler_name not in executables: - e = 'skeleton_mnist_softmax_classifier: default_exes[%s] does not exist' % compiler_name - print('Skip - ' + e) - pytest.skip(e) - output_file_name = '%s/bamboo/unit_tests/output/mnist_softmax_classifier_%s_output.txt' % (dir_name, compiler_name) - error_file_name = '%s/bamboo/unit_tests/error/mnist_softmax_classifier_%s_error.txt' % (dir_name, compiler_name) - command = tools.get_command( - cluster=cluster, executable=executables[compiler_name], num_nodes=1, - num_processes=1, dir_name=dir_name, - data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST', - data_reader_name='mnist', - data_reader_percent=data_reader_percent, - model_folder='tests', model_name='mnist_softmax_classifier', - optimizer_name='adam', - output_file_name=output_file_name, error_file_name=error_file_name, weekly=weekly) - return_code = os.system(command) - tools.assert_success(return_code, error_file_name) - - -def test_unit_mnist_softmax_classifier_clang6(cluster, exes, dirname, - weekly, data_reader_percent): - skeleton_mnist_softmax_classifier(cluster, exes, dirname, 'clang6', - weekly, data_reader_percent) - - -def test_unit_mnist_softmax_classifier_gcc7(cluster, exes, dirname, - weekly, data_reader_percent): - skeleton_mnist_softmax_classifier(cluster, exes, dirname, 'gcc7', - weekly, data_reader_percent) - - -def test_unit_mnist_softmax_classifier_intel19(cluster, exes, dirname, - weekly, data_reader_percent): - skeleton_mnist_softmax_classifier(cluster, exes, dirname, 'intel19', - weekly, data_reader_percent) - - -# Run with python3 -m pytest -s test_unit_softmax_classifier.py -k 'test_unit_mnist_softmax_classifier_exe' --exe= -def test_unit_mnist_softmax_classifier_exe(cluster, dirname, exe, - weekly, data_reader_percent): - if exe is None: - e = 'test_unit_mnist_softmax_classifier_exe: Non-local testing' - print('Skip - ' + e) - pytest.skip(e) - exes = {'exe': exe} - skeleton_mnist_softmax_classifier(cluster, exes, dirname, 'exe', - weekly, data_reader_percent) diff --git a/model_zoo/tests/layer_tests/model_softmax.prototext b/model_zoo/tests/layer_tests/model_softmax.prototext deleted file mode 100644 index 5e6891cb2ef..00000000000 --- a/model_zoo/tests/layer_tests/model_softmax.prototext +++ /dev/null @@ -1,99 +0,0 @@ -trainer { -} -model { - data_layout: "data_parallel" - mini_batch_size: 11 - num_epochs: 0 - - ################################################### - # Objective function and metrics - ################################################### - - objective_function { - layer_term { layer: "l2" } - } - metric { - layer_metric { - layer: "l2" - name: "L2 norm" - } - } - - ################################################### - # Callbacks - ################################################### - - callback { print {} } - callback { timer {} } - callback { - check_metric { - metric: "L2 norm" # Expected value: 1.987 - lower_bound: 1.986 - upper_bound: 1.988 - error_on_failure: true - execution_modes: "test" - } - } - callback { - check_gradients { - execution_modes: "test" - verbose: false - error_on_failure: true - } - } - - ################################################### - # Layers - ################################################### - - layer { - name: "data" - data_layout: "data_parallel" - input {} - } - - # Input data - layer { - name: "x" - weights_layer { - dims: "5" - } - data_layout: "model_parallel" - weights: "x_vals" - } - weights { - name: "x_vals" - initializer { - value_initializer { - values: "-4 -2 0 1 2" - } - } - } - - # Variations of softmax layer - layer { - parents: "x" - name: "softmax_model_parallel" - softmax {} - data_layout: "model_parallel" - } - layer { - parents: "x" - name: "softmax_data_parallel" - softmax {} - data_layout: "data_parallel" - } - - # Combine into objective function - layer { - parents: "softmax_model_parallel softmax_data_parallel" - name: "sum" - sum {} - } - layer { - parents: "sum" - name: "l2" - l2_norm2 {} - } - -} diff --git a/model_zoo/tests/model_mnist_conv_graph.prototext b/model_zoo/tests/model_mnist_conv_graph.prototext deleted file mode 100644 index 9ef468864c3..00000000000 --- a/model_zoo/tests/model_mnist_conv_graph.prototext +++ /dev/null @@ -1,238 +0,0 @@ -trainer { - hydrogen_block_size: 257 -} -model { - data_layout: "data_parallel" - mini_batch_size: 31 - num_epochs: 4 - - ################################################### - # Objective function - ################################################### - - objective_function { - layer_term { layer: "cross_entropy" } - } - - ################################################### - # Callbacks - ################################################### - - callback { print {} } - callback { timer {} } - callback { - check_gradients { - execution_modes: "test" - verbose: false - error_on_failure: true - } - } - - ################################################### - # Layers - ################################################### - - # data - layer { - name: "data" - children: "images labels" - data_layout: "data_parallel" - input {} - } - layer { - name: "images" - parents: "data" - data_layout: "data_parallel" - identity {} - } - layer { - name: "labels" - parents: "data" - data_layout: "model_parallel" - identity {} - } - - # conv1 - layer { - parents: "images" - name: "conv1" - convolution { - num_dims: 2 - num_output_channels: 29 - conv_dims_i: 7 - conv_pads_i: 0 - conv_strides_i: 2 - has_bias: true - } - data_layout: "data_parallel" - } - layer { - parents: "conv1" - name: "conv1_pool" - pooling { - num_dims: 2 - pool_dims_i: 3 - pool_pads_i: 1 - pool_strides_i: 2 - pool_mode: "average" - } - data_layout: "data_parallel" - } - - # branch1 - layer { - parents: "conv1_pool" - name: "branch1_conv1" - convolution { - num_dims: 2 - num_output_channels: 10 - conv_dims_i: 1 - conv_pads_i: 0 - conv_strides_i: 1 - has_bias: true - } - data_layout: "data_parallel" - } - - # branch2 - layer { - parents: "conv1_pool" - name: "branch2_conv1" - convolution { - num_dims: 2 - num_output_channels: 13 - conv_dims_i: 1 - conv_pads_i: 0 - conv_strides_i: 1 - has_bias: false - } - data_layout: "data_parallel" - } - layer { - parents: "branch2_conv1" - name: "branch2_bn1" - data_layout: "data_parallel" - batch_normalization { - decay: 0.9 - scale_init: 1.0 - bias_init: 0.0 - epsilon: 1e-5 - } - } - layer { - parents: "branch2_bn1" - name: "branch2_conv2" - data_layout: "data_parallel" - convolution { - num_dims: 2 - num_output_channels: 10 - conv_dims_i: 3 - conv_pads_i: 1 - conv_strides_i: 1 - has_bias: true - } - } - - # branch3 - layer { - parents: "conv1_pool" - name: "branch3_slice" - children: "branch3_conv1 branch3_conv2" - data_layout: "data_parallel" - slice { - axis: 1 - slice_points: "0 4 6" - } - } - weights { - name: "branch3_conv_kernel" - initializer { - glorot_uniform_initializer {} - } - } - weights { - name: "branch3_conv_bias" - initializer { - constant_initializer {} - } - } - layer { - parents: "branch3_slice" - name: "branch3_conv1" - data_layout: "data_parallel" - weights: "branch3_conv_kernel branch3_conv_bias" - convolution { - num_dims: 2 - num_output_channels: 10 - conv_dims_i: 3 - conv_pads_i: 1 - conv_strides_i: 1 - has_bias: true - } - } - layer { - parents: "branch3_slice" - name: "branch3_conv2" - data_layout: "data_parallel" - weights: "branch3_conv_kernel branch3_conv_bias" - convolution { - num_dims: 2 - num_output_channels: 10 - conv_dims_i: 3 - conv_pads_i: 1 - conv_strides_i: 1 - has_bias: true - } - } - layer { - parents: "branch3_conv1 branch3_conv2" - name: "branch3_concat" - data_layout: "data_parallel" - concatenation { - axis: 1 - } - } - - # sum - layer { - parents: "branch1_conv1 branch2_conv2 branch3_concat" - name: "sum" - data_layout: "data_parallel" - sum {} - } - - # prob - layer { - parents: "sum" - name: "prob_pool" - pooling { - num_dims: 2 - pool_dims_i: 6 - pool_pads_i: 0 - pool_strides_i: 1 - pool_mode: "average" - } - data_layout: "data_parallel" - } - layer { - parents: "prob_pool" - name: "prob_flat" - reshape { dims: "-1" } - data_layout: "data_parallel" - } - layer { - parents: "prob_flat" - name: "prob" - softmax {} - data_layout: "data_parallel" - } - - # cross_entropy - layer { - name: "cross_entropy" - parents: "prob labels" - data_layout: "model_parallel" - cross_entropy {} - } - -} diff --git a/model_zoo/tests/model_mnist_ridge_regression.prototext b/model_zoo/tests/model_mnist_ridge_regression.prototext deleted file mode 100644 index 8670847404b..00000000000 --- a/model_zoo/tests/model_mnist_ridge_regression.prototext +++ /dev/null @@ -1,77 +0,0 @@ -trainer { - hydrogen_block_size: 257 -} -model { - data_layout: "data_parallel" - mini_batch_size: 131 - num_epochs: 4 - - ################################################### - # Objective function - ################################################### - - objective_function { - layer_term { layer: "mse" } - l2_weight_regularization { - scale_factor: 0.01 - } - } - - ################################################### - # Metrics - ################################################### - - metric { layer_metric { layer: "mse" } } - - ################################################### - # Callbacks - ################################################### - callback { print {} } - callback { timer {} } - callback { - check_gradients { - execution_modes: "test" - verbose: false - error_on_failure: true - } - } - - ################################################### - # Layers - ################################################### - - layer { - name: "data" - children: "image label" - data_layout: "data_parallel" - input {} - } - layer { - parents: "data" - name: "image" - data_layout: "model_parallel" - split {} - } - layer { - parents: "data" - name: "label" - data_layout: "model_parallel" - split {} - } - layer { - parents: "image" - name: "fc" - data_layout: "model_parallel" - fully_connected { - num_neurons: 10 - has_bias: true - } - } - layer { - parents: "fc label" - name: "mse" - data_layout: "model_parallel" - mean_squared_error {} - } - -} diff --git a/model_zoo/tests/model_mnist_softmax_classifier.prototext b/model_zoo/tests/model_mnist_softmax_classifier.prototext deleted file mode 100644 index 941705fbbca..00000000000 --- a/model_zoo/tests/model_mnist_softmax_classifier.prototext +++ /dev/null @@ -1,87 +0,0 @@ -trainer { - hydrogen_block_size: 199 -} -model { - data_layout: "data_parallel" - mini_batch_size: 103 - num_epochs: 4 - - ################################################### - # Objective function - ################################################### - - objective_function { - layer_term { layer: "cross_entropy" } - } - - ################################################### - # Metrics - ################################################### - - metric { layer_metric { layer: "accuracy" } } - - ################################################### - # Callbacks - ################################################### - - callback { print {} } - callback { timer {} } - callback { - check_gradients { - execution_modes: "test" - verbose: false - error_on_failure: true - } - } - - ################################################### - # Layers - ################################################### - - layer { - name: "data" - children: "image label" - data_layout: "data_parallel" - input {} - } - layer { - parents: "data" - name: "image" - data_layout: "model_parallel" - split {} - } - layer { - parents: "data" - name: "label" - data_layout: "model_parallel" - split {} - } - layer { - parents: "image" - name: "fc" - data_layout: "model_parallel" - fully_connected { - num_neurons: 10 - has_bias: false - } - } - layer { - parents: "fc" - name: "prob" - data_layout: "model_parallel" - softmax {} - } - layer { - parents: "prob label" - name: "cross_entropy" - data_layout: "model_parallel" - cross_entropy {} - } - layer { - parents: "prob label" - name: "accuracy" - data_layout: "model_parallel" - categorical_accuracy {} - } - -} From 173811559d100ca0511afa37b41c619bf2cf731e Mon Sep 17 00:00:00 2001 From: Tim Moon Date: Fri, 18 Oct 2019 15:58:28 -0700 Subject: [PATCH 353/634] Support "optional" message fields in Protobuf API (#1292) To make a message field optional, use a wrapper message instead of a primitive type. The Python frontend has been updated to handle messages that have wrapper messages as fields. --- python/lbann/util/class_generator.py | 62 ++++++++++++++++++++++------ 1 file changed, 49 insertions(+), 13 deletions(-) diff --git a/python/lbann/util/class_generator.py b/python/lbann/util/class_generator.py index 62a5f5b9321..4e77d20f842 100644 --- a/python/lbann/util/class_generator.py +++ b/python/lbann/util/class_generator.py @@ -1,15 +1,20 @@ """Utility functions to generate classes from Protobuf messages.""" import google.protobuf.descriptor +import google.protobuf.wrappers_pb2 from lbann import lbann_pb2, callbacks_pb2, layers_pb2, metrics_pb2, model_pb2, objective_functions_pb2, optimizers_pb2, weights_pb2 +from lbann.util import make_iterable -# Map from Protobuf label enums to strings -_proto_label_to_str = { +# Each field in a Protobuf message is labeled as 'optional', +# 'required', or 'repeated' +# Note: 'optional' is not used in Protobuf 3. +_protobuf_field_label_names = { google.protobuf.descriptor.FieldDescriptor.LABEL_OPTIONAL: 'optional', google.protobuf.descriptor.FieldDescriptor.LABEL_REQUIRED: 'required', google.protobuf.descriptor.FieldDescriptor.LABEL_REPEATED: 'repeated' } -# Map from Protobuf type enums to strings -_proto_type_to_str = { + +# Each field in a Protobuf message has a type, e.g. float, int64 +_protobuf_field_type_names = { google.protobuf.descriptor.FieldDescriptor.TYPE_BOOL: 'bool', google.protobuf.descriptor.FieldDescriptor.TYPE_BYTES: 'bytes', google.protobuf.descriptor.FieldDescriptor.TYPE_DOUBLE: 'double', @@ -30,6 +35,25 @@ google.protobuf.descriptor.FieldDescriptor.TYPE_UINT64: 'uint64' } +# Wrapper Protobuf messages for primitive types +# Note: Protobuf 3 does not support optional message fields with +# primitive types. If a primitive field is not set, its value is +# "zero" (false for bool, empty string for string, etc). We need to +# use these wrapper messages to distinguish between values that are +# "zero" and values that are not set. +_protobuf_type_wrappers = ( + google.protobuf.wrappers_pb2.DoubleValue.DESCRIPTOR, + google.protobuf.wrappers_pb2.FloatValue.DESCRIPTOR, + google.protobuf.wrappers_pb2.Int64Value.DESCRIPTOR, + google.protobuf.wrappers_pb2.Int64Value.DESCRIPTOR, + google.protobuf.wrappers_pb2.UInt64Value.DESCRIPTOR, + google.protobuf.wrappers_pb2.Int32Value.DESCRIPTOR, + google.protobuf.wrappers_pb2.UInt32Value.DESCRIPTOR, + google.protobuf.wrappers_pb2.BoolValue.DESCRIPTOR, + google.protobuf.wrappers_pb2.StringValue.DESCRIPTOR, + google.protobuf.wrappers_pb2.BytesValue.DESCRIPTOR +) + def _generate_class(message_descriptor, base_field_name, base_class, @@ -58,7 +82,8 @@ class `__init__` method. # Names of Protobuf message and its fields message_name = message_descriptor.name - field_names = message_descriptor.fields_by_name.keys() + field_descriptors = message_descriptor.fields_by_name + field_names = field_descriptors.keys() # Make sure fields in generated and base classes are distinct for arg in base_kwargs: @@ -111,13 +136,24 @@ def export_proto(self): message = proto # Set message - for field in field_names: - val = getattr(self, field) + for field_name in field_names: + val = getattr(self, field_name) if val is not None: - if type(val) is list: - getattr(message, field).extend(val) - else: - setattr(message, field, val) + try: + field = getattr(message, field_name) + field_descriptor = field_descriptors[field_name] + if field_descriptor.message_type in _protobuf_type_wrappers: + field.SetInParent() + field.value = val + elif field_descriptor.label == google.protobuf.descriptor.FieldDescriptor.LABEL_REPEATED: + field.extend(make_iterable(val)) + else: + setattr(message, field_name, val) + except: + raise TypeError('{} is invalid type for {}.{}' + .format(type(val).__name__, + self.__class__.__name__, + field_name)) # Return Protobuf message return proto @@ -132,8 +168,8 @@ def get_field_names(self): for field in message_descriptor.fields: doc += ' {0} ({1} {2})\n'.format( field.name, - _proto_label_to_str.get(field.label, 'unknown'), - _proto_type_to_str.get(field.type, 'unknown')) + _protobuf_field_label_names.get(field.label, 'unknown'), + _protobuf_field_type_names.get(field.type, 'unknown')) else: doc = 'Fields: none\n' From 927f9d935c6a4450830a38111ab5c2e60785b7e5 Mon Sep 17 00:00:00 2001 From: Sam Ade Jacobs Date: Fri, 18 Oct 2019 16:03:46 -0700 Subject: [PATCH 354/634] Adding ATOM model and documentation to applications (#1312) * Adding ATOM model and documentation to applications * Adding ATOM model and documentation to applications --- applications/ATOM/README.md | 43 ++++++ applications/ATOM/train_atom_char_rnn.py | 173 +++++++++++++++++++++++ 2 files changed, 216 insertions(+) create mode 100644 applications/ATOM/train_atom_char_rnn.py diff --git a/applications/ATOM/README.md b/applications/ATOM/README.md index a752d534160..31e78420ef1 100644 --- a/applications/ATOM/README.md +++ b/applications/ATOM/README.md @@ -1,3 +1,46 @@ ## Accelerating Therapeutics for Opportunities in Medicine (ATOM) Models for training neural networks to suppor the [ATOM](https://atomscience.org) project + +The train_atom_char_rnn.py script implements GRU-based recurrent model for generating new SMILES strings. +Original neural network model and training hyperparameters are described in [MOSES benchmark](https://github.com/samadejacobs/moses/tree/master/moses/char_rnn). Please see LBANN documentations on how to install, build and run LBANN code. + +###How to train +```bash +run python3 train_atom_char.rnn.py +``` + +Expected training output in LBANN (250K ZINC training dataset, on a single LLNL Pascal GPU) is shown: +``` +-------------------------------------------------------------------------------- +[0] Epoch : stats formated [tr/v/te] iter/epoch = [3907/0/0] + global MB = [ 64/ 0/ 0] global last MB = [ 16 / 0 / 0 ] + local MB = [ 64/ 0/ 0] local last MB = [ 16+0/ 0+0/ 0+0] +-------------------------------------------------------------------------------- +model0 (instance 0) training epoch 0 objective function : 0.438031 +model0 (instance 0) training epoch 0 run time : 1009.55s +model0 (instance 0) training epoch 0 mini-batch time statistics : 0.257328s mean, 1.89938s max, 0.15177s min, 0.0331048s stdev +-------------------------------------------------------------------------------- +[1] Epoch : stats formated [tr/v/te] iter/epoch = [3907/0/0] + global MB = [ 64/ 0/ 0] global last MB = [ 16 / 0 / 0 ] + local MB = [ 64/ 0/ 0] local last MB = [ 16+0/ 0+0/ 0+0] +-------------------------------------------------------------------------------- +model0 (instance 0) training epoch 1 objective function : 0.37321 +model0 (instance 0) training epoch 1 run time : 1006.6s +model0 (instance 0) training epoch 1 mini-batch time statistics : 0.256573s mean, 0.912742s max, 0.158709s min, 0.0193512s stdev +``` + +### Inference and Sampling + +1. Clone this version of [MOSES benchmark repository](https://github.com/samadejacobs/moses) and follow instructions for installation +2. Inference using LBANN pretrained model parameters + +```bash + + python3 MOSES_DIR/scripts/run.py --model char_rnn --n_samples NUM_SAMPLES \ + --lbann_weights_dir LBANN_WEIGHTS_DIR \ + --lbann_epoch_counts EPOCHS + +``` + +Command above will load pre_trained LBANN weights and biases from LBANN_WEIGHTS_DIR at a specified EPOCH counts, generate up to NUM_SAMPLES new molecules, and calculate metrics on the new molecules, some metrics relative to the test (validation) dataset. diff --git a/applications/ATOM/train_atom_char_rnn.py b/applications/ATOM/train_atom_char_rnn.py new file mode 100644 index 00000000000..f5751dab21a --- /dev/null +++ b/applications/ATOM/train_atom_char_rnn.py @@ -0,0 +1,173 @@ +import numpy as np +from math import sqrt + +# Data paths +data_dir = '/p/lustre2/brainusr/datasets/zinc/moses_zinc_train250K.npy' +samples = np.load(data_dir, allow_pickle=True) + +dims = len(samples[0]) + + +pad_indx = 28 +# Sample access functions +def get_sample(index): + sample = samples[index] + if len(sample) < dims: + sample = np.concatenate((sample, np.full(dims-len(sample), pad_indx))) + else: + sample = np.resize(sample, dims) + return sample + +def num_samples(): + return samples.shape[0] + +def sample_dims(): + return [dims] + +def str_list(l): + return ' '.join([str(i) for i in l]) +# ============================================== +# Setup and launch experiment +# ============================================== + +def construct_model(): + """Construct LBANN model. + + Initial model for ATOM molecular SMILES generation + Network architecture and training hyperparameters from + https://github.com/samadejacobs/moses/tree/master/moses/char_rnn + + """ + import lbann + import lbann.modules + + sequence_length = sample_dims()[0] + data_layout = 'data_parallel' + + # Layer graph + input = lbann.Input(name='inp_tensor') + x_slice = lbann.Slice( + input, + axis=0, + slice_points=str_list(range(sequence_length+1)), + device='CPU', + name='inp_slice' + ) + + #embedding layer + emb = [] + embedding_size=30 + dictionary_size=30 + + emb_weights = lbann.Weights( + initializer=lbann.NormalInitializer(mean=0, standard_deviation=1), + name='emb_matrix' + ) + + lstm1 = lbann.modules.GRU(size=768, data_layout=data_layout) + fc = lbann.modules.FullyConnectedModule(size=dictionary_size, data_layout=data_layout) + + + last_output = lbann.Constant(value=0.0, + num_neurons='768', + data_layout=data_layout, + name='lstm_init_output') + + lstm1_prev_state = [last_output] + + + gt = lbann.Constant(value=0, num_neurons='57') + loss= [] + idl = [] + for i in range(sequence_length): + idl.append(lbann.Identity(x_slice, name='slice_idl_'+str(i), device='CPU')) + + for i in range(sequence_length-1): + emb_l = lbann.Embedding( + idl[i], + dictionary_size=dictionary_size, + embedding_size=embedding_size, + name='emb_'+str(i), + device='CPU', + weights=emb_weights + ) + + x,lstm1_prev_state = lstm1(emb_l,lstm1_prev_state) + fc_l = fc(x) + y_soft = lbann.Softmax(fc_l, name='soft_'+str(i)) + gt = lbann.OneHot(idl[i+1], size=dictionary_size) + ce = lbann.CrossEntropy([y_soft,gt],name='loss_'+str(i)) + #mask padding in input + pad_mask = lbann.NotEqual([idl[i],lbann.Constant(value=pad_indx,num_neurons='1')],device='CPU') + ce_mask = lbann.Multiply([pad_mask,ce],name='loss_mask_'+str(i)) + loss.append(lbann.LayerTerm(ce_mask, scale=1/(sequence_length-1))) + + + layers = list(lbann.traverse_layer_graph(input)) + # Setup objective function + weights = set() + for l in layers: + weights.update(l.weights) + obj = lbann.ObjectiveFunction(loss) + + + callbacks = [lbann.CallbackPrint(), + lbann.CallbackTimer(), + lbann.CallbackStepLearningRate(step=10, amt=0.5), + lbann.CallbackDumpWeights(basename="weights")] + + # Construct model + mini_batch_size = 64 + num_epochs = 50 + return lbann.Model(mini_batch_size, + num_epochs, + weights=weights, + layers=layers, + objective_function=obj, + callbacks=callbacks) + +def construct_data_reader(): + """Construct Protobuf message for Python data reader. + + The Python data reader will import this Python file to access the + sample access functions. + + """ + import os.path + import lbann + module_file = os.path.abspath(__file__) + module_name = os.path.splitext(os.path.basename(module_file))[0] + module_dir = os.path.dirname(module_file) + + # Base data reader message + message = lbann.reader_pb2.DataReader() + + # Training set data reader + data_reader = message.reader.add() + data_reader.name = 'python' + data_reader.role = 'train' + data_reader.shuffle = True + data_reader.percent_of_data_to_use = 1.0 + data_reader.python.module = module_name + data_reader.python.module_dir = module_dir + data_reader.python.sample_function = 'get_sample' + data_reader.python.num_samples_function = 'num_samples' + data_reader.python.sample_dims_function = 'sample_dims' + + return message + +if __name__ == '__main__': + import lbann + import lbann.contrib.lc.launcher + trainer = lbann.Trainer() + model = construct_model() + opt = lbann.Adam(learn_rate=0.001,beta1=0.9,beta2=0.99,eps=1e-8) + data_reader = construct_data_reader() + status = lbann.contrib.lc.launcher.run( + trainer, model, data_reader, opt, + account='hpcdl', + scheduler='slurm', + time_limit=1440, + nodes=1, + job_name='atom_char_rnn_250K') + print(status) From 09bb6e90c0dd978b6c0574f4d9a5c31f29968c51 Mon Sep 17 00:00:00 2001 From: Tom Benson <30674819+benson31@users.noreply.github.com> Date: Fri, 18 Oct 2019 16:25:59 -0700 Subject: [PATCH 355/634] add a hacky workaround to get OSX build to go through. (#1310) --- CMakeLists.txt | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 41e2cde1704..cfa4120c6ee 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -535,6 +535,29 @@ endif () target_link_libraries(lbann PUBLIC ${DL_LIBRARY}) +# Fix the -g issue with Clang on OSX +if (APPLE) + # Remove -g from the options + string(REPLACE "-g" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") + string(REPLACE "-g" "" CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG}") + + # Get all the sources and add "-g" to all of them. + get_target_property(_LBANN_SRCS lbann SOURCES) + set_source_files_properties(${_LBANN_SRCS} + PROPERTIES COMPILE_OPTIONS "-g") + + # Cleanup bad files + list(APPEND BAD_FILES + "${CMAKE_SOURCE_DIR}/src/layers/loss/cross_entropy.cpp") + foreach (bad_file IN LISTS BAD_FILES) + get_source_file_property( + _SRC_COMPILE_OPTS "${bad_file}" COMPILE_OPTIONS) + string(REPLACE "-g" "" _SRC_COMPILE_OPTS "${COMPILE_OPTIONS}") + set_source_files_properties( + "${bad_file}" PROPERTIES COMPILE_OPTIONS "${_SRC_COMPILE_OPTS}") + endforeach () +endif () + # Clean things up include(LBANNDebugUtilities) lbann_remove_default_include_paths_from_all_subtargets(lbann) From 2327c34fde8c753a9301eb45084fe1d755f5dc64 Mon Sep 17 00:00:00 2001 From: davidHysom Date: Mon, 21 Oct 2019 08:20:15 -0700 Subject: [PATCH 356/634] Memory optimization for data_reader_numpy_npz_conduit (#1293) * Initial commit. * compiles on pascal with gnu; passes testing --- include/lbann/data_readers/CMakeLists.txt | 1 + .../data_reader_numpy_npz_conduit.hpp | 13 ++- .../data_readers/numpy_conduit_converter.hpp | 71 ---------------- src/data_readers/CMakeLists.txt | 1 - .../data_reader_numpy_npz_conduit.cpp | 81 +++++++++++++++++-- src/data_readers/numpy_conduit_converter.cpp | 72 ----------------- 6 files changed, 87 insertions(+), 152 deletions(-) delete mode 100644 include/lbann/data_readers/numpy_conduit_converter.hpp delete mode 100644 src/data_readers/numpy_conduit_converter.cpp diff --git a/include/lbann/data_readers/CMakeLists.txt b/include/lbann/data_readers/CMakeLists.txt index 3743728b55a..31bb8ee711d 100644 --- a/include/lbann/data_readers/CMakeLists.txt +++ b/include/lbann/data_readers/CMakeLists.txt @@ -12,6 +12,7 @@ set_full_path(THIS_DIR_HEADERS data_reader_nci.hpp data_reader_numpy.hpp data_reader_numpy_npz.hpp + data_reader_numpy_npz_conduit.hpp data_reader_pilot2_molecular.hpp data_reader_python.hpp data_reader_synthetic.hpp diff --git a/include/lbann/data_readers/data_reader_numpy_npz_conduit.hpp b/include/lbann/data_readers/data_reader_numpy_npz_conduit.hpp index 09aeca29498..dbaec7d043c 100644 --- a/include/lbann/data_readers/data_reader_numpy_npz_conduit.hpp +++ b/include/lbann/data_readers/data_reader_numpy_npz_conduit.hpp @@ -30,6 +30,7 @@ #define LBANN_DATA_READER_NUMPY_NPZ_CONDUIT_HPP #include "lbann/data_readers/data_reader.hpp" +#include "conduit/conduit.hpp" #include namespace lbann { @@ -37,7 +38,8 @@ namespace lbann { * Data reader for data stored in numpy (.npz) files that are encapsulated . * in conduit::Nodes */ - class numpy_npz_conduit_reader : public generic_data_reader { +class numpy_npz_conduit_reader : public generic_data_reader { + public: numpy_npz_conduit_reader(const bool shuffle); // These need to be explicit because of some issue with the cnpy copy @@ -107,7 +109,14 @@ namespace lbann { std::vector m_filenames; bool load_numpy_npz_from_file(const std::unordered_set &data_ids, std::unordered_set& label_classes); - }; + + void load_conduit_node(const std::string filename, int data_id, conduit::Node &output, bool reset = true); + + std::unordered_map> m_npz_cache; + + void load_npz(const std::string filename, int data_id, conduit::Node &node); + +}; } // namespace lbann diff --git a/include/lbann/data_readers/numpy_conduit_converter.hpp b/include/lbann/data_readers/numpy_conduit_converter.hpp deleted file mode 100644 index 32317487043..00000000000 --- a/include/lbann/data_readers/numpy_conduit_converter.hpp +++ /dev/null @@ -1,71 +0,0 @@ -// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. -// Produced at the Lawrence Livermore National Laboratory. -// Written by the LBANN Research Team (B. Van Essen, et al.) listed in -// the CONTRIBUTORS file. -// -// LLNL-CODE-697807. -// All rights reserved. -// -// This file is part of LBANN: Livermore Big Artificial Neural Network -// Toolkit. For details, see http://software.llnl.gov/LBANN or -// https://github.com/LLNL/LBANN. -// -// Licensed under the Apache License, Version 2.0 (the "Licensee"); you -// may not use this file except in compliance with the License. You may -// obtain a copy of the License at: -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the license. -// -//////////////////////////////////////////////////////////////////////////////// - -#ifndef NUMPY_CONDUIT_CONVERTER_HPP -#define NUMPY_CONDUIT_CONVERTER_HPP - -#include "lbann_config.hpp" -#include "conduit/conduit.hpp" - -namespace lbann { - -/** - * The numpy_conduit_converter class contains static method(s) for - * reading numpy files and copying the contents to a conduit file. - * - * In general the schema for npz files, after conversion to conduit, is: - * - * { - * data_id (int) : - * // one or more of the following sections - * { - * section_name : - * { - * "word_size": , - * "fortran_order: <0|1>, - * "num_vals": , - * "shape": <[ vector ]>, - * "data": - * } - * } - * } - * - * cosmoflow has the following sections: - * "data": - * "frm": - * "responses": - */ - -class numpy_conduit_converter { - public: - - static void load_conduit_node(const std::string filename, int data_id, conduit::Node &output, bool reset_conduit_node = true); - -}; - -} // namespace lbann - -#endif // NUMPY_CONDUIT_CONVERTER_HPP diff --git a/src/data_readers/CMakeLists.txt b/src/data_readers/CMakeLists.txt index 53f5bca779c..792ee42b003 100644 --- a/src/data_readers/CMakeLists.txt +++ b/src/data_readers/CMakeLists.txt @@ -20,7 +20,6 @@ set_full_path(THIS_DIR_SOURCES data_reader_multihead_siamese.cpp data_reader_python.cpp offline_patches_npz.cpp - numpy_conduit_converter.cpp data_reader_numpy_npz_conduit.cpp ) diff --git a/src/data_readers/data_reader_numpy_npz_conduit.cpp b/src/data_readers/data_reader_numpy_npz_conduit.cpp index 39699da06df..f5c6b5d049f 100644 --- a/src/data_readers/data_reader_numpy_npz_conduit.cpp +++ b/src/data_readers/data_reader_numpy_npz_conduit.cpp @@ -27,7 +27,6 @@ #include "lbann/data_readers/data_reader_numpy_npz_conduit.hpp" #include "lbann/data_store/data_store_conduit.hpp" -#include "lbann/data_readers/numpy_conduit_converter.hpp" #include #include "lbann/utils/file_utils.hpp" // pad() #include "lbann/utils/jag_utils.hpp" // read_filelist(..) TODO should be move to file_utils @@ -191,7 +190,7 @@ void numpy_npz_conduit_reader::preload_data_store() { } conduit::Node node; - numpy_conduit_converter::load_conduit_node(m_filenames[data_id], data_id, node); + load_npz(m_filenames[data_id], data_id, node); const char *char_ptr = node[LBANN_DATA_ID_STR(data_id) + "/frm/data"].value(); const int* label_ptr = reinterpret_cast(char_ptr); label_classes.insert(*label_ptr); @@ -261,7 +260,7 @@ void numpy_npz_conduit_reader::preload_data_store() { bool numpy_npz_conduit_reader::load_numpy_npz_from_file(const std::unordered_set &data_ids, std::unordered_set &label_classes) { for (auto data_id : data_ids) { conduit::Node node; - numpy_conduit_converter::load_conduit_node(m_filenames[data_id], data_id, node); + load_conduit_node(m_filenames[data_id], data_id, node); const char *char_ptr = node[LBANN_DATA_ID_STR(data_id) + "/frm/data"].value(); const int* label_ptr = reinterpret_cast(char_ptr); label_classes.insert(*label_ptr); @@ -277,7 +276,7 @@ bool numpy_npz_conduit_reader::fetch_datum(Mat& X, int data_id, int mb_idx) { const conduit::Node& ds_node = m_data_store->get_conduit_node(data_id); node.set_external(ds_node); } else { - numpy_conduit_converter::load_conduit_node(m_filenames[data_id], data_id, node); + load_npz(m_filenames[data_id], data_id, node); //note: if testing, and test set is touched more than once, the following // will through an exception TODO: relook later const auto& c = static_cast(m_model->get_execution_context()); @@ -352,7 +351,7 @@ bool numpy_npz_conduit_reader::fetch_response(Mat& Y, int data_id, int mb_idx) { const conduit::Node& ds_node = m_data_store->get_conduit_node(data_id); node.set_external(ds_node); } else { - numpy_conduit_converter::load_conduit_node(m_filenames[data_id], data_id, node); + load_npz(m_filenames[data_id], data_id, node); if (priming_data_store()) { m_data_store->set_conduit_node(data_id, node); } else { @@ -398,7 +397,7 @@ void numpy_npz_conduit_reader::fill_in_metadata() { int data_id = 0; //meaningless conduit::Node node; - numpy_conduit_converter::load_conduit_node(m_filenames[my_file], data_id, node); + load_npz(m_filenames[my_file], data_id, node); //fill in m_data_dims auto shape = node[LBANN_DATA_ID_STR(data_id) + "/data/shape"].as_uint64_array(); @@ -447,4 +446,74 @@ void numpy_npz_conduit_reader::fill_in_metadata() { } } +void numpy_npz_conduit_reader::load_conduit_node(const std::string filename, int data_id, conduit::Node &output, bool reset) { + + try { + if (reset) { + output.reset(); + } + + std::vector shape; + std::map a = cnpy::npz_load(filename); + + for (auto &&t : a) { + cnpy::NpyArray &b = t.second; + if (b.shape[0] != 1) { + LBANN_ERROR("lbann currently only supports one sample per npz file; this file appears to contain " + std::to_string(b.shape[0]) + " samples; (", filename); + } + output[LBANN_DATA_ID_STR(data_id) + "/" + t.first + "/word_size"] = b.word_size; + output[LBANN_DATA_ID_STR(data_id) + "/" + t.first + "/fortran_order"] = b.fortran_order; + output[LBANN_DATA_ID_STR(data_id) + "/" + t.first + "/num_vals"] = b.num_vals; + output[LBANN_DATA_ID_STR(data_id) + "/" + t.first + "/shape"] = b.shape; + + if (b.data_holder->size() / b.word_size != b.num_vals) { + LBANN_ERROR("b.data_holder->size() / b.word_size (" + std::to_string(b.data_holder->size()) + " / " + std::to_string(b.word_size) + ") != b.num_vals (" + std::to_string(b.num_vals)); + } + + // conduit makes a copy of the data, hence owns the data, hence it + // will be properly deleted when then conduit::Node is deleted + char *data = b.data_holder->data(); + output[LBANN_DATA_ID_STR(data_id) + "/" + t.first + "/data"].set_char_ptr(data, b.word_size*b.num_vals); + } + } catch (...) { + //note: npz_load throws std::runtime_error, but I don't want to assume + // that won't change in the future + LBANN_ERROR("failed to open " + filename + " during cnpy::npz_load"); + } +} + +void numpy_npz_conduit_reader::load_npz(const std::string filename, int data_id, conduit::Node &output) { + + try { + output.reset(); + + std::vector shape; + m_npz_cache[data_id] = cnpy::npz_load(filename); + std::map &a = m_npz_cache[data_id]; + + for (auto &&t : a) { + cnpy::NpyArray &b = t.second; + if (b.shape[0] != 1) { + LBANN_ERROR("lbann currently only supports one sample per npz file; this file appears to contain " + std::to_string(b.shape[0]) + " samples; (", filename); + } + output[LBANN_DATA_ID_STR(data_id) + "/" + t.first + "/word_size"] = b.word_size; + output[LBANN_DATA_ID_STR(data_id) + "/" + t.first + "/fortran_order"] = b.fortran_order; + output[LBANN_DATA_ID_STR(data_id) + "/" + t.first + "/num_vals"] = b.num_vals; + output[LBANN_DATA_ID_STR(data_id) + "/" + t.first + "/shape"] = b.shape; + + if (b.data_holder->size() / b.word_size != b.num_vals) { + LBANN_ERROR("b.data_holder->size() / b.word_size (" + std::to_string(b.data_holder->size()) + " / " + std::to_string(b.word_size) + ") != b.num_vals (" + std::to_string(b.num_vals)); + } + + conduit::uint8 *data = reinterpret_cast(b.data_holder->data()); + output[LBANN_DATA_ID_STR(data_id) + "/" + t.first + "/data"].set_external_uint8_ptr(data, b.word_size*b.num_vals); + } + } catch (...) { + //note: npz_load throws std::runtime_error, but I don't want to assume + // that won't change in the future + LBANN_ERROR("failed to open " + filename + " during cnpy::npz_load"); + } +} + + } // namespace lbann diff --git a/src/data_readers/numpy_conduit_converter.cpp b/src/data_readers/numpy_conduit_converter.cpp deleted file mode 100644 index bc3e1dedff6..00000000000 --- a/src/data_readers/numpy_conduit_converter.cpp +++ /dev/null @@ -1,72 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. -// Produced at the Lawrence Livermore National Laboratory. -// Written by the LBANN Research Team (B. Van Essen, et al.) listed in -// the CONTRIBUTORS file. -// -// LLNL-CODE-697807. -// All rights reserved. -// -// This file is part of LBANN: Livermore Big Artificial Neural Network -// Toolkit. For details, see http://software.llnl.gov/LBANN or -// https://github.com/LLNL/LBANN. -// -// Licensed under the Apache License, Version 2.0 (the "Licensee"); you -// may not use this file except in compliance with the License. You may -// obtain a copy of the License at: -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the license. -// -//////////////////////////////////////////////////////////////////////////////// - -#include "lbann/data_readers/numpy_conduit_converter.hpp" -#include "lbann/utils/exception.hpp" -#include "lbann/data_store/data_store_conduit.hpp" -#include - -namespace lbann { - -//static -void numpy_conduit_converter::load_conduit_node(const std::string filename, int data_id, conduit::Node &output, bool reset) { - - try { - if (reset) { - output.reset(); - } - - std::vector shape; - std::map a = cnpy::npz_load(filename); - - for (auto &&t : a) { - cnpy::NpyArray &b = t.second; - if (b.shape[0] != 1) { - LBANN_ERROR("lbann currently only supports one sample per npz file; this file appears to contain " + std::to_string(b.shape[0]) + " samples; (", filename); - } - output[LBANN_DATA_ID_STR(data_id) + "/" + t.first + "/word_size"] = b.word_size; - output[LBANN_DATA_ID_STR(data_id) + "/" + t.first + "/fortran_order"] = b.fortran_order; - output[LBANN_DATA_ID_STR(data_id) + "/" + t.first + "/num_vals"] = b.num_vals; - output[LBANN_DATA_ID_STR(data_id) + "/" + t.first + "/shape"] = b.shape; - - if (b.data_holder->size() / b.word_size != b.num_vals) { - LBANN_ERROR("b.data_holder->size() / b.word_size (" + std::to_string(b.data_holder->size()) + " / " + std::to_string(b.word_size) + ") != b.num_vals (" + std::to_string(b.num_vals)); - } - - // conduit makes a copy of the data, hence owns the data, hence it - // will be properly deleted when then conduit::Node is deleted - char *data = b.data_holder->data(); - output[LBANN_DATA_ID_STR(data_id) + "/" + t.first + "/data"].set_char_ptr(data, b.word_size*b.num_vals); - } - } catch (...) { - //note: npz_load throws std::runtime_error, but I don't want to assume - // that won't change in the future - LBANN_ERROR("failed to open " + filename + " during cnpy::npz_load"); - } -} - -} // end of namespace lbann From 3ff2c0f14ab60204ddc6e8bee7eeb83b8210d40a Mon Sep 17 00:00:00 2001 From: Tom Benson <30674819+benson31@users.noreply.github.com> Date: Mon, 21 Oct 2019 14:35:21 -0700 Subject: [PATCH 357/634] Fix model copy issue (#1311) * fix an issue where null pointers could be dereferenced --- include/lbann/layers/io/input/generic_input_layer.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/lbann/layers/io/input/generic_input_layer.hpp b/include/lbann/layers/io/input/generic_input_layer.hpp index 7dac4dad4cd..b594109ee0f 100644 --- a/include/lbann/layers/io/input/generic_input_layer.hpp +++ b/include/lbann/layers/io/input/generic_input_layer.hpp @@ -126,7 +126,7 @@ class generic_input_layer : public io_layer { io_buffer = io_buffer->copy(); } for (auto& dr : m_data_readers) { - dr.second = dr.second->copy(); + dr.second = dr.second ? dr.second->copy() : nullptr; } } @@ -136,7 +136,7 @@ class generic_input_layer : public io_layer { io_buffer = io_buffer->copy(); } for (auto& dr : m_data_readers) { - dr.second = dr.second->copy(); + dr.second = dr.second ? dr.second->copy() : nullptr; } return *this; } From ded858f7b4089b4ca64eebbf9281072c77c7c0cc Mon Sep 17 00:00:00 2001 From: Tim Moon Date: Mon, 21 Oct 2019 16:58:26 -0700 Subject: [PATCH 358/634] Embedding layer with padding index (#1300) * Change embedding layer API to match PyTorch * Add pad index to embedding layer protobuf No actual functionality has been added yet * Separate decl and impl in embedding layer header * Add support for padding index in embedding layer * Handle padding index in protobuf parser for embedding layer * Update Bamboo unit test for embedding layer with padding index --- .../unit_tests/test_unit_layer_embedding.py | 163 +++++++++++++----- include/lbann/layers/learning/embedding.hpp | 137 +++++++++++++-- src/layers/learning/embedding.cpp | 87 +++------- src/proto/factories/layer_factory.cpp | 6 +- src/proto/layers.proto | 25 ++- src/weights/weights.cpp | 3 + 6 files changed, 299 insertions(+), 122 deletions(-) diff --git a/bamboo/unit_tests/test_unit_layer_embedding.py b/bamboo/unit_tests/test_unit_layer_embedding.py index e999eb2a3ea..86f4c203314 100644 --- a/bamboo/unit_tests/test_unit_layer_embedding.py +++ b/bamboo/unit_tests/test_unit_layer_embedding.py @@ -18,15 +18,16 @@ # the functions below to ingest data. # Data -dictionary_size = 7 -embedding_size = 5 -np.random.seed(4321) -embedding_array = np.random.normal(size=(dictionary_size,embedding_size)) +_num_samples = 41 +_num_embeddings = 7 # Sample access functions def get_sample(index): - np.random.seed(1234+index) - return [np.random.randint(dictionary_size)] + np.random.seed(2019101500+index) + i = np.random.randint(_num_embeddings) + if index in (1,2,4,7,17,31): + i = 0 + return [i] def num_samples(): return 41 def sample_dims(): @@ -57,49 +58,129 @@ def construct_model(lbann): """ - # Construct weights for embeddings - embedding_values = ' '.join([str(i) for i in np.nditer(embedding_array)]) - init = lbann.ValueInitializer(values=embedding_values) - w = lbann.Weights(optimizer=lbann.SGD(), initializer=init) - - # Layer graph - input = lbann.Input() - embedding = lbann.Embedding(input, - weights=w, - dictionary_size=dictionary_size, - embedding_size=embedding_size, - device='cpu') - l2_norm2 = lbann.L2Norm2(embedding) - layers = list(lbann.traverse_layer_graph(input)) - metric = lbann.Metric(l2_norm2, name='L2 norm squared') - obj = lbann.ObjectiveFunction(l2_norm2) - - # Compute expected value - metric_vals = [] + # Convenience function to convert list to a space-separated string + def str_list(it): + return ' '.join([str(i) for i in it]) + + # Convenience function to compute L2 norm squared with NumPy + def l2_norm2(x): + x = x.reshape(-1) + return np.inner(x, x) + + # Input data + x = lbann.Identity(lbann.Input()) + x_lbann = x + + # Objects for LBANN model + obj = [] + metrics = [] + callbacks = [] + + # ------------------------------------------ + # No padding index + # ------------------------------------------ + + # Embeddings + np.random.seed(20191015) + embedding_dim = 5 + embeddings = np.random.normal(size=(_num_embeddings,embedding_dim)) + + # LBANN implementation + embedding_weights = lbann.Weights( + optimizer=lbann.SGD(), + initializer=lbann.ValueInitializer(values=str_list(np.nditer(embeddings))) + ) + x = x_lbann + y = lbann.Embedding(x, + weights=embedding_weights, + num_embeddings=_num_embeddings, + embedding_dim=embedding_dim, + device='cpu') + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='no padding index')) + + # NumPy implementation + vals = [] for i in range(num_samples()): - input = get_sample(i) - embedding = embedding_array[int(input[0]), :] - l2_norm2 = np.inner(embedding, embedding) - metric_vals.append(l2_norm2) - expected_metric_value = np.mean(metric_vals) - tol = 8 * expected_metric_value * np.finfo(np.float32).eps - - # Initialize check metric callback - callbacks = [lbann.CallbackCheckMetric(metric='L2 norm squared', - lower_bound=expected_metric_value-tol, - upper_bound=expected_metric_value+tol, - error_on_failure=True, - execution_modes='test'), - lbann.CallbackCheckGradients(error_on_failure=True)] + x = get_sample(i)[0] + y = embeddings[x] + z = l2_norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # Padding index 0 + # ------------------------------------------ + + # Embeddings + np.random.seed(201910152) + embedding_dim = 3 + padding_idx = 0 + embeddings = np.random.normal(size=(_num_embeddings,embedding_dim)) + + # LBANN implementation + # Note: Embedding layer gradients are not exact if a padding index + # is set. Avoid gradient checking by not using an optimizer. + embedding_weights = lbann.Weights( + optimizer=None, + initializer=lbann.ValueInitializer(values=str_list(np.nditer(embeddings))) + ) + x = x_lbann + y = lbann.Embedding(x, + weights=embedding_weights, + num_embeddings=_num_embeddings, + embedding_dim=embedding_dim, + padding_idx=padding_idx, + device='cpu') + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='padding index = 0')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i)[0] + if x == padding_idx: + y = np.zeros(shape=embedding_dim, dtype=np.float32) + else: + y = embeddings[x] + z = l2_norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # Gradient checking + # ------------------------------------------ + + callbacks.append(lbann.CallbackCheckGradients(error_on_failure=True)) + + # ------------------------------------------ + # Construct model + # ------------------------------------------ # Construct model mini_batch_size = 17 num_epochs = 0 return lbann.Model(mini_batch_size, num_epochs, - layers=layers, + layers=lbann.traverse_layer_graph(x_lbann), objective_function=obj, - metrics=[metric], + metrics=metrics, callbacks=callbacks) def construct_data_reader(lbann): diff --git a/include/lbann/layers/learning/embedding.hpp b/include/lbann/layers/learning/embedding.hpp index 7e9af4d0062..9a53c4706e5 100644 --- a/include/lbann/layers/learning/embedding.hpp +++ b/include/lbann/layers/learning/embedding.hpp @@ -28,9 +28,23 @@ #define LBANN_LAYERS_LEARNING_EMBEDDING_HPP_INCLUDED #include "lbann/layers/layer.hpp" +#include "lbann/models/model.hpp" +#include "lbann/utils/memory.hpp" namespace lbann { +/** @brief Lookup table to vectors of fixed size. + * + * Takes a scalar input, interprets it as an index, and outputs the + * corresponding vector. The number of embedding vectors and the size + * of vectors are fixed. If the index is out-of-range, then the + * output is a vector of zeros. + * + * The embedding vectors are stored in an + * @f$ \text{embedding_dim} \times \text{num_embeddings} @f$ + * weights matrix. Note that this is the transpose of the weights in + * the PyTorch embedding layer. + */ template class embedding_layer : public Layer { static_assert(Layout == data_layout::DATA_PARALLEL, @@ -39,13 +53,23 @@ class embedding_layer : public Layer { "embedding layer only supports CPU"); public: + /** + * @param comm LBANN communicator. + * @param num_embeddings Size of dictionary of embeddings. + * @param embedding_dim Size of embedding vectors. + * @param padding_idx If set, then the corresponding embedding + * vector is initialized with zeros. The + * objective function gradient w.r.t. this + * embedding vector is always zero. + */ embedding_layer(lbann_comm* comm, - El::Int dictionary_size, - El::Int embedding_size) + size_t num_embeddings, + size_t embedding_dim, + El::Int padding_idx=-1) : Layer(comm), - m_dictionary_size{dictionary_size}, - m_embedding_size{embedding_size} { - } + m_num_embeddings{num_embeddings}, + m_embedding_dim{embedding_dim}, + m_padding_idx{padding_idx} {} embedding_layer(const embedding_layer& other) = default; embedding_layer& operator=(const embedding_layer& other) = default; @@ -59,12 +83,7 @@ class embedding_layer : public Layer { data_layout get_data_layout() const override { return Layout; } El::Device get_device_allocation() const override { return Device; } - description get_description() const override { - auto desc = Layer::get_description(); - desc.add("Dictionary size", m_dictionary_size); - desc.add("Embedding size", m_embedding_size); - return desc; - } + description get_description() const override; protected: @@ -77,12 +96,104 @@ class embedding_layer : public Layer { private: - El::Int m_dictionary_size; - El::Int m_embedding_size; + /** Size of dictionary of embeddings. */ + size_t m_num_embeddings; + /** Size of embedding vectors. */ + size_t m_embedding_dim; + /** If the padding index is set, then the corresponding embedding + * vector is initialized with zeros. The objective function + * gradient w.r.t. this embedding vector is always zero. + */ + El::Int m_padding_idx; + + /** Gradient w.r.t. embedding weights. */ StarMat m_dictionary_gradient; }; +// ========================================================= +// Implementation +// ========================================================= + +template +description embedding_layer::get_description() const { + auto desc = Layer::get_description(); + desc.add("Num embeddings", m_num_embeddings); + desc.add("Embedding dim", m_embedding_dim); + desc.add("Padding index", m_padding_idx); + return desc; +} + +template +void embedding_layer::setup_dims() { + Layer::setup_dims(); + + // Make sure input dimensions are valid + if (this->get_input_size() != 1) { + const auto& dims = this->get_input_dims(); + std::ostringstream dims_ss; + for (size_t i = 0; i < dims.size(); ++i) { + dims_ss << (i > 0 ? "x" : "") << dims[i]; + } + LBANN_ERROR(this->get_type()," layer \"",this->get_name(),"\" ", + "recieved an input tensor with invalid dimensions " + "(expected 1, got ",dims_ss.str(),")"); + } + + // Output is size of embedding vector + this->set_output_dims({static_cast(m_embedding_dim)}); + +} + +template +void embedding_layer::setup_data() { + Layer::setup_data(); + + // Construct default weights if needed + // Note: Randomly drawn from normal distribution with mean 0 and + // standard deviation 1. + if (this->m_weights.empty()) { + auto w = make_unique(get_comm()); + auto init = make_unique(0,1); + auto opt = std::unique_ptr(m_model->create_optimizer()); + w->set_name(this->get_name() + "_weights"); + w->set_initializer(std::move(init)); + w->set_optimizer(std::move(opt)); + this->m_weights.push_back(w.get()); + this->m_model->add_weights(std::move(w)); + } + if (this->m_weights.size() != 1) { + LBANN_ERROR("attempted to setup ", + this->get_type()," layer \"",this->get_name(),"\" ", + "with an invalid number of weights ", + "(expected 1, found ",this->m_weights.size(),")"); + } + + // Initialize dictionary + auto& dict = *m_weights[0]; + auto matrix_dist = get_prev_activations().DistData(); + matrix_dist.colDist = El::STAR; + matrix_dist.rowDist = El::STAR; + dict.set_dims({static_cast(m_embedding_dim)}, + {static_cast(m_num_embeddings)}); + dict.set_matrix_distribution(matrix_dist); + dict.setup(); + + // Zero out embedding vector for padding index + if (0 <= m_padding_idx + && m_padding_idx < static_cast(m_embedding_dim)) { + auto& dict_values = dict.get_values(); + std::unique_ptr pad_embedding(dict_values.Construct(dict_values.Grid(), + dict_values.Root())); + El::View(*pad_embedding, dict_values, El::ALL, El::IR(m_padding_idx)); + El::Zero(*pad_embedding); + } + + // Initialize gradient w.r.t. dictionary + m_dictionary_gradient.Resize(m_embedding_dim, m_num_embeddings); + +} + #ifndef LBANN_EMBEDDING_LAYER_INSTANTIATE extern template class embedding_layer< data_layout::DATA_PARALLEL, El::Device::CPU>; diff --git a/src/layers/learning/embedding.cpp b/src/layers/learning/embedding.cpp index e79a7aa5bc3..835fa29cf9d 100644 --- a/src/layers/learning/embedding.cpp +++ b/src/layers/learning/embedding.cpp @@ -37,74 +37,26 @@ void embedding_layer::setup_matrices m_dictionary_gradient = StarMat(grid); } -template <> -void embedding_layer::setup_dims() { - Layer::setup_dims(); - - // Make sure input dimensions are valid - if (this->get_input_size() != 1) { - const auto& input_dims = this->get_input_dims(); - std::ostringstream err; - err << get_type() << " layer \"" << get_name() << "\" " - << "recieved an input tensor with invalid dimensions " - << "(expected 1, got "; - for (size_t i = 0; i < input_dims.size(); ++i) { - err << (i > 0 ? "x" : "") << input_dims[i]; - } - err << ")"; - LBANN_ERROR(err.str()); - } - - // Output is size of embedding vector - this->set_output_dims({static_cast(m_embedding_size)}); - -} - -template <> -void embedding_layer::setup_data() { - Layer::setup_data(); - - // Make sure layer has weights for dictionary - if (this->m_weights.size() != 1) { - std::ostringstream err; - err << "attempted to setup " - << this->get_type() << " layer \"" << this->get_name() << "\" " - << "with an invalid number of weights " - << "(expected 1, " - << "found " << this->m_weights.size() << ")"; - LBANN_ERROR(err.str()); - } - - // Initialize dictionary - auto& dict = *m_weights[0]; - auto matrix_dist = get_prev_activations().DistData(); - matrix_dist.colDist = El::STAR; - matrix_dist.rowDist = El::STAR; - dict.set_dims({static_cast(m_embedding_size)}, - {static_cast(m_dictionary_size)}); - dict.set_matrix_distribution(matrix_dist); - - // Initialize gradient w.r.t. dictionary - m_dictionary_gradient.Resize(m_embedding_size, m_dictionary_size); - -} - template <> void embedding_layer::fp_compute() { // Local data - const auto& local_dict = m_weights[0]->get_values().LockedMatrix(); - const auto& local_input = get_local_prev_activations(); - auto& local_output = get_local_activations(); + const auto& local_dict = dynamic_cast(m_weights[0]->get_values().LockedMatrix()); + const auto& local_input = dynamic_cast(get_local_prev_activations()); + auto& local_output = dynamic_cast(get_local_activations()); const auto& local_width = local_input.Width(); // Populate output matrix with appropriate columns of dictionary CPUMat dict_v, output_v; for (El::Int col = 0; col < local_width; ++ col) { - const El::Int ind = static_cast(local_input(0, col)); - El::LockedView(dict_v, local_dict, El::ALL, El::IR(ind)); El::View(output_v, local_output, El::ALL, El::IR(col)); - El::Copy(dict_v, output_v); + const El::Int ind = static_cast(std::floor(local_input(0, col))); + if (0 <= ind && ind < static_cast(m_num_embeddings)) { + El::LockedView(dict_v, local_dict, El::ALL, El::IR(ind)); + El::Copy(dict_v, output_v); + } else { + El::Zero(output_v); + } } } @@ -120,21 +72,26 @@ void embedding_layer::bp_compute() { auto& opt = *m_weights[0]->get_optimizer(); // Local data - const auto& local_input = get_local_prev_activations(); - auto& local_dict_grad = m_dictionary_gradient.Matrix(); - const auto& local_output_grad = get_local_prev_error_signals(); + const auto& local_input = dynamic_cast(get_local_prev_activations()); + auto& local_dict_grad = dynamic_cast(m_dictionary_gradient.Matrix()); + const auto& local_output_grad = dynamic_cast(get_local_prev_error_signals()); const auto& local_width = local_input.Width(); const auto& c = static_cast(this->m_model->get_execution_context()); const auto& mini_batch_size = c.get_effective_mini_batch_size(); // Update appropriate columns of gradient w.r.t. dictionary + // Note: Don't update gradient for padding index El::Zero(local_dict_grad); CPUMat dict_grad_v, output_grad_v; for (El::Int col = 0; col < local_width; ++ col) { - const El::Int ind = static_cast(local_input(0, col)); - El::View(dict_grad_v, local_dict_grad, El::ALL, El::IR(ind)); - El::LockedView(output_grad_v, local_output_grad, El::ALL, El::IR(col)); - El::Axpy(DataType{1}, output_grad_v, dict_grad_v); + const El::Int ind = static_cast(std::floor(local_input(0, col))); + if (0 <= ind + && ind < static_cast(m_num_embeddings) + && ind != m_padding_idx) { + El::View(dict_grad_v, local_dict_grad, El::ALL, El::IR(ind)); + El::LockedView(output_grad_v, local_output_grad, El::ALL, El::IR(col)); + El::Axpy(DataType{1}, output_grad_v, dict_grad_v); + } } opt.add_to_gradient(m_dictionary_gradient, DataType{1} / mini_batch_size, diff --git a/src/proto/factories/layer_factory.cpp b/src/proto/factories/layer_factory.cpp index 8910d8a1114..7c06f8b96d1 100644 --- a/src/proto/factories/layer_factory.cpp +++ b/src/proto/factories/layer_factory.cpp @@ -243,10 +243,14 @@ std::unique_ptr construct_layer( // Learning layers if (proto_layer.has_embedding()) { const auto& params = proto_layer.embedding(); + const size_t num_embeddings = params.num_embeddings(); + const size_t embedding_dim = params.embedding_dim(); + const El::Int padding_idx = (params.has_padding_idx() ? + params.padding_idx().value() : -1); if (Layout == data_layout::DATA_PARALLEL && Device == El::Device::CPU) { return lbann::make_unique>( - comm, params.dictionary_size(), params.embedding_size()); + comm, num_embeddings, embedding_dim, padding_idx); } else { LBANN_ERROR("embedding layer is only supported with " "data-parallel data layout and on CPU"); diff --git a/src/proto/layers.proto b/src/proto/layers.proto index 550d5bb7a9e..094e0d3828a 100644 --- a/src/proto/layers.proto +++ b/src/proto/layers.proto @@ -28,6 +28,8 @@ syntax = "proto3"; package lbann_data; +import "google/protobuf/wrappers.proto"; + message Layer { string name = 50; string parents = 151; @@ -517,9 +519,28 @@ message Layer { double l2_regularization_factor = 12; //default: 0 } + /** @brief Lookup table to embedding vectors. + * + * Takes a scalar input, interprets it as an index, and outputs the + * corresponding vector. The number of embedding vectors and the + * size of vectors are fixed. If the index is out-of-range, then + * the output is a vector of zeros. + * + * The embedding vectors are stored in an + * @f$ \text{embedding_dim} \times \text{num_embeddings} @f$ + * weights matrix. Note that this is the transpose of the weights + * in the PyTorch embedding layer. + */ message Embedding { - int64 dictionary_size = 1; - int64 embedding_size = 2; + /// Size of dictionary of embeddings + int64 num_embeddings = 1; + /// Size of embedding vectors + int64 embedding_dim = 2; + /** If the padding index is set, then the corresponding embedding + * vector is initialized with zeros. The objective function + * gradient w.r.t. this embedding vector is always zero. + */ + google.protobuf.Int64Value padding_idx = 3; } message ChannelwiseScaleBias {} diff --git a/src/weights/weights.cpp b/src/weights/weights.cpp index ff459c05891..8531da28ff4 100644 --- a/src/weights/weights.cpp +++ b/src/weights/weights.cpp @@ -274,6 +274,9 @@ void weights::setup_default_matrix_distribution() { void weights::setup() { + // Return immediately if weights have already been setup + if (m_values != nullptr) { return; } + // Check that tensor dimensions are valid const auto& is_nonpositive = [] (int d) { return d <= 0; }; if (std::any_of(m_matrix_height_dims.begin(), From 2239232b1f0fe910f8667ad9436a92851f712aea Mon Sep 17 00:00:00 2001 From: Tim Moon Date: Mon, 21 Oct 2019 17:22:23 -0700 Subject: [PATCH 359/634] Tinkering with MPI launch invocations (#1307) * Get Python frontend to run on Lassen without system-specific optimizations * Tinker with optimized thread affinity on Pascal and Lassen * Environment optimizations in LC-specific launcher are optional --- python/lbann/contrib/lc/launcher.py | 73 +++++++++++++++++++---------- python/lbann/launcher/lsf.py | 10 +++- 2 files changed, 55 insertions(+), 28 deletions(-) diff --git a/python/lbann/contrib/lc/launcher.py b/python/lbann/contrib/lc/launcher.py index fc51f9ce246..34a3ca7c6a5 100644 --- a/python/lbann/contrib/lc/launcher.py +++ b/python/lbann/contrib/lc/launcher.py @@ -103,45 +103,66 @@ def make_batch_script(script_file=None, environment = environment.copy() # Setup GPU bindings - # Note: Hydrogen processes take ownership of the GPU indices that - # matches their node communicator ranks. mpibind assigns each rank - # a unique GPU with index 0, so it should be disabled. Processes - # may touch the wrong GPUs in the process of figuring out GPU - # ownership, so an exclusive GPU compute mode causes problems. + # Note: Each Hydrogen process is assigned to the GPU index that + # matches its node communicator rank. This is not compatible with + # mpibind, which assigns a GPU with index 0 to each process. We + # can't use an exclusive GPU compute mode since processes may + # touch the wrong GPU while figuring out ownership. if scheduler == 'slurm' and has_gpu(system): launcher_args.extend(['--mpibind=off', '--nvidia_compute_mode=default']) - # Deal with Pascal's hardware topology - # Note: Both GPUs on a Pascal node are on the same socket, so we - # only use cores on that socket. - if system == 'pascal' and procs_per_node == 2: + # Optimized thread affinity for Pascal + # Note: Both GPUs are on socket 0, so we only use cores on that + # socket. + if system == 'pascal': + cores_per_socket = cores_per_node(system) // 2 + cores_per_proc = cores_per_socket // procs_per_node + if 'AL_PROGRESS_RANKS_PER_NUMA_NODE' not in environment: + environment['AL_PROGRESS_RANKS_PER_NUMA_NODE'] = procs_per_node + if 'OMP_NUM_THREADS' not in environment: + environment['OMP_NUM_THREADS'] = cores_per_proc - 1 if scheduler == 'slurm': - launcher_args.append('--cpu_bind=mask_cpu:0x000001ff,0x0003fe00') - environment['OMP_NUM_THREADS'] = 8 - environment['AL_PROGRESS_RANKS_PER_NUMA_NODE'] = 2 + masks = [2**cores_per_proc - 1] + while len(masks) < procs_per_node: + masks.append(masks[-1] << cores_per_proc) + mask_str = ','.join([hex(mask) for mask in masks]) + launcher_args.append('--cpu_bind=mask_cpu:{}'.format(mask_str)) # Hacked bugfix for MPI_Init in MVAPICH2-2.3 # Note: MPI_Init hangs when started with more than 35 # processes. This bug is not present in MVAPICH2-2.2 but is # present in MVAPICH2-2.3rc2. - environment['MV2_USE_RDMA_CM'] = 0 + if 'MV2_USE_RDMA_CM' not in environment: + environment['MV2_USE_RDMA_CM'] = 0 - # Magic default arguments to jsrun/etc. - # Note: Pack processes using ten cores for each, with 40 cores total, and - # all four GPUs visible to each process. + # Optimizations for Sierra-like systems if system in ('sierra', 'lassen'): + + # Set thread affinity + # Note: Aluminum's default thread affinity is incorrect since + # hwloc treats GPUs as NUMA domains. + # Note: There are actually 22 cores/socket, but it seems that + # powers of 2 are better for performance. + cores_per_socket = 16 + procs_per_socket = (procs_per_node + 1) // 2 + cores_per_proc = cores_per_socket // procs_per_socket + if 'AL_PROGRESS_RANKS_PER_NUMA_NODE' not in environment: + environment['AL_PROGRESS_RANKS_PER_NUMA_NODE'] = procs_per_socket + if 'OMP_NUM_THREADS' not in environment: + environment['OMP_NUM_THREADS'] = cores_per_proc if scheduler == 'lsf': - launcher_args.extend([ - '--launch_distribution packed', - '--bind "packed:10"', - '--rs_per_host 1', - '--cpu_per_rs 40', - '--gpu_per_rs 4' - ]) - environment['OMP_NUM_THREADS'] = 4 - # Deal with topology mis-identification on Sierra/Lassen. - environment['AL_PROGRESS_RANKS_PER_NUMA_NODE'] = 2 + launcher_args.append('--bind packed:{}'.format(cores_per_proc)) + + # Hack to enable process forking + # Note: InfiniBand is known to experience hangs if an MPI + # process is forked (see + # https://www.open-mpi.org/faq/?category=openfabrics#ofa-fork). + # Setting IBV_FORK_SAFE seems to fix this issue, but it may + # hurt performance (see + # https://linux.die.net/man/3/ibv_fork_init). + if 'IBV_FORK_SAFE' not in environment: + environment['IBV_FORK_SAFE'] = 1 return lbann.launcher.make_batch_script(script_file=script_file, work_dir=work_dir, diff --git a/python/lbann/launcher/lsf.py b/python/lbann/launcher/lsf.py index a4846f966af..6448a3fca85 100644 --- a/python/lbann/launcher/lsf.py +++ b/python/lbann/launcher/lsf.py @@ -120,8 +120,14 @@ def add_parallel_command(self, procs_per_node = self.procs_per_node args = [launcher] args.extend(make_iterable(launcher_args)) - args.append('-n {}'.format(nodes)) - args.append('--tasks_per_rs {}'.format(procs_per_node)) + args.extend([ + '--nrs {}'.format(nodes), + '--rs_per_host 1', + '--tasks_per_rs {}'.format(procs_per_node), + '--launch_distribution packed', + '--cpu_per_rs ALL_CPUS', + '--gpu_per_rs ALL_GPUS', + ]) args.extend(make_iterable(command)) self.add_command(args) From aebf1884d18a1d668edd03647db1d82b8086e703 Mon Sep 17 00:00:00 2001 From: Tom Benson <30674819+benson31@users.noreply.github.com> Date: Tue, 22 Oct 2019 08:59:36 -0700 Subject: [PATCH 360/634] remove ckpt directories before running lenet checkpoint tests (#1316) --- bamboo/unit_tests/test_unit_checkpoint.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bamboo/unit_tests/test_unit_checkpoint.py b/bamboo/unit_tests/test_unit_checkpoint.py index adf8f2fab67..f0b6398f8da 100644 --- a/bamboo/unit_tests/test_unit_checkpoint.py +++ b/bamboo/unit_tests/test_unit_checkpoint.py @@ -18,7 +18,7 @@ def skeleton_checkpoint_lenet_shared(cluster, executables, dir_name, # No checkpointing, printing weights to files. output_file_name = '%s/bamboo/unit_tests/output/checkpoint_lenet_shared_no_checkpoint_%s_output.txt' % (dir_name, compiler_name) error_file_name = '%s/bamboo/unit_tests/error/checkpoint_lenet_shared_no_checkpoint_%s_error.txt' % (dir_name, compiler_name) - os.system('mkdir ckpt_lenet_shared') + os.system('rm -rf ckpt_lenet_shared && mkdir ckpt_lenet_shared') no_ckpt_dir = 'ckpt_lenet_shared/no_ckpt_{c}'.format(c=compiler_name) command = tools.get_command( cluster=cluster, executable=exe, num_nodes=1, num_processes=2, @@ -83,7 +83,7 @@ def skeleton_checkpoint_lenet_distributed(cluster, executables, dir_name, # No checkpointing, printing weights to files. output_file_name = '%s/bamboo/unit_tests/output/checkpoint_lenet_distributed_no_checkpoint_%s_output.txt' % (dir_name, compiler_name) error_file_name = '%s/bamboo/unit_tests/error/checkpoint_lenet_distributed_no_checkpoint_%s_error.txt' % (dir_name, compiler_name) - os.system('mkdir ckpt_lenet_distributed') + os.system('rm -rf ckpt_lenet_distributed && mkdir ckpt_lenet_distributed') no_ckpt_dir = 'ckpt_lenet_distributed/no_ckpt_{c}'.format(c=compiler_name) command = tools.get_command( cluster=cluster, executable=exe, num_nodes=1, num_processes=2, From 4d2a74ec8489575b96e328dda2a7c11345a6e0fc Mon Sep 17 00:00:00 2001 From: Jae-Seung Yeom Date: Tue, 22 Oct 2019 11:45:04 -0700 Subject: [PATCH 361/634] Have only the trainer master save shared RNG states and create the directory for RNG states to avoid race condition. --- include/lbann/utils/random.hpp | 2 +- src/utils/random.cpp | 44 +++++++++++++++++++--------------- 2 files changed, 26 insertions(+), 20 deletions(-) diff --git a/include/lbann/utils/random.hpp b/include/lbann/utils/random.hpp index 92074e30847..4c4aa2a8be6 100644 --- a/include/lbann/utils/random.hpp +++ b/include/lbann/utils/random.hpp @@ -247,7 +247,7 @@ void bernoulli_fill_procdet(AbsDistMat& mat, El::Int m, El::Int n, double p = 0. void uniform_fill_procdet(AbsDistMat& mat, El::Int m, El::Int n, DataType center = 0.0f, DataType radius = 1.0f); -bool save_rng_to_checkpoint(persist& p, const lbann_comm* comm); +bool save_rng_to_checkpoint(persist& p, lbann_comm* comm); bool load_rng_from_checkpoint(persist& p, const lbann_comm* comm); template diff --git a/src/utils/random.cpp b/src/utils/random.cpp index 6421f8c491e..3aba97d5774 100644 --- a/src/utils/random.cpp +++ b/src/utils/random.cpp @@ -99,31 +99,37 @@ fast_rng_gen& get_fast_io_generator() { return ::fast_io_generator; } -bool save_rng_to_checkpoint(persist& p, const lbann_comm* comm) { +bool save_rng_to_checkpoint(persist& p, lbann_comm* comm) { std::string dirname = std::string(p.m_checkpoint_dir) + "/rng_state"; - makedir(dirname.c_str()); + std::string rank_in_trainer; std::string rng_name; - /// @todo - Note that the RNG with thread local data is not correct - rng_name = dirname + "/rng_seq_generator"; - std::ofstream rng_seq(rng_name); - if(!rng_seq) { LBANN_ERROR("Failed to open ", rng_name); } - rng_seq << ::data_seq_generator; - rng_seq.close(); + if (comm == nullptr) { + rank_in_trainer = std::to_string(El::mpi::Rank(El::mpi::COMM_WORLD)); + makedir(dirname.c_str()); + } else { + rank_in_trainer = std::to_string(comm->get_rank_in_trainer()); + if (comm->am_trainer_master()) { + makedir(dirname.c_str()); + } + comm->trainer_barrier(); + } + + if (comm == nullptr || comm->am_trainer_master()) { + /// @todo - Note that the RNG with thread local data is not correct + rng_name = dirname + "/rng_seq_generator"; + std::ofstream rng_seq(rng_name); + if(!rng_seq) { LBANN_ERROR("Failed to open ", rng_name); } + rng_seq << ::data_seq_generator; + rng_seq.close(); #ifdef LBANN_SET_EL_RNG - rng_name = dirname + "/EL_generator"; - std::ofstream rng_EL(rng_name); - if(!rng_EL) { LBANN_ERROR("Failed to open ", rng_name); } - rng_EL << El::Generator(); - rng_EL.close(); + rng_name = dirname + "/EL_generator"; + std::ofstream rng_EL(rng_name); + if(!rng_EL) { LBANN_ERROR("Failed to open ", rng_name); } + rng_EL << El::Generator(); + rng_EL.close(); #endif - - std::string rank_in_world; - if (comm == nullptr) { - rank_in_world = std::to_string(El::mpi::Rank(El::mpi::COMM_WORLD)); - } else { - rank_in_world = std::to_string(comm->get_rank_in_world()); } /// @todo - Note that the RNG with thread local data is not correct From 797047b6fb9c02ed16352a7625cb5df34af5e876 Mon Sep 17 00:00:00 2001 From: Jae-Seung Yeom Date: Tue, 22 Oct 2019 11:46:35 -0700 Subject: [PATCH 362/634] label RNG states by the rank_in_trainer instead of the rank_in_world. --- src/utils/random.cpp | 34 +++++++++++++++++++--------------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/src/utils/random.cpp b/src/utils/random.cpp index 3aba97d5774..22276176e5d 100644 --- a/src/utils/random.cpp +++ b/src/utils/random.cpp @@ -133,14 +133,14 @@ bool save_rng_to_checkpoint(persist& p, lbann_comm* comm) { } /// @todo - Note that the RNG with thread local data is not correct - rng_name = dirname + "/rng_io_generator_" + rank_in_world; + rng_name = dirname + "/rng_io_generator_" + rank_in_trainer; std::ofstream rng_io(rng_name); if(!rng_io) { LBANN_ERROR("Failed to open ", rng_name); } rng_io << ::io_generator; rng_io.close(); /// @todo - Note that the RNG with thread local data is not correct - rng_name = dirname + "/rng_fast_io_generator_" + rank_in_world; + rng_name = dirname + "/rng_fast_io_generator_" + rank_in_trainer; std::ofstream rng_fast_io(rng_name); if(!rng_fast_io) { LBANN_ERROR("Failed to open ", rng_name); } rng_fast_io << ::fast_io_generator; @@ -149,26 +149,28 @@ bool save_rng_to_checkpoint(persist& p, lbann_comm* comm) { #ifdef _OPENMP #pragma omp parallel private(rng_name) { - rng_name = dirname + "/rng_generator_" + rank_in_world + "_" + std::to_string(omp_get_thread_num()); + rng_name = dirname + "/rng_generator_" + rank_in_trainer + "_" + + std::to_string(omp_get_thread_num()); std::ofstream rng(rng_name); if(!rng) { LBANN_ERROR("Failed to open ", rng_name); } rng << ::generator; rng.close(); - rng_name = dirname + "/rng_fast_generator_" + rank_in_world + "_" + std::to_string(omp_get_thread_num()); + rng_name = dirname + "/rng_fast_generator_" + rank_in_trainer + "_" + + std::to_string(omp_get_thread_num()); std::ofstream rng_fast(rng_name); if(!rng_fast) { LBANN_ERROR("Failed to open ", rng_name); } rng_fast << ::fast_generator; rng_fast.close(); } #else - rng_name = dirname + "/rng_generator_" + rank_in_world; + rng_name = dirname + "/rng_generator_" + rank_in_trainer; std::ofstream rng(rng_name); if(!rng) { LBANN_ERROR("Failed to open ", rng_name); } rng << ::generator; rng.close(); - rng_name = dirname + "/rng_fast_generator_" + rank_in_world; + rng_name = dirname + "/rng_fast_generator_" + rank_in_trainer; std::ofstream rng_fast(rng_name); if(!rng_fast) { LBANN_ERROR("Failed to open ", rng_name); } rng_fast << ::fast_generator; @@ -196,21 +198,21 @@ bool load_rng_from_checkpoint(persist& p, const lbann_comm* comm) { rng_EL >> El::Generator(); #endif - std::string rank_in_world; + std::string rank_in_trainer; if (comm == nullptr) { - rank_in_world = std::to_string(El::mpi::Rank(El::mpi::COMM_WORLD)); + rank_in_trainer = std::to_string(El::mpi::Rank(El::mpi::COMM_WORLD)); } else { - rank_in_world = std::to_string(comm->get_rank_in_world()); + rank_in_trainer = std::to_string(comm->get_rank_in_trainer()); } /// @todo - Note that the RNG with thread local data is not correct - rng_name = dirname + "/rng_io_generator_" + rank_in_world; + rng_name = dirname + "/rng_io_generator_" + rank_in_trainer; std::ifstream rng_io(rng_name); if(!rng_io) { LBANN_ERROR("Failed to open ", rng_name); } rng_io >> ::io_generator; /// @todo - Note that the RNG with thread local data is not correct - rng_name = dirname + "/rng_fast_io_generator_" + rank_in_world; + rng_name = dirname + "/rng_fast_io_generator_" + rank_in_trainer; std::ifstream rng_fast_io(rng_name); if(!rng_fast_io) { LBANN_ERROR("Failed to open ", rng_name); } rng_fast_io >> ::fast_io_generator; @@ -218,23 +220,25 @@ bool load_rng_from_checkpoint(persist& p, const lbann_comm* comm) { #ifdef _OPENMP #pragma omp parallel private(rng_name) { - rng_name = dirname + "/rng_generator_" + rank_in_world + "_" + std::to_string(omp_get_thread_num()); + rng_name = dirname + "/rng_generator_" + rank_in_trainer + "_" + + std::to_string(omp_get_thread_num()); std::ifstream rng(rng_name); if(!rng) { LBANN_ERROR("Failed to open ", rng_name); } rng >> ::generator; - rng_name = dirname + "/rng_fast_generator_" + rank_in_world + "_" + std::to_string(omp_get_thread_num()); + rng_name = dirname + "/rng_fast_generator_" + rank_in_trainer + "_" + + std::to_string(omp_get_thread_num()); std::ifstream rng_fast(rng_name); if(!rng_fast) { LBANN_ERROR("Failed to open ", rng_name); } rng_fast >> ::fast_generator; } #else - rng_name = dirname + "/rng_generator_" + rank_in_world; + rng_name = dirname + "/rng_generator_" + rank_in_trainer; std::ifstream rng(rng_name); if(!rng) { LBANN_ERROR("Failed to open ", rng_name); } rng >> ::generator; - rng_name = dirname + "/rng_fast_generator_" + rank_in_world; + rng_name = dirname + "/rng_fast_generator_" + rank_in_trainer; std::ifstream rng_fast(rng_name); if(!rng_fast) { LBANN_ERROR("Failed to open ", rng_name); } rng_fast >> ::fast_generator; From f7d2fc528c391da4a9cf6c93210e72cb818b7ca0 Mon Sep 17 00:00:00 2001 From: Tim Moon Date: Tue, 22 Oct 2019 14:40:58 -0700 Subject: [PATCH 363/634] Bugfix for softmax and log-softmax layers (#1318) * Fix bug in GPU log-softmax layer A CUDA kernel was sometimes launched with a grid size of 0 * Fix bug in GPU softmax layer A CUDA kernel was sometimes launched with a grid size of 0 * Make sure Bamboo layer unit tests have a last mini-batch of size 1 Helps catch bugs when a process has no local data. * Fix indexing bugs in log-softmax layer CUDA kernels * Document softmax and log-softmax layers in layers.proto --- .../unit_tests/test_unit_datareader_python.py | 4 +- bamboo/unit_tests/test_unit_layer_argmax.py | 4 +- bamboo/unit_tests/test_unit_layer_argmin.py | 4 +- .../test_unit_layer_channelwise_scale_bias.py | 4 +- .../unit_tests/test_unit_layer_convolution.py | 2 +- .../test_unit_layer_cross_entropy.py | 4 +- .../unit_tests/test_unit_layer_embedding.py | 2 +- ...nit_layer_entrywise_batch_normalization.py | 2 +- .../test_unit_layer_entrywise_scale_bias.py | 2 +- .../test_unit_layer_fully_connected.py | 2 +- .../unit_tests/test_unit_layer_log_softmax.py | 258 ++++++++-- .../test_unit_layer_mean_squared_error.py | 2 +- bamboo/unit_tests/test_unit_layer_one_hot.py | 2 +- bamboo/unit_tests/test_unit_layer_slice.py | 2 +- bamboo/unit_tests/test_unit_layer_softmax.py | 5 +- include/lbann/layers/activations/softmax.hpp | 11 +- include/lbann/utils/cuda.hpp | 34 ++ include/lbann/utils/impl/cuda.hpp | 58 +++ src/layers/activations/log_softmax.cpp | 20 +- src/layers/activations/log_softmax.cu | 437 +++++++++-------- src/layers/activations/softmax.cpp | 17 +- src/layers/activations/softmax.cu | 459 +++++++++--------- src/proto/layers.proto | 11 + 23 files changed, 824 insertions(+), 522 deletions(-) diff --git a/bamboo/unit_tests/test_unit_datareader_python.py b/bamboo/unit_tests/test_unit_datareader_python.py index e6fd5f22f24..1d1d3827c4f 100644 --- a/bamboo/unit_tests/test_unit_datareader_python.py +++ b/bamboo/unit_tests/test_unit_datareader_python.py @@ -17,7 +17,7 @@ # Data np.random.seed(20190708) -_num_samples = 23 +_num_samples = 29 _sample_size = 7 _samples = np.random.normal(size=(_num_samples,_sample_size)) _samples = _samples.astype(np.float32) @@ -78,7 +78,7 @@ def construct_model(lbann): execution_modes='test')) # Construct model - mini_batch_size = 5 + mini_batch_size = num_samples() // 4 num_epochs = 0 return lbann.Model(mini_batch_size, num_epochs, diff --git a/bamboo/unit_tests/test_unit_layer_argmax.py b/bamboo/unit_tests/test_unit_layer_argmax.py index f2b2f83bbb7..ff6fc7bee1e 100644 --- a/bamboo/unit_tests/test_unit_layer_argmax.py +++ b/bamboo/unit_tests/test_unit_layer_argmax.py @@ -19,7 +19,7 @@ # Data np.random.seed(20190911) -_num_samples = 31 +_num_samples = 35 _sample_dims = (11,) _sample_size = functools.reduce(operator.mul, _sample_dims) _samples = np.random.normal(size=(_num_samples,_sample_size)).astype(np.float32) @@ -98,7 +98,7 @@ def l2_norm2(x): execution_modes='test')) # Construct model - mini_batch_size = 17 + mini_batch_size = num_samples() // 2 num_epochs = 0 return lbann.Model(mini_batch_size, num_epochs, diff --git a/bamboo/unit_tests/test_unit_layer_argmin.py b/bamboo/unit_tests/test_unit_layer_argmin.py index 5887818e65b..0a6c90b12df 100644 --- a/bamboo/unit_tests/test_unit_layer_argmin.py +++ b/bamboo/unit_tests/test_unit_layer_argmin.py @@ -19,7 +19,7 @@ # Data np.random.seed(201909112) -_num_samples = 31 +_num_samples = 37 _sample_dims = (11,) _sample_size = functools.reduce(operator.mul, _sample_dims) _samples = np.random.normal(size=(_num_samples,_sample_size)).astype(np.float32) @@ -101,7 +101,7 @@ def l2_norm2(x): # Construct model # ------------------------------------------ - mini_batch_size = 17 + mini_batch_size = num_samples() // 2 num_epochs = 0 return lbann.Model(mini_batch_size, num_epochs, diff --git a/bamboo/unit_tests/test_unit_layer_channelwise_scale_bias.py b/bamboo/unit_tests/test_unit_layer_channelwise_scale_bias.py index 302dc262723..f952c138c14 100644 --- a/bamboo/unit_tests/test_unit_layer_channelwise_scale_bias.py +++ b/bamboo/unit_tests/test_unit_layer_channelwise_scale_bias.py @@ -19,7 +19,7 @@ # Data np.random.seed(20190719) -_num_samples = 29 +_num_samples = 23 _sample_dims = (7,5,3) _sample_size = functools.reduce(operator.mul, _sample_dims) _samples = np.random.normal(size=(_num_samples,_sample_size)).astype(np.float32) @@ -117,7 +117,7 @@ def l2_norm2(x): callbacks.append(lbann.CallbackCheckGradients(error_on_failure=True)) # Construct model - mini_batch_size = 17 + mini_batch_size = num_samples() // 2 num_epochs = 0 return lbann.Model(mini_batch_size, num_epochs, diff --git a/bamboo/unit_tests/test_unit_layer_convolution.py b/bamboo/unit_tests/test_unit_layer_convolution.py index ba1faec3ed2..1962a83e4d3 100644 --- a/bamboo/unit_tests/test_unit_layer_convolution.py +++ b/bamboo/unit_tests/test_unit_layer_convolution.py @@ -275,7 +275,7 @@ def l2_norm2(x): # Construct model # ------------------------------------------ - mini_batch_size = 11 + mini_batch_size = num_samples() // 2 num_epochs = 0 return lbann.Model(mini_batch_size, num_epochs, diff --git a/bamboo/unit_tests/test_unit_layer_cross_entropy.py b/bamboo/unit_tests/test_unit_layer_cross_entropy.py index 0fb25b25553..c692b2b4f34 100644 --- a/bamboo/unit_tests/test_unit_layer_cross_entropy.py +++ b/bamboo/unit_tests/test_unit_layer_cross_entropy.py @@ -28,7 +28,7 @@ np.random.seed(201910143) _samples = np.random.uniform(low=0.25, high=1, - size=(13,2,7)).astype(np.float32) + size=(23,2,7)).astype(np.float32) # Sample access functions def get_sample(index): @@ -168,7 +168,7 @@ def l2_norm2(x): # Construct model # ------------------------------------------ - mini_batch_size = 11 + mini_batch_size = num_samples() // 2 num_epochs = 0 return lbann.Model(mini_batch_size, num_epochs, diff --git a/bamboo/unit_tests/test_unit_layer_embedding.py b/bamboo/unit_tests/test_unit_layer_embedding.py index 86f4c203314..2e30677577d 100644 --- a/bamboo/unit_tests/test_unit_layer_embedding.py +++ b/bamboo/unit_tests/test_unit_layer_embedding.py @@ -174,7 +174,7 @@ def l2_norm2(x): # ------------------------------------------ # Construct model - mini_batch_size = 17 + mini_batch_size = num_samples() // 2 num_epochs = 0 return lbann.Model(mini_batch_size, num_epochs, diff --git a/bamboo/unit_tests/test_unit_layer_entrywise_batch_normalization.py b/bamboo/unit_tests/test_unit_layer_entrywise_batch_normalization.py index 3f9d5cf7f74..fda72f7e5e1 100644 --- a/bamboo/unit_tests/test_unit_layer_entrywise_batch_normalization.py +++ b/bamboo/unit_tests/test_unit_layer_entrywise_batch_normalization.py @@ -124,7 +124,7 @@ def str_list(it): # Construct model # ------------------------------------------ - mini_batch_size = 64 + mini_batch_size = num_samples() // 2 num_epochs = 1 return lbann.Model(mini_batch_size, num_epochs, diff --git a/bamboo/unit_tests/test_unit_layer_entrywise_scale_bias.py b/bamboo/unit_tests/test_unit_layer_entrywise_scale_bias.py index 08053863192..23178565fe2 100644 --- a/bamboo/unit_tests/test_unit_layer_entrywise_scale_bias.py +++ b/bamboo/unit_tests/test_unit_layer_entrywise_scale_bias.py @@ -164,7 +164,7 @@ def l2_norm2(x): # Construct model # ------------------------------------------ - mini_batch_size = 17 + mini_batch_size = num_samples() // 2 num_epochs = 0 return lbann.Model(mini_batch_size, num_epochs, diff --git a/bamboo/unit_tests/test_unit_layer_fully_connected.py b/bamboo/unit_tests/test_unit_layer_fully_connected.py index 609a51afd5d..f7bc06a52cd 100644 --- a/bamboo/unit_tests/test_unit_layer_fully_connected.py +++ b/bamboo/unit_tests/test_unit_layer_fully_connected.py @@ -251,7 +251,7 @@ def l2_norm2(x): # Construct model # ------------------------------------------ - mini_batch_size = 17 + mini_batch_size = num_samples() // 2 num_epochs = 0 return lbann.Model(mini_batch_size, num_epochs, diff --git a/bamboo/unit_tests/test_unit_layer_log_softmax.py b/bamboo/unit_tests/test_unit_layer_log_softmax.py index 1dc2d7bab74..7eec61dfdcf 100644 --- a/bamboo/unit_tests/test_unit_layer_log_softmax.py +++ b/bamboo/unit_tests/test_unit_layer_log_softmax.py @@ -1,56 +1,210 @@ +import functools +import operator +import os +import os.path import sys -sys.path.insert(0, '../common_python') +import numpy as np + +# Local files +current_file = os.path.realpath(__file__) +current_dir = os.path.dirname(current_file) +sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python')) import tools -import pytest -import os +# ============================================== +# Objects for Python data reader +# ============================================== +# Note: The Python data reader imports this file as a module and calls +# the functions below to ingest data. + +# Data +np.random.seed(201910213) +_num_samples = 15 +_sample_size = 11 +_samples = np.random.normal(size=(_num_samples,_sample_size)).astype(np.float32) + +# Sample access functions +def get_sample(index): + return _samples[index,:] +def num_samples(): + return _num_samples +def sample_dims(): + return (_sample_size,) + +# ============================================== +# NumPy softmax +# ============================================== + +def numpy_log_softmax(x): + """NumPy implementation of log-softmax.""" + x = x.astype(np.float64) + x = x - np.max(x) + return x - np.log(np.sum(np.exp(x))) + +# ============================================== +# Setup LBANN experiment +# ============================================== + +def setup_experiment(lbann): + """Construct LBANN experiment. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + trainer = lbann.Trainer() + model = construct_model(lbann) + data_reader = construct_data_reader(lbann) + optimizer = lbann.NoOptimizer() + return trainer, model, data_reader, optimizer + +def construct_model(lbann): + """Construct LBANN model. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Convenience function to convert list to a space-separated string + def str_list(it): + return ' '.join([str(i) for i in it]) + + # Convenience function to compute L2 norm squared with NumPy + def l2_norm2(x): + x = x.reshape(-1).astype(np.float64) + return np.inner(x, x) + + # Input data + # Note: Sum with a weights layer so that gradient checking will + # verify that error signals are correct. + x_weights = lbann.Weights(optimizer=lbann.SGD(), + initializer=lbann.ConstantInitializer(value=0.0)) + x0 = lbann.WeightsLayer(weights=x_weights, dims=str(_sample_size)) + x1 = lbann.Identity(lbann.Input()) + x = lbann.Sum([x0, x1]) + x_lbann = x + + # Objects for LBANN model + obj = [] + metrics = [] + callbacks = [] + + # ------------------------------------------ + # Data-parallel layout + # ------------------------------------------ + + # LBANN implementation + x = x_lbann + y = lbann.LogSoftmax(x, data_layout='data_parallel') + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='data-parallel output')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i) + y = numpy_log_softmax(x) + z = l2_norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # Model-parallel layout + # ------------------------------------------ + + # LBANN implementation + x = x_lbann + y = lbann.LogSoftmax(x, data_layout='model_parallel') + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='model-parallel output')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i) + y = numpy_log_softmax(x) + z = l2_norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # Gradient checking + # ------------------------------------------ + + callbacks.append(lbann.CallbackCheckGradients(error_on_failure=True)) + + # ------------------------------------------ + # Construct model + # ------------------------------------------ + + mini_batch_size = num_samples() // 2 + num_epochs = 0 + return lbann.Model(mini_batch_size, + num_epochs, + layers=lbann.traverse_layer_graph(x_lbann), + objective_function=obj, + metrics=metrics, + callbacks=callbacks) + +def construct_data_reader(lbann): + """Construct Protobuf message for Python data reader. + + The Python data reader will import the current Python file to + access the sample access functions. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Note: The training data reader should be removed when + # https://github.com/LLNL/lbann/issues/1098 is resolved. + message = lbann.reader_pb2.DataReader() + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'train' + ) + ]) + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'test' + ) + ]) + return message + +# ============================================== +# Setup PyTest +# ============================================== -def skeleton_layer_log_softmax(cluster, executables, dir_name, compiler_name, - weekly, data_reader_percent): - if compiler_name not in executables: - e = 'skeleton_layer_log_softmax: default_exes[%s] does not exist' % compiler_name - print('Skip - ' + e) - pytest.skip(e) - output_file_name = '%s/bamboo/unit_tests/output/layer_log_softmax_%s_output.txt' % (dir_name, compiler_name) - error_file_name = '%s/bamboo/unit_tests/error/layer_log_softmax_%s_error.txt' % (dir_name, compiler_name) - command = tools.get_command( - cluster=cluster, executable=executables[compiler_name], - num_nodes=1, - time_limit=10, - num_processes=2, dir_name=dir_name, - data_reader_name='synthetic', - data_reader_percent=data_reader_percent, - model_folder='tests/layer_tests', model_name='log_softmax', - optimizer_name='sgd', - output_file_name=output_file_name, error_file_name=error_file_name, weekly=weekly) - return_code = os.system(command) - tools.assert_success(return_code, error_file_name) - - -def test_unit_layer_log_softmax_clang6(cluster, exes, dirname, - weekly, data_reader_percent): - skeleton_layer_log_softmax(cluster, exes, dirname, 'clang6', - weekly, data_reader_percent) - - -def test_unit_layer_log_softmax_gcc7(cluster, exes, dirname, - weekly, data_reader_percent): - skeleton_layer_log_softmax(cluster, exes, dirname, 'gcc7', - weekly, data_reader_percent) - - -def test_unit_layer_log_softmax_intel19(cluster, exes, dirname, - weekly, data_reader_percent): - skeleton_layer_log_softmax(cluster, exes, dirname, 'intel19', - weekly, data_reader_percent) - - -# Run with python3 -m pytest -s test_unit_ridge_regression.py -k 'test_unit_layer_log_softmax_exe' --exe= -def test_unit_layer_log_softmax_exe(cluster, dirname, exe, weekly, data_reader_percent): - if exe is None: - e = 'test_unit_layer_log_softmax_exe: Non-local testing' - print('Skip - ' + e) - pytest.skip(e) - exes = {'exe': exe} - skeleton_layer_log_softmax(cluster, exes, dirname, 'exe', - weekly, data_reader_percent) +# Create test functions that can interact with PyTest +# Note: Create test name by removing ".py" from file name +_test_name = os.path.splitext(os.path.basename(current_file))[0] +for test in tools.create_tests(setup_experiment, _test_name): + globals()[test.__name__] = test diff --git a/bamboo/unit_tests/test_unit_layer_mean_squared_error.py b/bamboo/unit_tests/test_unit_layer_mean_squared_error.py index 02680e8fce4..6fe16a7a8f3 100644 --- a/bamboo/unit_tests/test_unit_layer_mean_squared_error.py +++ b/bamboo/unit_tests/test_unit_layer_mean_squared_error.py @@ -166,7 +166,7 @@ def l2_norm2(x): # Construct model # ------------------------------------------ - mini_batch_size = 11 + mini_batch_size = num_samples() // 2 num_epochs = 0 return lbann.Model(mini_batch_size, num_epochs, diff --git a/bamboo/unit_tests/test_unit_layer_one_hot.py b/bamboo/unit_tests/test_unit_layer_one_hot.py index efc0927044f..6b4c50ccce7 100644 --- a/bamboo/unit_tests/test_unit_layer_one_hot.py +++ b/bamboo/unit_tests/test_unit_layer_one_hot.py @@ -86,7 +86,7 @@ def construct_model(lbann): execution_modes='test')) # Construct model - mini_batch_size = 19 + mini_batch_size = num_samples() // 2 num_epochs = 0 return lbann.Model(mini_batch_size, num_epochs, diff --git a/bamboo/unit_tests/test_unit_layer_slice.py b/bamboo/unit_tests/test_unit_layer_slice.py index 8acb6f2cd70..52da1b097e2 100644 --- a/bamboo/unit_tests/test_unit_layer_slice.py +++ b/bamboo/unit_tests/test_unit_layer_slice.py @@ -196,7 +196,7 @@ def l2_norm2(x): # Construct model # -------------------------- - mini_batch_size = 17 + mini_batch_size = num_samples() // 2 num_epochs = 0 return lbann.Model(mini_batch_size, num_epochs, diff --git a/bamboo/unit_tests/test_unit_layer_softmax.py b/bamboo/unit_tests/test_unit_layer_softmax.py index 5d1e11b83b6..44636f91d87 100644 --- a/bamboo/unit_tests/test_unit_layer_softmax.py +++ b/bamboo/unit_tests/test_unit_layer_softmax.py @@ -38,7 +38,8 @@ def sample_dims(): def numpy_softmax(x): """NumPy implementation of softmax. - There is an implementation in SciPy 1.2.0 (scipy.special.softmax). + There is also an implementation in SciPy 1.2.0 + (scipy.special.softmax). """ x = x.astype(np.float64) @@ -158,7 +159,7 @@ def l2_norm2(x): # Construct model # ------------------------------------------ - mini_batch_size = 17 + mini_batch_size = num_samples() // 2 num_epochs = 0 return lbann.Model(mini_batch_size, num_epochs, diff --git a/include/lbann/layers/activations/softmax.hpp b/include/lbann/layers/activations/softmax.hpp index 6a43b3e0e91..e86969181a1 100644 --- a/include/lbann/layers/activations/softmax.hpp +++ b/include/lbann/layers/activations/softmax.hpp @@ -31,16 +31,17 @@ #include "lbann/utils/cudnn.hpp" // Threshold outputs to a minimum value. + // If enabled, the minimum output value is sqrt(min), where min is the // minimum, normalized, positive value (~1e-19 for float and ~1e-154 -// for double). The gradients w.r.t. input will be inaccurate, on the -// order of the minimum output value. -#define LBANN_ENABLE_SOFTMAX_CUTOFF +// for double). During backprop, gradients are computed as if +// thresholding did not occur, so there will be a discrepancy for +// values that are thresholded. +#define LBANN_ENABLE_SOFTMAX_THRESHOLD namespace lbann { -/** @brief - * +/** * @f[ \text{softmax}(x)_i = \frac{e^{x_i}}{\sum_j e^{x_j}} @f] */ template diff --git a/include/lbann/utils/cuda.hpp b/include/lbann/utils/cuda.hpp index 87201c0fe8d..6fba40a03ca 100644 --- a/include/lbann/utils/cuda.hpp +++ b/include/lbann/utils/cuda.hpp @@ -110,6 +110,40 @@ namespace cuda { template __device__ __forceinline__ T atomic_add(T* address, T val); +/** @brief Sum over threads in CUDA block + * + * Every thread in a CUDA block must enter this function. The sum is + * returned on thread 0. + * + * @tparam bdimx x-dimension of CUDA block + * @tparam bdimy y-dimension of CUDA block + * @tparam bdimz z-dimension of CUDA block + * @tparam T Data type + * @param val Contribution from thread + * @returns On thread 0, the sum. Not meaningful on other threads. + */ +template +__device__ __forceinline__ +T block_reduce(T val); + +/** @brief Reduction over threads in CUDA block + * + * Every thread in a CUDA block must enter this function. The reduced + * value is returned on thread 0. + * + * @tparam bdimx x-dimension of CUDA block + * @tparam bdimy y-dimension of CUDA block + * @tparam bdimz z-dimension of CUDA block + * @tparam T Data type + * @tparam Op Functor for reduction operation + * @param val Contribution from each thread + * @returns On thread 0, the reduced value. Not meaningful on other + * threads. + */ +template +__device__ __forceinline__ +T block_reduce(T val); + // Unary math functions template __device__ __forceinline__ T abs(const T& x); template __device__ __forceinline__ T round(const T& x); diff --git a/include/lbann/utils/impl/cuda.hpp b/include/lbann/utils/impl/cuda.hpp index 56b1ed27d9b..0434b6e22b1 100644 --- a/include/lbann/utils/impl/cuda.hpp +++ b/include/lbann/utils/impl/cuda.hpp @@ -25,7 +25,12 @@ //////////////////////////////////////////////////////////////////////////////// #include + +// Headers for NVCC #ifdef __CUDACC__ +#ifdef HYDROGEN_HAVE_CUB +#include "cub/block/block_reduce.cuh" +#endif // HYDROGEN_HAVE_CUB #include #include #endif // __CUDACC__ @@ -83,6 +88,59 @@ double atomic_add(double* address, double val) { #endif // __CUDA_ARCH__ < 600 } +// Block reduction +template +__device__ __forceinline__ +T block_reduce(T val) { +#ifdef HYDROGEN_HAVE_CUB + constexpr auto reduce_algo = cub::BLOCK_REDUCE_WARP_REDUCTIONS; + using BlockReduce = cub::BlockReduce; + __shared__ typename BlockReduce::TempStorage workspace; + val = BlockReduce(workspace).Sum(val); +#else + const size_t tid = threadIdx.x + threadIdx.y*bdimx + threadIdx.z*bdimx*bdimy; + constexpr size_t bsize = bdimx * bdimy * bdimz; + __shared__ DataType shared_max_vals[bsize]; + shared_vals[tid] = val; + for (size_t stride = bsize/2; stride > 0; stride /= 2) { + __syncthreads(); + if (tid < stride) { + shared_vals[tid] = shared_vals[tid] + shared_vals[tid+stride]; + } + } + if (tid == 0) { + val = shared_vals[0]; + } +#endif // HYDROGEN_HAVE_CUB + return val; +} +template +__device__ __forceinline__ +T block_reduce(T val) { +#ifdef HYDROGEN_HAVE_CUB + constexpr auto reduce_algo = cub::BLOCK_REDUCE_WARP_REDUCTIONS; + using BlockReduce = cub::BlockReduce; + __shared__ typename BlockReduce::TempStorage workspace; + val = BlockReduce(workspace).Reduce(val, Op()); +#else + Op op; + const size_t tid = threadIdx.x + threadIdx.y*bdimx + threadIdx.z*bdimx*bdimy; + constexpr size_t bsize = bdimx * bdimy * bdimz; + __shared__ DataType shared_max_vals[bsize]; + shared_vals[tid] = val; + for (size_t stride = bsize/2; stride > 0; stride /= 2) { + __syncthreads(); + if (tid < stride) { + shared_vals[tid] = op(shared_vals[tid], shared_vals[tid+stride]); + } + } + if (tid == 0) { + val = shared_vals[0]; + } +#endif // HYDROGEN_HAVE_CUB + return val; +} + // Unary math functions #define WRAP_UNARY_CUDA_MATH_FUNCTION(func) \ template <> __device__ __forceinline__ \ diff --git a/src/layers/activations/log_softmax.cpp b/src/layers/activations/log_softmax.cpp index ccf4992e91f..5e734c989d5 100644 --- a/src/layers/activations/log_softmax.cpp +++ b/src/layers/activations/log_softmax.cpp @@ -37,11 +37,11 @@ void fp(lbann_comm& comm, AbsDistMat& workspace) { // Local matrices - const auto& local_input = input.LockedMatrix(); - auto& local_output = output.Matrix(); - auto& local_workspace = workspace.Matrix(); - const auto& local_height = local_input.Height(); - const auto& local_width = local_input.Width(); + const auto& local_input = dynamic_cast(input.LockedMatrix()); + auto& local_output = dynamic_cast(output.Matrix()); + auto& local_workspace = dynamic_cast(workspace.Matrix()); + const auto local_height = local_input.Height(); + const auto local_width = local_input.Width(); // Find column-wise maximum entries El::Fill(workspace, std::numeric_limits::lowest()); @@ -59,7 +59,7 @@ void fp(lbann_comm& comm, LBANN_OMP_PARALLEL_FOR for (El::Int col = 0; col < local_width; ++col) { const auto shift = local_workspace(0, col); - DataType sum = 0; + DataType sum{0}; for (El::Int row = 0; row < local_height; ++row) { const auto& x = local_input(row, col); auto& y = local_output(row, col); @@ -89,10 +89,10 @@ void bp(lbann_comm& comm, AbsDistMat& workspace) { // Local matrices - const auto& local_output = output.LockedMatrix(); - const auto& local_gradient_wrt_output = gradient_wrt_output.LockedMatrix(); - auto& local_gradient_wrt_input = gradient_wrt_input.Matrix(); - auto& local_workspace = workspace.Matrix(); + const auto& local_output = dynamic_cast(output.LockedMatrix()); + const auto& local_gradient_wrt_output = dynamic_cast(gradient_wrt_output.LockedMatrix()); + auto& local_gradient_wrt_input = dynamic_cast(gradient_wrt_input.Matrix()); + auto& local_workspace = dynamic_cast(workspace.Matrix()); const auto& local_height = local_output.Height(); const auto& local_width = local_output.Width(); diff --git a/src/layers/activations/log_softmax.cu b/src/layers/activations/log_softmax.cu index 43584d28c46..e2f5e639096 100644 --- a/src/layers/activations/log_softmax.cu +++ b/src/layers/activations/log_softmax.cu @@ -26,199 +26,219 @@ #define LBANN_LOG_SOFTMAX_LAYER_INSTANTIATE #include "lbann/layers/activations/log_softmax.hpp" +#include "lbann/utils/cuda.hpp" namespace lbann { namespace { -/** Find largest entry within each CUDA block. - * Each block is assigned several entries from the same mini-batch - * sample and it finds the largest entry. Results are output to an - * nblocksx x width matrix. +/** @brief Max functor */ +template +struct max_op { + __device__ __forceinline__ + DataType operator()(const T& x1, const T& x2) const { + return cuda::max(x1, x2); + } +}; + +/** @brief Kernel for max reduction on matrix columns + * + * Each CUDA block computes the max over a subset of matrix entries + * and outputs the result. This is repeated multiple times for + * column-wise max reduction. + * + * Block dimensions: bsize x 1 x 1 + * + * Grid dimension: (height / bsize) x width x 1 + * + * @param values (height x width) matrix + * @param max_values (nblocksx x width) matrix */ -template -__global__ void reduce_max_kernel(El::Int height, El::Int width, +template +__global__ void reduce_max_kernel(size_t height, + size_t width, const DataType* __restrict__ values, - El::Int values_ldim, + size_t values_ldim, DataType* __restrict__ max_values) { // Indices - const El::Int tid = threadIdx.x; - const El::Int gidx = threadIdx.x + blockIdx.x * blockDim.x; - const El::Int bidx = blockIdx.x; - const El::Int bidy = blockIdx.y; - const El::Int nthreadsx = blockDim.x * gridDim.x; - const El::Int nblocksx = gridDim.x; - const El::Int nblocksy = gridDim.y; + const size_t tid = threadIdx.x; + const size_t gidx = threadIdx.x + blockIdx.x * blockDim.x; + const size_t bidx = blockIdx.x; + const size_t bidy = blockIdx.y; + const size_t nthreadsx = blockDim.x * gridDim.x; + const size_t nblocksx = gridDim.x; + const size_t nblocksy = gridDim.y; - // Reduce each matrix column independently - for (El::Int col = bidy; col < width; col += nblocksy) { + for (size_t col = bidy; col < width; col += nblocksy) { // Find largest value for each thread - DataType private_max_val = -cuda::infinity(); - for (El::Int row = gidx; row < height; row += nthreadsx) { - private_max_val = cuda::max(private_max_val, - values[row + col * values_ldim]); + DataType thread_max_val{-cuda::infinity()}; + for (size_t row = gidx; row < height; row += nthreadsx) { + const auto& val = values[row+col*values_ldim]; + thread_max_val = cuda::max(thread_max_val, val); } - // Shared memory reduction to get largest value for each block - __shared__ DataType shared_max_vals[block_size]; - shared_max_vals[tid] = private_max_val; - for (El::Int stride = block_size / 2; stride > 0; stride /= 2) { - __syncthreads(); - if (tid < stride) { - shared_max_vals[tid] = cuda::max(shared_max_vals[tid], - shared_max_vals[tid + stride]); - } - } + // Find largest value for each block + const DataType block_max_val + = cuda::block_reduce>(thread_max_val); if (tid == 0) { - max_values[bidx + col*nblocksx] = shared_max_vals[0]; + max_values[bidx+col*nblocksx] = block_max_val; } } } -/** Exponentiate inputs and compute sum(exp(x)). - * Inputs are shifted by the column max to prevent LogSumExp from - * blowing up. +/** @brief Kernel for matrix column sums + * + * Block dimensions: bsize x 1 x 1 + * + * Grid dimension: (height / bsize) x width x 1 + * + * @param sums On input, array of zeros. On output, sum(x) for each + * column. */ -template -__global__ void fp_exp_kernel(El::Int height, El::Int width, - const DataType* __restrict__ input, - El::Int input_ldim, - DataType* __restrict__ output, - El::Int output_ldim, - const DataType* __restrict__ shifts, - El::Int shifts_stride, - DataType* __restrict__ sums, - El::Int sums_stride) { +template +__global__ void reduce_sum_kernel(size_t height, + size_t width, + const DataType* __restrict__ values, + size_t values_ldim, + DataType* __restrict__ sums) { // Indices - const El::Int tid = threadIdx.x; - const El::Int gidx = threadIdx.x + blockIdx.x * blockDim.x; - const El::Int bidy = blockIdx.y; - const El::Int nthreadsx = blockDim.x * gridDim.x; - const El::Int nblocksy = gridDim.y; + const size_t tid = threadIdx.x; + const size_t gidx = threadIdx.x + blockIdx.x * blockDim.x; + const size_t bidy = blockIdx.y; + const size_t nthreadsx = blockDim.x * gridDim.x; + const size_t nblocksy = gridDim.y; - // Reduce each matrix column independently - for (El::Int col = bidy; col < width; col += nblocksy) { - const auto& shift = shifts[col * shifts_stride]; + for (size_t col = bidy; col < width; col += nblocksy) { - // Exponentiate inputs and compute sum for each thread - DataType private_sum = 0; - for (El::Int row = gidx; row < height; row += nthreadsx) { - const auto& x = input[row + col * input_ldim]; - auto& y = output[row + col * output_ldim]; - y = x - shift; - private_sum += cuda::exp(y); - } - - // Shared memory reduction to get sum for each block - __shared__ DataType shared_sums[block_size]; - shared_sums[tid] = private_sum; - for (El::Int stride = block_size / 2; stride > 0; stride /= 2) { - __syncthreads(); - if (tid < stride) { - shared_sums[tid] += shared_sums[tid + stride]; - } + // Compute sum for each thread + DataType thread_sum{0}; + for (size_t row = gidx; row < height; row += nthreadsx) { + thread_sum += values[row+col*values_ldim]; } - // Atomic add to global sum + // Compute sum for each block + const DataType block_sum = cuda::block_reduce(thread_sum); if (tid == 0) { - cuda::atomic_add(&sums[col * sums_stride], shared_sums[0]); + cuda::atomic_add(&sums[col], block_sum); } } } -/** Subtract LogSumExp from outputs. - * sums should contain sum(exp(x)) for each column. +/** @brief Compute sum(exp(x-shift)) for each matrix column + * + * Block dimensions: bsize x 1 x 1 + * + * Grid dimension: (height / bsize) x width x 1 + * + * @param shifts max(x) for each column + * @param sums On input, array of zeros. On output, + * sum(exp(x-shift)) for each column. */ -__global__ void fp_lse_kernel(El::Int height, El::Int width, - DataType* __restrict__ output, - El::Int output_ldim, - const DataType* __restrict__ sums, - El::Int sums_stride) { - const El::Int gidx = threadIdx.x + blockIdx.x * blockDim.x; - const El::Int bidy = blockIdx.y; - const El::Int nthreadsx = blockDim.x * gridDim.x; - const El::Int nblocksy = gridDim.y; - for (El::Int col = bidy; col < width; col += nblocksy) { - const auto& log_sum_exp = cuda::log(sums[col * sums_stride]); - for (El::Int row = gidx; row < height; row += nthreadsx) { - auto& y = output[row + col * output_ldim]; - y -= log_sum_exp; - } - } -} - -/** Compute sum of entries in gradient w.r.t. output. */ -template -__global__ void bp_sum_kernel(El::Int height, El::Int width, - const DataType* __restrict__ gradient_wrt_output, - El::Int gradient_wrt_output_ldim, - DataType* __restrict__ sums, - El::Int sums_stride) { +template +__global__ void fp_sumexp_kernel(size_t height, + size_t width, + const DataType* __restrict__ input, + size_t input_ldim, + const DataType* __restrict__ shifts, + DataType* __restrict__ sums) { // Indices - const El::Int tid = threadIdx.x; - const El::Int gidx = threadIdx.x + blockIdx.x * blockDim.x; - const El::Int bidy = blockIdx.y; - const El::Int nthreadsx = blockDim.x * gridDim.x; - const El::Int nblocksy = gridDim.y; + const size_t tid = threadIdx.x; + const size_t gidx = threadIdx.x + blockIdx.x * blockDim.x; + const size_t bidy = blockIdx.y; + const size_t nthreadsx = blockDim.x * gridDim.x; + const size_t nblocksy = gridDim.y; - // Compute sum for each matrix column independently - for (El::Int col = bidy; col < width; col += nblocksy) { - - // Compute sum for each thread - DataType private_sum = 0; - for (El::Int row = gidx; row < height; row += nthreadsx) { - const auto& dy = gradient_wrt_output[row + col * gradient_wrt_output_ldim]; - private_sum += dy; - } + for (size_t col = bidy; col < width; col += nblocksy) { + const auto& shift = shifts[col]; - // Shared memory reduction to get sum for each block - __shared__ DataType shared_sums[block_size]; - shared_sums[tid] = private_sum; - for (El::Int stride = block_size / 2; stride > 0; stride /= 2) { - __syncthreads(); - if (tid < stride) { - shared_sums[tid] += shared_sums[tid + stride]; - } + // Exponentiate inputs and compute sum for each thread + DataType thread_sum{0}; + for (size_t row = gidx; row < height; row += nthreadsx) { + const auto& x = input[row+col*input_ldim]; + thread_sum += cuda::exp(x-shift); } - // Atomic add to global sum + // Compute sum for each block + const DataType block_sum = cuda::block_reduce(thread_sum); if (tid == 0) { - cuda::atomic_add(&sums[col * sums_stride], shared_sums[0]); + cuda::atomic_add(&sums[col], block_sum); } } } -/** Compute gradient w.r.t. input. */ -template -__global__ void bp_kernel(El::Int height, El::Int width, +/** @brief Compute layer output + * + * y = x - shift - log(sum(x-shift)) + * + * Block dimensions: bsize x 1 x 1 + * + * Grid dimension: (height / bsize) x width x 1 + * + * @param shifts max(x) for each column + * @param sums sum(exp(x-shift)) for each column + */ +__global__ void fp_output_kernel(size_t height, + size_t width, + const DataType* __restrict__ input, + size_t input_ldim, + DataType* __restrict__ output, + size_t output_ldim, + const DataType* __restrict__ shifts, + const DataType* __restrict__ sums) { + const size_t gidx = threadIdx.x + blockIdx.x * blockDim.x; + const size_t gidy = threadIdx.y + blockIdx.y * blockDim.y; + const size_t nthreadsx = blockDim.x * gridDim.x; + const size_t nthreadsy = blockDim.y * gridDim.y; + for (size_t col = gidy; col < width; col += nthreadsy) { + const auto& shift = shifts[col]; + const DataType log_sum_exp = cuda::log(sums[col]); + for (size_t row = gidx; row < height; row += nthreadsx) { + const auto& x = input[row+col*input_ldim]; + auto& y = output[row+col*output_ldim]; + y = x - shift - log_sum_exp; + } + } +} + +/** @brief Compute gradient w.r.t. input + * + * dx = dy - softmax(x) * sum(dy) + * + * Block dimensions: bsize x 1 x 1 + * + * Grid dimension: (height / bsize) x width x 1 + * + * @param sums Column sums of the gradient w.r.t. output + */ +__global__ void bp_kernel(size_t height, + size_t width, const DataType* __restrict__ output, - El::Int output_ldim, + size_t output_ldim, const DataType* __restrict__ gradient_wrt_output, - El::Int gradient_wrt_output_ldim, + size_t gradient_wrt_output_ldim, const DataType* __restrict__ sums, - El::Int sums_stride, DataType* __restrict__ gradient_wrt_input, - El::Int gradient_wrt_input_ldim) { - const El::Int gidx = threadIdx.x + blockIdx.x * blockDim.x; - const El::Int bidy = blockIdx.y; - const El::Int nthreadsx = blockDim.x * gridDim.x; - const El::Int nblocksy = gridDim.y; - for (El::Int col = bidy; col < width; col += nblocksy) { - const auto& sum = sums[col * sums_stride]; - for (El::Int row = gidx; row < height; row += nthreadsx) { - const auto& y = output[row + col * output_ldim]; - const auto& dy = gradient_wrt_output[row + col * gradient_wrt_output_ldim]; - auto& dx = gradient_wrt_input[row + col * gradient_wrt_input_ldim]; + size_t gradient_wrt_input_ldim) { + const size_t gidx = threadIdx.x + blockIdx.x * blockDim.x; + const size_t gidy = threadIdx.y + blockIdx.y * blockDim.y; + const size_t nthreadsx = blockDim.x * gridDim.x; + const size_t nthreadsy = blockDim.y * gridDim.y; + for (size_t col = gidy; col < width; col += nthreadsy) { + const auto& sum = sums[col]; + for (size_t row = gidx; row < height; row += nthreadsx) { + const auto& y = output[row+col*output_ldim]; + const auto& dy = gradient_wrt_output[row+col*gradient_wrt_output_ldim]; + auto& dx = gradient_wrt_input[row+col*gradient_wrt_input_ldim]; dx = dy - cuda::exp(y) * sum; } } @@ -230,8 +250,8 @@ template <> void log_softmax_layer::fp_compute() { constexpr DataType zero = 0; constexpr DataType one = 1; - const auto& local_input = get_local_prev_activations(); - auto& local_output = get_local_activations(); + const auto& local_input = dynamic_cast(get_local_prev_activations()); + auto& local_output = dynamic_cast(get_local_activations()); if (!local_input.IsEmpty()) { CHECK_CUDNN(cudnnSoftmaxForward(cudnn::get_handle(), CUDNN_SOFTMAX_LOG, @@ -249,9 +269,9 @@ template <> void log_softmax_layer::bp_compute() { constexpr DataType zero = 0; constexpr DataType one = 1; - const auto& local_output = get_local_activations(); - const auto& local_gradient_wrt_output = get_local_prev_error_signals(); - auto& local_gradient_wrt_input = get_local_error_signals(); + const auto& local_output = dynamic_cast(get_local_activations()); + const auto& local_gradient_wrt_output = dynamic_cast(get_local_prev_error_signals()); + auto& local_gradient_wrt_input = dynamic_cast(get_local_error_signals()); if (!local_output.IsEmpty()) { CHECK_CUDNN(cudnnSoftmaxBackward(cudnn::get_handle(), CUDNN_SOFTMAX_LOG, @@ -271,67 +291,79 @@ template <> void log_softmax_layer::fp_compute() { // Local matrices - const auto& local_input = get_local_prev_activations(); - auto& local_output = get_local_activations(); - auto& local_workspace = m_workspace->Matrix(); - const auto& local_height = local_input.Height(); - const auto& local_width = local_input.Width(); + const auto& local_input = dynamic_cast(get_local_prev_activations()); + auto& local_output = dynamic_cast(get_local_activations()); + auto& local_workspace = dynamic_cast(m_workspace->Matrix()); + const size_t local_height = local_input.Height(); + const size_t local_width = local_input.Width(); // GPU objects auto&& stream = El::GPUManager::Stream(); auto&& event = El::GPUManager::Event(); El::SyncInfo sync_info{stream, event}; - // Initialize CUDA threads/blocks - // Note: kernels use a 2D thread distribution with a 256 x 1 block - // and nblocksx x local_width grid. - constexpr El::Int block_size = 256; - dim3 block_dims, grid_dims; - block_dims.x = block_size; - grid_dims.y = local_width; - - // Find column-wise maximum entries - grid_dims.x = (local_height + block_size - 1) / block_size; - if (grid_dims.x < 1) { grid_dims.x = 1; } - cuda::thrust::vector max_vals(grid_dims.x * local_width); - reduce_max_kernel<<>>( - local_height, local_width, - local_input.LockedBuffer(), local_input.LDim(), - max_vals.data().get()); - while (grid_dims.x > 1) { - const El::Int prev_height = grid_dims.x; - grid_dims.x = (prev_height + block_size - 1) / block_size; - cuda::thrust::vector prev_vals(std::move(max_vals)); + // Find max value in each column + cuda::thrust::vector max_vals; + if (local_input.IsEmpty()) { + max_vals.resize(local_width, + -std::numeric_limits::infinity()); + } + else { + constexpr size_t block_size = 256; + dim3 block_dims, grid_dims; + block_dims.x = block_size; + grid_dims.x = (local_height + block_size - 1) / block_size; + grid_dims.y = local_width; max_vals.resize(grid_dims.x * local_width); reduce_max_kernel<<>>( - prev_height, local_width, - prev_vals.data().get(), prev_height, + local_height, local_width, + local_input.LockedBuffer(), local_input.LDim(), max_vals.data().get()); + while (grid_dims.x > 1) { + const size_t prev_height = grid_dims.x; + grid_dims.x = (prev_height + block_size - 1) / block_size; + cuda::thrust::vector prev_vals(std::move(max_vals)); + max_vals.resize(grid_dims.x * local_width); + reduce_max_kernel<<>>( + prev_height, local_width, + prev_vals.data().get(), prev_height, + max_vals.data().get()); + } } El::mpi::AllReduce(max_vals.data().get(), max_vals.size(), El::mpi::MAX, m_workspace->RedundantComm(), sync_info); - // Shift inputs and compute sum(exp(x)) for each column + // Compute sum(exp(x-max_val)) for each column El::Zero(*m_workspace); - if (!local_output.IsEmpty()) { + if (!local_input.IsEmpty()) { + constexpr size_t block_size = 256; + dim3 block_dims, grid_dims; + block_dims.x = block_size; grid_dims.x = (local_height + block_size - 1) / block_size; - fp_exp_kernel<<>>( + grid_dims.y = local_width; + fp_sumexp_kernel<<>>( local_height, local_width, local_input.LockedBuffer(), local_input.LDim(), - local_output.Buffer(), local_output.LDim(), - max_vals.data().get(), 1, - local_workspace.Buffer(), 1); + max_vals.data().get(), + local_workspace.Buffer()); } - El::AllReduce(*m_workspace, m_workspace->RedundantComm()); + get_comm()->allreduce(*m_workspace, m_workspace->RedundantComm()); - // Compute output by subtracting LogSumExp + // Compute output + // Note: y = x - max_val - log(sum(exp(x-max_val))) if (!local_output.IsEmpty()) { + constexpr size_t block_size = 256; + dim3 block_dims, grid_dims; + block_dims.x = block_size; grid_dims.x = (local_height + block_size - 1) / block_size; - fp_lse_kernel<<>>( + grid_dims.y = local_width; + fp_output_kernel<<>>( local_height, local_width, + local_input.LockedBuffer(), local_input.LDim(), local_output.Buffer(), local_output.LDim(), - local_workspace.LockedBuffer(), 1); + max_vals.data().get(), + local_workspace.LockedBuffer()); } } @@ -340,55 +372,56 @@ template <> void log_softmax_layer::bp_compute() { // Local matrices - const auto& local_output = get_local_activations(); - const auto& local_gradient_wrt_output = get_local_prev_error_signals(); - auto& local_gradient_wrt_input = get_local_error_signals(); - auto& local_workspace = m_workspace->Matrix(); - const auto& local_height = local_output.Height(); - const auto& local_width = local_output.Width(); + const auto& local_output = dynamic_cast(get_local_activations()); + const auto& local_gradient_wrt_output = dynamic_cast(get_local_prev_error_signals()); + auto& local_gradient_wrt_input = dynamic_cast(get_local_error_signals()); + auto& local_workspace = dynamic_cast(m_workspace->Matrix()); + const size_t local_height = local_output.Height(); + const size_t local_width = local_output.Width(); // GPU objects auto&& stream = El::GPUManager::Stream(); auto&& event = El::GPUManager::Event(); El::SyncInfo sync_info{stream, event}; - // Initialize CUDA threads/blocks - // Note: kernels use a 2D thread distribution with a 256 x 1 block - // and nblocksx x local_width grid. - constexpr El::Int block_size = 256; - dim3 block_dims, grid_dims; - block_dims.x = block_size; - grid_dims.y = local_width; - // Compute sum of entries in gradient w.r.t. output El::Zero(local_workspace); - if (!local_output.IsEmpty()) { + if (!local_gradient_wrt_output.IsEmpty()) { + constexpr size_t block_size = 256; + dim3 block_dims, grid_dims; + block_dims.x = block_size; grid_dims.x = (local_height + block_size - 1) / block_size; - bp_sum_kernel + grid_dims.y = local_width; + reduce_sum_kernel <<>>( local_height, local_width, local_gradient_wrt_output.LockedBuffer(), local_gradient_wrt_output.LDim(), - local_workspace.Buffer(), 1); + local_workspace.Buffer()); } - El::AllReduce(*m_workspace, m_workspace->RedundantComm()); + get_comm()->allreduce(*m_workspace, m_workspace->RedundantComm()); // Compute gradient w.r.t. input - if (!local_output.IsEmpty()) { + if (!local_gradient_wrt_input.IsEmpty()) { + constexpr size_t block_size = 256; + dim3 block_dims, grid_dims; + block_dims.x = block_size; grid_dims.x = (local_height + block_size - 1) / block_size; - bp_kernel<<>>( + grid_dims.y = local_width; + bp_kernel<<>>( local_height, local_width, local_output.LockedBuffer(), local_output.LDim(), local_gradient_wrt_output.LockedBuffer(), local_gradient_wrt_output.LDim(), - local_workspace.Buffer(), 1, + local_workspace.LockedBuffer(), local_gradient_wrt_input.Buffer(), local_gradient_wrt_input.LDim()); } } +// Template instantiation template class log_softmax_layer< data_layout::DATA_PARALLEL, El::Device::GPU>; template class log_softmax_layer< diff --git a/src/layers/activations/softmax.cpp b/src/layers/activations/softmax.cpp index 1839f642903..236409bf825 100644 --- a/src/layers/activations/softmax.cpp +++ b/src/layers/activations/softmax.cpp @@ -31,12 +31,10 @@ namespace lbann { namespace { -// Minimum output value to avoid denormalized floats -#ifdef LBANN_ENABLE_SOFTMAX_CUTOFF -const DataType min_output = std::sqrt(std::numeric_limits::min()); -#else -const DataType min_output = 0; -#endif // LBANN_ENABLE_SOFTMAX_CUTOFF +#ifdef LBANN_ENABLE_SOFTMAX_THRESHOLD +/** Minimum output value to avoid denormalized floats */ +constexpr DataType threshold_val = std::sqrt(std::numeric_limits::min()); +#endif // LBANN_ENABLE_SOFTMAX_THRESHOLD void fp(lbann_comm& comm, const AbsDistMat& input, @@ -86,7 +84,10 @@ void fp(lbann_comm& comm, const auto& scale = 1 / local_workspace(0, col); for (El::Int row = 0; row < local_height; ++row) { auto& y = local_output(row, col); - y = std::max(scale * y, min_output); + y = scale * y; +#ifdef LBANN_ENABLE_SOFTMAX_THRESHOLD + y = std::max(y, threshold_val); +#endif // LBANN_ENABLE_SOFTMAX_THRESHOLD } } @@ -127,7 +128,7 @@ void bp(lbann_comm& comm, const auto& y = local_output(row, col); const auto& dy = local_gradient_wrt_output(row, col); auto& dx = local_gradient_wrt_input(row, col); - dx = (y > min_output) ? y * (dy - y_dot_dy) : DataType(0); + dx = y * (dy - y_dot_dy); } } diff --git a/src/layers/activations/softmax.cu b/src/layers/activations/softmax.cu index 07b807f4a23..ebcf868c835 100644 --- a/src/layers/activations/softmax.cu +++ b/src/layers/activations/softmax.cu @@ -26,234 +26,236 @@ #define LBANN_SOFTMAX_LAYER_INSTANTIATE #include "lbann/layers/activations/softmax.hpp" +#include "lbann/utils/cuda.hpp" namespace lbann { namespace { -/** Minimum output value to avoid denormalized floats. */ -inline __device__ DataType get_min_output() { -#ifdef LBANN_ENABLE_SOFTMAX_CUTOFF - return cuda::sqrt(cuda::min()); -#else - return DataType(0); -#endif // LBANN_ENABLE_SOFTMAX_CUTOFF -} - -#ifdef LBANN_ENABLE_SOFTMAX_CUTOFF -/** Operator for thresholding output. */ -struct fp_threshold_op { - const DataType min_output = get_min_output(); - inline __device__ DataType operator()(const DataType& y) const { - return cuda::max(y, min_output); +#ifdef LBANN_ENABLE_SOFTMAX_THRESHOLD +/** Functor to ensure values are above threshold value */ +struct threshold_op { + __forceinline__ __device__ DataType operator()(const DataType& y) const { + return cuda::max(y, cuda::sqrt(cuda::min())); } }; -/** Operator for thresholding gradient w.r.t. input. */ -struct bp_threshold_op { - const DataType min_output = get_min_output(); - inline __device__ DataType operator()(const DataType& y, - const DataType& dx) const { - return (y > min_output) ? dx : DataType(0); +#endif // LBANN_ENABLE_SOFTMAX_THRESHOLD + +/** @brief Max functor */ +template +struct max_op { + __device__ __forceinline__ + DataType operator()(const T& x1, const T& x2) const { + return cuda::max(x1, x2); } }; -#endif // LBANN_ENABLE_SOFTMAX_CUTOFF -/** Find largest entry within each CUDA block. - * Each block is assigned several entries from the same mini-batch - * sample and it finds the largest entry. Results are output to an - * nblocksx x width matrix. +/** @brief Kernel for max reduction on matrix columns + * + * Each CUDA block computes the max over a subset of matrix entries + * and outputs the result. This is repeated multiple times for + * column-wise max reduction. + * + * Block dimensions: bsize x 1 x 1 + * + * Grid dimension: (height / bsize) x width x 1 + * + * @param values (height x width) matrix + * @param max_values (nblocksx x width) matrix */ -template -__global__ void reduce_max_kernel(El::Int height, El::Int width, +template +__global__ void reduce_max_kernel(size_t height, + size_t width, const DataType* __restrict__ values, - El::Int values_ldim, + size_t values_ldim, DataType* __restrict__ max_values) { // Indices - const El::Int tid = threadIdx.x; - const El::Int gidx = threadIdx.x + blockIdx.x * blockDim.x; - const El::Int bidx = blockIdx.x; - const El::Int bidy = blockIdx.y; - const El::Int nthreadsx = blockDim.x * gridDim.x; - const El::Int nblocksx = gridDim.x; - const El::Int nblocksy = gridDim.y; + const size_t tid = threadIdx.x; + const size_t gidx = threadIdx.x + blockIdx.x * blockDim.x; + const size_t bidx = blockIdx.x; + const size_t bidy = blockIdx.y; + const size_t nthreadsx = blockDim.x * gridDim.x; + const size_t nblocksx = gridDim.x; + const size_t nblocksy = gridDim.y; - // Reduce each matrix column independently - for (El::Int col = bidy; col < width; col += nblocksy) { + for (size_t col = bidy; col < width; col += nblocksy) { // Find largest value for each thread - DataType private_max_val = -cuda::infinity(); - for (El::Int row = gidx; row < height; row += nthreadsx) { - private_max_val = cuda::max(private_max_val, - values[row + col * values_ldim]); + DataType thread_max_val{-cuda::infinity()}; + for (size_t row = gidx; row < height; row += nthreadsx) { + const auto& val = values[row+col*values_ldim]; + thread_max_val = cuda::max(thread_max_val, val); } - // Shared memory reduction to get largest value for each block - __shared__ DataType shared_max_vals[block_size]; - shared_max_vals[tid] = private_max_val; - for (El::Int stride = block_size / 2; stride > 0; stride /= 2) { - __syncthreads(); - if (tid < stride) { - shared_max_vals[tid] = cuda::max(shared_max_vals[tid], - shared_max_vals[tid + stride]); - } - } + // Find largest value for each block + const DataType block_max_val + = cuda::block_reduce>(thread_max_val); if (tid == 0) { - max_values[bidx + col*nblocksx] = shared_max_vals[0]; + max_values[bidx+col*nblocksx] = block_max_val; } } } -/** Exponentiate outputs and compute column sums. - * Subtracting by the column max prevents output from blowing - * up. Large negative values underflow to 0. +/** @brief Compute exp(x-shift) + * + * Also compute sum(exp(x-shift)) for each matrix column. + * + * Block dimensions: bsize x 1 x 1 + * + * Grid dimension: (height / bsize) x width x 1 */ -template -__global__ void fp_exp_kernel(El::Int height, El::Int width, +template +__global__ void fp_exp_kernel(size_t height, + size_t width, const DataType* __restrict__ input, - El::Int input_ldim, + size_t input_ldim, DataType* __restrict__ output, - El::Int output_ldim, + size_t output_ldim, const DataType* __restrict__ shifts, - El::Int shifts_stride, - DataType* __restrict__ sums, - El::Int sums_stride) { + DataType* __restrict__ sums) { // Indices - const El::Int tid = threadIdx.x; - const El::Int gidx = threadIdx.x + blockIdx.x * blockDim.x; - const El::Int bidy = blockIdx.y; - const El::Int nthreadsx = blockDim.x * gridDim.x; - const El::Int nblocksy = gridDim.y; - - // Reduce each matrix column independently - for (El::Int col = bidy; col < width; col += nblocksy) { - const auto& shift = shifts[col * shifts_stride]; - - // Exponentiate and compute sum for each thread - DataType private_sum = 0; - for (El::Int row = gidx; row < height; row += nthreadsx) { - const auto& x = input[row + col * input_ldim]; - auto& y = output[row + col * output_ldim]; - y = cuda::exp(x - shift); - private_sum += y; - } - - // Shared memory reduction to get sum for each block - __shared__ DataType shared_sums[block_size]; - shared_sums[tid] = private_sum; - for (El::Int stride = block_size / 2; stride > 0; stride /= 2) { - __syncthreads(); - if (tid < stride) { - shared_sums[tid] += shared_sums[tid + stride]; - } + const size_t tid = threadIdx.x; + const size_t gidx = threadIdx.x + blockIdx.x * blockDim.x; + const size_t bidy = blockIdx.y; + const size_t nthreadsx = blockDim.x * gridDim.x; + const size_t nblocksy = gridDim.y; + + for (size_t col = bidy; col < width; col += nblocksy) { + const auto& shift = shifts[col]; + + // Exponentiate inputs and compute sum for each thread + DataType thread_sum{0}; + for (size_t row = gidx; row < height; row += nthreadsx) { + const auto& x = input[row+col*input_ldim]; + auto& y = output[row+col*output_ldim]; + y = cuda::exp(x-shift); + thread_sum += y; } - // Atomic add to global sum + // Compute sum for each block + const DataType block_sum = cuda::block_reduce(thread_sum); if (tid == 0) { - cuda::atomic_add(&sums[col * sums_stride], shared_sums[0]); + cuda::atomic_add(&sums[col], block_sum); } } } -/** Divide outputs by column sums. - * Small values can be rounded to minimum output value to avoid - * denormalized floats. +/** @brief Compute layer output + * + * y = exp(x-shift) / sum(exp(x-shift)) + * + * If @c LBANN_ENABLE_SOFTMAX_THRESHOLD is set, small values are + * thresholded to a minimum value to avoid denormalized floats. + * + * Block dimensions: bsize x 1 x 1 + * + * Grid dimension: (height / bsize) x width x 1 + * + * @param output On input, constains exp(x-shift). On output, + * contains the layer output. + * @param sums sum(exp(x-shift)) for each column */ -__global__ void fp_scale_kernel(El::Int height, El::Int width, - DataType* __restrict__ output, - El::Int output_ldim, - const DataType* __restrict__ sums, - El::Int sums_stride) { - const El::Int gidx = threadIdx.x + blockIdx.x * blockDim.x; - const El::Int bidy = blockIdx.y; - const El::Int nthreadsx = blockDim.x * gridDim.x; - const El::Int nblocksy = gridDim.y; - const auto& min_output = get_min_output(); - for (El::Int col = bidy; col < width; col += nblocksy) { - const auto& scale = 1 / sums[col * sums_stride]; - for (El::Int row = gidx; row < height; row += nthreadsx) { - auto& y = output[row + col * output_ldim]; - y = cuda::max(scale * y, min_output); +__global__ void fp_output_kernel(size_t height, + size_t width, + DataType* __restrict__ output, + size_t output_ldim, + const DataType* __restrict__ sums) { + const size_t gidx = threadIdx.x + blockIdx.x * blockDim.x; + const size_t gidy = threadIdx.y + blockIdx.y * blockDim.y; + const size_t nthreadsx = blockDim.x * gridDim.x; + const size_t nthreadsy = blockDim.y * gridDim.y; + for (size_t col = gidy; col < width; col += nthreadsy) { + const auto& denom = sums[col]; + for (size_t row = gidx; row < height; row += nthreadsx) { + auto& y = output[row+col*output_ldim]; + y /= denom; +#ifdef LBANN_ENABLE_SOFTMAX_THRESHOLD + y = cuda::max(y, cuda::sqrt(cuda::min())); +#endif // LBANN_ENABLE_SOFTMAX_THRESHOLD } } } -/** Compute dot products between output and gradient w.r.t. output. */ -template -__global__ void bp_dot_product_kernel(El::Int height, El::Int width, +/** @brief Compute dot(y,dy) for each matrix column + * + * Block dimensions: bsize x 1 x 1 + * + * Grid dimension: (height / bsize) x width x 1 + */ +template +__global__ void bp_dot_product_kernel(size_t height, + size_t width, const DataType* __restrict__ output, - El::Int output_ldim, + size_t output_ldim, const DataType* __restrict__ gradient_wrt_output, - El::Int gradient_wrt_output_ldim, - DataType* __restrict__ dot_products, - El::Int dot_products_stride) { + size_t gradient_wrt_output_ldim, + DataType* __restrict__ dot_products) { // Indices - const El::Int tid = threadIdx.x; - const El::Int gidx = threadIdx.x + blockIdx.x * blockDim.x; - const El::Int bidy = blockIdx.y; - const El::Int nthreadsx = blockDim.x * gridDim.x; - const El::Int nblocksy = gridDim.y; + const size_t tid = threadIdx.x; + const size_t gidx = threadIdx.x + blockIdx.x * blockDim.x; + const size_t bidy = blockIdx.y; + const size_t nthreadsx = blockDim.x * gridDim.x; + const size_t nblocksy = gridDim.y; - // Compute dot product for each matrix column independently - for (El::Int col = bidy; col < width; col += nblocksy) { + for (size_t col = bidy; col < width; col += nblocksy) { // Compute dot product contribution for each thread - DataType private_dot_product = 0; - for (El::Int row = gidx; row < height; row += nthreadsx) { - const auto& y = output[row + col * output_ldim]; - const auto& dy = gradient_wrt_output[row + col * gradient_wrt_output_ldim]; - private_dot_product += y * dy; - } - - // Shared memory reduction to get contribution for each block - __shared__ DataType shared_dot_products[block_size]; - shared_dot_products[tid] = private_dot_product; - for (El::Int stride = block_size / 2; stride > 0; stride /= 2) { - __syncthreads(); - if (tid < stride) { - shared_dot_products[tid] += shared_dot_products[tid + stride]; - } + DataType thread_dot_product{0}; + for (size_t row = gidx; row < height; row += nthreadsx) { + const auto& y = output[row+col*output_ldim]; + const auto& dy = gradient_wrt_output[row+col*gradient_wrt_output_ldim]; + thread_dot_product += y * dy; } - // Atomic add to global dot product + // Compute dot product contribution for each block + const DataType block_dot_product + = cuda::block_reduce(thread_dot_product); if (tid == 0) { - cuda::atomic_add(&dot_products[col * dot_products_stride], - shared_dot_products[0]); + cuda::atomic_add(&dot_products[col], block_dot_product); } } } -/** Compute gradient w.r.t. input. */ -template -__global__ void bp_kernel(El::Int height, El::Int width, +/** @brief Compute gradient w.r.t. input + * + * dx = y * (dy - dot(y,dy)) + * + * Block dimensions: bsize x 1 x 1 + * + * Grid dimension: (height / bsize) x width x 1 + * + * @param dot_products dot(y,dy) for each matrix column + */ +template +__global__ void bp_kernel(size_t height, + size_t width, const DataType* __restrict__ output, - El::Int output_ldim, + size_t output_ldim, const DataType* __restrict__ gradient_wrt_output, - El::Int gradient_wrt_output_ldim, + size_t gradient_wrt_output_ldim, const DataType* __restrict__ dot_products, - El::Int dot_products_stride, DataType* __restrict__ gradient_wrt_input, - El::Int gradient_wrt_input_ldim) { - const El::Int gidx = threadIdx.x + blockIdx.x * blockDim.x; - const El::Int bidy = blockIdx.y; - const El::Int nthreadsx = blockDim.x * gridDim.x; - const El::Int nblocksy = gridDim.y; - const auto& min_output = get_min_output(); - for (El::Int col = bidy; col < width; col += nblocksy) { - const auto& y_dot_dy = dot_products[col * dot_products_stride]; - for (El::Int row = gidx; row < height; row += nthreadsx) { - const auto& y = output[row + col * output_ldim]; - const auto& dy = gradient_wrt_output[row + col * gradient_wrt_output_ldim]; - auto& dx = gradient_wrt_input[row + col * gradient_wrt_input_ldim]; - dx = (y > min_output) ? y * (dy - y_dot_dy) : DataType(0); + size_t gradient_wrt_input_ldim) { + const size_t gidx = threadIdx.x + blockIdx.x * blockDim.x; + const size_t gidy = threadIdx.y + blockIdx.y * blockDim.y; + const size_t nthreadsx = blockDim.x * gridDim.x; + const size_t nthreadsy = blockDim.y * gridDim.y; + for (size_t col = gidy; col < width; col += nthreadsy) { + const auto& y_dot_dy = dot_products[col]; + for (size_t row = gidx; row < height; row += nthreadsx) { + const auto& y = output[row+col*output_ldim]; + const auto& dy = gradient_wrt_output[row+col*gradient_wrt_output_ldim]; + auto& dx = gradient_wrt_input[row+col*gradient_wrt_input_ldim]; + dx = y * (dy - y_dot_dy); } } } @@ -264,8 +266,8 @@ template <> void softmax_layer::fp_compute() { constexpr DataType zero = 0; constexpr DataType one = 1; - const auto& local_input = get_local_prev_activations(); - auto& local_output = get_local_activations(); + const auto& local_input = dynamic_cast(get_local_prev_activations()); + auto& local_output = dynamic_cast(get_local_activations()); if (!local_input.IsEmpty()) { CHECK_CUDNN(cudnnSoftmaxForward(cudnn::get_handle(), CUDNN_SOFTMAX_ACCURATE, @@ -276,10 +278,10 @@ void softmax_layer::fp_compute() { &zero, m_tensors_cudnn_desc.get_activations(), local_output.Buffer())); -#ifdef LBANN_ENABLE_SOFTMAX_CUTOFF - cuda::apply_entrywise_unary_operator(local_output, - local_output); -#endif // LBANN_ENABLE_SOFTMAX_CUTOFF +#ifdef LBANN_ENABLE_SOFTMAX_THRESHOLD + cuda::apply_entrywise_unary_operator(local_output, + local_output); +#endif // LBANN_ENABLE_SOFTMAX_THRESHOLD } } @@ -287,9 +289,9 @@ template <> void softmax_layer::bp_compute() { constexpr DataType zero = 0; constexpr DataType one = 1; - const auto& local_output = get_local_activations(); - const auto& local_gradient_wrt_output = get_local_prev_error_signals(); - auto& local_gradient_wrt_input = get_local_error_signals(); + const auto& local_output = dynamic_cast(get_local_activations()); + const auto& local_gradient_wrt_output = dynamic_cast(get_local_prev_error_signals()); + auto& local_gradient_wrt_input = dynamic_cast(get_local_error_signals()); if (!local_output.IsEmpty()) { CHECK_CUDNN(cudnnSoftmaxBackward(cudnn::get_handle(), CUDNN_SOFTMAX_ACCURATE, @@ -302,11 +304,6 @@ void softmax_layer::bp_compute() { &zero, m_tensors_cudnn_desc.get_error_signals(), local_gradient_wrt_input.Buffer())); -#ifdef LBANN_ENABLE_SOFTMAX_CUTOFF - cuda::apply_entrywise_binary_operator(local_output, - local_gradient_wrt_input, - local_gradient_wrt_input); -#endif // LBANN_ENABLE_SOFTMAX_CUTOFF } } @@ -314,67 +311,78 @@ template <> void softmax_layer::fp_compute() { // Local matrices - const auto& local_input = get_local_prev_activations(); - auto& local_output = get_local_activations(); - auto& local_workspace = m_workspace->Matrix(); - const auto& local_height = local_input.Height(); - const auto& local_width = local_input.Width(); + const auto& local_input = dynamic_cast(get_local_prev_activations()); + auto& local_output = dynamic_cast(get_local_activations()); + auto& local_workspace = dynamic_cast(m_workspace->Matrix()); + const size_t local_height = local_input.Height(); + const size_t local_width = local_input.Width(); // GPU objects auto&& stream = El::GPUManager::Stream(); auto&& event = El::GPUManager::Event(); El::SyncInfo sync_info{stream, event}; - // Initialize CUDA threads/blocks - // Note: kernels use a 2D thread distribution with a 256 x 1 block - // and nblocksx x local_width grid. - constexpr El::Int block_size = 256; - dim3 block_dims, grid_dims; - block_dims.x = block_size; - grid_dims.y = local_width; - - // Find column-wise maximum entries - grid_dims.x = (local_height + block_size - 1) / block_size; - if (grid_dims.x < 1) { grid_dims.x = 1; } - cuda::thrust::vector max_vals(grid_dims.x * local_width); - reduce_max_kernel<<>>( - local_height, local_width, - local_input.LockedBuffer(), local_input.LDim(), - max_vals.data().get()); - while (grid_dims.x > 1) { - const El::Int prev_height = grid_dims.x; - grid_dims.x = (prev_height + block_size - 1) / block_size; - cuda::thrust::vector prev_vals(std::move(max_vals)); + // Find max value in each column + cuda::thrust::vector max_vals; + if (local_output.IsEmpty()) { + max_vals.resize(local_width, + -std::numeric_limits::infinity()); + } + else { + constexpr size_t block_size = 256; + dim3 block_dims, grid_dims; + block_dims.x = block_size; + grid_dims.x = (local_height + block_size - 1) / block_size; + grid_dims.y = local_width; max_vals.resize(grid_dims.x * local_width); reduce_max_kernel<<>>( - prev_height, local_width, - prev_vals.data().get(), prev_height, + local_height, local_width, + local_input.LockedBuffer(), local_input.LDim(), max_vals.data().get()); + while (grid_dims.x > 1) { + const size_t prev_height = grid_dims.x; + grid_dims.x = (prev_height + block_size - 1) / block_size; + cuda::thrust::vector prev_vals(std::move(max_vals)); + max_vals.resize(grid_dims.x * local_width); + reduce_max_kernel<<>>( + prev_height, local_width, + prev_vals.data().get(), prev_height, + max_vals.data().get()); + } } El::mpi::AllReduce(max_vals.data().get(), max_vals.size(), El::mpi::MAX, m_workspace->RedundantComm(), sync_info); - // Exponentiate outputs and compute column sums + // Compute exp(x-max_val) and sum(exp(x-max_val)) El::Zero(*m_workspace); if (!local_output.IsEmpty()) { + constexpr size_t block_size = 256; + dim3 block_dims, grid_dims; + block_dims.x = block_size; grid_dims.x = (local_height + block_size - 1) / block_size; + grid_dims.y = local_width; fp_exp_kernel<<>>( local_height, local_width, local_input.LockedBuffer(), local_input.LDim(), local_output.Buffer(), local_output.LDim(), - max_vals.data().get(), 1, - local_workspace.Buffer(), 1); + max_vals.data().get(), + local_workspace.Buffer()); } El::AllReduce(*m_workspace, m_workspace->RedundantComm()); - // Divide activations by column sums + // Compute output + // Note: y = exp(x-max_val) / sum(exp(x-max_val)) if (!local_output.IsEmpty()) { + constexpr size_t block_size = 256; + dim3 block_dims, grid_dims; + block_dims.x = block_size; grid_dims.x = (local_height + block_size - 1) / block_size; - fp_scale_kernel<<>>( + grid_dims.y = local_width; + fp_output_kernel<<>>( local_height, local_width, local_output.Buffer(), local_output.LDim(), - local_workspace.LockedBuffer(), 1); + local_workspace.LockedBuffer()); } } @@ -383,30 +391,26 @@ template <> void softmax_layer::bp_compute() { // Local matrices - const auto& local_output = get_local_activations(); - const auto& local_gradient_wrt_output = get_local_prev_error_signals(); - auto& local_gradient_wrt_input = get_local_error_signals(); - auto& local_workspace = m_workspace->Matrix(); - const auto& local_height = local_output.Height(); - const auto& local_width = local_output.Width(); + const auto& local_output = dynamic_cast(get_local_activations()); + const auto& local_gradient_wrt_output = dynamic_cast(get_local_prev_error_signals()); + auto& local_gradient_wrt_input = dynamic_cast(get_local_error_signals()); + auto& local_workspace = dynamic_cast(m_workspace->Matrix()); + const size_t local_height = local_output.Height(); + const size_t local_width = local_output.Width(); // GPU objects auto&& stream = El::GPUManager::Stream(); auto&& event = El::GPUManager::Event(); El::SyncInfo sync_info{stream, event}; - // Initialize CUDA threads/blocks - // Note: kernels use a 2D thread distribution with a 256 x 1 block - // and nblocksx x local_width grid. - constexpr El::Int block_size = 256; - dim3 block_dims, grid_dims; - block_dims.x = block_size; - grid_dims.y = local_width; - - // Compute dot products between output and gradient w.r.t. output + // Compute dot(y,dy) El::Zero(local_workspace); if (!local_output.IsEmpty()) { + constexpr size_t block_size = 256; + dim3 block_dims, grid_dims; + block_dims.x = block_size; grid_dims.x = (local_height + block_size - 1) / block_size; + grid_dims.y = local_width; bp_dot_product_kernel <<>>( local_height, local_width, @@ -414,26 +418,31 @@ void softmax_layer::bp_compute() { local_output.LDim(), local_gradient_wrt_output.LockedBuffer(), local_gradient_wrt_output.LDim(), - local_workspace.Buffer(), 1); + local_workspace.Buffer()); } El::AllReduce(*m_workspace, m_workspace->RedundantComm()); // Compute gradient w.r.t. input if (!local_output.IsEmpty()) { + constexpr size_t block_size = 256; + dim3 block_dims, grid_dims; + block_dims.x = block_size; grid_dims.x = (local_height + block_size - 1) / block_size; + grid_dims.y = local_width; bp_kernel<<>>( local_height, local_width, local_output.LockedBuffer(), local_output.LDim(), local_gradient_wrt_output.LockedBuffer(), local_gradient_wrt_output.LDim(), - local_workspace.Buffer(), 1, + local_workspace.Buffer(), local_gradient_wrt_input.Buffer(), local_gradient_wrt_input.LDim()); } } +// Template instantiation template class softmax_layer< data_layout::DATA_PARALLEL, El::Device::GPU>; template class softmax_layer< diff --git a/src/proto/layers.proto b/src/proto/layers.proto index 094e0d3828a..2695903df89 100644 --- a/src/proto/layers.proto +++ b/src/proto/layers.proto @@ -249,11 +249,22 @@ message Layer { double negative_slope = 1; //default: 0.01 } message LogSigmoid {} + + /** @brief Logarithm of softmax function. + * + * @f[ \log \text{softmax}(x)_i = x_i - \log \sum_j e^{x_j} @f] + */ message LogSoftmax {} + message Relu {} message Selu {} message Sigmoid {} + + /** + * @f[ \text{softmax}(x)_i = \frac{e^{x_i}}{\sum_j e^{x_j}} @f] + */ message Softmax {} + message Softplus {} message Softsign {} From 7d6a15dec61462e150527db77221d9daf45670bd Mon Sep 17 00:00:00 2001 From: Sam Ade Jacobs Date: Tue, 22 Oct 2019 15:37:40 -0700 Subject: [PATCH 364/634] Weight dump epoch interval (#1315) * Fix typos and broken links in READMEs * Add epoch_interval to dump weight callback --- applications/ATOM/README.md | 2 +- applications/ATOM/train_atom_char_rnn.py | 5 ++--- applications/README.md | 2 +- include/lbann/callbacks/dump_weights.hpp | 9 ++++++--- src/callbacks/dump_weights.cpp | 5 ++++- src/proto/callbacks.proto | 3 ++- 6 files changed, 16 insertions(+), 10 deletions(-) diff --git a/applications/ATOM/README.md b/applications/ATOM/README.md index 31e78420ef1..1789b609f4a 100644 --- a/applications/ATOM/README.md +++ b/applications/ATOM/README.md @@ -5,7 +5,7 @@ Models for training neural networks to suppor the [ATOM](https://atomscience.org The train_atom_char_rnn.py script implements GRU-based recurrent model for generating new SMILES strings. Original neural network model and training hyperparameters are described in [MOSES benchmark](https://github.com/samadejacobs/moses/tree/master/moses/char_rnn). Please see LBANN documentations on how to install, build and run LBANN code. -###How to train +### How to train ```bash run python3 train_atom_char.rnn.py ``` diff --git a/applications/ATOM/train_atom_char_rnn.py b/applications/ATOM/train_atom_char_rnn.py index f5751dab21a..14b1f820fe5 100644 --- a/applications/ATOM/train_atom_char_rnn.py +++ b/applications/ATOM/train_atom_char_rnn.py @@ -1,5 +1,4 @@ import numpy as np -from math import sqrt # Data paths data_dir = '/p/lustre2/brainusr/datasets/zinc/moses_zinc_train250K.npy' @@ -114,7 +113,7 @@ def construct_model(): callbacks = [lbann.CallbackPrint(), lbann.CallbackTimer(), lbann.CallbackStepLearningRate(step=10, amt=0.5), - lbann.CallbackDumpWeights(basename="weights")] + lbann.CallbackDumpWeights(basename="weights",epoch_interval=50)] # Construct model mini_batch_size = 64 @@ -167,7 +166,7 @@ def construct_data_reader(): trainer, model, data_reader, opt, account='hpcdl', scheduler='slurm', - time_limit=1440, + time_limit=720, nodes=1, job_name='atom_char_rnn_250K') print(status) diff --git a/applications/README.md b/applications/README.md index 602eec0cd3f..3deb0ff21d1 100644 --- a/applications/README.md +++ b/applications/README.md @@ -7,6 +7,6 @@ experiments directory, as well as utility / helper code to pre- or post-process data. These are some of applications that leverage LBANN: -- [Atom](atom/README.md): Accelerating Therapeutics for Opportunities +- [Atom](ATOM/README.md): Accelerating Therapeutics for Opportunities in Medicine (ATOM) - Networks for predicting molecular compounds that are optimized for multiple objectives diff --git a/include/lbann/callbacks/dump_weights.hpp b/include/lbann/callbacks/dump_weights.hpp index 603c07ca7b2..95f59e0b23c 100644 --- a/include/lbann/callbacks/dump_weights.hpp +++ b/include/lbann/callbacks/dump_weights.hpp @@ -38,7 +38,7 @@ namespace callback { /** * Dump weight matrices to files. - * This will dump each hidden layer's weight/bias matrix after each epoch. + * This will dump each hidden layer's weight/bias matrix after specified epoch interval. * The matrices are written to files using Elemental's simple ASCII format. This * is not meant for checkpointing, but for exporting weight matrices for * analysis that isn't easily done in LBANN. @@ -48,8 +48,9 @@ class dump_weights : public callback_base { /** * @param basename The basename for writing files. */ - dump_weights(std::string basename) : - callback_base(), m_basename(std::move(basename)) {} + dump_weights(std::string basename, El::Int epoch_interval=1) : + callback_base(), m_basename(std::move(basename)), + m_epoch_interval(std::max(El::Int(1),epoch_interval)) {} dump_weights(const dump_weights&) = default; dump_weights& operator=( const dump_weights&) = default; @@ -64,6 +65,8 @@ class dump_weights : public callback_base { private: /** Basename for writing files. */ std::string m_basename; + /** Interval at which to dump weights */ + El::Int m_epoch_interval; /// Dump weights from learning layers. void do_dump_weights(model *m, std::string s = ""); }; diff --git a/src/callbacks/dump_weights.cpp b/src/callbacks/dump_weights.cpp index f1f20dd1468..91bbe4fb76b 100644 --- a/src/callbacks/dump_weights.cpp +++ b/src/callbacks/dump_weights.cpp @@ -46,6 +46,9 @@ void dump_weights::on_epoch_end(model *m) { void dump_weights::do_dump_weights(model *m, std::string s) { const auto& c = static_cast(m->get_execution_context()); + + if(c.get_epoch() % m_epoch_interval != 0) return; + makedir(m_basename.c_str()); for (weights *w : m->get_weights()) { std::string epoch = "-epoch" + std::to_string(c.get_epoch()-1); @@ -67,7 +70,7 @@ build_dump_weights_callback_from_pbuf( const google::protobuf::Message& proto_msg, const std::shared_ptr&) { const auto& params = dynamic_cast(proto_msg); - return make_unique(params.basename()); + return make_unique(params.basename(), params.epoch_interval()); } } // namespace callback diff --git a/src/proto/callbacks.proto b/src/proto/callbacks.proto index f204be30b79..a0419f5f92b 100644 --- a/src/proto/callbacks.proto +++ b/src/proto/callbacks.proto @@ -128,6 +128,7 @@ message Callback { message CallbackDumpWeights { string basename = 1; + int64 epoch_interval = 2; } message CallbackDumpOutputs { @@ -325,4 +326,4 @@ message Callback { message CallbackPrintModelDescription { } -} \ No newline at end of file +} From 9136d009076dd617b52c93b6e04c2fb340a07885 Mon Sep 17 00:00:00 2001 From: Tim Moon Date: Wed, 23 Oct 2019 14:54:43 -0700 Subject: [PATCH 365/634] Refactor Bamboo helper function for Python frontend (#1321) Functionality is no longer specific to unit_tests dir. --- bamboo/clean.sh | 1 + bamboo/common_python/tools.py | 142 ++++++++++-------- .../integration_tests/experiments/.gitignore | 2 + .../unit_tests/test_unit_datareader_python.py | 4 +- bamboo/unit_tests/test_unit_layer_argmax.py | 4 +- .../test_unit_layer_channelwise_scale_bias.py | 4 +- .../unit_tests/test_unit_layer_embedding.py | 4 +- ...nit_layer_entrywise_batch_normalization.py | 4 +- .../test_unit_layer_entrywise_scale_bias.py | 4 +- .../unit_tests/test_unit_layer_log_softmax.py | 4 +- bamboo/unit_tests/test_unit_layer_one_hot.py | 4 +- bamboo/unit_tests/test_unit_layer_slice.py | 4 +- bamboo/unit_tests/test_unit_layer_softmax.py | 4 +- 13 files changed, 92 insertions(+), 93 deletions(-) create mode 100644 bamboo/integration_tests/experiments/.gitignore diff --git a/bamboo/clean.sh b/bamboo/clean.sh index ac408cff67e..8ac26cf18f4 100755 --- a/bamboo/clean.sh +++ b/bamboo/clean.sh @@ -18,6 +18,7 @@ rm -rf ${LBANN_DIR}/bamboo/integration_tests/__pycache__ rm -f ${LBANN_DIR}/bamboo/integration_tests/*.tfevents.* rm -f ${LBANN_DIR}/bamboo/integration_tests/error/* rm -f ${LBANN_DIR}/bamboo/integration_tests/output/* +rm -rf ${LBANN_DIR}/bamboo/integration_tests/experiments/* # Unit Tests rm -rf ${LBANN_DIR}/bamboo/unit_tests/ckpt* diff --git a/bamboo/common_python/tools.py b/bamboo/common_python/tools.py index 11bf9722bf5..699d78c0a61 100644 --- a/bamboo/common_python/tools.py +++ b/bamboo/common_python/tools.py @@ -662,21 +662,25 @@ def assert_failure(return_code, expected_error, error_file_name): efn=error_file_name)) -def create_tests(setup_func, test_name): +def create_tests(setup_func, + test_file, + test_name_base=None, + nodes=1, + procs_per_node=None): """Create functions that can interact with PyTest. - This function creates tests that involve setting up and running an - LBANN experiment with the Python frontend. `setup_func` should be - a function that takes in the LBANN Python module and outputs - objects for an LBANN experiment. A test succeeds if LBANN runs and - exits with an exit code of 0, and fails otherwise. + This function creates tests that involve running an LBANN + experiment with the Python frontend. `setup_func` should be a + function that takes in the LBANN Python module and outputs objects + for an LBANN experiment. A test succeeds if LBANN runs and exits + with an exit code of 0, and fails otherwise. PyTest detects tests by loading in a Python script and looking for functions prefixed with 'test_'. After you call this function within a script to generate test functions, make sure to add the test functions to the script's scope. For example: - _test_funcs = tools.create_tests(setup_func, test_name) + _test_funcs = tools.create_tests(setup_func, __file__) for t in _test_funcs: globals()[t.__name__] = t @@ -685,42 +689,53 @@ def create_tests(setup_func, test_name): Python frontend. It takes in the LBANN Python module as input and returns a `(lbann.Trainer, lbann.Model, lbann.reader_pb2.DataReader, lbann.Optimizer)`. - test_name (str): Descriptive name. Should be prefixed with - 'test_'. + test_file (str): Python script being run by PyTest. In most + cases, use `__file__`. + test_name (str, optional): Descriptive name (default: test + file name with '.py' removed). + nodes (int, optional): Number of compute nodes (default: 1). + procs_per_node (int, optional): Number of parallel processes + per compute node (default: system-specific default, + usually number of GPUs per node). Returns: Iterable of function: Tests that can interact with PyTest. + Each function returns a dict containing log files and + other output data. """ + # Make sure test name is valid + test_file = os.path.realpath(test_file) + if not test_name_base: + # Create test name by removing '.py' from file name + test_name_base = os.path.splitext(os.path.basename(test_file))[0] + if not re.match('^test_.', test_name_base): + # Make sure test name is prefixed with 'test_' + test_name_base = 'test_' + test_name_base + # Basic test function def test_func(cluster, executables, dir_name, compiler_name): - process_executable(test_name, compiler_name, executables) - - # Choose LBANN build and load Python frontend - if compiler_name == 'exe': - exe = executables[compiler_name] - bin_dir = os.path.dirname(exe) - install_dir = os.path.dirname(bin_dir) - build_path = '{i}/lib/python3.7/site-packages'.format(i=install_dir) - else: - if compiler_name == 'clang6': - path = 'clang.Release' - elif compiler_name == 'clang6_debug': - path = 'clang.Debug' - elif compiler_name == 'gcc7': - path = 'gnu.Release' - elif compiler_name == 'clang6_debug': - path = 'gnu.Debug' - elif compiler_name == 'intel19': - path = 'intel.Release' - elif compiler_name == 'intel19_debug': - path = 'intel.Debug' - path = '{p}.{c}.llnl.gov'.format(p=path, c=cluster) - build_path = '{d}/build/{p}/install/lib/python3.7/site-packages'.format( - d=dir_name, p=path) - print('build_path={b}'.format(b=build_path)) - sys.path.append(build_path) + process_executable(test_name_base, compiler_name, executables) + test_name = '{}_{}'.format(test_name_base, compiler_name) + + # Load LBANN Python frontend + build_names = { + 'clang6': 'clang.Release.{}.llnl.gov'.format(cluster), + 'clang6_debug': 'clang.Debug.{}.llnl.gov'.format(cluster), + 'gcc7': 'gnu.Release.{}.llnl.gov'.format(cluster), + 'gcc7_debug': 'gnu.Debug.{}.llnl.gov'.format(cluster), + 'intel19': 'intel.Release.{}.llnl.gov'.format(cluster), + 'intel19_debug': 'intel.Debug.{}.llnl.gov'.format(cluster), + } + python_frontend_path = os.path.join(dir_name, + 'build', + build_names[compiler_name], + 'install', + 'lib', + 'python3.7', + 'site-packages') + sys.path.append(python_frontend_path) import lbann import lbann.contrib.lc.launcher @@ -728,14 +743,14 @@ def test_func(cluster, executables, dir_name, compiler_name): trainer, model, data_reader, optimizer = setup_func(lbann) # Run LBANN experiment - kwargs = { - 'nodes': 1, - 'overwrite_script': True - } - experiment_dir = '{d}/bamboo/unit_tests/experiments/{t}_{c}'.format( - d=dir_name, t=test_name, c=compiler_name) - error_file_name = '{e}/err.log'.format( - e=experiment_dir, c=compiler_name) + experiment_dir = os.path.join(os.path.dirname(test_file), + 'experiments', + test_name) + stdout_log_file = os.path.join(experiment_dir, 'out.log') + stderr_log_file = os.path.join(experiment_dir, 'err.log') + kwargs = {} + if procs_per_node: + kwargs['procs_per_node'] = procs_per_node return_code = lbann.contrib.lc.launcher.run( trainer=trainer, model=model, @@ -743,32 +758,33 @@ def test_func(cluster, executables, dir_name, compiler_name): optimizer=optimizer, experiment_dir=experiment_dir, job_name='lbann_{}'.format(test_name), + nodes=nodes, + overwrite_script=True, **kwargs) - assert_success(return_code, error_file_name) + assert_success(return_code, stderr_log_file) + return { + 'return_code': return_code, + 'experiment_dir': experiment_dir, + 'stdout_log_file': stdout_log_file, + 'stderr_log_file': stderr_log_file, + } # Specific test functions for different build configurations - def test_func_exe(cluster, dirname, exe): - if exe is None: - e = 'test_{}_exe: Non-local testing'.format(test_name) - print('Skip - ' + e) - pytest.skip(e) - exes = {'exe': exe} - test_func(cluster, exes, dirname, 'exe') def test_func_clang6(cluster, exes, dirname): - test_func(cluster, exes, dirname, 'clang6') + return test_func(cluster, exes, dirname, 'clang6') def test_func_gcc7(cluster, exes, dirname): - test_func(cluster, exes, dirname, 'gcc7') + return test_func(cluster, exes, dirname, 'gcc7') def test_func_intel19(cluster, exes, dirname): - test_func(cluster, exes, dirname, 'intel19') - test_func_exe.__name__ = '{}_exe'.format(test_name) - test_func_clang6.__name__ = '{}_clang6'.format(test_name) - test_func_gcc7.__name__ = '{}_gcc7'.format(test_name) - test_func_intel19.__name__ = '{}_intel19'.format(test_name) - - return (test_func_exe, - test_func_clang6, - test_func_gcc7, - test_func_intel19) + return test_func(cluster, exes, dirname, 'intel19') + test_func_clang6.__name__ = '{}_clang6'.format(test_name_base) + test_func_gcc7.__name__ = '{}_gcc7'.format(test_name_base) + test_func_intel19.__name__ = '{}_intel19'.format(test_name_base) + + return ( + test_func_gcc7, + test_func_clang6, + test_func_intel19, + ) def create_python_data_reader(lbann, diff --git a/bamboo/integration_tests/experiments/.gitignore b/bamboo/integration_tests/experiments/.gitignore new file mode 100644 index 00000000000..d6b7ef32c84 --- /dev/null +++ b/bamboo/integration_tests/experiments/.gitignore @@ -0,0 +1,2 @@ +* +!.gitignore diff --git a/bamboo/unit_tests/test_unit_datareader_python.py b/bamboo/unit_tests/test_unit_datareader_python.py index 1d1d3827c4f..fcf05875b7c 100644 --- a/bamboo/unit_tests/test_unit_datareader_python.py +++ b/bamboo/unit_tests/test_unit_datareader_python.py @@ -127,7 +127,5 @@ def construct_data_reader(lbann): # ============================================== # Create test functions that can interact with PyTest -# Note: Create test name by removing ".py" from file name -_test_name = os.path.splitext(os.path.basename(current_file))[0] -for test in tools.create_tests(setup_experiment, _test_name): +for test in tools.create_tests(setup_experiment, __file__): globals()[test.__name__] = test diff --git a/bamboo/unit_tests/test_unit_layer_argmax.py b/bamboo/unit_tests/test_unit_layer_argmax.py index ff6fc7bee1e..86d496faefc 100644 --- a/bamboo/unit_tests/test_unit_layer_argmax.py +++ b/bamboo/unit_tests/test_unit_layer_argmax.py @@ -148,7 +148,5 @@ def construct_data_reader(lbann): # ============================================== # Create test functions that can interact with PyTest -# Note: Create test name by removing ".py" from file name -_test_name = os.path.splitext(os.path.basename(current_file))[0] -for test in tools.create_tests(setup_experiment, _test_name): +for test in tools.create_tests(setup_experiment, __file__): globals()[test.__name__] = test diff --git a/bamboo/unit_tests/test_unit_layer_channelwise_scale_bias.py b/bamboo/unit_tests/test_unit_layer_channelwise_scale_bias.py index f952c138c14..a9b72dda5e0 100644 --- a/bamboo/unit_tests/test_unit_layer_channelwise_scale_bias.py +++ b/bamboo/unit_tests/test_unit_layer_channelwise_scale_bias.py @@ -167,7 +167,5 @@ def construct_data_reader(lbann): # ============================================== # Create test functions that can interact with PyTest -# Note: Create test name by removing ".py" from file name -_test_name = os.path.splitext(os.path.basename(current_file))[0] -for test in tools.create_tests(setup_experiment, _test_name): +for test in tools.create_tests(setup_experiment, __file__): globals()[test.__name__] = test diff --git a/bamboo/unit_tests/test_unit_layer_embedding.py b/bamboo/unit_tests/test_unit_layer_embedding.py index 2e30677577d..4677ea76b67 100644 --- a/bamboo/unit_tests/test_unit_layer_embedding.py +++ b/bamboo/unit_tests/test_unit_layer_embedding.py @@ -224,7 +224,5 @@ def construct_data_reader(lbann): # ============================================== # Create test functions that can interact with PyTest -# Note: Create test name by removing ".py" from file name -_test_name = os.path.splitext(os.path.basename(current_file))[0] -for test in tools.create_tests(setup_experiment, _test_name): +for test in tools.create_tests(setup_experiment, __file__): globals()[test.__name__] = test diff --git a/bamboo/unit_tests/test_unit_layer_entrywise_batch_normalization.py b/bamboo/unit_tests/test_unit_layer_entrywise_batch_normalization.py index fda72f7e5e1..2e9997a81ae 100644 --- a/bamboo/unit_tests/test_unit_layer_entrywise_batch_normalization.py +++ b/bamboo/unit_tests/test_unit_layer_entrywise_batch_normalization.py @@ -174,7 +174,5 @@ def construct_data_reader(lbann): # ============================================== # Create test functions that can interact with PyTest -# Note: Create test name by removing ".py" from file name -_test_name = os.path.splitext(os.path.basename(current_file))[0] -for test in tools.create_tests(setup_experiment, _test_name): +for test in tools.create_tests(setup_experiment, __file__): globals()[test.__name__] = test diff --git a/bamboo/unit_tests/test_unit_layer_entrywise_scale_bias.py b/bamboo/unit_tests/test_unit_layer_entrywise_scale_bias.py index 23178565fe2..e3fd0382af8 100644 --- a/bamboo/unit_tests/test_unit_layer_entrywise_scale_bias.py +++ b/bamboo/unit_tests/test_unit_layer_entrywise_scale_bias.py @@ -214,7 +214,5 @@ def construct_data_reader(lbann): # ============================================== # Create test functions that can interact with PyTest -# Note: Create test name by removing ".py" from file name -_test_name = os.path.splitext(os.path.basename(current_file))[0] -for test in tools.create_tests(setup_experiment, _test_name): +for test in tools.create_tests(setup_experiment, __file__): globals()[test.__name__] = test diff --git a/bamboo/unit_tests/test_unit_layer_log_softmax.py b/bamboo/unit_tests/test_unit_layer_log_softmax.py index 7eec61dfdcf..8250b199895 100644 --- a/bamboo/unit_tests/test_unit_layer_log_softmax.py +++ b/bamboo/unit_tests/test_unit_layer_log_softmax.py @@ -204,7 +204,5 @@ def construct_data_reader(lbann): # ============================================== # Create test functions that can interact with PyTest -# Note: Create test name by removing ".py" from file name -_test_name = os.path.splitext(os.path.basename(current_file))[0] -for test in tools.create_tests(setup_experiment, _test_name): +for test in tools.create_tests(setup_experiment, __file__): globals()[test.__name__] = test diff --git a/bamboo/unit_tests/test_unit_layer_one_hot.py b/bamboo/unit_tests/test_unit_layer_one_hot.py index 6b4c50ccce7..a435dfc8e22 100644 --- a/bamboo/unit_tests/test_unit_layer_one_hot.py +++ b/bamboo/unit_tests/test_unit_layer_one_hot.py @@ -136,7 +136,5 @@ def construct_data_reader(lbann): # ============================================== # Create test functions that can interact with PyTest -# Note: Create test name by removing ".py" from file name -_test_name = os.path.splitext(os.path.basename(current_file))[0] -for test in tools.create_tests(setup_experiment, _test_name): +for test in tools.create_tests(setup_experiment, __file__): globals()[test.__name__] = test diff --git a/bamboo/unit_tests/test_unit_layer_slice.py b/bamboo/unit_tests/test_unit_layer_slice.py index 52da1b097e2..54e2fc95a4c 100644 --- a/bamboo/unit_tests/test_unit_layer_slice.py +++ b/bamboo/unit_tests/test_unit_layer_slice.py @@ -246,7 +246,5 @@ def construct_data_reader(lbann): # ============================================== # Create test functions that can interact with PyTest -# Note: Create test name by removing ".py" from file name -_test_name = os.path.splitext(os.path.basename(current_file))[0] -for test in tools.create_tests(setup_experiment, _test_name): +for test in tools.create_tests(setup_experiment, __file__): globals()[test.__name__] = test diff --git a/bamboo/unit_tests/test_unit_layer_softmax.py b/bamboo/unit_tests/test_unit_layer_softmax.py index 44636f91d87..0d494f42e62 100644 --- a/bamboo/unit_tests/test_unit_layer_softmax.py +++ b/bamboo/unit_tests/test_unit_layer_softmax.py @@ -209,7 +209,5 @@ def construct_data_reader(lbann): # ============================================== # Create test functions that can interact with PyTest -# Note: Create test name by removing ".py" from file name -_test_name = os.path.splitext(os.path.basename(current_file))[0] -for test in tools.create_tests(setup_experiment, _test_name): +for test in tools.create_tests(setup_experiment, __file__): globals()[test.__name__] = test From ceb9a13cc0c5fe662914811abe5eb5cb347c82a0 Mon Sep 17 00:00:00 2001 From: Jae-Seung Yeom Date: Wed, 23 Oct 2019 15:57:37 -0700 Subject: [PATCH 366/634] Fix saving of the shared RNG states for distributed checkpointing --- include/lbann/utils/random.hpp | 3 ++- src/models/model.cpp | 4 ++-- src/utils/random.cpp | 14 +++++++++++--- 3 files changed, 15 insertions(+), 6 deletions(-) diff --git a/include/lbann/utils/random.hpp b/include/lbann/utils/random.hpp index 4c4aa2a8be6..0e7795f3b29 100644 --- a/include/lbann/utils/random.hpp +++ b/include/lbann/utils/random.hpp @@ -247,7 +247,8 @@ void bernoulli_fill_procdet(AbsDistMat& mat, El::Int m, El::Int n, double p = 0. void uniform_fill_procdet(AbsDistMat& mat, El::Int m, El::Int n, DataType center = 0.0f, DataType radius = 1.0f); -bool save_rng_to_checkpoint(persist& p, lbann_comm* comm); +bool save_rng_to_checkpoint_shared(persist& p, lbann_comm* comm); +bool save_rng_to_checkpoint_distributed(persist& p, lbann_comm* comm); bool load_rng_from_checkpoint(persist& p, const lbann_comm* comm); template diff --git a/src/models/model.cpp b/src/models/model.cpp index 1735f35db54..cecc496fc3c 100644 --- a/src/models/model.cpp +++ b/src/models/model.cpp @@ -1279,7 +1279,7 @@ bool model::save_to_checkpoint_shared(persist& p) { LBANN_ERROR("Unable to save layer[",i,"]=", get_layer(i).get_name()); } } - save_rng_to_checkpoint(p, m_comm); + save_rng_to_checkpoint_shared(p, m_comm); for (const auto& m : m_metrics) { m->save_to_checkpoint_shared(p); } @@ -1340,7 +1340,7 @@ bool model::save_to_checkpoint_distributed(persist& p){ LBANN_ERROR("Unable to save layer[",i,"]=", get_layer(i).get_name()); } } - save_rng_to_checkpoint(p, m_comm); + save_rng_to_checkpoint_distributed(p, m_comm); for (const auto& m : m_metrics) { m->save_to_checkpoint_distributed(p); } diff --git a/src/utils/random.cpp b/src/utils/random.cpp index 22276176e5d..12b41474318 100644 --- a/src/utils/random.cpp +++ b/src/utils/random.cpp @@ -99,7 +99,7 @@ fast_rng_gen& get_fast_io_generator() { return ::fast_io_generator; } -bool save_rng_to_checkpoint(persist& p, lbann_comm* comm) { +bool save_rng_to_checkpoint(persist& p, lbann_comm* comm, bool is_distributed) { std::string dirname = std::string(p.m_checkpoint_dir) + "/rng_state"; std::string rank_in_trainer; std::string rng_name; @@ -109,13 +109,13 @@ bool save_rng_to_checkpoint(persist& p, lbann_comm* comm) { makedir(dirname.c_str()); } else { rank_in_trainer = std::to_string(comm->get_rank_in_trainer()); - if (comm->am_trainer_master()) { + if (comm->am_trainer_master() || is_distributed) { makedir(dirname.c_str()); } comm->trainer_barrier(); } - if (comm == nullptr || comm->am_trainer_master()) { + if (comm == nullptr || comm->am_trainer_master() || is_distributed) { /// @todo - Note that the RNG with thread local data is not correct rng_name = dirname + "/rng_seq_generator"; std::ofstream rng_seq(rng_name); @@ -180,6 +180,14 @@ bool save_rng_to_checkpoint(persist& p, lbann_comm* comm) { return true; } +bool save_rng_to_checkpoint_shared(persist& p, lbann_comm* comm) { + return save_rng_to_checkpoint(p, comm, false); +} + +bool save_rng_to_checkpoint_distributed(persist& p, lbann_comm* comm) { + return save_rng_to_checkpoint(p, comm, true); +} + bool load_rng_from_checkpoint(persist& p, const lbann_comm* comm) { std::string dirname = std::string(p.m_checkpoint_dir) + "/rng_state"; From 1ca1c4e66667b9bf5afe0a33048184cd879fade1 Mon Sep 17 00:00:00 2001 From: Tim Moon Date: Fri, 25 Oct 2019 10:24:31 -0700 Subject: [PATCH 367/634] Bamboo integration tests using Python frontend (attempt 2) (#1322) * Add Bamboo integration tests for LeNet, AlexNet, and ResNet-50 Remove the existing integration tests * Enable weekly Bamboo testing on Ray * Debug Bamboo shell scripts * Remove unused directories * Fix the path for the parallel file system on Ray * Update metrics and runtimes for Bamboo integration tests --- bamboo/allocate_and_run.sh | 15 +- bamboo/clean.sh | 2 - bamboo/full_alexnet_clang6/README.md | 1 - bamboo/full_alexnet_gcc7/README.md | 1 - bamboo/full_alexnet_intel19/README.md | 1 - bamboo/integration_tests/common_code.py | 269 ----------------- bamboo/integration_tests/conftest.py | 28 -- bamboo/integration_tests/error/.gitignore | 2 - ...toencoder_imagenet_objective_functions.csv | 21 -- ..._autoencoder_mnist_objective_functions.csv | 6 - .../catalyst/clang6/expected_performance.csv | 5 - ...toencoder_imagenet_objective_functions.csv | 21 -- ..._autoencoder_mnist_objective_functions.csv | 6 - .../catalyst/gcc7/expected_performance.csv | 5 - ...toencoder_imagenet_objective_functions.csv | 21 -- ..._autoencoder_mnist_objective_functions.csv | 6 - .../corona/gcc7/expected_performance.csv | 5 - ...toencoder_imagenet_objective_functions.csv | 21 -- ..._autoencoder_mnist_objective_functions.csv | 6 - .../lassen/gcc7/expected_performance.csv | 5 - ...toencoder_imagenet_objective_functions.csv | 21 -- ..._autoencoder_mnist_objective_functions.csv | 6 - .../pascal/gcc7/expected_performance.csv | 5 - bamboo/integration_tests/full_alexnet.sh | 59 ---- bamboo/integration_tests/output/.gitignore | 2 - .../test_integration_alexnet.py | 223 ++++++++++++++ .../test_integration_autoencoders.py | 102 ------- .../test_integration_debug.py | 141 --------- .../test_integration_lenet.py | 204 +++++++++++++ .../test_integration_performance.py | 278 ------------------ .../test_integration_resnet50.py | 223 ++++++++++++++ bamboo/run.sh | 6 +- python/lbann/contrib/lc/paths.py | 2 + 33 files changed, 659 insertions(+), 1060 deletions(-) delete mode 100644 bamboo/full_alexnet_clang6/README.md delete mode 100644 bamboo/full_alexnet_gcc7/README.md delete mode 100644 bamboo/full_alexnet_intel19/README.md delete mode 100644 bamboo/integration_tests/common_code.py delete mode 100644 bamboo/integration_tests/error/.gitignore delete mode 100644 bamboo/integration_tests/expected_values/catalyst/clang6/expected_conv_autoencoder_imagenet_objective_functions.csv delete mode 100644 bamboo/integration_tests/expected_values/catalyst/clang6/expected_conv_autoencoder_mnist_objective_functions.csv delete mode 100644 bamboo/integration_tests/expected_values/catalyst/clang6/expected_performance.csv delete mode 100644 bamboo/integration_tests/expected_values/catalyst/gcc7/expected_conv_autoencoder_imagenet_objective_functions.csv delete mode 100644 bamboo/integration_tests/expected_values/catalyst/gcc7/expected_conv_autoencoder_mnist_objective_functions.csv delete mode 100644 bamboo/integration_tests/expected_values/catalyst/gcc7/expected_performance.csv delete mode 100644 bamboo/integration_tests/expected_values/corona/gcc7/expected_conv_autoencoder_imagenet_objective_functions.csv delete mode 100644 bamboo/integration_tests/expected_values/corona/gcc7/expected_conv_autoencoder_mnist_objective_functions.csv delete mode 100644 bamboo/integration_tests/expected_values/corona/gcc7/expected_performance.csv delete mode 100644 bamboo/integration_tests/expected_values/lassen/gcc7/expected_conv_autoencoder_imagenet_objective_functions.csv delete mode 100644 bamboo/integration_tests/expected_values/lassen/gcc7/expected_conv_autoencoder_mnist_objective_functions.csv delete mode 100644 bamboo/integration_tests/expected_values/lassen/gcc7/expected_performance.csv delete mode 100644 bamboo/integration_tests/expected_values/pascal/gcc7/expected_conv_autoencoder_imagenet_objective_functions.csv delete mode 100644 bamboo/integration_tests/expected_values/pascal/gcc7/expected_conv_autoencoder_mnist_objective_functions.csv delete mode 100644 bamboo/integration_tests/expected_values/pascal/gcc7/expected_performance.csv delete mode 100755 bamboo/integration_tests/full_alexnet.sh delete mode 100644 bamboo/integration_tests/output/.gitignore create mode 100644 bamboo/integration_tests/test_integration_alexnet.py delete mode 100644 bamboo/integration_tests/test_integration_autoencoders.py delete mode 100644 bamboo/integration_tests/test_integration_debug.py create mode 100644 bamboo/integration_tests/test_integration_lenet.py delete mode 100644 bamboo/integration_tests/test_integration_performance.py create mode 100644 bamboo/integration_tests/test_integration_resnet50.py diff --git a/bamboo/allocate_and_run.sh b/bamboo/allocate_and_run.sh index 26f74f45c19..f46baa69687 100755 --- a/bamboo/allocate_and_run.sh +++ b/bamboo/allocate_and_run.sh @@ -35,28 +35,21 @@ fi if [ "${CLUSTER}" = 'lassen' ]; then ALLOCATION_TIME_LIMIT=600 if [ ${WEEKLY} -ne 0 ]; then - timeout -k 5 24h bsub -G guests -Is -q pbatch -nnodes 16 -W ${ALLOCATION_TIME_LIMIT} ./run.sh --weekly + timeout -k 5 24h bsub -G guests -Is -q pbatch -nnodes 4 -W ${ALLOCATION_TIME_LIMIT} ./run.sh --weekly else timeout -k 5 24h bsub -G guests -Is -q pbatch -nnodes 2 -W ${ALLOCATION_TIME_LIMIT} ./run.sh fi elif [ "${CLUSTER}" = 'ray' ]; then + ALLOCATION_TIME_LIMIT=240 if [ ${WEEKLY} -ne 0 ]; then - echo "No ray testing in weekly." + timeout -k 5 24h bsub -Is -q pbatch -nnodes 4 -W ${ALLOCATION_TIME_LIMIT} ./run.sh --weekly else - ALLOCATION_TIME_LIMIT=240 timeout -k 5 24h bsub -Is -q pbatch -nnodes 2 -W ${ALLOCATION_TIME_LIMIT} ./run.sh fi elif [ "${CLUSTER}" = 'catalyst' ] || [ "${CLUSTER}" = 'corona' ] || [ "${CLUSTER}" = 'pascal' ]; then ALLOCATION_TIME_LIMIT=960 if [ ${WEEKLY} -ne 0 ]; then - timeout -k 5 24h salloc -N16 --partition=pbatch -t ${ALLOCATION_TIME_LIMIT} ./run.sh --weekly - if [ "${CLUSTER}" = 'catalyst' ]; then - cd integration_tests - python -m pytest -s test_integration_performance.py -k test_integration_performance_full_alexnet_clang6 --weekly --run --junitxml=../full_alexnet_clang6/results.xml - python -m pytest -s test_integration_performance.py -k test_integration_performance_full_alexnet_gcc7 --weekly --run --junitxml=../full_alexnet_gcc7/results.xml - # python -m pytest -s test_integration_performance.py -k test_integration_performance_full_alexnet_intel19 --weekly --run --junitxml=../full_alexnet_intel19/results.xml - cd .. - fi + timeout -k 5 24h salloc -N4 --partition=pbatch -t ${ALLOCATION_TIME_LIMIT} ./run.sh --weekly else ALLOCATION_TIME_LIMIT=90 # Start with 1.5 hrs; may adjust for CPU clusters if [[ $(mjstat -c | awk 'match($1, "pbatch") && NF < 7 { print $5 }') -ne "0" ]]; diff --git a/bamboo/clean.sh b/bamboo/clean.sh index 8ac26cf18f4..03b7826cf2a 100755 --- a/bamboo/clean.sh +++ b/bamboo/clean.sh @@ -16,8 +16,6 @@ rm -f ${LBANN_DIR}/bamboo/integration_tests/*.prototext* rm -f ${LBANN_DIR}/bamboo/integration_tests/*.pyc rm -rf ${LBANN_DIR}/bamboo/integration_tests/__pycache__ rm -f ${LBANN_DIR}/bamboo/integration_tests/*.tfevents.* -rm -f ${LBANN_DIR}/bamboo/integration_tests/error/* -rm -f ${LBANN_DIR}/bamboo/integration_tests/output/* rm -rf ${LBANN_DIR}/bamboo/integration_tests/experiments/* # Unit Tests diff --git a/bamboo/full_alexnet_clang6/README.md b/bamboo/full_alexnet_clang6/README.md deleted file mode 100644 index 6672d2ab7b0..00000000000 --- a/bamboo/full_alexnet_clang6/README.md +++ /dev/null @@ -1 +0,0 @@ -Directory for results.xml for full_alexnet_clang6. \ No newline at end of file diff --git a/bamboo/full_alexnet_gcc7/README.md b/bamboo/full_alexnet_gcc7/README.md deleted file mode 100644 index a518e84799e..00000000000 --- a/bamboo/full_alexnet_gcc7/README.md +++ /dev/null @@ -1 +0,0 @@ -Directory for results.xml for full_alexnet_gcc7. \ No newline at end of file diff --git a/bamboo/full_alexnet_intel19/README.md b/bamboo/full_alexnet_intel19/README.md deleted file mode 100644 index 0fe9ebc203b..00000000000 --- a/bamboo/full_alexnet_intel19/README.md +++ /dev/null @@ -1 +0,0 @@ -Directory for results.xml for full_alexnet_intel19. \ No newline at end of file diff --git a/bamboo/integration_tests/common_code.py b/bamboo/integration_tests/common_code.py deleted file mode 100644 index 98107f545a7..00000000000 --- a/bamboo/integration_tests/common_code.py +++ /dev/null @@ -1,269 +0,0 @@ -import sys -sys.path.insert(0, '../common_python') -import tools -import collections, csv, os, pprint, re, time - - -# Set up the command ########################################################## -def get_command(cluster, dir_name, model_folder, model_name, executable, - output_file_name, error_file_name, compiler_name, weekly=False, - data_reader_percent=None): - if model_name in ['alexnet', 'conv_autoencoder_imagenet']: - if weekly: - time_limit = 360 - else: - time_limit = 60 - if cluster == 'lassen': - command = tools.get_command( - cluster=cluster, executable=executable, - # Allocation/Run Parameters - num_nodes=16, num_processes=32, partition='pbatch', - time_limit=time_limit, - # LBANN Parameters - dir_name=dir_name, - data_filedir_train_default='/p/lscratchh/brainusr/datasets/ILSVRC2012/original/train/', - data_filename_train_default='/p/lscratchh/brainusr/datasets/ILSVRC2012/labels/train.txt', - data_filedir_test_default='/p/lscratchh/brainusr/datasets/ILSVRC2012/original/val/', - data_filename_test_default='/p/lscratchh/brainusr/datasets/ILSVRC2012/labels/val.txt', - data_reader_name='imagenet_lassen', - data_reader_percent=data_reader_percent, - model_folder=model_folder, model_name=model_name, num_epochs=20, - optimizer_name='adagrad', - # Error/Output Redirect - error_file_name=error_file_name, - output_file_name=output_file_name, - # Misc. Parameters - weekly=weekly) - else: - command = tools.get_command( - cluster=cluster, executable=executable, - # Allocation/Run Parameters - num_nodes=16, num_processes=32, partition='pbatch', - time_limit=time_limit, - # LBANN Parameters - dir_name=dir_name, - data_filedir_train_default='/p/lscratchh/brainusr/datasets/ILSVRC2012/original/train/', - data_filename_train_default='/p/lscratchh/brainusr/datasets/ILSVRC2012/labels/train.txt', - data_filedir_test_default='/p/lscratchh/brainusr/datasets/ILSVRC2012/original/val/', - data_filename_test_default='/p/lscratchh/brainusr/datasets/ILSVRC2012/labels/val.txt', - data_reader_name='imagenet', - data_reader_percent=data_reader_percent, - model_folder=model_folder, model_name=model_name, num_epochs=20, - optimizer_name='adagrad', - # Error/Output Redirect - error_file_name=error_file_name, - output_file_name=output_file_name, - # Misc. Parameters - weekly=weekly) - elif model_name in ['conv_autoencoder_mnist', 'lenet_mnist']: - if (model_name == 'lenet_mnist') and \ - (compiler_name in ['clang6', 'intel19']): - partition = 'pbatch' - time_limit = 600 - else: - partition = 'pdebug' - time_limit = 60 - if (cluster == 'ray') and (model_name == 'conv_autoencoder_mnist'): - num_processes = 20 - else: - num_processes = 2 - command = tools.get_command( - cluster=cluster, executable=executable, - # Allocation/Run Parameters - num_nodes=1, num_processes=num_processes, partition=partition, - time_limit=time_limit, - # LBANN Parameters - dir_name=dir_name, - data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST', - data_reader_name='mnist', data_reader_percent=data_reader_percent, - model_folder=model_folder, model_name=model_name, num_epochs=5, - optimizer_name='adagrad', - # Error/Output Redirect - error_file_name=error_file_name, - output_file_name=output_file_name, - # Misc. Parameters - weekly=weekly) - else: - raise Exception('Invalid model: %s' % model_name) - return command - -# Run LBANN ################################################################### - - -def run_lbann(command, model_name, output_file_name, error_file_name, - should_log=False): - print('About to run: %s' % command) - print('%s began waiting in the queue at ' % model_name + - time.strftime('%H:%M:%S', time.localtime())) - return_code = os.system(command) - print('%s finished at ' % model_name + - time.strftime('%H:%M:%S', time.localtime())) - lbann_exceptions = [] - timed_out = False - if should_log or (return_code != 0): - output_file = open(output_file_name, 'r') - for line in output_file: - is_match = re.search( - 'This lbann_exception is about to be thrown:(.*)', line) - if is_match: - lbann_exceptions.append(is_match.group(1)) - is_match = re.search('CANCELLED AT (.*) DUE TO TIME LIMIT', line) - if is_match: - timed_out = True - error_file = open(error_file_name, 'r') - for line in error_file: - is_match = re.search('LBANN error on (.*)', line) - if is_match: - lbann_exceptions.append(is_match.group(1)) - if return_code != 0: - error_string = ('Model %s crashed with return_code=%d, timed_out=%s,' - ' and lbann exceptions=%s. Command was: %s') % ( - model_name, return_code, str(timed_out), - str(collections.Counter(lbann_exceptions)), command) - print(error_string) - tools.assert_success(return_code, error_file_name) - -# Extract data from output #################################################### - - -def populate_data_dict_epoch(regex, line, data_field, data_fields, data_dict, - model_id): - is_match = re.search(regex, line) - if is_match and (data_field in data_fields): - if model_id not in data_dict[data_field].keys(): - data_dict[data_field][model_id] = {} - epoch_id = is_match.group(1) - value = float(is_match.group(2)) - data_dict[data_field][model_id][epoch_id] = value - - -def populate_data_dict_overall(regex, line, data_field, data_fields, data_dict, - model_id): - is_match = re.search(regex, line) - if is_match and (data_field in data_fields): - if model_id not in data_dict[data_field].keys(): - data_dict[data_field][model_id] = {} - value = float(is_match.group(1)) - data_dict[data_field][model_id]['overall'] = value - - -# data_dict[data_field][model_id][epoch_id] = float -# data_fields is the list or set of data we're interested in. -def extract_data(output_file_name, data_fields, should_log): - output_file = open(output_file_name, 'r') - data_dict = {} - for data_field in data_fields: - data_dict[data_field] = {} - - for line in output_file: - if should_log: - print('extract_data: %s: %s' % (output_file_name, line)) - - # Check if line is reporting model results - is_model = re.search('^Model ([0-9]+)', line) - if not is_model: - is_model = re.search('^model([0-9]+)', line) - if is_model: - print('extract_data: is_model={is_model}'.format(is_model=is_model)) - model_id = is_model.group(1) - - regex = 'training epoch ([0-9]+) objective function : ([0-9.]+)' - data_field = 'training_objective_function' - populate_data_dict_epoch(regex, line, data_field, data_fields, - data_dict, model_id) - - regex = 'training epoch ([0-9]+) run time : ([0-9.]+)' - data_field = 'training_run_time' - populate_data_dict_epoch(regex, line, data_field, data_fields, - data_dict, model_id) - - regex = 'training epoch ([0-9]+) mini-batch time statistics : ([0-9.]+)s mean, ([0-9.]+)s max, ([0-9.]+)s min, ([0-9.]+)s stdev' - is_match = re.search(regex, line) - if is_match: - print('extract_data: is_mini-batch time statistics={is_match}'.format( - is_match=is_match)) - epoch_id = is_match.group(1) - mean_value = float(is_match.group(2)) - max_value = float(is_match.group(3)) - min_value = float(is_match.group(4)) - stdev_value = float(is_match.group(5)) - data_field = 'training_mean' - if data_field in data_fields: - if model_id not in data_dict[data_field].keys(): - data_dict[data_field][model_id] = {} - print('extract_data: mean_value={mv}'.format(mv=mean_value)) - data_dict[data_field][model_id][epoch_id] = mean_value - data_field = 'training_max' - if data_field in data_fields: - if model_id not in data_dict[data_field].keys(): - data_dict[data_field][model_id] = {} - print('extract_data: max_value={mv}'.format(mv=max_value)) - data_dict[data_field][model_id][epoch_id] = max_value - data_field = 'training_min' - if data_field in data_fields: - if model_id not in data_dict[data_field].keys(): - data_dict[data_field][model_id] = {} - print('extract_data: min_value={mv}'.format(mv=min_value)) - data_dict[data_field][model_id][epoch_id] = min_value - data_field = 'training_stdev' - if data_field in data_fields: - if model_id not in data_dict[data_field].keys(): - data_dict[data_field][model_id] = {} - print('extract_data: stdev={sv}'.format(sv=stdev_value)) - data_dict[data_field][model_id][epoch_id] = stdev_value - - # This will re-populate the value for 'test_accuracy' - # on each epoch, thus keeping the final value. - # Just keep the data_field as 'test_accuracy' so we don't have - # to update code and csv files to include 'validation_accuracy'. - regex = 'validation categorical accuracy : ([0-9.]+)' - data_field = 'test_accuracy' - populate_data_dict_overall(regex, line, data_field, data_fields, - data_dict, model_id) - - # Overwrite accuracy from validation if we have test accuracy. - regex = 'test categorical accuracy : ([0-9.]+)' - data_field = 'test_accuracy' - populate_data_dict_overall(regex, line, data_field, data_fields, - data_dict, model_id) - - output_file.close() - if should_log: - print('extract_data: Extracted Data below:') - pprint.pprint(data_dict) - return data_dict - -# Skeleton #################################################################### - - -def skeleton(cluster, dir_name, executable, model_folder, model_name, - data_fields, should_log, compiler_name=None, weekly=False, - data_reader_percent=None): - if compiler_name is None: - output_file_name = '%s/bamboo/integration_tests/output/%s_output.txt' % (dir_name, model_name) - error_file_name = '%s/bamboo/integration_tests/error/%s_error.txt' % (dir_name, model_name) - else: - output_file_name = '%s/bamboo/integration_tests/output/%s_%s_output.txt' % (dir_name, model_name, compiler_name) - error_file_name = '%s/bamboo/integration_tests/error/%s_%s_error.txt' % (dir_name, model_name, compiler_name) - command = get_command( - cluster, dir_name, model_folder, model_name, executable, - output_file_name, error_file_name, compiler_name, weekly=weekly, - data_reader_percent=data_reader_percent) - run_lbann(command, model_name, output_file_name, - error_file_name, should_log) - return extract_data(output_file_name, data_fields, should_log) - -# Misc. functions ############################################################ - - -# csv_dict[row_header][column_header] = float -def csv_to_dict(csv_path): - with open(csv_path, 'r') as csv_file: - reader = csv.reader(csv_file, skipinitialspace=True) - column_headers = next(reader) - values = {} - for row in reader: - row_header = row[0] - values[row_header] = dict( - zip(column_headers[1:], map(float, row[1:]))) - return values diff --git a/bamboo/integration_tests/conftest.py b/bamboo/integration_tests/conftest.py index a318e7537ed..9487cdf242e 100644 --- a/bamboo/integration_tests/conftest.py +++ b/bamboo/integration_tests/conftest.py @@ -13,20 +13,12 @@ def pytest_addoption(parser): parser.addoption('--cluster', action='store', default=cluster, help='--cluster= to specify the cluster being run on, for the purpose of determing which commands to use. Default the current cluster') - parser.addoption('--debug_build', action='store_true', default=False, - help='--debug_build specifies that debug tests should be run, even without doing a --weekly build. Default False') parser.addoption('--dirname', action='store', default=default_dirname, help='--dirname= to specify the top-level directory. Default directory of build_lbann_lc executable') parser.addoption('--exes', action='store', default=default_exes, help='--exes={compiler_name: path}') - parser.addoption('--run', action='store_true', default=False, - help='--run specifies that a test normally ignored should be run. Default False') parser.addoption('--weekly', action='store_true', default=False, help='--weekly specifies that the test should ONLY be run weekly, not nightly. Default False') - # For local testing only - parser.addoption('--data-reader-percent', action='store', default=None, - help='--data-reader-percent=. Default None. Note that 1.0 is 100%.') - parser.addoption('--exe', action='store', help='--exe=') @pytest.fixture @@ -34,11 +26,6 @@ def cluster(request): return request.config.getoption('--cluster') -@pytest.fixture -def debug_build(request): - return request.config.getoption('--debug_build') - - @pytest.fixture def dirname(request): return request.config.getoption('--dirname') @@ -49,21 +36,6 @@ def exes(request): return request.config.getoption('--exes') -@pytest.fixture -def run(request): - return request.config.getoption('--run') - - @pytest.fixture def weekly(request): return request.config.getoption('--weekly') - - -@pytest.fixture -def data_reader_percent(request): - return request.config.getoption('--data-reader-percent') - - -@pytest.fixture -def exe(request): - return request.config.getoption('--exe') diff --git a/bamboo/integration_tests/error/.gitignore b/bamboo/integration_tests/error/.gitignore deleted file mode 100644 index d6b7ef32c84..00000000000 --- a/bamboo/integration_tests/error/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -* -!.gitignore diff --git a/bamboo/integration_tests/expected_values/catalyst/clang6/expected_conv_autoencoder_imagenet_objective_functions.csv b/bamboo/integration_tests/expected_values/catalyst/clang6/expected_conv_autoencoder_imagenet_objective_functions.csv deleted file mode 100644 index 003794fd557..00000000000 --- a/bamboo/integration_tests/expected_values/catalyst/clang6/expected_conv_autoencoder_imagenet_objective_functions.csv +++ /dev/null @@ -1,21 +0,0 @@ -Epoch_number, training_objective_function_nightly, training_objective_function_weekly -0, 0.675652, 0.608574 -1, 0.590008, 0.590008 -2, 0.587484, 0.587484 -3, 0.586305, 0.586305 -4, 0.585585, 0.585585 -5, 0.585036, 0.585036 -6, 0.584688, 0.584688 -7, 0.584348, 0.584348 -8, 0.584041, 0.584041 -9, 0.583865, 0.583865 -10, 0.583665, 0.583665 -11, 0.583521, 0.583521 -12, 0.583303, 0.583303 -13, 0.58328, 0.58328 -14, 0.5832, 0.5832 -15, 0.583134, 0.583134 -16, 0.583052, 0.583052 -17, 0.583039, 0.583039 -18, 0.582954, 0.582954 -19, 0.582936, 0.582936 diff --git a/bamboo/integration_tests/expected_values/catalyst/clang6/expected_conv_autoencoder_mnist_objective_functions.csv b/bamboo/integration_tests/expected_values/catalyst/clang6/expected_conv_autoencoder_mnist_objective_functions.csv deleted file mode 100644 index 80c12b2b0ed..00000000000 --- a/bamboo/integration_tests/expected_values/catalyst/clang6/expected_conv_autoencoder_mnist_objective_functions.csv +++ /dev/null @@ -1,6 +0,0 @@ -Epoch_number, training_objective_function -0, 0.207480 -1, 0.194710 -2, 0.193224 -3, 0.192867 -4, 0.192758 diff --git a/bamboo/integration_tests/expected_values/catalyst/clang6/expected_performance.csv b/bamboo/integration_tests/expected_values/catalyst/clang6/expected_performance.csv deleted file mode 100644 index 6a9581ff8f8..00000000000 --- a/bamboo/integration_tests/expected_values/catalyst/clang6/expected_performance.csv +++ /dev/null @@ -1,5 +0,0 @@ -Model_name, training_run_time, training_mean, training_max, training_min, training_stdev, test_accuracy -alexnet_nightly, 117.00, 2.80, 9.00, 1.20, 2.00, 0.00 -alexnet_weekly, 490.00, 1.00, 9.00, 0.60, 0.50, 2.50 -cache_alexnet, 0.00, 0.00, 0.00, 0.00, 0.00, 100.00 -lenet_mnist, 100.00, 0.12, 0.40, 0.10, 0.09, 98.40 diff --git a/bamboo/integration_tests/expected_values/catalyst/gcc7/expected_conv_autoencoder_imagenet_objective_functions.csv b/bamboo/integration_tests/expected_values/catalyst/gcc7/expected_conv_autoencoder_imagenet_objective_functions.csv deleted file mode 100644 index 003794fd557..00000000000 --- a/bamboo/integration_tests/expected_values/catalyst/gcc7/expected_conv_autoencoder_imagenet_objective_functions.csv +++ /dev/null @@ -1,21 +0,0 @@ -Epoch_number, training_objective_function_nightly, training_objective_function_weekly -0, 0.675652, 0.608574 -1, 0.590008, 0.590008 -2, 0.587484, 0.587484 -3, 0.586305, 0.586305 -4, 0.585585, 0.585585 -5, 0.585036, 0.585036 -6, 0.584688, 0.584688 -7, 0.584348, 0.584348 -8, 0.584041, 0.584041 -9, 0.583865, 0.583865 -10, 0.583665, 0.583665 -11, 0.583521, 0.583521 -12, 0.583303, 0.583303 -13, 0.58328, 0.58328 -14, 0.5832, 0.5832 -15, 0.583134, 0.583134 -16, 0.583052, 0.583052 -17, 0.583039, 0.583039 -18, 0.582954, 0.582954 -19, 0.582936, 0.582936 diff --git a/bamboo/integration_tests/expected_values/catalyst/gcc7/expected_conv_autoencoder_mnist_objective_functions.csv b/bamboo/integration_tests/expected_values/catalyst/gcc7/expected_conv_autoencoder_mnist_objective_functions.csv deleted file mode 100644 index 8bcf25bb71d..00000000000 --- a/bamboo/integration_tests/expected_values/catalyst/gcc7/expected_conv_autoencoder_mnist_objective_functions.csv +++ /dev/null @@ -1,6 +0,0 @@ -Epoch_number, training_objective_function -0, 0.207514 -1, 0.194710 -2, 0.193221 -3, 0.192864 -4, 0.192755 diff --git a/bamboo/integration_tests/expected_values/catalyst/gcc7/expected_performance.csv b/bamboo/integration_tests/expected_values/catalyst/gcc7/expected_performance.csv deleted file mode 100644 index c05e05c43e8..00000000000 --- a/bamboo/integration_tests/expected_values/catalyst/gcc7/expected_performance.csv +++ /dev/null @@ -1,5 +0,0 @@ -Model_name, training_run_time, training_mean, training_max, training_min, training_stdev, test_accuracy -alexnet_nightly, 65.00, 1.50, 8.30, 0.37, 1.70, 0.1 -alexnet_weekly, 360.00, 0.90, 4.00, 0.40, 0.70, 2.00 -cache_alexnet, 0.00, 0.00, 0.00, 0.00, 0.00, 100.00 -lenet_mnist, 137.00, 0.18, 0.40, 0.15, 0.04, 98.92 diff --git a/bamboo/integration_tests/expected_values/corona/gcc7/expected_conv_autoencoder_imagenet_objective_functions.csv b/bamboo/integration_tests/expected_values/corona/gcc7/expected_conv_autoencoder_imagenet_objective_functions.csv deleted file mode 100644 index d1fec964160..00000000000 --- a/bamboo/integration_tests/expected_values/corona/gcc7/expected_conv_autoencoder_imagenet_objective_functions.csv +++ /dev/null @@ -1,21 +0,0 @@ -Epoch_number, training_objective_function_nightly, training_objective_function_weekly -0, 0.983936, 0.608574 -1, 0.908194, 0.590008 -2, 0.900910, 0.587484 -3, 0.899583, 0.586305 -4, 0.897652, 0.585585 -5, 0.889670, 0.585036 -6, 0.890061, 0.584688 -7, 0.888348, 0.584348 -8, 0.888921, 0.584041 -9, 0.883034, 0.583865 -10, 0.888236, 0.583665 -11, 0.881798, 0.583521 -12, 0.884866, 0.583303 -13, 0.883757, 0.58328 -14, 0.881703, 0.5832 -15, 0.883718, 0.583134 -16, 0.875670, 0.583052 -17, 0.877554, 0.583039 -18, 0.882443, 0.582954 -19, 0.881577, 0.582936 diff --git a/bamboo/integration_tests/expected_values/corona/gcc7/expected_conv_autoencoder_mnist_objective_functions.csv b/bamboo/integration_tests/expected_values/corona/gcc7/expected_conv_autoencoder_mnist_objective_functions.csv deleted file mode 100644 index 8bcf25bb71d..00000000000 --- a/bamboo/integration_tests/expected_values/corona/gcc7/expected_conv_autoencoder_mnist_objective_functions.csv +++ /dev/null @@ -1,6 +0,0 @@ -Epoch_number, training_objective_function -0, 0.207514 -1, 0.194710 -2, 0.193221 -3, 0.192864 -4, 0.192755 diff --git a/bamboo/integration_tests/expected_values/corona/gcc7/expected_performance.csv b/bamboo/integration_tests/expected_values/corona/gcc7/expected_performance.csv deleted file mode 100644 index f48c79b35b6..00000000000 --- a/bamboo/integration_tests/expected_values/corona/gcc7/expected_performance.csv +++ /dev/null @@ -1,5 +0,0 @@ -Model_name, training_run_time, training_mean, training_max, training_min, training_stdev, test_accuracy -alexnet_nightly, 55.00, 1.03, 1.90, 0.80, 0.21, 0.00 -alexnet_weekly, 491.00, 1.00, 9.00, 1.11, 0.60, 2.00 -cache_alexnet, 0.00, 0.00, 0.00, 0.00, 0.00, 100.00 -lenet_mnist, 385.00, 0.50, 2.00, 0.51, 0.80, 98.40 diff --git a/bamboo/integration_tests/expected_values/lassen/gcc7/expected_conv_autoencoder_imagenet_objective_functions.csv b/bamboo/integration_tests/expected_values/lassen/gcc7/expected_conv_autoencoder_imagenet_objective_functions.csv deleted file mode 100644 index 003794fd557..00000000000 --- a/bamboo/integration_tests/expected_values/lassen/gcc7/expected_conv_autoencoder_imagenet_objective_functions.csv +++ /dev/null @@ -1,21 +0,0 @@ -Epoch_number, training_objective_function_nightly, training_objective_function_weekly -0, 0.675652, 0.608574 -1, 0.590008, 0.590008 -2, 0.587484, 0.587484 -3, 0.586305, 0.586305 -4, 0.585585, 0.585585 -5, 0.585036, 0.585036 -6, 0.584688, 0.584688 -7, 0.584348, 0.584348 -8, 0.584041, 0.584041 -9, 0.583865, 0.583865 -10, 0.583665, 0.583665 -11, 0.583521, 0.583521 -12, 0.583303, 0.583303 -13, 0.58328, 0.58328 -14, 0.5832, 0.5832 -15, 0.583134, 0.583134 -16, 0.583052, 0.583052 -17, 0.583039, 0.583039 -18, 0.582954, 0.582954 -19, 0.582936, 0.582936 diff --git a/bamboo/integration_tests/expected_values/lassen/gcc7/expected_conv_autoencoder_mnist_objective_functions.csv b/bamboo/integration_tests/expected_values/lassen/gcc7/expected_conv_autoencoder_mnist_objective_functions.csv deleted file mode 100644 index 8bcf25bb71d..00000000000 --- a/bamboo/integration_tests/expected_values/lassen/gcc7/expected_conv_autoencoder_mnist_objective_functions.csv +++ /dev/null @@ -1,6 +0,0 @@ -Epoch_number, training_objective_function -0, 0.207514 -1, 0.194710 -2, 0.193221 -3, 0.192864 -4, 0.192755 diff --git a/bamboo/integration_tests/expected_values/lassen/gcc7/expected_performance.csv b/bamboo/integration_tests/expected_values/lassen/gcc7/expected_performance.csv deleted file mode 100644 index aa67a5073a0..00000000000 --- a/bamboo/integration_tests/expected_values/lassen/gcc7/expected_performance.csv +++ /dev/null @@ -1,5 +0,0 @@ -Model_name, training_run_time, training_mean, training_max, training_min, training_stdev, test_accuracy -alexnet_nightly, 23.00, 0.70, 10.30, 0.10, 1.20, 0.00 -alexnet_weekly, 56.00, 0.15, 10.00, 0.70, 0.70, 1.50 -cache_alexnet, 0.00, 0.00, 0.00, 0.00, 0.00, 100.00 -lenet_mnist, 10.10, 0.06, 5.30, 0.01, 0.60, 98.30 diff --git a/bamboo/integration_tests/expected_values/pascal/gcc7/expected_conv_autoencoder_imagenet_objective_functions.csv b/bamboo/integration_tests/expected_values/pascal/gcc7/expected_conv_autoencoder_imagenet_objective_functions.csv deleted file mode 100644 index 003794fd557..00000000000 --- a/bamboo/integration_tests/expected_values/pascal/gcc7/expected_conv_autoencoder_imagenet_objective_functions.csv +++ /dev/null @@ -1,21 +0,0 @@ -Epoch_number, training_objective_function_nightly, training_objective_function_weekly -0, 0.675652, 0.608574 -1, 0.590008, 0.590008 -2, 0.587484, 0.587484 -3, 0.586305, 0.586305 -4, 0.585585, 0.585585 -5, 0.585036, 0.585036 -6, 0.584688, 0.584688 -7, 0.584348, 0.584348 -8, 0.584041, 0.584041 -9, 0.583865, 0.583865 -10, 0.583665, 0.583665 -11, 0.583521, 0.583521 -12, 0.583303, 0.583303 -13, 0.58328, 0.58328 -14, 0.5832, 0.5832 -15, 0.583134, 0.583134 -16, 0.583052, 0.583052 -17, 0.583039, 0.583039 -18, 0.582954, 0.582954 -19, 0.582936, 0.582936 diff --git a/bamboo/integration_tests/expected_values/pascal/gcc7/expected_conv_autoencoder_mnist_objective_functions.csv b/bamboo/integration_tests/expected_values/pascal/gcc7/expected_conv_autoencoder_mnist_objective_functions.csv deleted file mode 100644 index 8bcf25bb71d..00000000000 --- a/bamboo/integration_tests/expected_values/pascal/gcc7/expected_conv_autoencoder_mnist_objective_functions.csv +++ /dev/null @@ -1,6 +0,0 @@ -Epoch_number, training_objective_function -0, 0.207514 -1, 0.194710 -2, 0.193221 -3, 0.192864 -4, 0.192755 diff --git a/bamboo/integration_tests/expected_values/pascal/gcc7/expected_performance.csv b/bamboo/integration_tests/expected_values/pascal/gcc7/expected_performance.csv deleted file mode 100644 index 98c22e515df..00000000000 --- a/bamboo/integration_tests/expected_values/pascal/gcc7/expected_performance.csv +++ /dev/null @@ -1,5 +0,0 @@ -Model_name, training_run_time, training_mean, training_max, training_min, training_stdev, test_accuracy -alexnet_nightly, 51.00, 1.20, 4.00, 0.50, 0.40, 100.00 -alexnet_weekly, 300.00, 1.00, 7.00, 0.10, 1.30, 2.0 -cache_alexnet, 0.00, 0.00, 0.00, 0.00, 0.00, 100.00 -lenet_mnist, 12.00, 0.04, 6.00, 0.01, 0.40, 98.40 diff --git a/bamboo/integration_tests/full_alexnet.sh b/bamboo/integration_tests/full_alexnet.sh deleted file mode 100755 index 393a0fb9e43..00000000000 --- a/bamboo/integration_tests/full_alexnet.sh +++ /dev/null @@ -1,59 +0,0 @@ -#!/bin/bash - -module load mpifileutils - -COMPILER=0 -while :; do - case ${1} in - --compiler) - # Choose compiler - if [ -n "${2}" ]; then - COMPILER=${2} - shift - else - echo "\"${1}\" option requires a non-empty option argument" >&2 - exit 1 - fi - ;; - -?*) - # Unknown option - echo "Unknown option (${1})" >&2 - exit 1 - ;; - *) - # Break loop if there are no more options - break - esac - shift -done - -if [ ${COMPILER} -eq 0 ]; then - exit 1 -fi - -LBANN_DIR=$(git rev-parse --show-toplevel) -CLUSTER=$(hostname | sed 's/\([a-zA-Z][a-zA-Z]*\)[0-9]*/\1/g') -FILE_PREFIX=${LBANN_DIR}/bamboo/unit_tests/output/full_alexnet_${CLUSTER}_${COMPILER} - -# Clear SSDs -srun --wait=0 --clear-ssd hostname > ${FILE_PREFIX}_1_output.txt - -# Cache dataset -echo "Caching dataset..." -[ -e /l/ssd/lbannusr/datasets-resized/ILSVRC2012/train_resized.tar ] || \ - srun --nodes=128 --ntasks-per-node=2 dbcast /p/lscratchh/brainusr/datasets/ILSVRC2012/original/train_resized.tar /l/ssd/lbannusr/datasets-resized/ILSVRC2012/train_resized.tar > ${FILE_PREFIX}_2_output.txt -[ -d /l/ssd/lbannusr/datasets-resized/ILSVRC2012/train ] || \ - srun --nodes=128 --ntasks-per-node=1 tar xf /l/ssd/lbannusr/datasets-resized/ILSVRC2012/train_resized.tar -C /l/ssd/lbannusr/datasets-resized/ILSVRC2012 -[ -e /l/ssd/lbannusr/datasets-resized/ILSVRC2012/val_resized.tar ] || \ - srun --nodes=128 --ntasks-per-node=2 dbcast /p/lscratchh/brainusr/datasets/ILSVRC2012/original/val_resized.tar /l/ssd/lbannusr/datasets-resized/ILSVRC2012/val_resized.tar > ${FILE_PREFIX}_3_output.txt -[ -d /l/ssd/lbannusr/datasets-resized/ILSVRC2012/val ] || \ - srun --nodes=128 --ntasks-per-node=1 tar xf /l/ssd/lbannusr/datasets-resized/ILSVRC2012/val_resized.tar -C /l/ssd/lbannusr/datasets-resized/ILSVRC2012 -[ -e /l/ssd/lbannusr/datasets-resized/ILSVRC2012/labels.tar ] || \ - srun --nodes=128 --ntasks-per-node=2 dbcast /p/lscratchh/brainusr/datasets/ILSVRC2012/original/labels.tar /l/ssd/lbannusr/datasets-resized/ILSVRC2012/labels.tar > ${FILE_PREFIX}_4_output.txt -[ -e /l/ssd/lbannusr/datasets-resized/ILSVRC2012/labels/train.txt ] || \ - srun --nodes=128 --ntasks-per-node=1 tar xf /l/ssd/lbannusr/datasets-resized/ILSVRC2012/labels.tar -C /l/ssd/lbannusr/datasets-resized/ILSVRC2012 -wait -echo "Done caching dataset..." - -# Experiment -srun --nodes=128 --ntasks-per-node=2 ${LBANN_DIR}/bamboo/compiler_tests/builds/catalyst_gcc-7.1.0_x86_64_mvapich2-2.2_openblas_rel/build/model_zoo/lbann --model=${LBANN_DIR}/model_zoo/models/alexnet/model_alexnet.prototext --optimizer=${LBANN_DIR}/model_zoo/optimizers/opt_sgd.prototext --reader=${LBANN_DIR}/model_zoo/data_readers/data_reader_imagenet.prototext --data_filedir_train=/l/ssd/lbannusr/datasets-resized/ILSVRC2012/train/ --data_filename_train=/l/ssd/lbannusr/datasets-resized/ILSVRC2012/labels/train.txt --data_filedir_test=/l/ssd/lbannusr/datasets-resized/ILSVRC2012/val/ --data_filename_test=/l/ssd/lbannusr/datasets-resized/ILSVRC2012/labels/val.txt diff --git a/bamboo/integration_tests/output/.gitignore b/bamboo/integration_tests/output/.gitignore deleted file mode 100644 index d6b7ef32c84..00000000000 --- a/bamboo/integration_tests/output/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -* -!.gitignore diff --git a/bamboo/integration_tests/test_integration_alexnet.py b/bamboo/integration_tests/test_integration_alexnet.py new file mode 100644 index 00000000000..1440d90c39e --- /dev/null +++ b/bamboo/integration_tests/test_integration_alexnet.py @@ -0,0 +1,223 @@ +import functools +import operator +import os +import os.path +import re +import sys +import numpy as np +import google.protobuf.text_format +import pytest + +# Local files +current_file = os.path.realpath(__file__) +current_dir = os.path.dirname(current_file) +sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python')) +import tools + +# ============================================== +# Options +# ============================================== + +# Training options +num_epochs = 5 +mini_batch_size = 256 +num_nodes = 4 +imagenet_fraction = 0.280994 # Train with 360K out of 1.28M samples + +# Top-5 classification accuracy (percent) +expected_train_accuracy_range = (9, 15) +expected_test_accuracy_range = (15, 24) + +# Average mini-batch time (in sec) for each LC system +expected_mini_batch_times = { + 'pascal': 0.100, + 'lassen': 0.050, + 'ray': 0.075, +} + +# ============================================== +# Setup LBANN experiment +# ============================================== + +def setup_experiment(lbann): + """Construct LBANN experiment. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + trainer = lbann.Trainer() + model = construct_model(lbann) + data_reader = construct_data_reader(lbann) + optimizer = lbann.SGD(learn_rate=0.01, momentum=0.9) + return trainer, model, data_reader, optimizer + +def construct_model(lbann): + """Construct LBANN model. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # TODO (tym): Figure out how to switch between LBANN builds. See + # GitHub Issue #1289. + import lbann.models + + # Layer graph + input_ = lbann.Input() + images = lbann.Identity(input_) + labels = lbann.Identity(input_) + x = lbann.models.AlexNet(1000)(images) + probs = lbann.Softmax(x) + cross_entropy = lbann.CrossEntropy([probs, labels]) + top5 = lbann.TopKCategoricalAccuracy([probs, labels], k=5) + layers = list(lbann.traverse_layer_graph(x)) + + # Setup objective function + l2_reg_weights = set() + for l in layers: + if type(l) == lbann.Convolution or type(l) == lbann.FullyConnected: + l2_reg_weights.update(l.weights) + l2_reg = lbann.L2WeightRegularization(weights=l2_reg_weights, scale=1e-4) + obj = lbann.ObjectiveFunction([cross_entropy, l2_reg]) + + # Objects for LBANN model + callbacks = [lbann.CallbackPrint(), lbann.CallbackTimer()] + metrics = [lbann.Metric(top5, name='top-5 accuracy', unit='%')] + + # Construct model + return lbann.Model(mini_batch_size, + num_epochs, + layers=layers, + objective_function=obj, + metrics=metrics, + callbacks=callbacks) + +def construct_data_reader(lbann): + """Construct Protobuf message for Python data reader. + + The Python data reader will import the current Python file to + access the sample access functions. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # TODO (tym): Figure out how to switch between LBANN builds. See + # GitHub Issue #1289. + import lbann.contrib.lc.paths + + # Load data readers from prototext + dirname = os.path.dirname + lbann_dir = dirname(dirname(dirname(os.path.realpath(__file__)))) + pb_file = os.path.join(lbann_dir, + 'model_zoo', + 'data_readers', + 'data_reader_imagenet.prototext') + message = lbann.lbann_pb2.LbannPB() + with open(pb_file, 'r') as f: + google.protobuf.text_format.Merge(f.read(), message) + message = message.data_reader + + # Set location of ImageNet-1K data + message.reader[0].data_filedir = lbann.contrib.lc.paths.imagenet_dir(data_set='train') + message.reader[0].data_filename = lbann.contrib.lc.paths.imagenet_labels(data_set='train') + message.reader[1].data_filedir = lbann.contrib.lc.paths.imagenet_dir(data_set='val') + message.reader[1].data_filename = lbann.contrib.lc.paths.imagenet_labels(data_set='val') + + # We train on a subset of ImageNet + message.reader[0].percent_of_data_to_use = imagenet_fraction + + # Only evaluate on ImageNet validation set at end of training + message.reader[1].role = 'test' + + return message + +# ============================================== +# Setup PyTest +# ============================================== + +def augment_test_func(test_func): + """Augment test function to parse log files. + + `tools.create_tests` creates functions that run an LBANN + experiment. This function creates augmented functions that parse + the log files after LBANN finishes running, e.g. to check metrics + or runtimes. + + Note: The naive approach is to define the augmented test functions + in a loop. However, Python closures are late binding. In other + words, the function would be overwritten every time we define it. + We get around this overwriting problem by defining the augmented + function in the local scope of another function. + + Args: + test_func (function): Test function created by + `tools.create_tests`. + + Returns: + function: Test that can interact with PyTest. + + """ + test_name = test_func.__name__ + + # Define test function + def func(cluster, exes, dirname, weekly): + + # Skip test with nightly builds and on CPU systems + if not weekly: + pytest.skip('only run {} with weekly builds'.format(test_name)) + if cluster in ('catalyst', 'corona'): + pytest.skip('only run {} on GPU systems'.format(test_name)) + + # Run LBANN experiment + experiment_output = test_func(cluster, exes, dirname) + + # Parse LBANN log file + train_accuracy = None + test_accuracy = None + mini_batch_times = [] + with open(experiment_output['stdout_log_file']) as f: + for line in f: + match = re.search('training epoch [0-9]+ top-5 accuracy : ([0-9.]+)%', line) + if match: + train_accuracy = float(match.group(1)) + match = re.search('test top-5 accuracy : ([0-9.]+)%', line) + if match: + test_accuracy = float(match.group(1)) + match = re.search('training epoch [0-9]+ mini-batch time statistics : ([0-9.]+)s mean', line) + if match: + mini_batch_times.append(float(match.group(1))) + + # Check if training accuracy is within expected range + assert (expected_train_accuracy_range[0] + < train_accuracy + < expected_train_accuracy_range[1]), \ + 'train accuracy is outside expected range' + + # Check if testing accuracy is within expected range + assert (expected_test_accuracy_range[0] + < test_accuracy + < expected_test_accuracy_range[1]), \ + 'test accuracy is outside expected range' + + # Check if mini-batch time is within expected range + # Note: Skip first epoch since its runtime is usually an outlier + mini_batch_times = mini_batch_times[1:] + mini_batch_time = sum(mini_batch_times) / len(mini_batch_times) + assert (0.75 * expected_mini_batch_times[cluster] + < mini_batch_time + < 1.25 * expected_mini_batch_times[cluster]), \ + 'average mini-batch time is outside expected range' + + # Return test function from factory function + func.__name__ = test_name + return func + +# Create test functions that can interact with PyTest +for _test_func in tools.create_tests(setup_experiment, + __file__, + nodes=num_nodes): + globals()[_test_func.__name__] = augment_test_func(_test_func) diff --git a/bamboo/integration_tests/test_integration_autoencoders.py b/bamboo/integration_tests/test_integration_autoencoders.py deleted file mode 100644 index 1c89520dc74..00000000000 --- a/bamboo/integration_tests/test_integration_autoencoders.py +++ /dev/null @@ -1,102 +0,0 @@ -import pytest -import common_code - - -def error_if(f, f_symbol, data_field, actual_values, expected_values, - model_name, errors, all_values, frequency_str): - d = actual_values[data_field] - for model_id in sorted(d.keys()): - for epoch_id in sorted(d[model_id].keys()): - actual_value = d[model_id][epoch_id] - expected_value = expected_values[epoch_id][data_field + frequency_str] - - if actual_value is None: - errors.append('d[%s][%s] == None' % (model_id, epoch_id)) - if expected_value is None: - errors.append('d[%s]([%s] == None' % (model_id, epoch_id)) - - if f(actual_value, expected_value): - errors.append('%f %s %f %s Model %s Epoch %s %s' % ( - actual_value, f_symbol, expected_value, model_name, model_id, - epoch_id, data_field)) - all_values.append('%f %s Model %s Epoch %s %s' % ( - actual_value, model_name, model_id, epoch_id, data_field)) - - -def run_tests(actual_objective_functions, model_name, dir_name, cluster, - should_log, compiler_name, frequency_str=''): - expected_objective_functions = common_code.csv_to_dict( - '%s/bamboo/integration_tests/expected_values/%s/%s/expected_%s_objective_functions.csv' % (dir_name, cluster, compiler_name, model_name)) - errors = [] - all_values = [] - tolerance = 0.05 - # Are we within tolerance * expected_value? - outside_tolerance = lambda x, y: abs(x - y) > abs(tolerance * y) - error_if(outside_tolerance, '!=', 'training_objective_function', - actual_objective_functions, expected_objective_functions, - model_name, errors, all_values, frequency_str) - - if should_log: - print('All values for: %s %s (%d)' % (model_name, compiler_name, - len(all_values))) - for value in all_values: - print(value) - assert errors == [] - -DATA_FIELDS = [ - 'training_objective_function' -] - - -def skeleton_autoencoder_imagenet(cluster, dir_name, executables, compiler_name, - weekly, data_reader_percent): - if cluster in ['lassen', 'pascal']: - e = 'skeleton_autoencoder_imagenet: does not run on GPU' - print('Skip - ' + e) - pytest.skip(e) - if compiler_name not in executables: - e = 'skeleton_autoencoder_imagenet: default_exes[%s] does not exist' % compiler_name - print('Skip - ' + e) - pytest.skip(e) - model_folder = 'models/autoencoder_imagenet' - model_name = 'conv_autoencoder_imagenet' - should_log = False - actual_objective_functions = common_code.skeleton( - cluster, dir_name, executables[compiler_name], model_folder, model_name, - DATA_FIELDS, should_log, compiler_name=compiler_name, weekly=weekly, - data_reader_percent=data_reader_percent) - frequency_str = '_nightly' - if weekly: - frequency_str = '_weekly' - run_tests(actual_objective_functions, model_name, dir_name, cluster, - should_log, compiler_name, frequency_str) - - -def test_integration_autoencoder_imagenet_clang6(cluster, dirname, exes, - weekly, data_reader_percent): - skeleton_autoencoder_imagenet(cluster, dirname, exes, 'clang6', weekly, - data_reader_percent) - - -def test_integration_autoencoder_imagenet_gcc7(cluster, dirname, exes, weekly, - data_reader_percent): - skeleton_autoencoder_imagenet(cluster, dirname, exes, 'gcc7', weekly, - data_reader_percent) - - -def test_integration_autoencoder_imagenet_intel19(cluster, dirname, exes, - weekly, data_reader_percent): - skeleton_autoencoder_imagenet(cluster, dirname, exes, 'intel19', weekly, - data_reader_percent) - - -# Run with python3 -m pytest -s test_integration_autoencoder.py -k 'test_integration_autoencoder_imagenet_exe' --exe= -def test_integration_autoencoder_imagenet_exe(cluster, dirname, exe, weekly, - data_reader_percent): - if exe is None: - e = 'test_integration_autoencoder_imagenet_exe: Non-local testing' - print('Skip - ' + e) - pytest.skip() - exes = {'exe': exe} - skeleton_autoencoder_imagenet(cluster, dirname, exes, 'exe', weekly, - data_reader_percent) diff --git a/bamboo/integration_tests/test_integration_debug.py b/bamboo/integration_tests/test_integration_debug.py deleted file mode 100644 index 8edf10eb3cc..00000000000 --- a/bamboo/integration_tests/test_integration_debug.py +++ /dev/null @@ -1,141 +0,0 @@ -import sys -sys.path.insert(0, '../common_python') -import tools -import pytest -import common_code - - -def skeleton_mnist_debug(cluster, dir_name, executables, compiler_name, weekly, - debug_build, should_log=False, - data_reader_percent=None): - # If weekly or debug_build are true, then run the test. - if not (weekly or debug_build): - e = 'skeleton_mnist_debug: Not doing weekly or debug_build testing' - print('Skip - ' + e) - pytest.skip(e) - if compiler_name not in executables: - e = 'skeleton_mnist_debug: default_exes[%s] does not exist' % compiler_name - print('Skip - ' + e) - pytest.skip(e) - model_name = 'lenet_mnist' - output_file_name = '%s/bamboo/integration_tests/output/%s_%s_output.txt' %(dir_name, model_name, compiler_name) - error_file_name = '%s/bamboo/integration_tests/error/%s_%s_error.txt' %(dir_name, model_name, compiler_name) - command = tools.get_command( - cluster=cluster, executable=executables[compiler_name], - # Allocation/Run Parameters - num_nodes=1, partition='pbatch', time_limit=100, - # LBANN Parameters - dir_name=dir_name, - data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST', - data_reader_name='mnist', - data_reader_percent=data_reader_percent, - model_folder='models/' + model_name, - model_name=model_name, num_epochs=5, optimizer_name='adagrad', - # Error/Output Redirect - error_file_name=error_file_name, - output_file_name=output_file_name, - # Misc. Parameters - weekly=weekly) - common_code.run_lbann(command, model_name, output_file_name, - error_file_name, should_log) - - -def skeleton_cifar_debug(cluster, dir_name, executables, compiler_name, weekly, - debug_build, should_log=False, - data_reader_percent=None): - # If weekly or debug_build are true, then run the test. - if not (weekly or debug_build): - e = 'skeleton_cifar_debug: Not doing weekly or debug_build testing' - print('Skip - ' + e) - pytest.skip(e) - if cluster == 'ray': - e = 'skeleton_cifar_debug: cifar not operational on Ray' - print('Skip - ' + e) - pytest.skip(e) - if compiler_name not in executables: - e = 'skeleton_cifar_debug: default_exes[%s] does not exist' % compiler_name - print('Skip - ' + e) - pytest.skip(e) - model_name = 'autoencoder_cifar10' - output_file_name = '%s/bamboo/integration_tests/output/%s_%s_output.txt' %(dir_name, model_name, compiler_name) - error_file_name = '%s/bamboo/integration_tests/error/%s_%s_error.txt' %(dir_name, model_name, compiler_name) - command = tools.get_command( - cluster=cluster, executable=executables[compiler_name], - # Allocation/Run Parameters - num_nodes=1, partition='pbatch', time_limit=100, - # LBANN Parameters - dir_name=dir_name, - data_filename_train_default='/p/lscratchh/brainusr/datasets/cifar10-bin/data_all.bin', - data_filename_test_default='/p/lscratchh/brainusr/datasets/cifar10-bin/test_batch.bin', - data_reader_name='cifar10', data_reader_percent=data_reader_percent, - model_folder='models/' + model_name, - model_name='conv_' + model_name, num_epochs=5, optimizer_name='adagrad', - # Error/Output Redirect - error_file_name=error_file_name, - output_file_name=output_file_name, - # Misc. Parameters - weekly=weekly) - common_code.run_lbann(command, model_name, output_file_name, - error_file_name, should_log) - - -def test_integration_mnist_clang6_debug(cluster, dirname, exes, weekly, - debug_build, data_reader_percent): - skeleton_mnist_debug(cluster, dirname, exes, 'clang6_debug', weekly, - debug_build, data_reader_percent) - - -def test_integration_cifar_clang6_debug(cluster, dirname, exes, weekly, - debug_build, data_reader_percent): - skeleton_cifar_debug(cluster, dirname, exes, 'clang6_debug', weekly, - debug_build, data_reader_percent) - - -def test_integration_mnist_gcc7_debug(cluster, dirname, exes, weekly, - debug_build, data_reader_percent): - skeleton_mnist_debug(cluster, dirname, exes, 'gcc7_debug', weekly, - debug_build, data_reader_percent) - - -def test_integration_cifar_gcc7_debug(cluster, dirname, exes, weekly, - debug_build, data_reader_percent): - skeleton_cifar_debug(cluster, dirname, exes, 'gcc7_debug', weekly, - debug_build, data_reader_percent) - - -def test_integration_mnist_intel19_debug(cluster, dirname, exes, weekly, - debug_build, data_reader_percent): - skeleton_mnist_debug(cluster, dirname, exes, 'intel19_debug', weekly, - debug_build, data_reader_percent) - - -def test_integration_cifar_intel19_debug(cluster, dirname, exes, weekly, - debug_build, data_reader_percent): - skeleton_cifar_debug(cluster, dirname, exes, 'intel19_debug', weekly, - debug_build, data_reader_percent) - - -# Run with python3 -m pytest -s test_integration_debug.py -k 'test_integration_mnist_exe' --exe= -def test_integration_mnist_exe(cluster, dirname, exe, weekly, - data_reader_percent): - if exe is None: - e = 'test_integration_mnist_exe: Non-local testing' - print('Skip - ' + e) - pytest.skip(e) - exes = {'exe': exe} - debug_build = True - skeleton_mnist_debug(cluster, dirname, exes, 'exe', weekly, debug_build, - data_reader_percent=data_reader_percent) - - -# Run with python3 -m pytest -s test_integration_debug.py -k 'test_integration_cifar_exe' --exe= -def test_integration_cifar_exe(cluster, dirname, exe, weekly, - data_reader_percent): - if exe == None: - e = 'test_integration_cifar_exe: Non-local testing' - print('Skip - ' + e) - pytest.skip(e) - exes = {'exe': exe} - debug_build=True - skeleton_cifar_debug(cluster, dirname, exes, 'exe', weekly, debug_build, - data_reader_percent=data_reader_percent) diff --git a/bamboo/integration_tests/test_integration_lenet.py b/bamboo/integration_tests/test_integration_lenet.py new file mode 100644 index 00000000000..b3ca8b70d1b --- /dev/null +++ b/bamboo/integration_tests/test_integration_lenet.py @@ -0,0 +1,204 @@ +import functools +import operator +import os +import os.path +import re +import sys +import numpy as np +import google.protobuf.text_format +import pytest + +# Local files +current_file = os.path.realpath(__file__) +current_dir = os.path.dirname(current_file) +sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python')) +import tools + +# ============================================== +# Options +# ============================================== + +# Training options +num_epochs = 5 +mini_batch_size = 64 +num_nodes = 2 + +# Classification accuracy (percent) +expected_train_accuracy_range = (98.75, 99.25) +expected_test_accuracy_range = (98, 99) + +# Average mini-batch time (in sec) for each LC system +expected_mini_batch_times = { + 'pascal': 0.0040, + 'catalyst': 0.0055, + 'lassen': 0.0020, + 'ray': 0.0025, + 'corona': 0.0075, +} + +# ============================================== +# Setup LBANN experiment +# ============================================== + +def setup_experiment(lbann): + """Construct LBANN experiment. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + trainer = lbann.Trainer() + model = construct_model(lbann) + data_reader = construct_data_reader(lbann) + optimizer = lbann.SGD(learn_rate=0.01, momentum=0.9) + return trainer, model, data_reader, optimizer + +def construct_model(lbann): + """Construct LBANN model. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # TODO (tym): Figure out how to switch between LBANN builds. See + # GitHub Issue #1289. + import lbann.models + + # Layer graph + input_ = lbann.Input() + images = lbann.Identity(input_) + labels = lbann.Identity(input_) + x = lbann.models.LeNet(10)(images) + probs = lbann.Softmax(x) + loss = lbann.CrossEntropy([probs, labels]) + acc = lbann.CategoricalAccuracy([probs, labels]) + + # Objects for LBANN model + callbacks = [lbann.CallbackPrint(), lbann.CallbackTimer()] + metrics = [lbann.Metric(acc, name='accuracy', unit='%')] + + # Construct model + return lbann.Model(mini_batch_size, + num_epochs, + layers=lbann.traverse_layer_graph(input_), + objective_function=loss, + metrics=metrics, + callbacks=callbacks) + +def construct_data_reader(lbann): + """Construct Protobuf message for Python data reader. + + The Python data reader will import the current Python file to + access the sample access functions. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # TODO (tym): Figure out how to switch between LBANN builds. See + # GitHub Issue #1289. + import lbann.contrib.lc.paths + + # Load data readers from prototext + dirname = os.path.dirname + lbann_dir = dirname(dirname(dirname(os.path.realpath(__file__)))) + pb_file = os.path.join(lbann_dir, + 'model_zoo', + 'data_readers', + 'data_reader_mnist.prototext') + message = lbann.lbann_pb2.LbannPB() + with open(pb_file, 'r') as f: + google.protobuf.text_format.Merge(f.read(), message) + message = message.data_reader + + # Set location of MNIST data + for reader in message.reader: + reader.data_filedir = lbann.contrib.lc.paths.mnist_dir() + + # No validation set + message.reader[0].validation_percent = 0 + + return message + +# ============================================== +# Setup PyTest +# ============================================== + +def augment_test_func(test_func): + """Augment test function to parse log files. + + `tools.create_tests` creates functions that run an LBANN + experiment. This function creates augmented functions that parse + the log files after LBANN finishes running, e.g. to check metrics + or runtimes. + + Note: The naive approach is to define the augmented test functions + in a loop. However, Python closures are late binding. In other + words, the function would be overwritten every time we define it. + We get around this overwriting problem by defining the augmented + function in the local scope of another function. + + Args: + test_func (function): Test function created by + `tools.create_tests`. + + Returns: + function: Test that can interact with PyTest. + + """ + test_name = test_func.__name__ + + # Define test function + def func(cluster, exes, dirname): + + # Run LBANN experiment + experiment_output = test_func(cluster, exes, dirname) + + # Parse LBANN log file + train_accuracy = None + test_accuracy = None + mini_batch_times = [] + with open(experiment_output['stdout_log_file']) as f: + for line in f: + match = re.search('training epoch [0-9]+ accuracy : ([0-9.]+)%', line) + if match: + train_accuracy = float(match.group(1)) + match = re.search('test accuracy : ([0-9.]+)%', line) + if match: + test_accuracy = float(match.group(1)) + match = re.search('training epoch [0-9]+ mini-batch time statistics : ([0-9.]+)s mean', line) + if match: + mini_batch_times.append(float(match.group(1))) + + # Check if training accuracy is within expected range + assert (expected_train_accuracy_range[0] + < train_accuracy + < expected_train_accuracy_range[1]), \ + 'train accuracy is outside expected range' + + # Check if testing accuracy is within expected range + assert (expected_test_accuracy_range[0] + < test_accuracy + < expected_test_accuracy_range[1]), \ + 'test accuracy is outside expected range' + + # Check if mini-batch time is within expected range + # Note: Skip first epoch since its runtime is usually an outlier + mini_batch_times = mini_batch_times[1:] + mini_batch_time = sum(mini_batch_times) / len(mini_batch_times) + assert (0.75 * expected_mini_batch_times[cluster] + < mini_batch_time + < 1.25 * expected_mini_batch_times[cluster]), \ + 'average mini-batch time is outside expected range' + + # Return test function from factory function + func.__name__ = test_name + return func + +# Create test functions that can interact with PyTest +for _test_func in tools.create_tests(setup_experiment, + __file__, + nodes=num_nodes): + globals()[_test_func.__name__] = augment_test_func(_test_func) diff --git a/bamboo/integration_tests/test_integration_performance.py b/bamboo/integration_tests/test_integration_performance.py deleted file mode 100644 index b3c7c980070..00000000000 --- a/bamboo/integration_tests/test_integration_performance.py +++ /dev/null @@ -1,278 +0,0 @@ -import pytest -import operator, os -import common_code - - -def error_if(f, f_symbol, data_field, actual_values, expected_values, - model_name, errors, all_values, frequency_str): - d = actual_values[data_field] - if f_symbol == '<': - # Every time a value is smaller, update archive_value - archive_value = float('inf') - elif f_symbol == '>': - # Every time a value is greater, update archive_value - archive_value = float('-inf') - else: - raise Exception('Invalid Function Symbol %s' % f_symbol) - for model_id in sorted(d.keys()): - for epoch_id in sorted(d[model_id].keys()): - actual_value = d[model_id][epoch_id] - expected_value = expected_values[model_name + frequency_str][data_field] - - if actual_value is None: - errors.append('actual_value: d[%s][%s] is None' % (model_id, epoch_id)) - else: - print('actual_value={av}'.format(av=actual_value)) - if expected_value is None: - errors.append( - 'expected_value: d[%s]([%s] is None' % (model_id, epoch_id)) - else: - print('expected_value={ev}'.format(ev=expected_value)) - - if (actual_value is not None) and (expected_value is not None): - if f(actual_value, expected_value): - errors.append('%f %s %f %s Model %s Epoch %s %s' % ( - actual_value, f_symbol, expected_value, model_name, model_id, - epoch_id, data_field)) - all_values.append('%f %s Model %s Epoch %s %s' % ( - actual_value, model_name, model_id, epoch_id, data_field)) - - if f(actual_value, archive_value): - archive_value = actual_value - else: - print('archiving: either actual_value or expected_value is None.') - return archive_value - - -def run_tests(actual_performance, model_name, dir_name, should_log, - compiler_name, cluster, frequency_str=''): - expected_performance = common_code.csv_to_dict( - '%s/bamboo/integration_tests/expected_values/%s/%s/expected_performance.csv' % (dir_name, cluster, compiler_name)) - errors = [] - all_values = [] - greater_than = operator.gt - less_than = operator.lt - max_run_time = error_if(greater_than, '>', 'training_run_time', actual_performance, expected_performance, model_name, errors, all_values, frequency_str) - max_mean = error_if(greater_than, '>', 'training_mean', actual_performance, expected_performance, model_name, errors, all_values, frequency_str) - max_max = error_if(greater_than, '>', 'training_max', actual_performance, expected_performance, model_name, errors, all_values, frequency_str) - max_min = error_if(greater_than, '>', 'training_min', actual_performance, expected_performance, model_name, errors, all_values, frequency_str) - max_stdev = error_if(greater_than, '>', 'training_stdev', actual_performance, expected_performance, model_name, errors, all_values, frequency_str) - min_accuracy = error_if(less_than, '<', 'test_accuracy', actual_performance, expected_performance, model_name, errors, all_values, frequency_str) - - archival_string = '%s, %f, %f, %f, %f, %f, %f\n' % ( - os.environ['bamboo_buildNumber'], max_run_time, max_mean, max_max, max_min, - max_stdev, min_accuracy) - print('archival_string: ' + archival_string) - if os.environ['LOGNAME'] == 'lbannusr': - key = 'bamboo_planKey' - if key in os.environ: - plan = os.environ[key] - if plan in ['LBANN-NIGHTD', 'LBANN-WD']: - archive_file = '/usr/workspace/wsb/lbannusr/archives/%s/%s/%s/performance_%s.txt' % (plan, cluster, compiler_name, model_name) - print('Archive file: ' + archive_file) - with open(archive_file, 'a') as archive: - print('Archiving to file.') - archive.write(archival_string) - else: - print('The plan %s does not have archiving activated' % plan) - else: - print('%s is not in os.environ' % key) - else: - print('os.environ["LOGNAME"]=%s' % os.environ['LOGNAME']) - - if should_log: - print('All values for: %s %s (%d)' % ( - model_name, compiler_name, len(all_values))) - for value in all_values: - print(value) - assert errors == [] - -DATA_FIELDS = [ - 'training_run_time', - 'training_mean', - 'training_max', - 'training_min', - 'training_stdev', - 'test_accuracy' -] - - -def skeleton_performance_lenet_mnist(cluster, dir_name, executables, - compiler_name, weekly, - data_reader_percent): - if compiler_name not in executables: - e = 'skeleton_performance_lenet_mnist: default_exes[%s] does not exist' % compiler_name - print('Skip - ' + e) - pytest.skip(e) - executable = executables[compiler_name] - model_name = 'lenet_mnist' - model_folder = 'models/' + model_name - should_log = True - actual_performance = common_code.skeleton( - cluster, dir_name, executable, model_folder, model_name, DATA_FIELDS, - should_log, compiler_name=compiler_name, weekly=weekly, - data_reader_percent=data_reader_percent) - frequency_str = '_nightly' - if weekly: - frequency_str = '_weekly' - run_tests(actual_performance, model_name, dir_name, should_log, - compiler_name, cluster, frequency_str=frequency_str) - - -def skeleton_performance_alexnet(cluster, dir_name, executables, compiler_name, - weekly, data_reader_percent): - if compiler_name not in executables: - e = 'skeleton_performance_alexnet: default_exes[%s] does not exist' % compiler_name - print('Skip - ' + e) - pytest.skip(e) - executable = executables[compiler_name] - model_name = 'alexnet' - model_folder = 'models/' + model_name - should_log = True - actual_performance = common_code.skeleton( - cluster, dir_name, executable, model_folder, model_name, DATA_FIELDS, - should_log, compiler_name=compiler_name, weekly=weekly, - data_reader_percent=data_reader_percent) - frequency_str = '_nightly' - if weekly: - frequency_str = '_weekly' - run_tests(actual_performance, model_name, dir_name, should_log, - compiler_name, cluster, frequency_str=frequency_str) - - -def skeleton_performance_full_alexnet(cluster, dir_name, executables, - compiler_name, weekly, run, - data_reader_percent): - # `run` is False for calls to run.sh. - # `run` is True, in allocate_and_run.sh, if this is a Weekly test on Catalyst. - if not run: - e = 'skeleton_performance_full_alexnet: Ignored' - print('Skip - ' + e) - pytest.skip(e) - if not weekly: - e = 'skeleton_performance_full_alexnet: Not doing weekly testing' - print('Skip - ' + e) - pytest.skip(e) - if compiler_name not in executables: - e = 'skeleton_performance_full_alexnet: default_exes[%s] does not exist' % compiler_name - print('Skip - ' + e) - pytest.skip(e) - executable = executables[compiler_name] - if not os.path.exists(executable): - pytest.skip('Executable does not exist: %s' % executable) - model_name = 'full_alexnet' - should_log = True - output_file_name = '%s/bamboo/integration_tests/output/%s_%s_output.txt' %(dir_name, model_name, compiler_name) - error_file_name = '%s/bamboo/integration_tests/error/%s_%s_error.txt' %(dir_name, model_name, compiler_name) - # No use for data_reader_percent here. - # Keeping it as a parameter since a user may pass it in when - # running all exe tests. - if cluster in ['catalyst']: - command = 'salloc --nodes 128 %s/bamboo/integration_tests/%s.sh > %s 2> %s' % (dir_name, model_name, output_file_name, error_file_name) - elif cluster in ['lassen', 'pascal', 'ray']: - e = 'skeleton_performance_full_alexnet: Pascal, Ray are unsupported for skeleton_performance_full_alexnet' - print('Skip - ' + e) - pytest.skip(e) - else: - raise Exception('Unsupported Cluster %s' % cluster) - common_code.run_lbann(command, model_name, output_file_name, error_file_name, - should_log) # Don't need return value - actual_performance = common_code.extract_data(output_file_name, DATA_FIELDS, - should_log) - run_tests(actual_performance, model_name, dir_name, should_log, compiler_name, - cluster) - - -def test_integration_performance_lenet_mnist_clang6(cluster, dirname, exes, - weekly, data_reader_percent): - skeleton_performance_lenet_mnist(cluster, dirname, exes, 'clang6', weekly, - data_reader_percent) - - -def test_integration_performance_alexnet_clang6(cluster, dirname, exes, weekly, - data_reader_percent): - skeleton_performance_alexnet(cluster, dirname, exes, 'clang6', weekly, - data_reader_percent) - - -def test_integration_performance_full_alexnet_clang6(cluster, dirname, exes, - weekly, run, - data_reader_percent): - skeleton_performance_full_alexnet(cluster, dirname, exes, 'clang6', weekly, - run, data_reader_percent) - - -def test_integration_performance_lenet_mnist_gcc7(cluster, dirname, exes, - weekly, data_reader_percent): - skeleton_performance_lenet_mnist(cluster, dirname, exes, 'gcc7', weekly, - data_reader_percent) - - -def test_integration_performance_alexnet_gcc7(cluster, dirname, exes, weekly, - data_reader_percent): - skeleton_performance_alexnet(cluster, dirname, exes, 'gcc7', weekly, - data_reader_percent) - - -def test_integration_performance_full_alexnet_gcc7(cluster, dirname, exes, - weekly, run, - data_reader_percent): - skeleton_performance_full_alexnet(cluster, dirname, exes, 'gcc7', weekly, run, - data_reader_percent) - - -def test_integration_performance_lenet_mnist_intel19(cluster, dirname, exes, - weekly, - data_reader_percent): - skeleton_performance_lenet_mnist(cluster, dirname, exes, 'intel19', weekly, - data_reader_percent) - - -def test_integration_performance_alexnet_intel19(cluster, dirname, exes, - weekly, data_reader_percent): - skeleton_performance_alexnet(cluster, dirname, exes, 'intel19', weekly, - data_reader_percent) - - -def test_integration_performance_full_alexnet_intel19(cluster, dirname, exes, - weekly, run, - data_reader_percent): - skeleton_performance_full_alexnet(cluster, dirname, exes, 'intel19', weekly, - run, data_reader_percent) - - -# Run with python3 -m pytest -s test_integration_performance.py -k 'test_integration_performance_lenet_mnist_exe' --exe= -def test_integration_performance_lenet_mnist_exe(cluster, dirname, exe, weekly, - data_reader_percent): - if exe is None: - e = 'test_integration_performance_lenet_mnist_exe: Non-local testing' - print('Skip - ' + e) - pytest.skip(e) - exes = {'exe': exe} - skeleton_performance_lenet_mnist(cluster, dirname, exes, 'exe', weekly, - data_reader_percent) - - -# Run with python3 -m pytest -s test_integration_performance.py -k 'test_integration_performance_alexnet_exe' --exe= -def test_integration_performance_alexnet_exe(cluster, dirname, exe, weekly, - data_reader_percent): - if exe is None: - e = 'stest_integration_performance_alexnet_exe: Non-local testing' - print('Skip - ' + e) - pytest.skip(e) - exes = {'exe': exe} - skeleton_performance_alexnet(cluster, dirname, exes, 'exe', weekly, - data_reader_percent) - - -# Run with python3 -m pytest -s test_integration_performance.py -k 'test_integration_performance_full_alexnet_exe' --weekly --run --exe= -def test_integration_performance_full_alexnet_exe(cluster, dirname, weekly, - run, exe, - data_reader_percent): - if exe is None: - e = 'test_integration_performance_full_alexnet_exe: Non-local testing' - print('Skip - ' + e) - pytest.skip(e) - exes = {'exe': exe} - skeleton_performance_full_alexnet(cluster, dirname, exes, 'exe', weekly, - run, data_reader_percent) diff --git a/bamboo/integration_tests/test_integration_resnet50.py b/bamboo/integration_tests/test_integration_resnet50.py new file mode 100644 index 00000000000..086afd351f3 --- /dev/null +++ b/bamboo/integration_tests/test_integration_resnet50.py @@ -0,0 +1,223 @@ +import functools +import operator +import os +import os.path +import re +import sys +import numpy as np +import google.protobuf.text_format +import pytest + +# Local files +current_file = os.path.realpath(__file__) +current_dir = os.path.dirname(current_file) +sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python')) +import tools + +# ============================================== +# Options +# ============================================== + +# Training options +num_epochs = 5 +mini_batch_size = 256 +num_nodes = 4 +imagenet_fraction = 0.280994 # Train with 360K out of 1.28M samples + +# Top-5 classification accuracy (percent) +expected_train_accuracy_range = (45, 50) +expected_test_accuracy_range = (40, 55) + +# Average mini-batch time (in sec) for each LC system +expected_mini_batch_times = { + 'pascal': 0.25, + 'lassen': 0.10, + 'ray': 0.15, +} + +# ============================================== +# Setup LBANN experiment +# ============================================== + +def setup_experiment(lbann): + """Construct LBANN experiment. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + trainer = lbann.Trainer() + model = construct_model(lbann) + data_reader = construct_data_reader(lbann) + optimizer = lbann.SGD(learn_rate=0.1, momentum=0.9) + return trainer, model, data_reader, optimizer + +def construct_model(lbann): + """Construct LBANN model. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # TODO (tym): Figure out how to switch between LBANN builds. See + # GitHub Issue #1289. + import lbann.models + + # Layer graph + input_ = lbann.Input() + images = lbann.Identity(input_) + labels = lbann.Identity(input_) + x = lbann.models.ResNet50(1000, bn_statistics_group_size=-1)(images) + probs = lbann.Softmax(x) + cross_entropy = lbann.CrossEntropy([probs, labels]) + top5 = lbann.TopKCategoricalAccuracy([probs, labels], k=5) + layers = list(lbann.traverse_layer_graph(x)) + + # Setup objective function + l2_reg_weights = set() + for l in layers: + if type(l) == lbann.Convolution or type(l) == lbann.FullyConnected: + l2_reg_weights.update(l.weights) + l2_reg = lbann.L2WeightRegularization(weights=l2_reg_weights, scale=1e-4) + obj = lbann.ObjectiveFunction([cross_entropy, l2_reg]) + + # Objects for LBANN model + callbacks = [lbann.CallbackPrint(), lbann.CallbackTimer()] + metrics = [lbann.Metric(top5, name='top-5 accuracy', unit='%')] + + # Construct model + return lbann.Model(mini_batch_size, + num_epochs, + layers=layers, + objective_function=obj, + metrics=metrics, + callbacks=callbacks) + +def construct_data_reader(lbann): + """Construct Protobuf message for Python data reader. + + The Python data reader will import the current Python file to + access the sample access functions. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # TODO (tym): Figure out how to switch between LBANN builds. See + # GitHub Issue #1289. + import lbann.contrib.lc.paths + + # Load data readers from prototext + dirname = os.path.dirname + lbann_dir = dirname(dirname(dirname(os.path.realpath(__file__)))) + pb_file = os.path.join(lbann_dir, + 'model_zoo', + 'data_readers', + 'data_reader_imagenet.prototext') + message = lbann.lbann_pb2.LbannPB() + with open(pb_file, 'r') as f: + google.protobuf.text_format.Merge(f.read(), message) + message = message.data_reader + + # Set location of ImageNet-1K data + message.reader[0].data_filedir = lbann.contrib.lc.paths.imagenet_dir(data_set='train') + message.reader[0].data_filename = lbann.contrib.lc.paths.imagenet_labels(data_set='train') + message.reader[1].data_filedir = lbann.contrib.lc.paths.imagenet_dir(data_set='val') + message.reader[1].data_filename = lbann.contrib.lc.paths.imagenet_labels(data_set='val') + + # We train on a subset of ImageNet + message.reader[0].percent_of_data_to_use = imagenet_fraction + + # Only evaluate on ImageNet validation set at end of training + message.reader[1].role = 'test' + + return message + +# ============================================== +# Setup PyTest +# ============================================== + +def augment_test_func(test_func): + """Augment test function to parse log files. + + `tools.create_tests` creates functions that run an LBANN + experiment. This function creates augmented functions that parse + the log files after LBANN finishes running, e.g. to check metrics + or runtimes. + + Note: The naive approach is to define the augmented test functions + in a loop. However, Python closures are late binding. In other + words, the function would be overwritten every time we define it. + We get around this overwriting problem by defining the augmented + function in the local scope of another function. + + Args: + test_func (function): Test function created by + `tools.create_tests`. + + Returns: + function: Test that can interact with PyTest. + + """ + test_name = test_func.__name__ + + # Define test function + def func(cluster, exes, dirname, weekly): + + # Skip test with nightly builds and on CPU systems + if not weekly: + pytest.skip('only run {} with weekly builds'.format(test_name)) + if cluster in ('catalyst', 'corona'): + pytest.skip('only run {} on GPU systems'.format(test_name)) + + # Run LBANN experiment + experiment_output = test_func(cluster, exes, dirname) + + # Parse LBANN log file + train_accuracy = None + test_accuracy = None + mini_batch_times = [] + with open(experiment_output['stdout_log_file']) as f: + for line in f: + match = re.search('training epoch [0-9]+ top-5 accuracy : ([0-9.]+)%', line) + if match: + train_accuracy = float(match.group(1)) + match = re.search('test top-5 accuracy : ([0-9.]+)%', line) + if match: + test_accuracy = float(match.group(1)) + match = re.search('training epoch [0-9]+ mini-batch time statistics : ([0-9.]+)s mean', line) + if match: + mini_batch_times.append(float(match.group(1))) + + # Check if training accuracy is within expected range + assert (expected_train_accuracy_range[0] + < train_accuracy + < expected_train_accuracy_range[1]), \ + 'train accuracy is outside expected range' + + # Check if testing accuracy is within expected range + assert (expected_test_accuracy_range[0] + < test_accuracy + < expected_test_accuracy_range[1]), \ + 'test accuracy is outside expected range' + + # Check if mini-batch time is within expected range + # Note: Skip first epoch since its runtime is usually an outlier + mini_batch_times = mini_batch_times[1:] + mini_batch_time = sum(mini_batch_times) / len(mini_batch_times) + assert (0.75 * expected_mini_batch_times[cluster] + < mini_batch_time + < 1.25 * expected_mini_batch_times[cluster]), \ + 'average mini-batch time is outside expected range' + + # Return test function from factory function + func.__name__ = test_name + return func + +# Create test functions that can interact with PyTest +for _test_func in tools.create_tests(setup_experiment, + __file__, + nodes=num_nodes): + globals()[_test_func.__name__] = augment_test_func(_test_func) diff --git a/bamboo/run.sh b/bamboo/run.sh index 45908a04f8e..e7e5fec75fe 100755 --- a/bamboo/run.sh +++ b/bamboo/run.sh @@ -39,12 +39,12 @@ module load cmake/3.9.2 $PYTHON -m pytest -s -vv --durations=0 --junitxml=results.xml cd .. -echo "Task: Integration Tests (Weekly only)" +echo "Task: Integration Tests" cd integration_tests if [ ${WEEKLY} -ne 0 ]; then $PYTHON -m pytest -s -vv --durations=0 --weekly --junitxml=results.xml -# else -# $PYTHON -m pytest -s -vv --durations=0 --junitxml=results.xml +else + $PYTHON -m pytest -s -vv --durations=0 --junitxml=results.xml fi cd .. diff --git a/python/lbann/contrib/lc/paths.py b/python/lbann/contrib/lc/paths.py index 2ca67f2f4a1..75940153cd4 100644 --- a/python/lbann/contrib/lc/paths.py +++ b/python/lbann/contrib/lc/paths.py @@ -10,6 +10,8 @@ def parallel_file_system_path(system = system()): """Base path to parallel file system.""" if system in ('lassen', 'sierra'): return '/p/gpfs1/' + elif system == 'ray': + return '/p/gscratchr/' else: return '/p/lustre2/' From acd5cc9bdd39c041c72f29a92bf2b823a36805f0 Mon Sep 17 00:00:00 2001 From: Tim Moon Date: Fri, 25 Oct 2019 14:58:13 -0700 Subject: [PATCH 368/634] Fix compilation error in softmax layer (#1326) std::sqrt is not constexpr --- src/layers/activations/softmax.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/layers/activations/softmax.cpp b/src/layers/activations/softmax.cpp index 236409bf825..7c10914f064 100644 --- a/src/layers/activations/softmax.cpp +++ b/src/layers/activations/softmax.cpp @@ -33,7 +33,7 @@ namespace { #ifdef LBANN_ENABLE_SOFTMAX_THRESHOLD /** Minimum output value to avoid denormalized floats */ -constexpr DataType threshold_val = std::sqrt(std::numeric_limits::min()); +const DataType threshold_val = std::sqrt(std::numeric_limits::min()); #endif // LBANN_ENABLE_SOFTMAX_THRESHOLD void fp(lbann_comm& comm, From 21ef49ffea73015df44a9b8731d5b42d4ed7a47c Mon Sep 17 00:00:00 2001 From: davidHysom Date: Mon, 28 Oct 2019 14:03:53 -0700 Subject: [PATCH 369/634] Data spill and checkpoint (#1313) * This PR adds checkpoint capability to data_store_conduit. Each data_store can checkpoint its local partition of the global data, along with other required state, to local (or non-local) disk. * Checkpointing is activated via the methods: write_checkpoint(directory) and load_checkpoint(directory, generic_data_reader*) * There is also a built in test in which the data store is checkpointed after the first epoch; variables are cleared; and the checkpoint is read back in. This is activated via the cmd line flag: --data_store_test_checkpoint=. On lassen, can be "lassen" in which case an attempt will be made to use the burst buffers. Otherwise it can be any /. In this case must already exist. may also exist, but if not it will (hopefully) be created. * This PR does not implement data spilling, due to behavioral differences between preloading and explicit loading. Data spilling will be addressed in a subsequent PR. * This PR passes bamboo, and has also been independently tested on lassen and pascal (via the --data_store_test_checkpoint= flag), for both imagenet and JAG data. * This PR replaces most of the cout and cerr printing with optional writes to file. Printing is triggered via the cmd line flags --data_store_profile (P_0 writes summary info to file) and --data_store_debug (all ranks write possibly extensive info to file). Calls to std::cout and std::cerr have been replaced with the variadic methods: PROFILE(...) and DEBUG(...) --- .../lbann/data_store/data_store_conduit.hpp | 300 ++++-- src/data_readers/data_reader.cpp | 5 +- src/data_readers/data_reader_image.cpp | 1 - src/data_readers/data_reader_jag_conduit.cpp | 1 - .../data_reader_numpy_npz_conduit.cpp | 1 - src/data_store/data_store_conduit.cpp | 865 +++++++++++++----- src/models/model.cpp | 8 +- src/utils/lbann_library.cpp | 2 +- src/utils/options.cpp | 23 +- 9 files changed, 918 insertions(+), 288 deletions(-) diff --git a/include/lbann/data_store/data_store_conduit.hpp b/include/lbann/data_store/data_store_conduit.hpp index 63e56e31585..43719250bf5 100644 --- a/include/lbann/data_store/data_store_conduit.hpp +++ b/include/lbann/data_store/data_store_conduit.hpp @@ -32,6 +32,7 @@ #include "lbann/base.hpp" #include "lbann/comm.hpp" +#include "lbann/utils/exception.hpp" #include "conduit/conduit_node.hpp" #include #include @@ -74,8 +75,8 @@ class data_store_conduit { //! convenience handle void set_shuffled_indices(const std::vector *indices); - /// for use during development and debugging - size_t get_num_indices() const; + /** @brief Returns the number of samples summed over all ranks */ + size_t get_num_global_indices() const; void setup(int mini_batch_size); @@ -90,7 +91,8 @@ class data_store_conduit { /// get_empty_node(). In some operating modes this saves us from copying the node void set_conduit_node(int data_id, conduit::Node &node, bool already_have = false); - void set_preloaded_conduit_node(int data_id, conduit::Node &node); + void set_preloaded_conduit_node(int data_id, const conduit::Node &node); + void spill_preloaded_conduit_node(int data_id, const conduit::Node &node); const conduit::Node & get_random_node() const; @@ -99,9 +101,7 @@ class data_store_conduit { /// returns an empty node conduit::Node & get_empty_node(int data_id); - /// As of this writing, will be called if cmd line includes: --preload_data_store - /// This may change in the future; TODO revisit - void set_preload(); + void set_is_preloaded(); bool is_preloaded() { return m_preload; } @@ -134,7 +134,8 @@ class data_store_conduit { /// only used for debugging; pass --debug on cmd line to get /// each data store to print to a different file. This is made /// public so data readers can also print to the file - mutable std::ofstream *m_output = nullptr; + std::ofstream *m_debug = nullptr; + std::ofstream *m_profile = nullptr; /// for use during development and debugging int get_data_size() { return m_data.size(); } @@ -142,13 +143,118 @@ class data_store_conduit { /// made public for debugging during development void copy_members(const data_store_conduit& rhs, const std::vector& = std::vector()); + /** @brief Closes then reopens the debug logging file + * + * Debug logging is enabled on all ranks via the cmd line flag: --data_store_debug + */ void flush_debug_file(); -protected : - double m_exchange_time = 0; + /** @brief Closes then reopens the profile logging file + * + * Profile logging is enabled on P_0 via the cmd line flag: --data_store_profile + */ + void flush_profile_file(); + + /** @brief Writes object's state to file */ + void write_checkpoint(std::string dir_name); + + /** @brief Loads object's state from file */ + void load_checkpoint(std::string dir_name, generic_data_reader *reader = nullptr); + +private : + + /** @brief The number of samples that this processor owns */ + size_t m_my_num_indices = 0; + + /** @brief if true, then we are spilling (offloading) samples to disk */ + bool m_spill = false; + + /** @brief if true, then all samples have been spilled */ + bool m_is_spilled = false; + + /** During spilling, the conduit file pathnames are written to this file */ + std::ofstream m_metadata; + + /** @brief Base directory for spilling (offloading) conduit nodes */ + std::string m_spill_dir_base; + + /** @brief Used to form the directory path for spilling conduit nodes */ + int m_cur_spill_dir_integer = -1; + + /** @brief @brief Current directory for spilling (writing to file) conduit nodes + * + * m_cur_spill_dir = m_spill_dir_base/ + */ + std::string m_cur_spill_dir; + + /** @brief The directory to use for testing checkpointing + * + * Testing is activated by passing the cmd flag: --data_store_test_checkpoint= + */ + std::string m_test_dir; + + /** @brief Contains the number of conduit nodes that have been written to m_cur_dir + * + * When m_num_files_in_cur_spill_dir == m_max_files_per_directory, + * m_cur_spill_dir_integer is incremented and a new m_cur_dir is created + */ + int m_num_files_in_cur_spill_dir; + + /** @brief maps data_id to m_m_cur_spill_dir_integer. */ + std::unordered_map m_spilled_nodes; + + /// used in set_conduit_node(...) + std::mutex m_mutex; + + /// for use in local cache mode + char *m_mem_seg = 0; + size_t m_mem_seg_length = 0; + std::string m_seg_name; + + const std::string m_debug_filename_base = "debug"; + std::string m_debug_filename; + + const std::string m_profile_filename_base = "data_store_profile"; + std::string m_profile_filename; + + bool m_was_loaded_from_file = false; + const std::string m_cereal_fn = "data_store_cereal"; + + /// used in spill_to_file + /// (actually, conduit::Node.save() writes both a + /// json file and a binary file, so double this number + const int m_max_files_per_directory = 500; + + //=========================================================== + // timers for profiling exchange_data + //=========================================================== + + // applicable to imagenet; NA for JAG + double m_exchange_sample_sizes_time = 0; + + // time from beginning of exchange_data_by_sample to wait_all + double m_start_snd_rcv_time = 0; + + // time for wait_all + double m_wait_all_time = 0; + + // time to unpack nodes received from other ranks double m_rebuild_time = 0; + // total time for exchange_mini_batch_data + double m_exchange_time = 0; + + // sanity check: + // m_start_snd_rcv_time + m_wait_all_time + m_rebuild_time + // should be only slightly less than m_exchange_time; + // Note that, for imagenet, the first call to exchange_data_by_sample + // involves additional communication for exchanging sample sizes + + //=========================================================== + // END: timers for profiling exchange_data + //=========================================================== + int m_cur_epoch = 0; bool m_is_setup = false; @@ -178,36 +284,38 @@ protected : generic_data_reader *m_reader; - lbann_comm *m_comm; + lbann_comm *m_comm = nullptr; - /// convenience handle + /// convenience handles bool m_world_master; - - /// convenience handle bool m_trainer_master; - - /// rank in the trainer; convenience handle int m_rank_in_trainer; - - /// number of procs in the trainer; convenience handle + int m_rank_in_world = -1; // -1 for debugging int m_np_in_trainer; - /// maps an index to the processor that owns the associated data - mutable std::unordered_map m_owner; + /** @brief Maps an index to the processor that owns the associated data + * + * Must be mutable since rhs.m_owner may be modified in copy_members, + * in which rhs is const. + */ + //TODO: make undoredered map; for development want map() for ordered printing + mutable std::map m_owner; /// convenience handle const std::vector *m_shuffled_indices; - void exchange_data_by_sample(size_t current_pos, size_t mb_size); + /** @brief Contains the conduit nodes that are "owned" by this rank + * + * Map data_id -> conduit::Node. + * Must be mutable since rhs.m_owner may be modified in copy_members, + * in which rhs is const. + */ + mutable std::unordered_map m_data; /// Contains the list of data IDs that will be received std::vector m_recv_data_ids; std::unordered_map m_recv_sample_sizes; - /// contains the Nodes that this processor owns; - /// maps data_id to conduit::Node - mutable std::unordered_map m_data; - /// This vector contains Nodes that this processor needs for /// the current minibatch; this is filled in by exchange_data() std::unordered_map m_minibatch_data; @@ -221,9 +329,27 @@ protected : std::vector m_outgoing_msg_sizes; std::vector m_incoming_msg_sizes; - /// used in exchange_data_by_super_node(); contains the super_nodes, - /// after they have been converted from compacted format - std::vector m_reconstituted; + /// for use when conduit Nodes have non-uniform size, e.g, imagenet + std::unordered_map m_sample_sizes; + + /// maps processor id -> set of indices (whose associated samples) + /// this proc needs to send. (formerly called "proc_to_indices); + /// this is filled in by build_indices_i_will_send() + std::vector> m_indices_to_send; + + /// maps processor id -> set of indices (whose associated samples) + /// this proc needs to recv from others. (formerly called "needed") + std::vector> m_indices_to_recv; + + /// offset at which the raw image will be stored in a shared memory segment; + /// for use in local cache mode; maps data_id to offset + std::unordered_map m_image_offsets; + + //========================================================================= + // methods follow + //========================================================================= + + void exchange_data_by_sample(size_t current_pos, size_t mb_size); void setup_data_store_buffers(); @@ -233,42 +359,23 @@ protected : /// fills in m_owner, which maps index -> owning processor void exchange_owner_maps(); - /// for use when conduit Nodes have non-uniform size, e.g, imagenet, - /// and when running in non-super_node mode + /// for use when conduit Nodes have non-uniform size, e.g, imagenet void exchange_sample_sizes(); - /// maps processor id -> set of indices (whose associated samples) - /// this proc needs to send. (formerly called "proc_to_indices); - /// this is filled in by build_indices_i_will_send() - std::vector> m_indices_to_send; - /// fills in m_indices_to_send and returns the number of samples /// that will be sent int build_indices_i_will_send(int current_pos, int mb_size); - /// maps processor id -> set of indices (whose associated samples) - /// this proc needs to recv from others. (formerly called "needed") - std::vector> m_indices_to_recv; - /// fills in m_indices_to_recv and returns the number of samples /// that will be received int build_indices_i_will_recv(int current_pos, int mb_size); void error_check_compacted_node(const conduit::Node &nd, int data_id); - /// for use when conduit Nodes have non-uniform size, e.g, imagenet - std::unordered_map m_sample_sizes; - - /// used in set_conduit_node(...) - std::mutex m_mutex; - /// Currently only used for imagenet. On return, 'sizes' maps a sample_id to image size, and indices[p] contains the sample_ids that P_p owns /// for use in local cache mode void get_image_sizes(std::unordered_map &sizes, std::vector> &indices); - /// offset at which the raw image will be stored in a shared memory segment; - /// for use in local cache mode; maps data_id to offset - std::unordered_map m_image_offsets; /// fills in m_image_offsets for use in local cache mode void compute_image_offsets(std::unordered_map &sizes, std::vector> &indices); @@ -287,12 +394,101 @@ protected : /// for use in local cache mode void fillin_shared_images(const std::vector &images, size_t offset); - /// for use in local cache mode - char *m_mem_seg = 0; - size_t m_mem_seg_length = 0; - std::string m_seg_name; + /** @brief For testing during development + * + * At the beginning of the 2nd epoch, calls write_checkpoint(), + * clears some variables, calls load_checkpoint then continues. + * To activate this test use cmd flag: --data_store_test_checkpoint= + */ + void test_checkpoint(const std::string&); + + /** @brief Called by test_checkpoint */ + void print_variables(); + + /** @brief Called by test_checkpoint */ + void print_partial_owner_map(int n); + + std::string get_conduit_dir() const; + std::string get_cereal_fn() const; + std::string get_metadata_fn() const; + + + /** @brief Creates the directory if it does not already exist */ + void make_dir_if_it_doesnt_exist(const std::string &dir); + + /** @brief Writes conduit node to file */ + void spill_conduit_node(const conduit::Node &node, int data_id); + + /** @brief Loads conduit nodes from file into m_data */ + void load_spilled_conduit_nodes(); + + /** @brief Creates directory structure, opens metadata file for output, etc + * + * This method is called for both --data_store_spill and + * --data_store_test_checkpoint + */ + void setup_spill(const std::string &dir); + + /** @brief Saves this object's state to file + * + * Here, "state" is all data, except for conduit nodes, that is + * needed to reload from checkpoint + */ + void save_state(); + + /** @brief Optionally open debug and profiling files + * + * A debug file is opened for every pair; + * files are opened if the cmd flag --data_store_debug is passed. + * A profiling file is opened only be + * pairs; files are opened if the cmd flag --data_store_profile is passed. + */ + void open_informational_files(); + + /** @brief Creates a directory for spilling conduit nodes */ + void open_next_conduit_spill_directory(); + + //========================================================================= + // functions and templates for optional profiling and debug files follow + //========================================================================= + + void PROFILE() { + if (!m_profile) { + return; + } + (*m_profile) << std::endl; + flush_profile_file(); + } + + template + void PROFILE(T var1, Types... var2) { + if (!m_world_master) { + return; + } + if (!m_profile) { + return; + } + (*m_profile) << var1 << " "; + PROFILE(var2...) ; + } + + void DEBUG() { + if (!m_debug) { + return; + } + (*m_debug) << std::endl; + flush_debug_file(); + } + + template + void DEBUG(T var1, Types... var2) { + if (!m_debug) { + return; + } + (*m_debug) << var1 << " "; + DEBUG(var2...) ; + } - std::string m_debug_filename; }; } // namespace lbann diff --git a/src/data_readers/data_reader.cpp b/src/data_readers/data_reader.cpp index a914f192c6f..dbd1af895e2 100644 --- a/src/data_readers/data_reader.cpp +++ b/src/data_readers/data_reader.cpp @@ -688,7 +688,7 @@ double generic_data_reader::get_use_percent() const { void generic_data_reader::instantiate_data_store(const std::vector& local_list_sizes) { double tm1 = get_time(); options *opts = options::get(); - if (! (opts->get_bool("use_data_store") || opts->get_bool("preload_data_store") || opts->get_bool("data_store_cache"))) { + if (! (opts->get_bool("use_data_store") || opts->get_bool("preload_data_store") || opts->get_bool("data_store_cache") || opts->has_string("data_store_spill"))) { if (m_data_store != nullptr) { delete m_data_store; m_data_store = nullptr; @@ -724,11 +724,12 @@ void generic_data_reader::instantiate_data_store(const std::vector& local_l m_data_store->build_preloaded_owner_map(local_list_sizes); } preload_data_store(); + m_data_store->set_is_preloaded(); if(is_master()) { std::cout << "Preload complete; time: " << get_time() - tm2 << std::endl; } - size_t n = m_data_store->get_num_indices(); + size_t n = m_data_store->get_num_global_indices(); if (n != m_shuffled_indices.size()) { LBANN_ERROR("num samples loaded: ", n, " != shuffled-indices.size(): ", m_shuffled_indices.size()); } diff --git a/src/data_readers/data_reader_image.cpp b/src/data_readers/data_reader_image.cpp index d948a9915c3..38cb5fc7ac6 100644 --- a/src/data_readers/data_reader_image.cpp +++ b/src/data_readers/data_reader_image.cpp @@ -208,7 +208,6 @@ void read_raw_data(const std::string &filename, std::vector &data) { void image_data_reader::preload_data_store() { - m_data_store->set_preload(); options *opts = options::get(); if (is_master()) std::cout << "Starting image_data_reader::preload_data_store; num indices: " << m_shuffled_indices.size() << std::endl; diff --git a/src/data_readers/data_reader_jag_conduit.cpp b/src/data_readers/data_reader_jag_conduit.cpp index 9b277cc1d9d..916555ebca6 100644 --- a/src/data_readers/data_reader_jag_conduit.cpp +++ b/src/data_readers/data_reader_jag_conduit.cpp @@ -922,7 +922,6 @@ void data_reader_jag_conduit::load() { void data_reader_jag_conduit::preload_data_store() { - m_data_store->set_preload(); conduit::Node work; const std::string key; // key = "" is intentional diff --git a/src/data_readers/data_reader_numpy_npz_conduit.cpp b/src/data_readers/data_reader_numpy_npz_conduit.cpp index f5c6b5d049f..30e80a7146f 100644 --- a/src/data_readers/data_reader_numpy_npz_conduit.cpp +++ b/src/data_readers/data_reader_numpy_npz_conduit.cpp @@ -141,7 +141,6 @@ void numpy_npz_conduit_reader::preload_data_store() { LBANN_ERROR("numpy_npz_conduit_reader currently assumes you are using 100% of the data set; you specified get_absolute_sample_count() = ", count, " and get_use_percent() = ", use_percent, "; please ask Dave Hysom to modify the code, if you want to use less than 100%"); } - m_data_store->set_preload(); int rank = m_comm->get_rank_in_trainer(); std::unordered_set label_classes; diff --git a/src/data_store/data_store_conduit.cpp b/src/data_store/data_store_conduit.cpp index ae21fa47e01..bf7cda64d1c 100644 --- a/src/data_store/data_store_conduit.cpp +++ b/src/data_store/data_store_conduit.cpp @@ -32,21 +32,29 @@ #include "lbann/utils/exception.hpp" #include "lbann/utils/options.hpp" #include "lbann/utils/timer.hpp" +#include "lbann/utils/file_utils.hpp" #include #include #include +#include #include #include #include #include #include +#include +#include + + +#include +#include +#include namespace lbann { data_store_conduit::data_store_conduit( generic_data_reader *reader) : m_reader(reader) { - m_comm = m_reader->get_comm(); if (m_comm == nullptr) { LBANN_ERROR(" m_comm is nullptr"); @@ -55,38 +63,74 @@ data_store_conduit::data_store_conduit( m_world_master = m_comm->am_world_master(); m_trainer_master = m_comm->am_trainer_master(); m_rank_in_trainer = m_comm->get_rank_in_trainer(); + m_rank_in_world = m_comm->get_rank_in_world(); m_np_in_trainer = m_comm->get_procs_per_trainer(); + open_informational_files(); + options *opts = options::get(); - if (opts->get_bool("debug")) { - std::stringstream ss; - ss << "debug_" << m_reader->get_role() << "." << m_comm->get_rank_in_world(); - m_output = new std::ofstream(ss.str().c_str()); - m_debug_filename = ss.str(); - if (m_world_master) { - std::cerr << "opened " << ss.str() << " for writing\n"; + std::string spill_dir; + + // error check for a single "spill" flag + if (opts->has_string("data_store_test_checkpoint") + && opts->has_string("data_store_spill")) { + LBANN_ERROR("you passed both --data_store_test_checkpoint and --data_store_spill; please use one or the other or none, but not both"); + } + + // error check if running in checkpoint test mode + if (opts->has_string("data_store_test_checkpoint")) { + std::string c = opts->get_string("data_store_test_checkpoint"); + if (c == "1") { + LBANN_ERROR("--data_store_test_checkpoint=1; you probably forgot to specify the spill directory; you must specify --data_store_test_checkpoint='"); + } else { + if (c == "lassen") { + char * val = std::getenv("BBPATH"); + if (val == NULL) { + LBANN_ERROR("std::getenv(\"BBPATH\") returned NULL; unable to use burst buffer"); + } + std::string cc(val); + c = cc + "/data_store"; + } + spill_dir = c; + m_test_dir = c; + } + } + + // error check if running in spill mode + if (opts->has_string("data_store_spill")) { + const std::string c = opts->get_string("data_store_spill"); + if (c == "1") { + LBANN_ERROR("--data_store_spill=1; you probably forgot to specify the spill directory; you must specify --data_store_spill='"); + } else { + spill_dir = c; } + } + + if (spill_dir != "") { + m_spill_dir_base = spill_dir; } + // error check: if running in local cache mode, must preload + // TODO: future work -- modify so preload is not necessary m_is_local_cache = opts->get_bool("data_store_cache"); - m_preload = opts->get_bool("preload_data_store"); - if (m_is_local_cache && !m_preload) { + if (m_is_local_cache && !opts->get_bool("preload_data_store")) { LBANN_ERROR("data_store_cache is currently only implemented for preload mode; this will change in the future. For now, pleas pass both flags: data_store_cache and --preload_data_store"); } - if (m_world_master) { - if (m_is_local_cache) { - std::cerr << "data_store_conduit is running in local_cache mode\n"; - } else { - std::cerr << "data_store_conduit is running in multi-message mode\n"; - } + if (m_is_local_cache) { + PROFILE("data_store_conduit is running in local_cache mode"); + } else { + PROFILE("data_store_conduit is running in multi-message mode"); } } data_store_conduit::~data_store_conduit() { - if (m_output) { - m_output->close(); + if (m_debug) { + m_debug->close(); + } + if (m_profile) { + m_profile->close(); } if (m_is_local_cache && m_mem_seg) { int sanity = shm_unlink(m_seg_name.c_str()); @@ -119,15 +163,9 @@ data_store_conduit& data_store_conduit::operator=(const data_store_conduit& rhs) void data_store_conduit::set_data_reader_ptr(generic_data_reader *reader) { m_reader = reader; - if (options::get()->get_bool("debug")) { - std::stringstream ss; - ss << "debug_" << m_reader->get_role() << "." << m_comm->get_rank_in_world(); - m_output = new std::ofstream(ss.str().c_str()); - m_debug_filename = ss.str(); - if (m_world_master) { - std::cerr << "data_store_conduit::set_data_reader_ptr; opened " << ss.str() << " for writing\n"; - } - } + m_debug = 0; + m_profile = 0; + open_informational_files(); } void data_store_conduit::copy_members(const data_store_conduit& rhs, const std::vector& ds_sample_move_list) { @@ -139,7 +177,7 @@ void data_store_conduit::copy_members(const data_store_conduit& rhs, const std:: m_is_local_cache = rhs.m_is_local_cache; m_node_sizes_vary = rhs.m_node_sizes_vary; m_have_sample_sizes = rhs.m_have_sample_sizes; - m_reader = rhs.m_reader; + //m_reader = rhs.m_reader; m_comm = rhs.m_comm; m_world_master = rhs.m_world_master; m_trainer_master = rhs.m_trainer_master; @@ -152,12 +190,16 @@ void data_store_conduit::copy_members(const data_store_conduit& rhs, const std:: m_mem_seg_length = rhs.m_mem_seg_length; m_seg_name = rhs.m_seg_name; m_image_offsets = rhs.m_image_offsets; - if (m_output) { - LBANN_ERROR("m_output should be nullptr"); - } + + m_spill = rhs.m_spill; + m_is_spilled = rhs.m_is_spilled; + m_spill_dir_base = rhs.m_spill_dir_base; + m_cur_spill_dir_integer = rhs.m_cur_spill_dir_integer; + m_cur_spill_dir = rhs.m_cur_spill_dir; + m_num_files_in_cur_spill_dir = rhs.m_num_files_in_cur_spill_dir; /// This block needed when carving a validation set from the training set - //if (options::get()->get_bool("debug") && !m_output) { + m_my_num_indices = 0; if(ds_sample_move_list.size() == 0) { m_data = rhs.m_data; } else { @@ -178,6 +220,7 @@ void data_store_conduit::copy_members(const data_store_conduit& rhs, const std:: n2[names[0]][t] = rhs.m_data[i]["data"][names[0]][t]; } build_node_for_sending(n2, m_data[i]); + ++m_my_num_indices; } rhs.m_data.erase(i); @@ -204,33 +247,25 @@ void data_store_conduit::copy_members(const data_store_conduit& rhs, const std:: m_outgoing_msg_sizes = rhs.m_outgoing_msg_sizes; m_incoming_msg_sizes = rhs.m_incoming_msg_sizes; m_compacted_sample_size = rhs.m_compacted_sample_size; - m_reconstituted = rhs.m_reconstituted; m_indices_to_send = rhs.m_indices_to_send; m_indices_to_recv = rhs.m_indices_to_recv; + + open_informational_files(); } void data_store_conduit::setup(int mini_batch_size) { - if (m_world_master) { - std::cerr << "starting data_store_conduit::setup() for role: " << m_reader->get_role() << "\n"; - if (m_is_local_cache) { - std::cerr << "data store mode: local cache\n"; - } else { - std::cerr << "data store mode: exchange_data via individual samples\n"; - } - } - double tm1 = get_time(); - m_owner_map_mb_size = mini_batch_size; + PROFILE("starting setup()"); - m_is_setup = true; + m_owner_map_mb_size = mini_batch_size; if (m_is_local_cache && m_preload) { preload_local_cache(); } + m_is_setup = true; - if (m_world_master) { - std::cerr << "TIME for data_store_conduit setup: " << get_time() - tm1 << "\n"; - } + PROFILE("time for data_store_conduit setup: ", (get_time()-tm1), + " (will be insignificant unless running in local cache mode)"); } void data_store_conduit::setup_data_store_buffers() { @@ -242,33 +277,64 @@ void data_store_conduit::setup_data_store_buffers() { m_outgoing_msg_sizes.resize(m_np_in_trainer); m_incoming_msg_sizes.resize(m_np_in_trainer); m_recv_buffer.resize(m_np_in_trainer); - m_reconstituted.resize(m_np_in_trainer); } -void data_store_conduit::set_preloaded_conduit_node(int data_id, conduit::Node &node) { +void data_store_conduit::spill_preloaded_conduit_node(int data_id, const conduit::Node &node) { // note: at this point m_data[data_id] = node - if (m_output) { - (*m_output) << "set_preloaded_conduit_node: " << data_id << std::endl; + conduit::Node n3 = node; + { + std::lock_guard lock(m_mutex); + build_node_for_sending(node, n3); + } + if (!m_node_sizes_vary) { + error_check_compacted_node(n3, data_id); + } else { + std::lock_guard lock(m_mutex); + m_sample_sizes[data_id] = n3.total_bytes_compact(); + } + + { + std::lock_guard lock(m_mutex); + spill_conduit_node(node, data_id); + m_spilled_nodes[data_id] = m_cur_spill_dir_integer; + m_data.erase(data_id); + } +} + +void data_store_conduit::set_preloaded_conduit_node(int data_id, const conduit::Node &node) { + // note: at this point m_data[data_id] = node + + { + std::lock_guard lock(m_mutex); + ++m_my_num_indices; + } + + if (m_spill) { + spill_preloaded_conduit_node(data_id, node); + return; + } + + { + conduit::Node n2 = node; + std::lock_guard lock(m_mutex); + build_node_for_sending(n2, m_data[data_id]); } - conduit::Node n2 = node; - m_mutex.lock(); - build_node_for_sending(n2, m_data[data_id]); - m_mutex.unlock(); if (!m_node_sizes_vary) { error_check_compacted_node(m_data[data_id], data_id); } else { - m_mutex.lock(); + std::lock_guard lock(m_mutex); m_sample_sizes[data_id] = m_data[data_id].total_bytes_compact(); - m_mutex.unlock(); } } void data_store_conduit::error_check_compacted_node(const conduit::Node &nd, int data_id) { + if (m_node_sizes_vary) { + return; + } + std::lock_guard lock(m_mutex); if (m_compacted_sample_size == 0) { m_compacted_sample_size = nd.total_bytes_compact(); - if (m_world_master) { - std::cout << "num bytes for nodes to be transmitted: " << nd.total_bytes_compact() << " per node" << std::endl; - } + PROFILE("num bytes for nodes to be transmitted: ", nd.total_bytes_compact(), " per node"); } else if (m_compacted_sample_size != nd.total_bytes_compact() && !m_node_sizes_vary) { LBANN_ERROR("Conduit node being added data_id: ", data_id, " is not the same size as existing nodes in the data_store ", @@ -287,64 +353,77 @@ void data_store_conduit::error_check_compacted_node(const conduit::Node &nd, int } +//n.b. Do not put any PROFILE or DEBUG statements in this method, +// since the threading from the data_reader will cause you grief void data_store_conduit::set_conduit_node(int data_id, conduit::Node &node, bool already_have) { - - if (m_output) { - (*m_output) << "set_conduit_node: " << data_id << std::endl; - } + std::lock_guard lock(m_mutex); + // TODO: test whether having multiple mutexes below is better than + // locking this entire call with a single mutex + ++m_my_num_indices; if (m_is_local_cache && m_preload) { LBANN_ERROR("you called data_store_conduit::set_conduit_node, but you're running in local cache mode with preloading; something is broken; please contact Dave Hysom"); } - //m_mutex.lock(); - if (already_have == false && m_data.find(data_id) != m_data.end()) { - LBANN_ERROR("duplicate data_id: ", data_id, " in data_store_conduit::set_conduit_node; role: ", m_reader->get_role()); - } + { + //std::lock_guard lock(m_mutex); + if (already_have == false && m_data.find(data_id) != m_data.end()) { + LBANN_ERROR("duplicate data_id: ", data_id, " in data_store_conduit::set_conduit_node; role: ", m_reader->get_role()); + } + } if (already_have && is_local_cache()) { if (m_data.find(data_id) == m_data.end()) { LBANN_ERROR("you claim the passed node was obtained from this data_store, but the data_id (", data_id, ") doesn't exist in m_data"); } - //m_mutex.unlock(); return; } if (is_local_cache()) { - m_mutex.lock(); m_data[data_id] = node; - m_mutex.unlock(); } else { - m_mutex.lock(); - m_owner[data_id] = m_rank_in_trainer; - build_node_for_sending(node, m_data[data_id]); - error_check_compacted_node(m_data[data_id], data_id); - m_sample_sizes[data_id] = m_data[data_id].total_bytes_compact(); - m_mutex.unlock(); + if (m_spill) { + PROFILE("spill!\n"); + + //TODO: rethink how we go about exchanging sample sizes. + //currently, we exchange sample sizes a single time, and + //the exchange is for all samples. To make this work with + //spilling we need to compute the sample size by building + //a node_for_sending (below), then we throw it away. + //Also, see not in copy_members() about problems with the + //schema that cause us to rebuild the node_for_sending after + //copying or loading from disk. I need to revisit this and + //figure out what's going on. + conduit::Node n2; + build_node_for_sending(node, n2); + error_check_compacted_node(n2, data_id); + { + // std::lock_guard lock(m_mutex); + LBANN_ERROR("NOT YET IMPLEMENTED"); + m_owner[data_id] = m_rank_in_trainer; + m_sample_sizes[data_id] = n2.total_bytes_compact(); + spill_conduit_node(node, data_id); + m_spilled_nodes[data_id] = m_cur_spill_dir_integer; + } + } + + else { + { + // std::lock_guard lock(m_mutex); + m_owner[data_id] = m_rank_in_trainer; + build_node_for_sending(node, m_data[data_id]); + m_sample_sizes[data_id] = m_data[data_id].total_bytes_compact(); + } + error_check_compacted_node(m_data[data_id], data_id); + } } } +//n.b. Do not put any PROFILE or DEBUG statements in this method, +// since the threading from the data_reader will cause you grief const conduit::Node & data_store_conduit::get_conduit_node(int data_id) const { - if (m_output) { - (*m_output) << "get_conduit_node: " << data_id << std::endl; - } - /** - * dah: commenting this out since it gives a false positive for test - * case with unshuffled indices. Since we currently send samples - * to ourselves, they should be in m_minibatch_data. The following - * block is only useful if, at some future time, we do not send - * indices to ourself - std::unordered_map::const_iterator t = m_data.find(data_id); - if (t != m_data.end()) { - if(m_super_node) { - return t->second; - } else { - return t->second["data"]; - } - } - */ if (is_local_cache()) { std::unordered_map::const_iterator t3 = m_data.find(data_id); if (t3 == m_data.end()) { @@ -362,13 +441,6 @@ const conduit::Node & data_store_conduit::get_conduit_node(int data_id) const { return t3->second["data"]; } LBANN_ERROR("failed to find data_id: ", data_id, " in m_minibatch_data; m_minibatch_data.size: ", m_minibatch_data.size(), " and also failed to find it in m_data; m_data.size: ", m_data.size(), "; role: ", m_reader->get_role()); - if (m_output) { - (*m_output) << "failed to find data_id: " << data_id << " in m_minibatch_data; my m_minibatch_data indices: "; - for (auto t : m_minibatch_data) { - (*m_output) << t.first << " "; - } - (*m_output) << std::endl; - } } return t2->second; @@ -415,20 +487,25 @@ void data_store_conduit::exchange_data_by_sample(size_t current_pos, size_t mb_s LBANN_ERROR("setup(mb_size) has not been called"); } + double tm5 = get_time(); + /// exchange sample sizes if they are non-uniform (imagenet); /// this will only be called once, during the first call to /// exchange_data_by_sample at the beginning of the 2nd epoch, /// or during the first call th exchange_data_by_sample() during /// the first epoch if preloading if (m_node_sizes_vary && !m_have_sample_sizes) { + double tm3 = get_time(); exchange_sample_sizes(); + m_exchange_sample_sizes_time += (get_time() - tm3); } - if (m_output) { - (*m_output) << "starting data_store_conduit::exchange_data_by_sample; mb_size: " << mb_size << std::endl; + int num_send_req = build_indices_i_will_send(current_pos, mb_size); + if (m_spill) { + // TODO + load_spilled_conduit_nodes(); } - int num_send_req = build_indices_i_will_send(current_pos, mb_size); int num_recv_req = build_indices_i_will_recv(current_pos, mb_size); m_send_requests.resize(num_send_req); @@ -468,10 +545,6 @@ void data_store_conduit::exchange_data_by_sample(size_t current_pos, size_t mb_s sz = m_sample_sizes[index]; } - if (m_output) { - (*m_output) << "sending " << index << " size: " << sz << " to " << p << std::endl; - } - m_comm->nb_tagged_send(s, sz, p, index, m_send_requests[ss++], m_comm->get_trainer_comm()); } } @@ -513,16 +586,20 @@ void data_store_conduit::exchange_data_by_sample(size_t current_pos, size_t mb_s LBANN_ERROR("m_recv_requests.size != m_recv_buffer.size; m_recv_requests: ", m_recv_requests.size(), " m_recv_buffer.size: ", m_recv_buffer.size()); } + m_start_snd_rcv_time += (get_time() - tm5); + // wait for all msgs to complete + tm5 = get_time(); m_comm->wait_all(m_send_requests); m_comm->wait_all(m_recv_requests); + m_wait_all_time += (get_time() - tm5); //======================================================================== //part 3: construct the Nodes needed by me for the current minibatch + tm5 = get_time(); conduit::Node nd; m_minibatch_data.clear(); - double tm2 = get_time(); for (size_t j=0; j < m_recv_buffer.size(); j++) { conduit::uint8 *n_buff_ptr = (conduit::uint8*)m_recv_buffer[j].data_ptr(); conduit::Node n_msg; @@ -538,7 +615,12 @@ void data_store_conduit::exchange_data_by_sample(size_t current_pos, size_t mb_s int data_id = m_recv_data_ids[j]; m_minibatch_data[data_id].set_external(n_msg["data"]); } - m_rebuild_time += (get_time() - tm2); + m_rebuild_time += (get_time() - tm5); + + if (m_spill) { + // TODO + m_data.clear(); + } } int data_store_conduit::build_indices_i_will_recv(int current_pos, int mb_size) { @@ -560,20 +642,22 @@ int data_store_conduit::build_indices_i_will_send(int current_pos, int mb_size) m_indices_to_send.clear(); m_indices_to_send.resize(m_np_in_trainer); int k = 0; - if (m_output) { - (*m_output) << "build_indices_i_will_send; cur pos: " << current_pos << " mb_size: " << mb_size << " m_data.size: " << m_data.size() << "\n"; - } + DEBUG("build_indices_i_will_send; cur pos: ", current_pos, " mb_size: ", mb_size, " m_data.size: ", m_data.size()); for (int i = current_pos; i < current_pos + mb_size; i++) { auto index = (*m_shuffled_indices)[i]; /// If this rank owns the index send it to the (i%m_np)'th rank + bool is_mine = false; if (m_data.find(index) != m_data.end()) { + is_mine = true; + } else if (m_spilled_nodes.find(index) != m_spilled_nodes.end()) { + is_mine = true; + } + if (is_mine) { m_indices_to_send[(i % m_owner_map_mb_size) % m_np_in_trainer].insert(index); // Sanity check if (m_owner[index] != m_rank_in_trainer) { - std::stringstream s; - s << "error for i: "<& per_r } } -#if 0 -void data_store_conduit::build_owner_map(int mini_batch_size) { - if (m_world_master) std::cerr << "starting data_store_conduit::build_owner_map for role: " << m_reader->get_role() << " with mini_batch_size: " << mini_batch_size << " num indices: " << m_shuffled_indices->size() << "\n"; - if (mini_batch_size == 0) { - LBANN_ERROR("mini_batch_size == 0; can't build owner_map"); - } - m_owner.clear(); - m_owner_map_mb_size = mini_batch_size; - for (size_t i = 0; i < m_shuffled_indices->size(); i++) { - auto index = (*m_shuffled_indices)[i]; - /// To compute the owner index first find its position inside of - /// the mini-batch (mod mini-batch size) and then find how it is - /// striped across the ranks in the trainer - m_owner[index] = (i % m_owner_map_mb_size) % m_np_in_trainer; - } -} -#endif - const conduit::Node & data_store_conduit::get_random_node() const { size_t sz = m_data.size(); @@ -640,9 +706,7 @@ conduit::Node & data_store_conduit::get_empty_node(int data_id) { } void data_store_conduit::purge_unused_samples(const std::vector& indices) { - if (m_output) { - (*m_output) << " starting purge_unused_samples; indices.size(): " << indices.size() << " data.size(): " << m_data.size() << std::endl; - } + DEBUG(" starting purge_unused_samples; indices.size(): ", indices.size(), " data.size(): ", m_data.size()); /// Remove unused indices from the data and owner maps for(auto&& i : indices) { if(m_data.find(i) != m_data.end()){ @@ -652,9 +716,7 @@ void data_store_conduit::purge_unused_samples(const std::vector& indices) { m_owner.erase(i); } } - if (m_output) { - (*m_output) << " leaving purge_unused_samples; indices.size(): " << indices.size() << " data.size(): " << m_data.size() << std::endl; - } + DEBUG("leaving purge_unused_samples; indices.size(): ", indices.size(), " data.size(): ", m_data.size()); } void data_store_conduit::compact_nodes() { @@ -672,11 +734,7 @@ void data_store_conduit::compact_nodes() { int data_store_conduit::get_index_owner(int idx) { if (m_owner.find(idx) == m_owner.end()) { - std::stringstream err; - err << __FILE__ << " " << __LINE__ << " :: " - << " idx: " << idx << " was not found in the m_owner map;" - << " map size: " << m_owner.size(); - throw lbann_exception(err.str()); + LBANN_ERROR(" idx: ", idx, " was not found in the m_owner map; map size: ", m_owner.size()); } return m_owner[idx]; } @@ -830,9 +888,6 @@ void data_store_conduit::check_mem_capacity(lbann_comm *comm, const std::string bool data_store_conduit::has_conduit_node(int data_id) const { std::unordered_map::const_iterator t = m_data.find(data_id); - if (m_output) { - (*m_output) << "has_conduit_node( " << data_id << " ) = " << (t == m_data.end()) << std::endl; - } return t != m_data.end(); } @@ -841,17 +896,14 @@ void data_store_conduit::set_shuffled_indices(const std::vector *indices) { } void data_store_conduit::exchange_sample_sizes() { - if (m_output) { - (*m_output) << "starting data_store_conduit::exchange_sample_sizes" << std::endl; - } - + DEBUG("starting data_store_conduit::exchange_sample_sizes"); int my_count = m_sample_sizes.size(); std::vector all_counts(m_np_in_trainer); m_comm->all_gather(&my_count, 1, all_counts.data(), 1, m_comm->get_trainer_comm()); - if (m_output) { + if (m_debug) { for (size_t h=0; h other_sizes; + std::vector others; for (int k=0; kbroadcast(k, my_sizes.data(), all_counts[k]*2, m_comm->get_trainer_comm()); } else { - m_comm->broadcast(k, other_sizes.data(), all_counts[k]*2, m_comm->get_trainer_comm()); - - for (size_t i=0; ibroadcast(k, others.data(), all_counts[k]*2, m_comm->get_trainer_comm()); + + for (size_t i=0; i &work, std::unordered_map< } work.resize(n); - if (m_output) { - (*m_output) << "data_store_conduit::read_files; requested work size: " << n << std::endl; - } + DEBUG("data_store_conduit::read_files; requested work size: ", n); //get the list of images from the data reader image_data_reader *image_reader = dynamic_cast(m_reader); @@ -1180,66 +1226,131 @@ void data_store_conduit::exchange_images(std::vector &work, std::unordered } void data_store_conduit::exchange_owner_maps() { - if (m_output) { - (*m_output) << "\nstarting data_store_conduit::exchange_owner_maps\n\n"; + PROFILE("starting exchange_owner_maps;", + "my owner map size: ", m_owner.size()); + DEBUG("starting exchange_owner_maps;", + "size: ", m_owner.size()); + if (m_reader->get_role() == "validate" && m_debug) { + (*m_debug) << "\nmy owner map:\n"; + for (auto t : m_owner) { + (*m_debug) << " " << t.first << " is owned by " << t.second << std::endl; + } } - int my_count = m_owner.size(); + + int my_count = m_my_num_indices; std::vector all_counts(m_np_in_trainer); m_comm->all_gather(&my_count, 1, all_counts.data(), 1, m_comm->get_trainer_comm()); - std::vector my_sizes(m_owner.size()); + std::vector my_sizes(m_my_num_indices); size_t j = 0; for (auto t : m_owner) { my_sizes[j++] = t.first; } - std::vector other_sizes; + std::vector others; for (int k=0; kbroadcast(k, my_sizes.data(), all_counts[k], m_comm->get_trainer_comm()); } else { - m_comm->broadcast(k, other_sizes.data(), all_counts[k], m_comm->get_trainer_comm()); - for (size_t i=0; ibroadcast(k, others.data(), all_counts[k], m_comm->get_trainer_comm()); + for (size_t i=0; iget_role(), "; m_owner[",other_sizes[i],"] = ", m_owner[other_sizes[i]]); + LBANN_ERROR("duplicate data_id: ", others[i], " role: ", m_reader->get_role(), "; m_owner[", others[i],"] = ", m_owner[others[i]], " for role: ", m_reader->get_role(), " m_owner.size: ", m_owner.size(), " m_data.size(): ", m_data.size()); } - m_owner[other_sizes[i]] = k; + m_owner[others[i]] = k; } } } + PROFILE("leaving data_store_conduit::exchange_owner_maps\n", + "my owner map size: ", m_owner.size()); } void data_store_conduit::exchange_mini_batch_data(size_t current_pos, size_t mb_size) { - double tm1 = get_time(); if (is_local_cache()) { return; } + double tm1 = get_time(); + if (m_reader->at_new_epoch()) { - if (m_world_master && m_cur_epoch > 0) { - std::cout << "time for exchange_mini_batch_data calls: " - << m_exchange_time << std::endl - << "time for constructing conduit Nodes: " << m_rebuild_time - << std::endl; - std::cout << std::endl; - m_exchange_time = 0.; + PROFILE("At new epoch; m_cur_epoch: ", m_cur_epoch); + if (m_cur_epoch > 0) { + PROFILE( + "\n", + "Exchange Data Timing:\n", + " exchange_mini_batch_data: ", m_exchange_time, "\n", + " exchange sample sizes: ", m_exchange_sample_sizes_time, "\n", + " start sends and rcvs: ", m_start_snd_rcv_time, "\n", + " wait alls: ", m_wait_all_time, "\n", + " unpacking rcvd nodes: ", m_rebuild_time, "\n\n"); + + if (options::get()->get_bool("data_store_min_max_timing")) { + std::vector send; + static int count = 5; + send.reserve(count); + send.push_back(m_exchange_time); + send.push_back(m_exchange_sample_sizes_time); + send.push_back(m_start_snd_rcv_time); + send.push_back(m_wait_all_time); + send.push_back(m_rebuild_time); + if (m_trainer_master) { + std::vector rcv_max(count); + std::vector rcv_min(count); + m_comm->trainer_reduce(send.data(), count, rcv_max.data(), El::mpi::MAX); + m_comm->trainer_reduce(send.data(), count, rcv_min.data(), El::mpi::MIN); + PROFILE( + "Exchange Data MAX Timing:\n", + " exchange_mini_batch_data: ", rcv_max[0], "\n", + " exchange sample sizes: ", rcv_max[1], "\n", + " start sends and rcvs: ", rcv_max[2], "\n", + " wait alls: ", rcv_max[3], "\n", + " unpacking rcvd nodes: ", rcv_max[4], "\n\n"); + PROFILE( + "Exchange Data MIN Timing:\n", + " exchange_mini_batch_data: ", rcv_min[0], "\n", + " exchange sample sizes: ", rcv_min[1], "\n", + " start sends and rcvs: ", rcv_min[2], "\n", + " wait alls: ", rcv_min[3], "\n", + " unpacking rcvd nodes: ", rcv_min[4], "\n\n"); + } else { + m_comm->trainer_reduce(send.data(), count, 0, El::mpi::MAX); + m_comm->trainer_reduce(send.data(), count, 0, El::mpi::MIN); + } + } + + m_exchange_sample_sizes_time = 0.; + m_start_snd_rcv_time = 0.; + m_wait_all_time = 0.; m_rebuild_time = 0.; + m_exchange_time = 0.; } ++m_cur_epoch; } - if (m_reader->at_new_epoch() && !m_preload && !m_is_local_cache && m_cur_epoch == 1) { + // when not running in preload mode, exchange owner maps after the 1st epoch + if (m_reader->at_new_epoch() && !options::get()->get_bool("preload_data_store") && !is_local_cache() && m_cur_epoch == 1) { + PROFILE("calling exchange_owner_maps"); exchange_owner_maps(); + /* + * TODO + if (m_spill) { + m_is_spilled = true; + m_metadata.close(); + save_state(); + } + */ + } + + if (m_test_dir != "" && m_reader->at_new_epoch() && !is_local_cache() && m_cur_epoch == 1) { + test_checkpoint(m_test_dir); } exchange_data_by_sample(current_pos, mb_size); @@ -1247,17 +1358,349 @@ void data_store_conduit::exchange_mini_batch_data(size_t current_pos, size_t mb_ } void data_store_conduit::flush_debug_file() { - if (!m_output) { + if (!m_debug) { return; } - m_output->close(); - m_output->open(m_debug_filename.c_str(), std::ios::app); + m_debug->close(); + m_debug->open(m_debug_filename.c_str(), std::ios::app); } -size_t data_store_conduit::get_num_indices() const { - size_t num = m_data.size(); - size_t n = m_comm->trainer_allreduce(num); - return n; +void data_store_conduit::flush_profile_file() { + if (!m_profile) { + return; + } + m_profile->close(); + m_profile->open(m_profile_filename.c_str(), std::ios::app); +} + +size_t data_store_conduit::get_num_global_indices() const { + return m_comm->trainer_allreduce(m_my_num_indices); +} + +void data_store_conduit::test_checkpoint(const std::string &checkpoint_dir) { + if (m_world_master) { + std::cerr << "starting data_store_conduit::test_checkpoint for role: " + << m_reader->get_role() << std::endl; + print_partial_owner_map(10); + std::cerr << "\nHere are some private variables before clearing them:\n"; + print_variables(); + std::cerr << "\nCalling write_checkpoint()" << std::endl; + } + write_checkpoint(checkpoint_dir); + + // clear or reset private variables + auto sanity = m_owner; + m_owner.clear(); + m_sample_sizes.clear(); + m_data.clear(); + m_cur_epoch = -1; + + m_is_setup = false; + m_preload = false; + m_explicit_loading = true; + m_owner_map_mb_size = 0; + m_compacted_sample_size = 0; + m_node_sizes_vary = true; + + if (m_world_master) { + std::cerr << "\nHere are some private variables after clearing them:\n"; + print_variables(); + } + + if (m_world_master) { + std::cerr << "Cleared the owner map; m_owner.size(): " << m_owner.size() + << std::endl + << "Calling load_checkpoint" << std::endl; + } + load_checkpoint(checkpoint_dir, nullptr); + if (m_world_master) { + std::cerr << "Here is part of the re-loaded owner map; map.size(): " << m_owner.size() << std::endl; + print_partial_owner_map(10); + std::cerr << "\nHere are some private variables after reloading:\n"; + print_variables(); + } + + //check that the owner map was correctly loaded + for (auto t : m_owner) { + if (sanity.find(t.first) == sanity.end()) { + LBANN_ERROR("sanity.find(t.first) == sanity.end() for t.first= ", t.first); + } else if (sanity[t.first] != m_owner[t.first]) { + LBANN_ERROR("sanity[t.first] != m_owner[t.first] for t.first= ", t.first, " and m_owner[t.first]= ", m_owner[t.first]); + } + } + + m_comm->global_barrier(); +} + +void data_store_conduit::make_dir_if_it_doesnt_exist(const std::string &dir_name) { + int node_rank = m_comm->get_rank_in_node(); + if (node_rank == 0) { + bool exists = file::directory_exists(dir_name); + if (!exists) { + if (m_world_master) { + std::cerr << "data_store_conduit; the directory '" << dir_name << "' doesn't exist; creating it\n"; + } + file::make_directory(dir_name); + } + } +} + +void data_store_conduit::setup_spill(const std::string &base_dir) { + m_spill_dir_base = base_dir; + m_spill = true; + m_cur_spill_dir_integer = -1; + m_num_files_in_cur_spill_dir = m_max_files_per_directory; + PROFILE("base directory for spilling: ", m_spill_dir_base); + + // create directory structure for spilling data + make_dir_if_it_doesnt_exist(m_spill_dir_base); + m_comm->trainer_barrier(); + make_dir_if_it_doesnt_exist(get_conduit_dir()); + PROFILE("base directory for spilling conduit nodes: ", get_conduit_dir()); + + // open metadata file; this will contains the file pathnames of spilled + // conduit nodes + const std::string fnn = get_metadata_fn(); + m_metadata.open(fnn.c_str()); + if (!m_metadata) { + LBANN_ERROR("failed to open ", fnn, " for writing"); + } + PROFILE("will write metadata to file: ", get_metadata_fn()); + + //n.b. must do this here, instead of only in spill_conduit_node(), + // in case a reader (e.g, validation reader) has no data + open_next_conduit_spill_directory(); +} + +void data_store_conduit::write_checkpoint(std::string dir_name) { + // if we're spilling data, everything has already been written to file + if (m_is_spilled) { + return; + } + double tm1 = get_time(); + setup_spill(dir_name); + + // cerealize all non-conduit::Node variables + save_state(); + + // save conduit Nodes + m_metadata << get_conduit_dir() << "\n"; + DEBUG("m_data.size: ", m_data.size()); + for (auto t : m_data) { + spill_conduit_node(t.second["data"], t.first); + } + m_metadata.close(); + PROFILE("time to write checkpoint: ", (get_time() - tm1)); +} + +void data_store_conduit::save_state() { + // checkpoint remaining state using cereal + const std::string fn = get_cereal_fn(); + std::ofstream os(fn); + if (!os) { + LBANN_ERROR("failed to open ", fn, " for writing"); + } + + { + cereal::XMLOutputArchive archive(os); + archive(CEREAL_NVP(m_cur_epoch), + CEREAL_NVP(m_is_setup), + CEREAL_NVP(m_preload), + CEREAL_NVP(m_explicit_loading), + CEREAL_NVP(m_owner_map_mb_size), + CEREAL_NVP(m_compacted_sample_size), + CEREAL_NVP(m_is_local_cache), + CEREAL_NVP(m_node_sizes_vary), + CEREAL_NVP(m_have_sample_sizes), + CEREAL_NVP(m_owner), + CEREAL_NVP(m_sample_sizes)); + } + os.close(); +} + +void data_store_conduit::load_checkpoint(std::string dir_name, generic_data_reader *reader) { + double tm1 = get_time(); + PROFILE("starting data_store_conduit::load_checkpoint"); + + // Sanity check that checkpoint directories exist + m_spill_dir_base = dir_name; + bool exists = file::directory_exists(m_spill_dir_base); + if (!exists) { + LBANN_ERROR("cannot load data_store from file, since the specified directory ", dir_name, "doesn't exist"); + } + const std::string conduit_dir = get_conduit_dir(); + exists = file::directory_exists(conduit_dir); + if (!exists) { + LBANN_ERROR("cannot load data_store from file, since the specified directory '", conduit_dir, "' doesn't exist"); + } + + // Read checkpoint for all essential variables except conduit Nodes + const std::string fn = get_cereal_fn(); + std::ifstream in(fn); + if (!in) { + LBANN_ERROR("failed to open ", m_cereal_fn, " for reading"); + } + cereal::XMLInputArchive iarchive(in); + iarchive(m_cur_epoch, m_is_setup, + m_preload, m_explicit_loading, + m_owner_map_mb_size, + m_compacted_sample_size, m_is_local_cache, + m_node_sizes_vary, m_have_sample_sizes, + m_owner, m_sample_sizes); + + if (reader != nullptr) { + m_reader = reader; + m_comm = m_reader->get_comm(); + m_shuffled_indices = &(m_reader->get_shuffled_indices()); + m_world_master = m_comm->am_world_master(); + m_trainer_master = m_comm->am_trainer_master(); + m_rank_in_trainer = m_comm->get_rank_in_trainer(); + m_rank_in_world = m_comm->get_rank_in_world(); + m_np_in_trainer = m_comm->get_procs_per_trainer(); + } + + // Open metadata filename; this is in index re, checkpointed conduit filenames + const std::string metadata_fn = get_metadata_fn(); + std::ifstream metadata(metadata_fn); + if (!metadata) { + LBANN_ERROR("failed to open ", metadata_fn, " for reading"); + } + + // Error check that the conduit base directory name is correct + std::string base_dir; + getline(metadata, base_dir); + if (conduit_dir != base_dir) { + LBANN_ERROR("conduit_dir != base_dir (", conduit_dir, ", ", base_dir); + } + + // Load conduit Nodes + std::string tmp; + int sample_id; + while (metadata >> tmp >> sample_id) { + if (tmp.size() > 2) { + const std::string fn2 = base_dir + "/" + tmp; + conduit::Node nd; + nd.load(fn2); + build_node_for_sending(nd, m_data[sample_id]); + } + } + metadata.close(); + + m_was_loaded_from_file = true; + PROFILE("time to load checkpoint: ", (get_time() - tm1)); +} + +void data_store_conduit::print_variables() { + if (!m_world_master) { + return; + } + std::cerr << "m_cur_epoch: " << m_cur_epoch << std::endl + << "m_is_setup: " << m_is_setup << std::endl + << "m_preload: " << m_preload << std::endl + << "m_explicit_loading: " << m_explicit_loading << std::endl + << "m_owner_map_mb_size: " << m_owner_map_mb_size << std::endl + << "m_compacted_sample_size: " << m_compacted_sample_size << std::endl + << "m_node_sizes_vary: " << m_node_sizes_vary << std::endl; +} + +std::string data_store_conduit::get_conduit_dir() const { + return m_spill_dir_base + "/conduit_" + m_reader->get_role() + "_" + std::to_string(m_rank_in_world); +} + +std::string data_store_conduit::get_cereal_fn() const { + return m_spill_dir_base + '/' + m_cereal_fn + "_" + m_reader->get_role() + "_" + std::to_string(m_rank_in_world) + ".xml"; +} + +std::string data_store_conduit::get_metadata_fn() const { + return m_spill_dir_base + "/metadata_" + m_reader->get_role() + "_" + std::to_string(m_rank_in_world); +} + +void data_store_conduit::open_next_conduit_spill_directory() { + if (m_num_files_in_cur_spill_dir != m_max_files_per_directory) { + return; + } + m_num_files_in_cur_spill_dir = 0; + m_cur_spill_dir_integer += 1; + m_cur_spill_dir = get_conduit_dir() + "/" + to_string(m_cur_spill_dir_integer); + DEBUG("calling file::directory_exists(", m_cur_spill_dir, ")"); + bool exists = file::directory_exists(m_cur_spill_dir); + DEBUG("exists? ", exists); + if (!exists) { + file::make_directory(m_cur_spill_dir); + } +} + +void data_store_conduit::spill_conduit_node(const conduit::Node &node, int data_id) { + if (!m_metadata.is_open()) { + LBANN_ERROR("metadata file is not open"); + } + + std::lock_guard lock(m_mutex); + if (m_num_files_in_cur_spill_dir == m_max_files_per_directory) { + open_next_conduit_spill_directory(); + } + + const std::string fn = m_cur_spill_dir + "/" + std::to_string(data_id); + node.save(fn); + m_metadata << m_cur_spill_dir_integer << "/" << data_id << " " << data_id << std::endl; + m_spilled_nodes[data_id] = m_cur_spill_dir_integer; + ++m_num_files_in_cur_spill_dir; +} + +void data_store_conduit::load_spilled_conduit_nodes() { + m_data.clear(); + + for (const auto &v : m_indices_to_send) { + for (const auto &id : v) { + std::unordered_map::const_iterator it = m_spilled_nodes.find(id); + if (it == m_spilled_nodes.end()) { + LBANN_ERROR("it == m_spilled_nodes.end() for sample_id: ", id, "; m_spilled_nodes.size: ", m_spilled_nodes.size()); + } + const std::string fn = get_conduit_dir() + "/" + std::to_string(it->second) + "/" + std::to_string(id); + //PROFILE("loading conduit file: ", fn); + conduit::Node node; + node.load(fn); + build_node_for_sending(node, m_data[id]); + } + } +} + +void data_store_conduit::open_informational_files() { + options *opts = options::get(); + if (m_comm == nullptr) { + LBANN_ERROR("m_comm == nullptr"); + } + + // optionally, each pair opens a debug file + if (opts->get_bool("data_store_debug") && !m_debug && m_reader != nullptr) { + m_debug_filename = m_debug_filename_base + "_" + m_reader->get_role() + "." + std::to_string(m_comm->get_rank_in_world()) + ".txt"; + m_debug = new std::ofstream(m_debug_filename.c_str()); + if (!m_debug) { + LBANN_ERROR("failed to open ", m_debug_filename, " for writing"); + } + } + + // optionally, pair opens a file for writing + if (opts->get_bool("data_store_profile") && m_world_master && !m_profile && m_reader != nullptr) { + m_profile_filename = m_profile_filename_base + "_" + m_reader->get_role() + ".txt"; + m_profile = new std::ofstream(m_profile_filename.c_str()); + if (!m_profile) { + LBANN_ERROR("failed to open ", m_profile_filename, " for writing"); + } + } +} + +void data_store_conduit::print_partial_owner_map(int n) { + std::cerr << "\nHere is part of the owner map; m_owner.size(): " << m_owner.size() << std::endl; + std::map m; + for (auto t : m_owner) { + m[t.first] = t.second; + } + int j = 0; + for (auto t : m) { + std::cerr << " sample_id: " << t.first << " owner: " << t.second << std::endl; + if (j++ >= 10) break; + } } } // namespace lbann diff --git a/src/models/model.cpp b/src/models/model.cpp index cecc496fc3c..bf590be7445 100644 --- a/src/models/model.cpp +++ b/src/models/model.cpp @@ -918,19 +918,25 @@ void model::collect_background_data_fetch(execution_mode mode) { } } +// only used in callbacks/ltfb.cpp; from that file: +// "Note that this is a temporary fix +// for the current use of the tournament" void model::make_data_store_preloaded(execution_mode mode) { for (El::Int i = 0; i < get_num_layers(); ++i) { auto *input = dynamic_cast(&get_layer(i)); if (input != nullptr) { auto *data_store = input->get_data_reader(mode)->get_data_store_ptr(); if(data_store != nullptr && !data_store->is_preloaded()) { - input->get_data_reader(mode)->get_data_store_ptr()->set_preload(); + input->get_data_reader(mode)->get_data_store_ptr()->set_is_preloaded(); input->get_data_reader(mode)->get_data_store_ptr()->set_explicit_loading(false); } } } } +// only used in callbacks/ltfb.cpp; from that file: +// "Note that this is a temporary fix +// for the current use of the tournament" void model::mark_data_store_explicitly_loading(execution_mode mode) { for (El::Int i = 0; i < get_num_layers(); ++i) { auto *input = dynamic_cast(&get_layer(i)); diff --git a/src/utils/lbann_library.cpp b/src/utils/lbann_library.cpp index 3617de3c6a5..8388117226c 100644 --- a/src/utils/lbann_library.cpp +++ b/src/utils/lbann_library.cpp @@ -276,7 +276,7 @@ std::unique_ptr build_model_from_prototext( // Setup models ret_model->setup(); - if (opts->get_bool("use_data_store") || opts->get_bool("preload_data_store") || opts->get_bool("data_store_cache")) { + if (opts->get_bool("use_data_store") || opts->get_bool("preload_data_store") || opts->get_bool("data_store_cache") || opts->has_string("data_store_spill")) { if (master) { std::cout << "\nUSING DATA STORE!\n\n"; } diff --git a/src/utils/options.cpp b/src/utils/options.cpp index 1bfb0814fbb..3caea90d0f7 100644 --- a/src/utils/options.cpp +++ b/src/utils/options.cpp @@ -1,5 +1,6 @@ #include "mpi.h" #include "lbann/utils/options.hpp" +#include "lbann/utils/exception.hpp" #include #include #include @@ -135,11 +136,7 @@ int options::get_int(std::string option) { int result; if (!m_test_int(option, result)) { - std::stringstream err; - err << __FILE__ << " " << __LINE__ - << " :: options::get_int() - failed to find option: " << option - << ", or to convert to int"; - throw std::runtime_error(err.str()); + LBANN_ERROR("options::get_int() - failed to find option: ", option, ", or to convert to int"); } return result; } @@ -157,11 +154,7 @@ double options::get_double(std::string option) { double result; if (!m_test_double(option, result)) { - std::stringstream err; - err << __FILE__ << " " << __LINE__ - << " :: options::get_double() - failed to find option: " << option - << ", or to convert the value to double"; - throw std::runtime_error(err.str()); + LBANN_ERROR("options::get_double() - failed to find option: ", option, ", or to convert the value to double"); } return result; } @@ -179,10 +172,7 @@ std::string options::get_string(std::string option) { std::string result; if (!m_test_string(option, result)) { - std::stringstream err; - err << __FILE__ << " " << __LINE__ - << " :: options::get_string() - failed to find option: " << option; - throw std::runtime_error(err.str()); + LBANN_ERROR("options::get_string() - failed to find option: ", option); } return result; } @@ -286,10 +276,7 @@ void options::m_parse_file(std::string fn) std::ifstream in(fn.c_str()); if (!in.is_open()) { if (!m_rank) { - std::stringstream err; - err << __FILE__ << " " << __LINE__ - << " :: failed to open file for reading: " << fn; - throw std::runtime_error(err.str()); + LBANN_ERROR("failed to open file for reading: ", fn); } } From 3ef64a2131494d8ce730fa40315f629749bd241a Mon Sep 17 00:00:00 2001 From: Nikoli Dryden Date: Thu, 31 Oct 2019 09:05:47 +0100 Subject: [PATCH 370/634] Fix protobuf version in superbuild --- superbuild/protobuf/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/superbuild/protobuf/CMakeLists.txt b/superbuild/protobuf/CMakeLists.txt index 3bfba417c4a..020353fee49 100644 --- a/superbuild/protobuf/CMakeLists.txt +++ b/superbuild/protobuf/CMakeLists.txt @@ -31,7 +31,7 @@ else () endif () # ... then the tag. -set(PROTOBUF_TAG "master" +set(PROTOBUF_TAG "v3.10.1" CACHE STRING "The git tag or hash to checkout for PROTOBUF") # Where to install PROTOBUF From d517d90277204040cb652072574aacd78dd789cb Mon Sep 17 00:00:00 2001 From: davidHysom Date: Thu, 31 Oct 2019 07:53:27 -0700 Subject: [PATCH 371/634] Data store move preload (#1327) * Moving call to preload_data_store() from generic_data_reader::instantiate_data_store() to generic_data_reader::setup_data_store(). * Preloading is now done independently by the train and validate reader. Previously only the train reader was preloaded, then, later, samples were moved from the train to the validate reader. * This PR removes all portions of the code related to the ds_sample_move_list vector, since it's no longer needed --- .../lbann/data_readers/data_reader_image.hpp | 4 +- .../data_readers/data_reader_imagenet.hpp | 4 -- .../data_readers/data_reader_jag_conduit.hpp | 3 +- .../lbann/data_store/data_store_conduit.hpp | 2 +- src/data_readers/data_reader.cpp | 28 +++++++------ src/data_readers/data_reader_image.cpp | 21 +--------- src/data_readers/data_reader_imagenet.cpp | 6 --- src/data_readers/data_reader_jag_conduit.cpp | 13 +------ src/data_store/data_store_conduit.cpp | 39 +------------------ src/proto/proto_common.cpp | 10 +---- 10 files changed, 26 insertions(+), 104 deletions(-) diff --git a/include/lbann/data_readers/data_reader_image.hpp b/include/lbann/data_readers/data_reader_image.hpp index 4780407efb0..466487b0aa2 100644 --- a/include/lbann/data_readers/data_reader_image.hpp +++ b/include/lbann/data_readers/data_reader_image.hpp @@ -41,8 +41,6 @@ class image_data_reader : public generic_data_reader { image_data_reader(bool shuffle = true); image_data_reader(const image_data_reader&); - image_data_reader(const image_data_reader&, const std::vector& ds_sample_move_list); - image_data_reader(const image_data_reader&, const std::vector& ds_sample_move_list, std::string role); image_data_reader& operator=(const image_data_reader&); /** Set up imagenet specific input parameters @@ -99,7 +97,7 @@ class image_data_reader : public generic_data_reader { void preload_data_store() override; protected: - void copy_members(const image_data_reader &rhs, const std::vector& ds_sample_move_list = std::vector()); + void copy_members(const image_data_reader &rhs); /// Set the default values for the width, the height, the number of channels, and the number of labels of an image virtual void set_defaults(); diff --git a/include/lbann/data_readers/data_reader_imagenet.hpp b/include/lbann/data_readers/data_reader_imagenet.hpp index ed674ff0d19..7f226f965de 100644 --- a/include/lbann/data_readers/data_reader_imagenet.hpp +++ b/include/lbann/data_readers/data_reader_imagenet.hpp @@ -35,10 +35,6 @@ namespace lbann { class imagenet_reader : public image_data_reader { public: imagenet_reader(bool shuffle = true); - imagenet_reader(const imagenet_reader&, - const std::vector& ds_sample_move_list); - imagenet_reader(const imagenet_reader&, - const std::vector& ds_sample_move_list, std::string role); imagenet_reader(const imagenet_reader&) = default; imagenet_reader& operator=(const imagenet_reader&) = default; ~imagenet_reader() override; diff --git a/include/lbann/data_readers/data_reader_jag_conduit.hpp b/include/lbann/data_readers/data_reader_jag_conduit.hpp index 7e6f52ea052..486900fc670 100644 --- a/include/lbann/data_readers/data_reader_jag_conduit.hpp +++ b/include/lbann/data_readers/data_reader_jag_conduit.hpp @@ -89,7 +89,6 @@ class data_reader_jag_conduit : public generic_data_reader { data_reader_jag_conduit(bool shuffle = true); data_reader_jag_conduit(const data_reader_jag_conduit&); - data_reader_jag_conduit(const data_reader_jag_conduit&, const std::vector& ds_sample_move_list); data_reader_jag_conduit& operator=(const data_reader_jag_conduit&); ~data_reader_jag_conduit() override; data_reader_jag_conduit* copy() const override { return new data_reader_jag_conduit(*this); } @@ -258,7 +257,7 @@ class data_reader_jag_conduit : public generic_data_reader { void preload_data_store() override; virtual void set_defaults(); - virtual void copy_members(const data_reader_jag_conduit& rhs, const std::vector& ds_sample_move_list = std::vector()); + virtual void copy_members(const data_reader_jag_conduit& rhs); /// add data type for independent variable void add_independent_variable_type(const variable_t independent); diff --git a/include/lbann/data_store/data_store_conduit.hpp b/include/lbann/data_store/data_store_conduit.hpp index 43719250bf5..5381194ea6a 100644 --- a/include/lbann/data_store/data_store_conduit.hpp +++ b/include/lbann/data_store/data_store_conduit.hpp @@ -141,7 +141,7 @@ class data_store_conduit { int get_data_size() { return m_data.size(); } /// made public for debugging during development - void copy_members(const data_store_conduit& rhs, const std::vector& = std::vector()); + void copy_members(const data_store_conduit& rhs); /** @brief Closes then reopens the debug logging file * diff --git a/src/data_readers/data_reader.cpp b/src/data_readers/data_reader.cpp index dbd1af895e2..46154f751f6 100644 --- a/src/data_readers/data_reader.cpp +++ b/src/data_readers/data_reader.cpp @@ -714,15 +714,28 @@ void generic_data_reader::instantiate_data_store(const std::vector& local_l m_data_store->set_shuffled_indices(&m_shuffled_indices); + if (opts->get_bool("preload_data_store") && !opts->get_bool("data_store_cache")) { + if (local_list_sizes.size() != 0) { + m_data_store->build_preloaded_owner_map(local_list_sizes); + } + } + + if (is_master()) { + std::cout << "generic_data_reader::instantiate_data_store time: : " << (get_time() - tm1) << std::endl; + } +} + +void generic_data_reader::setup_data_store(int mini_batch_size) { + if (m_data_store == nullptr) { + LBANN_ERROR("m_data_store == nullptr; you shouldn't be here"); + } // optionally preload the data store + options *opts = options::get(); if (opts->get_bool("preload_data_store") && !opts->get_bool("data_store_cache")) { if(is_master()) { std::cerr << "generic_data_reader::instantiate_data_store - Starting the preload" << std::endl; } double tm2 = get_time(); - if (local_list_sizes.size() != 0) { - m_data_store->build_preloaded_owner_map(local_list_sizes); - } preload_data_store(); m_data_store->set_is_preloaded(); if(is_master()) { @@ -734,15 +747,6 @@ void generic_data_reader::instantiate_data_store(const std::vector& local_l LBANN_ERROR("num samples loaded: ", n, " != shuffled-indices.size(): ", m_shuffled_indices.size()); } } - if (is_master()) { - std::cout << "generic_data_reader::instantiate_data_store time: : " << (get_time() - tm1) << std::endl; - } -} - -void generic_data_reader::setup_data_store(int mini_batch_size) { - if (m_data_store == nullptr) { - LBANN_ERROR("m_data_store == nullptr; you shouldn't be here"); - } m_data_store->setup(mini_batch_size); } diff --git a/src/data_readers/data_reader_image.cpp b/src/data_readers/data_reader_image.cpp index 38cb5fc7ac6..2842f65a18a 100644 --- a/src/data_readers/data_reader_image.cpp +++ b/src/data_readers/data_reader_image.cpp @@ -48,19 +48,6 @@ image_data_reader::image_data_reader(const image_data_reader& rhs) copy_members(rhs); } -image_data_reader::image_data_reader(const image_data_reader& rhs,const std::vector& ds_sample_move_list, std::string role) - : generic_data_reader(rhs) -{ - set_role(role); - copy_members(rhs, ds_sample_move_list); -} - -image_data_reader::image_data_reader(const image_data_reader& rhs,const std::vector& ds_sample_move_list) - : generic_data_reader(rhs) -{ - copy_members(rhs, ds_sample_move_list); -} - image_data_reader& image_data_reader::operator=(const image_data_reader& rhs) { generic_data_reader::operator=(rhs); m_image_dir = rhs.m_image_dir; @@ -74,14 +61,10 @@ image_data_reader& image_data_reader::operator=(const image_data_reader& rhs) { return (*this); } -void image_data_reader::copy_members(const image_data_reader &rhs, const std::vector& ds_sample_move_list) { +void image_data_reader::copy_members(const image_data_reader &rhs) { if(rhs.m_data_store != nullptr) { - if(ds_sample_move_list.size() == 0) { - m_data_store = new data_store_conduit(rhs.get_data_store()); - } else { - m_data_store = new data_store_conduit(rhs.get_data_store(), ds_sample_move_list); - } + m_data_store = new data_store_conduit(rhs.get_data_store()); m_data_store->set_data_reader_ptr(this); } diff --git a/src/data_readers/data_reader_imagenet.cpp b/src/data_readers/data_reader_imagenet.cpp index 8acad24ab9c..0d83fc679ad 100644 --- a/src/data_readers/data_reader_imagenet.cpp +++ b/src/data_readers/data_reader_imagenet.cpp @@ -37,12 +37,6 @@ imagenet_reader::imagenet_reader(bool shuffle) set_defaults(); } -imagenet_reader::imagenet_reader(const imagenet_reader& rhs, const std::vector& ds_sample_move_list, std::string role) - : image_data_reader(rhs, ds_sample_move_list, role) {} - -imagenet_reader::imagenet_reader(const imagenet_reader& rhs, const std::vector& ds_sample_move_list) - : image_data_reader(rhs, ds_sample_move_list) {} - imagenet_reader::~imagenet_reader() {} void imagenet_reader::set_defaults() { diff --git a/src/data_readers/data_reader_jag_conduit.cpp b/src/data_readers/data_reader_jag_conduit.cpp index 916555ebca6..80db910cec5 100644 --- a/src/data_readers/data_reader_jag_conduit.cpp +++ b/src/data_readers/data_reader_jag_conduit.cpp @@ -138,7 +138,7 @@ data_reader_jag_conduit::data_reader_jag_conduit(bool shuffle) set_defaults(); } -void data_reader_jag_conduit::copy_members(const data_reader_jag_conduit& rhs, const std::vector& ds_sample_move_list) { +void data_reader_jag_conduit::copy_members(const data_reader_jag_conduit& rhs) { m_independent = rhs.m_independent; m_independent_groups = rhs.m_independent_groups; m_dependent = rhs.m_dependent; @@ -185,11 +185,7 @@ void data_reader_jag_conduit::copy_members(const data_reader_jag_conduit& rhs, c m_list_per_model = rhs.m_list_per_model; if(rhs.m_data_store != nullptr) { - if(ds_sample_move_list.size() == 0) { - m_data_store = new data_store_conduit(rhs.get_data_store()); - } else { - m_data_store = new data_store_conduit(rhs.get_data_store(), ds_sample_move_list); - } + m_data_store = new data_store_conduit(rhs.get_data_store()); m_data_store->set_data_reader_ptr(this); } } @@ -199,11 +195,6 @@ data_reader_jag_conduit::data_reader_jag_conduit(const data_reader_jag_conduit& copy_members(rhs); } -data_reader_jag_conduit::data_reader_jag_conduit(const data_reader_jag_conduit& rhs, const std::vector& ds_sample_move_list) - : generic_data_reader(rhs) { - copy_members(rhs, ds_sample_move_list); -} - data_reader_jag_conduit& data_reader_jag_conduit::operator=(const data_reader_jag_conduit& rhs) { // check for self-assignment if (this == &rhs) { diff --git a/src/data_store/data_store_conduit.cpp b/src/data_store/data_store_conduit.cpp index bf7cda64d1c..3fc0c3a1339 100644 --- a/src/data_store/data_store_conduit.cpp +++ b/src/data_store/data_store_conduit.cpp @@ -148,9 +148,6 @@ data_store_conduit::data_store_conduit(const data_store_conduit& rhs) { copy_members(rhs); } -data_store_conduit::data_store_conduit(const data_store_conduit& rhs, const std::vector& ds_sample_move_list) { - copy_members(rhs, ds_sample_move_list); -} data_store_conduit& data_store_conduit::operator=(const data_store_conduit& rhs) { // check for self-assignment @@ -168,7 +165,7 @@ void data_store_conduit::set_data_reader_ptr(generic_data_reader *reader) { open_informational_files(); } -void data_store_conduit::copy_members(const data_store_conduit& rhs, const std::vector& ds_sample_move_list) { +void data_store_conduit::copy_members(const data_store_conduit& rhs) { m_is_setup = rhs.m_is_setup; m_preload = rhs.m_preload; m_explicit_loading = rhs.m_explicit_loading; @@ -198,40 +195,6 @@ void data_store_conduit::copy_members(const data_store_conduit& rhs, const std:: m_cur_spill_dir = rhs.m_cur_spill_dir; m_num_files_in_cur_spill_dir = rhs.m_num_files_in_cur_spill_dir; - /// This block needed when carving a validation set from the training set - m_my_num_indices = 0; - if(ds_sample_move_list.size() == 0) { - m_data = rhs.m_data; - } else { - /// Move indices on the list from the data and owner maps in the RHS data store to the new data store - for(auto&& i : ds_sample_move_list) { - - if(rhs.m_data.find(i) != rhs.m_data.end()){ - /// Repack the nodes because they don't seem to copy correctly - // - //dah - previously this code block only contained the line: - // build_node_for_sending(rhs.m_data[i]["data"], m_data[i]); - //However, this resulted in errors in the schema; not sure why, - //as it used to work; some change in the conduit library? - conduit::Node n2; - const std::vector &names = rhs.m_data[i]["data"].child_names(); - const std::vector &names2 = rhs.m_data[i]["data"][names[0]].child_names(); - for (auto t : names2) { - n2[names[0]][t] = rhs.m_data[i]["data"][names[0]][t]; - } - build_node_for_sending(n2, m_data[i]); - ++m_my_num_indices; - } - rhs.m_data.erase(i); - - /// Removed migrated nodes from the original data store's owner list - if(rhs.m_owner.find(i) != rhs.m_owner.end()) { - m_owner[i] = rhs.m_owner[i]; - rhs.m_owner.erase(i); - } - } - } - /// Clear the pointer to the data reader, this cannot be copied m_reader = nullptr; m_shuffled_indices = nullptr; diff --git a/src/proto/proto_common.cpp b/src/proto/proto_common.cpp index f71ef31542f..0738485fe69 100644 --- a/src/proto/proto_common.cpp +++ b/src/proto/proto_common.cpp @@ -422,7 +422,7 @@ void init_data_readers( } else if (name == "numpy_npz_conduit_reader") { reader_validation = new numpy_npz_conduit_reader(*dynamic_cast(reader)); } else if (name == "imagenet") { - reader_validation = new imagenet_reader(*dynamic_cast(reader), reader->get_unused_indices()); + reader_validation = new imagenet_reader(*dynamic_cast(reader)); } else if (name == "multihead_siamese") { reader_validation = new data_reader_multihead_siamese(*dynamic_cast(reader)); } else if (name == "jag") { @@ -450,7 +450,7 @@ void init_data_readers( reader_jag_conduit->set_leading_reader(leader); } } else { - reader_validation = new data_reader_jag_conduit(*dynamic_cast(reader), reader->get_unused_indices()); + reader_validation = new data_reader_jag_conduit(*dynamic_cast(reader)); const std::string role = "validate"; auto reader_jag_conduit = dynamic_cast(reader_validation); reader_jag_conduit->set_leading_reader(reader_jag_conduit); @@ -502,12 +502,6 @@ void init_data_readers( reader_validation->get_data_store_ptr()->compact_nodes(); } - /// At this point clean up any unused samples from the main data store - if(reader->get_data_store_ptr() != nullptr) { - auto&& data_store = reader->get_data_store_ptr(); - data_store->purge_unused_samples(reader->get_unused_indices()); - } - if (master) { size_t num_train = reader->get_num_data(); size_t num_validate = reader_validation->get_num_data(); From fe148493a069b14176897984868bb13879d7d0f4 Mon Sep 17 00:00:00 2001 From: davidHysom Date: Thu, 31 Oct 2019 07:57:52 -0700 Subject: [PATCH 372/634] Data store reloading: Template Method design pattern (#1328) * Modified preload_data_store() to use the Template Method design pattern, as suggested by Tom. * Moved a code block (that was identical in the three readers that preload) into generic_data_reader. --- include/lbann/data_readers/data_reader.hpp | 18 ++++----- .../lbann/data_readers/data_reader_image.hpp | 2 +- .../data_readers/data_reader_jag_conduit.hpp | 2 +- .../data_reader_numpy_npz_conduit.hpp | 2 +- .../lbann/data_store/data_store_conduit.hpp | 29 ++++++++++---- src/data_readers/data_reader.cpp | 39 ++++++++++++++----- src/data_readers/data_reader_image.cpp | 22 +---------- src/data_readers/data_reader_jag_conduit.cpp | 21 +--------- .../data_reader_numpy_npz_conduit.cpp | 21 +--------- src/data_store/data_store_conduit.cpp | 15 +------ 10 files changed, 69 insertions(+), 102 deletions(-) diff --git a/include/lbann/data_readers/data_reader.hpp b/include/lbann/data_readers/data_reader.hpp index cfae9cee33c..e7f38081d2b 100644 --- a/include/lbann/data_readers/data_reader.hpp +++ b/include/lbann/data_readers/data_reader.hpp @@ -621,17 +621,9 @@ class generic_data_reader { /// until later. void setup_data_store(int mini_batch_size); - void instantiate_data_store(const std::vector& local_list_sizes = std::vector()); + void instantiate_data_store(); - // note: don't want to make this virtual, since then all derived classes - // would have to override. But, this should only be called from within - // derived classes where it makes sense to do so. - // Once the sample_list class and file formats are generalized and - // finalized, it should (may?) be possible to code a single - // preload_data_store method. - virtual void preload_data_store() { - LBANN_ERROR("you should not be here"); - } + virtual void preload_data_store(); void set_gan_labelling(bool has_gan_labelling) { m_gan_labelling = has_gan_labelling; @@ -801,6 +793,12 @@ class generic_data_reader { friend class data_reader_merge_features; friend class data_reader_merge_samples; +private: + + virtual void do_preload_data_store() { + LBANN_ERROR("Not implemented."); + } + protected : //var to support GAN bool m_gan_labelling; //boolean flag of whether its GAN binary label, default is false diff --git a/include/lbann/data_readers/data_reader_image.hpp b/include/lbann/data_readers/data_reader_image.hpp index 466487b0aa2..f4cf92a0e36 100644 --- a/include/lbann/data_readers/data_reader_image.hpp +++ b/include/lbann/data_readers/data_reader_image.hpp @@ -94,7 +94,7 @@ class image_data_reader : public generic_data_reader { return m_image_list.at(idx); } - void preload_data_store() override; + void do_preload_data_store() override; protected: void copy_members(const image_data_reader &rhs); diff --git a/include/lbann/data_readers/data_reader_jag_conduit.hpp b/include/lbann/data_readers/data_reader_jag_conduit.hpp index 486900fc670..ecb59c58c3c 100644 --- a/include/lbann/data_readers/data_reader_jag_conduit.hpp +++ b/include/lbann/data_readers/data_reader_jag_conduit.hpp @@ -254,7 +254,7 @@ class data_reader_jag_conduit : public generic_data_reader { /// once the sample_list class and file formats are generalized and /// finalized, it should (may?) be possible to code a single /// preload_data_store method. - void preload_data_store() override; + void do_preload_data_store() override; virtual void set_defaults(); virtual void copy_members(const data_reader_jag_conduit& rhs); diff --git a/include/lbann/data_readers/data_reader_numpy_npz_conduit.hpp b/include/lbann/data_readers/data_reader_numpy_npz_conduit.hpp index dbaec7d043c..57473224f9f 100644 --- a/include/lbann/data_readers/data_reader_numpy_npz_conduit.hpp +++ b/include/lbann/data_readers/data_reader_numpy_npz_conduit.hpp @@ -75,7 +75,7 @@ class numpy_npz_conduit_reader : public generic_data_reader { const std::vector get_data_dims() const override { return m_data_dims; } protected: - void preload_data_store() override; + void do_preload_data_store() override; bool fetch_datum(CPUMat& X, int data_id, int mb_idx) override; bool fetch_label(CPUMat& Y, int data_id, int mb_idx) override; diff --git a/include/lbann/data_store/data_store_conduit.hpp b/include/lbann/data_store/data_store_conduit.hpp index 5381194ea6a..6c5f5c8b57a 100644 --- a/include/lbann/data_store/data_store_conduit.hpp +++ b/include/lbann/data_store/data_store_conduit.hpp @@ -80,8 +80,6 @@ class data_store_conduit { void setup(int mini_batch_size); - void preload_local_cache(); - void check_mem_capacity(lbann_comm *comm, const std::string sample_list_file, size_t stride, size_t offset); /// returns the conduit node @@ -112,9 +110,6 @@ class data_store_conduit { /// fills in m_owner, which maps index -> owning processor void build_preloaded_owner_map(const std::vector& per_rank_list_sizes); - /// Removed nodes corresponding from the indices vector from the data store - void purge_unused_samples(const std::vector& indices); - /// Recompact the nodes because they are not copied properly when instantiating /// using the copy constructor void compact_nodes(); @@ -123,8 +118,23 @@ class data_store_conduit { /// with the index int get_index_owner(int idx); + /** @brief Returns "true" is running in local cache mode + * + * In local cache mode, each node contains a complete copy + * of the data set. This is stored in a shared memory segment, + * but part of the set may be spilled to disk if memory is + * insufficient. Local cache mode is activated via the cmd line + * flag: --data_store_cache + */ bool is_local_cache() const { return m_is_local_cache; } + /** @brief Read the data set into memory + * + * Each rank reads a portion of the data set, then + * bcasts to all other ranks. + */ + void preload_local_cache(); + void exchange_mini_batch_data(size_t current_pos, size_t mb_size); void set_node_sizes_vary() { m_node_sizes_vary = true; } @@ -405,14 +415,17 @@ private : /** @brief Called by test_checkpoint */ void print_variables(); - /** @brief Called by test_checkpoint */ + /** @brief Called by test_checkpoint + * + * For testing and development. Prints the first 'n' entries from + * the owner map * (which maps sample_id -> owning rank) to std::cout + */ void print_partial_owner_map(int n); std::string get_conduit_dir() const; std::string get_cereal_fn() const; std::string get_metadata_fn() const; - /** @brief Creates the directory if it does not already exist */ void make_dir_if_it_doesnt_exist(const std::string &dir); @@ -470,6 +483,7 @@ private : } (*m_profile) << var1 << " "; PROFILE(var2...) ; + flush_profile_file(); } void DEBUG() { @@ -487,6 +501,7 @@ private : } (*m_debug) << var1 << " "; DEBUG(var2...) ; + flush_debug_file(); } }; diff --git a/src/data_readers/data_reader.cpp b/src/data_readers/data_reader.cpp index 46154f751f6..41a77988a60 100644 --- a/src/data_readers/data_reader.cpp +++ b/src/data_readers/data_reader.cpp @@ -547,7 +547,6 @@ void generic_data_reader::use_unused_index_set() { if(m_data_store != nullptr) { /// Update the data store's pointer to the shuffled indices m_data_store->set_shuffled_indices(&m_shuffled_indices); - m_data_store->purge_unused_samples(m_unused_indices); } m_unused_indices.clear(); std::vector().swap(m_unused_indices); // Trick to force memory reallocation @@ -685,7 +684,7 @@ double generic_data_reader::get_use_percent() const { return m_use_percent; } -void generic_data_reader::instantiate_data_store(const std::vector& local_list_sizes) { +void generic_data_reader::instantiate_data_store() { double tm1 = get_time(); options *opts = options::get(); if (! (opts->get_bool("use_data_store") || opts->get_bool("preload_data_store") || opts->get_bool("data_store_cache") || opts->has_string("data_store_spill"))) { @@ -714,12 +713,6 @@ void generic_data_reader::instantiate_data_store(const std::vector& local_l m_data_store->set_shuffled_indices(&m_shuffled_indices); - if (opts->get_bool("preload_data_store") && !opts->get_bool("data_store_cache")) { - if (local_list_sizes.size() != 0) { - m_data_store->build_preloaded_owner_map(local_list_sizes); - } - } - if (is_master()) { std::cout << "generic_data_reader::instantiate_data_store time: : " << (get_time() - tm1) << std::endl; } @@ -731,13 +724,13 @@ void generic_data_reader::setup_data_store(int mini_batch_size) { } // optionally preload the data store options *opts = options::get(); - if (opts->get_bool("preload_data_store") && !opts->get_bool("data_store_cache")) { + + if (opts->get_bool("preload_data_store") || opts->get_bool("data_store_cache")) { if(is_master()) { std::cerr << "generic_data_reader::instantiate_data_store - Starting the preload" << std::endl; } double tm2 = get_time(); preload_data_store(); - m_data_store->set_is_preloaded(); if(is_master()) { std::cout << "Preload complete; time: " << get_time() - tm2 << std::endl; } @@ -817,4 +810,30 @@ void generic_data_reader::set_role(std::string role) { } } +void generic_data_reader::preload_data_store() { + if (m_data_store->is_local_cache()) { + m_data_store->preload_local_cache(); + } else { + std::vector local_list_sizes; + int np = m_comm->get_procs_per_trainer(); + int base_files_per_rank = m_shuffled_indices.size() / np; + int extra = m_shuffled_indices.size() - (base_files_per_rank*np); + if (extra > np) { + LBANN_ERROR("extra > np"); + } + local_list_sizes.resize(np, 0); + for (int j=0; jbuild_preloaded_owner_map(local_list_sizes); + } + + do_preload_data_store(); + m_data_store->set_is_preloaded(); +} + + } // namespace lbann diff --git a/src/data_readers/data_reader_image.cpp b/src/data_readers/data_reader_image.cpp index 2842f65a18a..c5ea5aaeed1 100644 --- a/src/data_readers/data_reader_image.cpp +++ b/src/data_readers/data_reader_image.cpp @@ -152,25 +152,8 @@ void image_data_reader::load() { std::iota(m_shuffled_indices.begin(), m_shuffled_indices.end(), 0); resize_shuffled_indices(); - std::vector local_list_sizes; - if (opts->get_bool("preload_data_store") || opts->get_bool("data_store_cache")) { - int np = m_comm->get_procs_per_trainer(); - int base_files_per_rank = m_shuffled_indices.size() / np; - int extra = m_shuffled_indices.size() - (base_files_per_rank*np); - if (extra > np) { - LBANN_ERROR("extra > np"); - } - local_list_sizes.resize(np, 0); - for (int j=0; jset_option("node_sizes_vary", 1); - instantiate_data_store(local_list_sizes); + instantiate_data_store(); select_subset_of_data(); } @@ -190,10 +173,9 @@ void read_raw_data(const std::string &filename, std::vector &data) { } -void image_data_reader::preload_data_store() { +void image_data_reader::do_preload_data_store() { options *opts = options::get(); - if (is_master()) std::cout << "Starting image_data_reader::preload_data_store; num indices: " << m_shuffled_indices.size() << std::endl; int rank = m_comm->get_rank_in_trainer(); bool threaded = ! options::get()->get_bool("data_store_no_thread"); diff --git a/src/data_readers/data_reader_jag_conduit.cpp b/src/data_readers/data_reader_jag_conduit.cpp index 80db910cec5..8cc08991b03 100644 --- a/src/data_readers/data_reader_jag_conduit.cpp +++ b/src/data_readers/data_reader_jag_conduit.cpp @@ -890,29 +890,12 @@ void data_reader_jag_conduit::load() { std::cout << "Lists have been gathered" << std::endl; } - std::vector local_list_sizes; - if (opts->get_bool("preload_data_store") || opts->get_bool("data_store_cache")) { - int np = m_comm->get_procs_per_trainer(); - int base_files_per_rank = m_shuffled_indices.size() / np; - int extra = m_shuffled_indices.size() - (base_files_per_rank*np); - if (extra > np) { - LBANN_ERROR("extra > np"); - } - local_list_sizes.resize(np, 0); - for (int j=0; j local_list_sizes; - if (opts->get_bool("preload_data_store")) { - int np = m_comm->get_procs_per_trainer(); - int base_files_per_rank = m_filenames.size() / np; - int extra = m_filenames.size() - (base_files_per_rank*np); - if (extra > np) { - LBANN_ERROR("extra > np"); - } - local_list_sizes.resize(np, 0); - for (int j=0; j& per_rank_list_sizes) { + PROFILE("starting data_store_conduit::build_preloaded_owner_map"); m_owner.clear(); int owning_rank = 0; size_t per_rank_list_range_start = 0; @@ -668,20 +669,6 @@ conduit::Node & data_store_conduit::get_empty_node(int data_id) { return m_data[data_id]; } -void data_store_conduit::purge_unused_samples(const std::vector& indices) { - DEBUG(" starting purge_unused_samples; indices.size(): ", indices.size(), " data.size(): ", m_data.size()); - /// Remove unused indices from the data and owner maps - for(auto&& i : indices) { - if(m_data.find(i) != m_data.end()){ - m_data.erase(i); - } - if(m_owner.find(i) != m_owner.end()) { - m_owner.erase(i); - } - } - DEBUG("leaving purge_unused_samples; indices.size(): ", indices.size(), " data.size(): ", m_data.size()); -} - void data_store_conduit::compact_nodes() { for(auto&& j : *m_shuffled_indices) { if(m_data.find(j) != m_data.end()){ From 25267fae0646f6c70708126c9e090a6f4294dd17 Mon Sep 17 00:00:00 2001 From: Nikoli Dryden Date: Thu, 31 Oct 2019 10:06:05 -0700 Subject: [PATCH 373/634] Pin cereal and libjpeg-turbo --- superbuild/cereal/CMakeLists.txt | 2 +- superbuild/jpeg-turbo/CMakeLists.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/superbuild/cereal/CMakeLists.txt b/superbuild/cereal/CMakeLists.txt index 40ce193d16d..ea726d0d1d0 100644 --- a/superbuild/cereal/CMakeLists.txt +++ b/superbuild/cereal/CMakeLists.txt @@ -13,7 +13,7 @@ else () CACHE STRING "The URL from which to clone CEREAL.") endif () -set(CEREAL_TAG "master" CACHE STRING "The git tag or hash to checkout for CEREAL") +set(CEREAL_TAG "v1.3.0" CACHE STRING "The git tag or hash to checkout for CEREAL") # Where to install CEREAL set(CEREAL_CMAKE_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}" diff --git a/superbuild/jpeg-turbo/CMakeLists.txt b/superbuild/jpeg-turbo/CMakeLists.txt index 9971400e95e..689fc2cad91 100644 --- a/superbuild/jpeg-turbo/CMakeLists.txt +++ b/superbuild/jpeg-turbo/CMakeLists.txt @@ -16,7 +16,7 @@ else () CACHE STRING "The URL from which to clone LIBJPEG-TURBO") endif () -set(JPEG-TURBO_TAG "master" +set(JPEG-TURBO_TAG "2.0.3" CACHE STRING "The git tag to checkout for LIBJPEG-TURBO") # Where to install LIBJPEG-TURBO From 637eee41a5ab9303bbcaf9962ef1f83ada7c95e2 Mon Sep 17 00:00:00 2001 From: "Thomas R. Benson" Date: Thu, 31 Oct 2019 12:00:27 -0700 Subject: [PATCH 374/634] quick documentation fixes; RTD is broken --- docs/lbann.rst | 4 ++-- include/lbann/layers/learning/embedding.hpp | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/lbann.rst b/docs/lbann.rst index 81b5eb73b88..30438e05af2 100644 --- a/docs/lbann.rst +++ b/docs/lbann.rst @@ -1,6 +1,6 @@ -******************** +************************************************** LBANN Software Architecture and Class Overview -******************** +************************************************** Trainers (i.e. execution environment) ****************************************** diff --git a/include/lbann/layers/learning/embedding.hpp b/include/lbann/layers/learning/embedding.hpp index 9a53c4706e5..e5650a72e8d 100644 --- a/include/lbann/layers/learning/embedding.hpp +++ b/include/lbann/layers/learning/embedding.hpp @@ -41,7 +41,7 @@ namespace lbann { * output is a vector of zeros. * * The embedding vectors are stored in an - * @f$ \text{embedding_dim} \times \text{num_embeddings} @f$ + * @f$ \text{embedding\_dim} \times \text{num\_embeddings} @f$ * weights matrix. Note that this is the transpose of the weights in * the PyTorch embedding layer. */ From 7528da56d92c85ca4ba552f999915b71fc8d7f95 Mon Sep 17 00:00:00 2001 From: Tim Moon Date: Fri, 1 Nov 2019 10:28:22 -0700 Subject: [PATCH 375/634] Convert Bamboo layer unit tests to use Python frontend (#1325) * Add Bamboo utility function for L2 norm squared with NumPy Make sure NumPy implementations in layer unit tests are computed with 64-bit floats. * Implement all Bamboo layer tests with Python frontend * Hack to get around bug in I/O buffer An error happens if the mini-batch size is too small --- bamboo/common_python/tools.py | 36 +- .../unit_tests/test_unit_datareader_python.py | 12 +- bamboo/unit_tests/test_unit_layer_argmax.py | 12 +- bamboo/unit_tests/test_unit_layer_argmin.py | 17 +- .../test_unit_layer_channelwise_scale_bias.py | 25 +- bamboo/unit_tests/test_unit_layer_clamp.py | 219 ++++++++++-- .../unit_tests/test_unit_layer_convolution.py | 55 ++-- .../unit_tests/test_unit_layer_covariance.py | 311 +++++++++++++++--- .../test_unit_layer_cross_entropy.py | 45 ++- bamboo/unit_tests/test_unit_layer_elu.py | 217 ++++++++++-- .../unit_tests/test_unit_layer_embedding.py | 21 +- ...nit_layer_entrywise_batch_normalization.py | 16 +- .../test_unit_layer_entrywise_scale_bias.py | 39 +-- .../test_unit_layer_fully_connected.py | 35 +- bamboo/unit_tests/test_unit_layer_identity.py | 233 ++++++++++--- bamboo/unit_tests/test_unit_layer_l1_norm.py | 236 ++++++++++--- bamboo/unit_tests/test_unit_layer_l2_norm2.py | 51 --- .../unit_tests/test_unit_layer_leaky_relu.py | 239 +++++++++++--- .../unit_tests/test_unit_layer_log_sigmoid.py | 238 +++++++++++--- .../unit_tests/test_unit_layer_log_softmax.py | 32 +- .../test_unit_layer_mean_absolute_error.py | 255 +++++++++++--- .../test_unit_layer_mean_squared_error.py | 44 +-- bamboo/unit_tests/test_unit_layer_one_hot.py | 2 +- bamboo/unit_tests/test_unit_layer_relu.py | 234 ++++++++++--- bamboo/unit_tests/test_unit_layer_selu.py | 250 +++++++++++--- bamboo/unit_tests/test_unit_layer_sigmoid.py | 239 +++++++++++--- bamboo/unit_tests/test_unit_layer_slice.py | 33 +- bamboo/unit_tests/test_unit_layer_softmax.py | 30 +- bamboo/unit_tests/test_unit_layer_softplus.py | 235 ++++++++++--- bamboo/unit_tests/test_unit_layer_softsign.py | 238 +++++++++++--- .../test_unit_layer_squared_difference.py | 252 +++++++++++--- .../unit_tests/test_unit_layer_tessellate.py | 244 +++++++++++--- bamboo/unit_tests/test_unit_layer_variance.py | 288 +++++++++++++--- .../model_channelwise_mean.prototext | 93 ------ .../tests/layer_tests/model_clamp.prototext | 123 ------- .../layer_tests/model_covariance.prototext | 127 ------- .../tests/layer_tests/model_elu.prototext | 115 ------- .../layer_tests/model_identity.prototext | 99 ------ .../tests/layer_tests/model_l1_norm.prototext | 99 ------ .../layer_tests/model_l2_norm2.prototext | 79 ----- .../layer_tests/model_leaky_relu.prototext | 115 ------- .../layer_tests/model_log_sigmoid.prototext | 99 ------ .../layer_tests/model_log_softmax.prototext | 99 ------ .../model_mean_absolute_error.prototext | 115 ------- .../tests/layer_tests/model_relu.prototext | 99 ------ .../tests/layer_tests/model_selu.prototext | 99 ------ .../tests/layer_tests/model_sigmoid.prototext | 99 ------ .../layer_tests/model_softplus.prototext | 99 ------ .../layer_tests/model_softsign.prototext | 99 ------ .../model_squared_difference.prototext | 114 ------- .../layer_tests/model_tessellate.prototext | 121 ------- .../layer_tests/model_variance.prototext | 111 ------- 52 files changed, 3368 insertions(+), 3069 deletions(-) delete mode 100644 bamboo/unit_tests/test_unit_layer_l2_norm2.py delete mode 100644 model_zoo/tests/layer_tests/model_channelwise_mean.prototext delete mode 100644 model_zoo/tests/layer_tests/model_clamp.prototext delete mode 100644 model_zoo/tests/layer_tests/model_covariance.prototext delete mode 100644 model_zoo/tests/layer_tests/model_elu.prototext delete mode 100644 model_zoo/tests/layer_tests/model_identity.prototext delete mode 100644 model_zoo/tests/layer_tests/model_l1_norm.prototext delete mode 100644 model_zoo/tests/layer_tests/model_l2_norm2.prototext delete mode 100644 model_zoo/tests/layer_tests/model_leaky_relu.prototext delete mode 100644 model_zoo/tests/layer_tests/model_log_sigmoid.prototext delete mode 100644 model_zoo/tests/layer_tests/model_log_softmax.prototext delete mode 100644 model_zoo/tests/layer_tests/model_mean_absolute_error.prototext delete mode 100644 model_zoo/tests/layer_tests/model_relu.prototext delete mode 100644 model_zoo/tests/layer_tests/model_selu.prototext delete mode 100644 model_zoo/tests/layer_tests/model_sigmoid.prototext delete mode 100644 model_zoo/tests/layer_tests/model_softplus.prototext delete mode 100644 model_zoo/tests/layer_tests/model_softsign.prototext delete mode 100644 model_zoo/tests/layer_tests/model_squared_difference.prototext delete mode 100644 model_zoo/tests/layer_tests/model_tessellate.prototext delete mode 100644 model_zoo/tests/layer_tests/model_variance.prototext diff --git a/bamboo/common_python/tools.py b/bamboo/common_python/tools.py index 699d78c0a61..c2f25c782b2 100644 --- a/bamboo/common_python/tools.py +++ b/bamboo/common_python/tools.py @@ -1,7 +1,9 @@ +import collections.abc import math import os import re import sys +import numpy as np import pytest @@ -667,7 +669,7 @@ def create_tests(setup_func, test_name_base=None, nodes=1, procs_per_node=None): - """Create functions that can interact with PyTest. + """Create functions that can interact with PyTest This function creates tests that involve running an LBANN experiment with the Python frontend. `setup_func` should be a @@ -793,7 +795,7 @@ def create_python_data_reader(lbann, num_samples_function_name, sample_dims_function_name, execution_mode): - """Create protobuf message for Python data reader. + """Create protobuf message for Python data reader A Python data reader gets data by importing a Python module and calling functions in its scope. @@ -831,3 +833,33 @@ def create_python_data_reader(lbann, reader.python.sample_dims_function = sample_dims_function_name return reader + + +def numpy_l2norm2(x): + """Square of L2 norm, computed with NumPy + + The computation is performed with 64-bit floats. + + """ + if x.dtype is not np.float64: + x = x.astype(np.float64) + x = x.reshape(-1) + return np.inner(x, x) + + +def make_iterable(obj): + """Convert to an iterable object + + Simply returns `obj` if it is alredy iterable. Otherwise returns a + 1-tuple containing `obj`. + + """ + if isinstance(obj, collections.abc.Iterable) and not isinstance(obj, str): + return obj + else: + return (obj,) + + +def str_list(it): + """Convert an iterable object to a space-separated string""" + return ' '.join([str(i) for i in make_iterable(it)]) diff --git a/bamboo/unit_tests/test_unit_datareader_python.py b/bamboo/unit_tests/test_unit_datareader_python.py index fcf05875b7c..8a9d49a0d52 100644 --- a/bamboo/unit_tests/test_unit_datareader_python.py +++ b/bamboo/unit_tests/test_unit_datareader_python.py @@ -3,7 +3,7 @@ import sys import numpy as np -# Local files +# Bamboo utilities current_file = os.path.realpath(__file__) current_dir = os.path.dirname(current_file) sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python')) @@ -57,17 +57,17 @@ def construct_model(lbann): # Layer graph x = lbann.Input() - obj = lbann.L2Norm2(x) + y = lbann.L2Norm2(x) layers = list(lbann.traverse_layer_graph(x)) - metric = lbann.Metric(obj, name='obj') + metric = lbann.Metric(y, name='obj') callbacks = [] # Compute expected value with NumPy vals = [] for i in range(num_samples()): - x = get_sample(i) - obj = np.inner(x, x) - vals.append(obj) + x = get_sample(i).astype(np.float64) + y = tools.numpy_l2norm2(x) + vals.append(y) val = np.mean(vals) tol = 8 * val * np.finfo(np.float32).eps callbacks.append(lbann.CallbackCheckMetric( diff --git a/bamboo/unit_tests/test_unit_layer_argmax.py b/bamboo/unit_tests/test_unit_layer_argmax.py index 86d496faefc..71d984c90df 100644 --- a/bamboo/unit_tests/test_unit_layer_argmax.py +++ b/bamboo/unit_tests/test_unit_layer_argmax.py @@ -5,7 +5,7 @@ import sys import numpy as np -# Local files +# Bamboo utilities current_file = os.path.realpath(__file__) current_dir = os.path.dirname(current_file) sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python')) @@ -61,17 +61,13 @@ def construct_model(lbann): """ - # Convenience function to convert list to a space-separated string - def str_list(it): - return ' '.join([str(i) for i in it]) - # Convenience function to compute L2 norm squared with NumPy def l2_norm2(x): x = x.reshape(-1) return np.inner(x, x) # LBANN implementation - x = lbann.Reshape(lbann.Input(), dims=str_list(_sample_dims)) + x = lbann.Reshape(lbann.Input(), dims=tools.str_list(_sample_dims)) y = lbann.Argmax(x, device='cpu') z = lbann.L2Norm2(y) @@ -84,9 +80,9 @@ def l2_norm2(x): # Get expected metric value from NumPy implementation vals = [] for i in range(num_samples()): - x = get_sample(i).reshape(_sample_dims) + x = get_sample(i).reshape(_sample_dims).astype(np.float64) y = np.argmax(x) - z = l2_norm2(y) + z = tools.numpy_l2norm2(y) vals.append(z) val = np.mean(vals) tol = 8 * val * np.finfo(np.float32).eps diff --git a/bamboo/unit_tests/test_unit_layer_argmin.py b/bamboo/unit_tests/test_unit_layer_argmin.py index 0a6c90b12df..8d0299f44c6 100644 --- a/bamboo/unit_tests/test_unit_layer_argmin.py +++ b/bamboo/unit_tests/test_unit_layer_argmin.py @@ -5,7 +5,7 @@ import sys import numpy as np -# Local files +# Bamboo utilities current_file = os.path.realpath(__file__) current_dir = os.path.dirname(current_file) sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python')) @@ -61,17 +61,8 @@ def construct_model(lbann): """ - # Convenience function to convert list to a space-separated string - def str_list(it): - return ' '.join([str(i) for i in it]) - - # Convenience function to compute L2 norm squared with NumPy - def l2_norm2(x): - x = x.reshape(-1) - return np.inner(x, x) - # LBANN implementation - x = lbann.Reshape(lbann.Input(), dims=str_list(_sample_dims)) + x = lbann.Reshape(lbann.Input(), dims=tools.str_list(_sample_dims)) y = lbann.Argmin(x, device='cpu') z = lbann.L2Norm2(y) @@ -84,9 +75,9 @@ def l2_norm2(x): # Get expected metric value from NumPy implementation vals = [] for i in range(num_samples()): - x = get_sample(i).reshape(_sample_dims) + x = get_sample(i).reshape(_sample_dims).astype(np.float64) y = np.argmin(x) - z = l2_norm2(y) + z = tools.numpy_l2norm2(y) vals.append(z) val = np.mean(vals) tol = 8 * val * np.finfo(np.float32).eps diff --git a/bamboo/unit_tests/test_unit_layer_channelwise_scale_bias.py b/bamboo/unit_tests/test_unit_layer_channelwise_scale_bias.py index a9b72dda5e0..748be5b7e0c 100644 --- a/bamboo/unit_tests/test_unit_layer_channelwise_scale_bias.py +++ b/bamboo/unit_tests/test_unit_layer_channelwise_scale_bias.py @@ -5,7 +5,7 @@ import sys import numpy as np -# Local files +# Bamboo utilities current_file = os.path.realpath(__file__) current_dir = os.path.dirname(current_file) sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python')) @@ -59,15 +59,6 @@ def construct_model(lbann): """ - # Convenience function to convert list to a space-separated string - def str_list(it): - return ' '.join([str(i) for i in it]) - - # Convenience function to compute L2 norm squared with NumPy - def l2_norm2(x): - x = x.reshape(-1) - return np.inner(x, x) - # Input data # Note: Sum with a weights layer so that gradient checking will # verify that error signals are correct. @@ -75,13 +66,13 @@ def l2_norm2(x): initializer=lbann.ConstantInitializer(value=0.0), name='input_weights') x0 = lbann.WeightsLayer(weights=x_weights, - dims=str_list(_sample_dims)) - x1 = lbann.Reshape(lbann.Input(), dims=str_list(_sample_dims)) + dims=tools.str_list(_sample_dims)) + x1 = lbann.Reshape(lbann.Input(), dims=tools.str_list(_sample_dims)) x = lbann.Sum([x0, x1]) # Apply channel-wise scale/bias - scale_values = str_list(np.nditer(_scale)) - bias_values = str_list(np.nditer(_bias)) + scale_values = tools.str_list(np.nditer(_scale)) + bias_values = tools.str_list(np.nditer(_bias)) scalebias_weights = lbann.Weights( optimizer=lbann.SGD(), initializer=lbann.ValueInitializer(values='{} {}'.format(scale_values, @@ -100,9 +91,9 @@ def l2_norm2(x): # Get expected metric value from NumPy implementation vals = [] for i in range(num_samples()): - x = get_sample(i).reshape(_sample_dims) - y = _scale * x + _bias - z = l2_norm2(y) + x = get_sample(i).reshape(_sample_dims).astype(np.float64) + y = _scale.astype(np.float64) * x + _bias.astype(np.float64) + z = tools.numpy_l2norm2(y) vals.append(z) val = np.mean(vals) tol = 8 * val * np.finfo(np.float32).eps diff --git a/bamboo/unit_tests/test_unit_layer_clamp.py b/bamboo/unit_tests/test_unit_layer_clamp.py index d1fda5ac057..cb439755f9e 100644 --- a/bamboo/unit_tests/test_unit_layer_clamp.py +++ b/bamboo/unit_tests/test_unit_layer_clamp.py @@ -1,48 +1,195 @@ +import functools +import operator +import os +import os.path import sys -sys.path.insert(0, '../common_python') +import numpy as np + +# Bamboo utilities +current_file = os.path.realpath(__file__) +current_dir = os.path.dirname(current_file) +sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python')) import tools -import pytest -import os +# ============================================== +# Objects for Python data reader +# ============================================== +# Note: The Python data reader imports this file as a module and calls +# the functions below to ingest data. + +# Data +# Note: The clamp function is not differentiable at the interval +# boundaries, so we make sure values are well inside or well outside +# the interval. +np.random.seed(201910241) +_num_samples = 27 +_sample_size = 11 +_samples = np.random.choice([-193.0,-4.0,-1.0,1.0,3.0,5.0,2003.0], + size=(_num_samples,_sample_size)) +_samples += np.random.uniform(-0.5,0.5, size=_samples.shape) +_samples = _samples.astype(np.float32) + +# Sample access functions +def get_sample(index): + return _samples[index,:] +def num_samples(): + return _num_samples +def sample_dims(): + return (_sample_size,) + +# ============================================== +# Setup LBANN experiment +# ============================================== + +def setup_experiment(lbann): + """Construct LBANN experiment. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + trainer = lbann.Trainer() + model = construct_model(lbann) + data_reader = construct_data_reader(lbann) + optimizer = lbann.NoOptimizer() + return trainer, model, data_reader, optimizer + +def construct_model(lbann): + """Construct LBANN model. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Input data + # Note: Sum with a weights layer so that gradient checking will + # verify that error signals are correct. + x_weights = lbann.Weights(optimizer=lbann.SGD(), + initializer=lbann.ConstantInitializer(value=0.0)) + x0 = lbann.WeightsLayer(weights=x_weights, dims=str(_sample_size)) + x1 = lbann.Identity(lbann.Input()) + x = lbann.Sum([x0, x1]) + x_lbann = x + + # Objects for LBANN model + obj = [] + metrics = [] + callbacks = [] + + # ------------------------------------------ + # Data-parallel layout + # ------------------------------------------ + + # LBANN implementation + x = x_lbann + y = lbann.Clamp(x, min=-2, max=2, data_layout='data_parallel') + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='data-parallel output')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i).astype(np.float64) + y = np.clip(x, -2, 2) + z = tools.numpy_l2norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # Model-parallel layout + # ------------------------------------------ + + # LBANN implementation + x = x_lbann + y = lbann.Clamp(x, min=0, max=4, data_layout='model_parallel') + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='model-parallel output')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i).astype(np.float64) + y = np.clip(x, 0, 4) + z = tools.numpy_l2norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # Gradient checking + # ------------------------------------------ + + callbacks.append(lbann.CallbackCheckGradients(error_on_failure=True)) -def skeleton_layer_clamp(cluster, executables, dir_name, compiler_name, - weekly, data_reader_percent): - if compiler_name not in executables: - e = 'skeleton_layer_clamp: default_exes[%s] does not exist' % compiler_name - print('Skip - ' + e) - pytest.skip(e) - output_file_name = '%s/bamboo/unit_tests/output/layer_clamp_%s_output.txt' % (dir_name, compiler_name) - error_file_name = '%s/bamboo/unit_tests/error/layer_clamp_%s_error.txt' % (dir_name, compiler_name) - command = tools.get_command( - cluster=cluster, executable=executables[compiler_name], num_nodes=1, - time_limit=10, - num_processes=2, dir_name=dir_name, - data_reader_name='synthetic', - data_reader_percent=data_reader_percent, - model_folder='tests/layer_tests', model_name='clamp', - optimizer_name='sgd', - output_file_name=output_file_name, error_file_name=error_file_name, weekly=weekly) - return_code = os.system(command) - tools.assert_success(return_code, error_file_name) + # ------------------------------------------ + # Construct model + # ------------------------------------------ + mini_batch_size = num_samples() // 2 + num_epochs = 0 + return lbann.Model(mini_batch_size, + num_epochs, + layers=lbann.traverse_layer_graph(x_lbann), + objective_function=obj, + metrics=metrics, + callbacks=callbacks) -def test_unit_layer_clamp_clang6(cluster, exes, dirname, weekly, data_reader_percent): - skeleton_layer_clamp(cluster, exes, dirname, 'clang6', weekly, data_reader_percent) +def construct_data_reader(lbann): + """Construct Protobuf message for Python data reader. + The Python data reader will import the current Python file to + access the sample access functions. -def test_unit_layer_clamp_gcc7(cluster, exes, dirname, weekly, data_reader_percent): - skeleton_layer_clamp(cluster, exes, dirname, 'gcc7', weekly, data_reader_percent) + Args: + lbann (module): Module for LBANN Python frontend + """ -def test_unit_layer_clamp_intel19(cluster, exes, dirname, weekly, data_reader_percent): - skeleton_layer_clamp(cluster, exes, dirname, 'intel19', weekly, data_reader_percent) + # Note: The training data reader should be removed when + # https://github.com/LLNL/lbann/issues/1098 is resolved. + message = lbann.reader_pb2.DataReader() + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'train' + ) + ]) + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'test' + ) + ]) + return message +# ============================================== +# Setup PyTest +# ============================================== -# Run with python3 -m pytest -s test_unit_layer_clamp.py -k 'test_unit_layer_clamp_exe' --exe= -def test_unit_layer_clamp_exe(cluster, dirname, exe, weekly, data_reader_percent): - if exe is None: - e = 'test_unit_layer_clamp_exe: Non-local testing' - print('Skip - ' + e) - pytest.skip(e) - exes = {'exe': exe} - skeleton_layer_clamp(cluster, exes, dirname, 'exe', weekly, data_reader_percent) +# Create test functions that can interact with PyTest +for test in tools.create_tests(setup_experiment, __file__): + globals()[test.__name__] = test diff --git a/bamboo/unit_tests/test_unit_layer_convolution.py b/bamboo/unit_tests/test_unit_layer_convolution.py index 1962a83e4d3..dbf92edc585 100644 --- a/bamboo/unit_tests/test_unit_layer_convolution.py +++ b/bamboo/unit_tests/test_unit_layer_convolution.py @@ -6,7 +6,7 @@ import sys import numpy as np -# Local files +# Bamboo utilities current_file = os.path.realpath(__file__) current_dir = os.path.dirname(current_file) sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python')) @@ -70,15 +70,21 @@ def pytorch_convolution(data, """ - # Convert input data to PyTorch tensors + # Convert input data to PyTorch tensors with 64-bit floats import torch import torch.nn.functional if type(data) is np.ndarray: - data = torch.from_numpy(data.astype(np.float64)) + data = torch.from_numpy(data) if type(kernel) is np.ndarray: - kernel = torch.from_numpy(kernel.astype(np.float64)) + kernel = torch.from_numpy(kernel) if type(bias) is np.ndarray: - bias = torch.from_numpy(bias.astype(np.float64)) + bias = torch.from_numpy(bias) + if data.dtype is not torch.float64: + data = data.astype(torch.float64) + if kernel.dtype is not torch.float64: + kernel = kernel.astype(torch.float64) + if bias.dtype is not torch.float64: + bias = bias.astype(torch.float64) # Perform convolution with PyTorch output = None @@ -125,15 +131,6 @@ def construct_model(lbann): """ - # Convenience function to convert list to a space-separated string - def str_list(it): - return ' '.join([str(i) for i in it]) - - # Convenience function to compute L2 norm squared with NumPy - def l2_norm2(x): - x = x.reshape(-1).astype(np.float64) - return np.inner(x, x) - # Input data # Note: Sum with a weights layer so that gradient checking will # verify that error signals are correct. @@ -141,8 +138,8 @@ def l2_norm2(x): initializer=lbann.ConstantInitializer(value=0.0), name='input_weights') x0 = lbann.WeightsLayer(weights=x_weights, - dims=str_list(_sample_dims)) - x1 = lbann.Reshape(lbann.Input(), dims=str_list(_sample_dims)) + dims=tools.str_list(_sample_dims)) + x1 = lbann.Reshape(lbann.Input(), dims=tools.str_list(_sample_dims)) x = lbann.Sum([x0, x1]) x_lbann = x @@ -167,12 +164,12 @@ def l2_norm2(x): # Apply convolution kernel_weights = lbann.Weights( optimizer=lbann.SGD(), - initializer=lbann.ValueInitializer(values=str_list(np.nditer(kernel))), + initializer=lbann.ValueInitializer(values=tools.str_list(np.nditer(kernel))), name='kernel1' ) bias_weights = lbann.Weights( optimizer=lbann.SGD(), - initializer=lbann.ValueInitializer(values=str_list(np.nditer(bias))), + initializer=lbann.ValueInitializer(values=tools.str_list(np.nditer(bias))), name='bias1' ) x = x_lbann @@ -181,10 +178,10 @@ def l2_norm2(x): num_dims=3, num_output_channels=kernel_dims[0], has_vectors=True, - conv_dims=str_list(kernel_dims[2:]), - conv_strides=str_list(strides), - conv_pads=str_list(pads), - conv_dilations=str_list(dilations), + conv_dims=tools.str_list(kernel_dims[2:]), + conv_strides=tools.str_list(strides), + conv_pads=tools.str_list(pads), + conv_dilations=tools.str_list(dilations), has_bias=True) z = lbann.L2Norm2(y) obj.append(z) @@ -197,7 +194,7 @@ def l2_norm2(x): x, kernel, bias=bias, stride=strides, padding=pads, dilation=dilations ) - z = l2_norm2(y) / _num_samples + z = tools.numpy_l2norm2(y) / _num_samples val = z except: # Precomputed value @@ -225,7 +222,7 @@ def l2_norm2(x): # Apply convolution kernel_weights = lbann.Weights( optimizer=lbann.SGD(), - initializer=lbann.ValueInitializer(values=str_list(np.nditer(kernel))), + initializer=lbann.ValueInitializer(values=tools.str_list(np.nditer(kernel))), name='kernel2' ) x = x_lbann @@ -234,10 +231,10 @@ def l2_norm2(x): num_dims=3, num_output_channels=kernel_dims[0], has_vectors=True, - conv_dims=str_list(kernel_dims[2:]), - conv_strides=str_list(strides), - conv_pads=str_list(pads), - conv_dilations=str_list(dilations), + conv_dims=tools.str_list(kernel_dims[2:]), + conv_strides=tools.str_list(strides), + conv_pads=tools.str_list(pads), + conv_dilations=tools.str_list(dilations), num_groups=num_groups, has_bias=False) z = lbann.L2Norm2(y) @@ -252,7 +249,7 @@ def l2_norm2(x): stride=strides, padding=pads, dilation=dilations, groups=num_groups ) - z = l2_norm2(y) / _num_samples + z = tools.numpy_l2norm2(y) / _num_samples val = z except: # Precomputed value diff --git a/bamboo/unit_tests/test_unit_layer_covariance.py b/bamboo/unit_tests/test_unit_layer_covariance.py index fc9961a222a..bf5c78eddf4 100644 --- a/bamboo/unit_tests/test_unit_layer_covariance.py +++ b/bamboo/unit_tests/test_unit_layer_covariance.py @@ -1,55 +1,264 @@ +import functools +import operator +import os +import os.path import sys -sys.path.insert(0, '../common_python') +import numpy as np + +# Bamboo utilities +current_file = os.path.realpath(__file__) +current_dir = os.path.dirname(current_file) +sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python')) import tools -import pytest -import os +# ============================================== +# Objects for Python data reader +# ============================================== +# Note: The Python data reader imports this file as a module and calls +# the functions below to ingest data. + +# Data +np.random.seed(201910242) +_samples = np.random.normal(size=(27,2,5)).astype(np.float32) + +# Sample access functions +def get_sample(index): + return _samples[index].reshape(-1) +def num_samples(): + return _samples.shape[0] +def sample_dims(): + return (2*_samples.shape[-1],) + +# ============================================== +# Setup LBANN experiment +# ============================================== + +def setup_experiment(lbann): + """Construct LBANN experiment. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + trainer = lbann.Trainer() + model = construct_model(lbann) + data_reader = construct_data_reader(lbann) + optimizer = lbann.NoOptimizer() + return trainer, model, data_reader, optimizer + +def construct_model(lbann): + """Construct LBANN model. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Input data + # Note: Sum with weights layers so that gradient checking will + # verify that error signals are correct. + slice_size = _samples.shape[-1] + x0_weights = lbann.Weights(optimizer=lbann.SGD(), + initializer=lbann.ConstantInitializer(value=0.0), + name='input0_weights') + x1_weights = lbann.Weights(optimizer=lbann.SGD(), + initializer=lbann.ConstantInitializer(value=0.0), + name='input1_weights') + x_slice = lbann.Slice(lbann.Input(), + slice_points=tools.str_list([0, slice_size, 2*slice_size])) + x0 = lbann.Sum([x_slice, + lbann.WeightsLayer(weights=x0_weights, + dims=str(slice_size))]) + x1 = lbann.Sum([x_slice, + lbann.WeightsLayer(weights=x1_weights, + dims=str(slice_size))]) + x0_lbann = x0 + x1_lbann = x1 + + # Objects for LBANN model + obj = [] + metrics = [] + callbacks = [] + + # ------------------------------------------ + # Data-parallel layout, unbiased + # ------------------------------------------ + + # LBANN implementation + x0 = x0_lbann + x1 = x1_lbann + y = lbann.Covariance([x0, x1], data_layout='data_parallel') + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='data-parallel layout, unbiased')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i).astype(np.float64) + x0 = x[:slice_size] + x1 = x[slice_size:] + y = np.cov(np.stack((x0,x1), axis=0), bias=False)[0,1] + z = tools.numpy_l2norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # Model-parallel layout, unbiased + # ------------------------------------------ + + # LBANN implementation + x0 = x0_lbann + x1 = x1_lbann + y = lbann.Covariance([x0, x1], data_layout='model_parallel') + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='model-parallel layout, unbiased')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i).astype(np.float64) + x0 = x[:slice_size] + x1 = x[slice_size:] + y = np.cov(np.stack((x0,x1), axis=0), bias=False)[0,1] + z = tools.numpy_l2norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # Data-parallel layout, biased + # ------------------------------------------ + + # LBANN implementation + x0 = x0_lbann + x1 = x1_lbann + y = lbann.Covariance([x0, x1], biased=True, data_layout='data_parallel') + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='data-parallel layout, biased')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i) + x0 = x[:slice_size].astype(np.float64) + x1 = x[slice_size:].astype(np.float64) + y = np.cov(np.stack((x0,x1), axis=0), bias=True)[0,1] + z = tools.numpy_l2norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # Model-parallel layout, biased + # ------------------------------------------ + + # LBANN implementation + x0 = x0_lbann + x1 = x1_lbann + y = lbann.Covariance([x0, x1], biased=True, data_layout='model_parallel') + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='model-parallel layout, biased')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i) + x0 = x[:slice_size].astype(np.float64) + x1 = x[slice_size:].astype(np.float64) + y = np.cov(np.stack((x0,x1), axis=0), bias=True)[0,1] + z = tools.numpy_l2norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # Gradient checking + # ------------------------------------------ + + callbacks.append(lbann.CallbackCheckGradients(error_on_failure=True)) + + # ------------------------------------------ + # Construct model + # ------------------------------------------ + + mini_batch_size = num_samples() // 2 + num_epochs = 0 + return lbann.Model(mini_batch_size, + num_epochs, + layers=lbann.traverse_layer_graph(x0_lbann), + objective_function=obj, + metrics=metrics, + callbacks=callbacks) + +def construct_data_reader(lbann): + """Construct Protobuf message for Python data reader. + + The Python data reader will import the current Python file to + access the sample access functions. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Note: The training data reader should be removed when + # https://github.com/LLNL/lbann/issues/1098 is resolved. + message = lbann.reader_pb2.DataReader() + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'train' + ) + ]) + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'test' + ) + ]) + return message + +# ============================================== +# Setup PyTest +# ============================================== -def skeleton_layer_covariance(cluster, executables, dir_name, compiler_name, - weekly, data_reader_percent): - if compiler_name not in executables: - e = 'skeleton_layer_covariance: default_exes[%s] does not exist' % compiler_name - print('Skip - ' + e) - pytest.skip(e) - output_file_name = '%s/bamboo/unit_tests/output/layer_covariance_%s_output.txt' % (dir_name, compiler_name) - error_file_name = '%s/bamboo/unit_tests/error/layer_covariance_%s_error.txt' % (dir_name, compiler_name) - command = tools.get_command( - cluster=cluster, executable=executables[compiler_name], num_nodes=1, - time_limit=10, - num_processes=2, dir_name=dir_name, - data_reader_name='synthetic', - data_reader_percent=data_reader_percent, - model_folder='tests/layer_tests', model_name='covariance', - optimizer_name='sgd', - output_file_name=output_file_name, error_file_name=error_file_name, weekly=weekly) - return_code = os.system(command) - tools.assert_success(return_code, error_file_name) - - -def test_unit_layer_covariance_clang6(cluster, exes, dirname, - weekly, data_reader_percent): - skeleton_layer_covariance(cluster, exes, dirname, 'clang6', - weekly, data_reader_percent) - - -def test_unit_layer_covariance_gcc7(cluster, exes, dirname, - weekly, data_reader_percent): - skeleton_layer_covariance(cluster, exes, dirname, 'gcc7', - weekly, data_reader_percent) - - -def test_unit_layer_covariance_intel19(cluster, exes, dirname, - weekly, data_reader_percent): - skeleton_layer_covariance(cluster, exes, dirname, 'intel19', - weekly, data_reader_percent) - - -# Run with python3 -m pytest -s test_unit_ridge_regression.py -k 'test_unit_layer_covariance_exe' --exe= -def test_unit_layer_covariance_exe(cluster, dirname, exe, weekly, data_reader_percent): - if exe is None: - e = 'test_unit_layer_covariance_exe: Non-local testing' - print('Skip - ' + e) - pytest.skip(e) - exes = {'exe': exe} - skeleton_layer_covariance(cluster, exes, dirname, 'exe', - weekly, data_reader_percent) +# Create test functions that can interact with PyTest +for test in tools.create_tests(setup_experiment, __file__): + globals()[test.__name__] = test diff --git a/bamboo/unit_tests/test_unit_layer_cross_entropy.py b/bamboo/unit_tests/test_unit_layer_cross_entropy.py index c692b2b4f34..03272f733d8 100644 --- a/bamboo/unit_tests/test_unit_layer_cross_entropy.py +++ b/bamboo/unit_tests/test_unit_layer_cross_entropy.py @@ -5,7 +5,7 @@ import sys import numpy as np -# Local files +# Bamboo utilities current_file = os.path.realpath(__file__) current_dir = os.path.dirname(current_file) sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python')) @@ -38,6 +38,26 @@ def num_samples(): def sample_dims(): return (2*_samples.shape[-1],) +# ============================================== +# NumPy cross entropy +# ============================================== + +def numpy_cross_entropy(x, xhat): + """Cross entropy between two distributions, computed with NumPy + + The computation is performed with 64-bit floats. + + Args: + x: Estimated distribution + xhat: True distribution + + """ + if x.dtype is not np.float64: + x = x.astype(np.float64) + if xhat.dtype is not np.float64: + xhat = xhat.astype(np.float64) + return -np.inner(xhat, np.log(x)) + # ============================================== # Setup LBANN experiment # ============================================== @@ -63,15 +83,6 @@ def construct_model(lbann): """ - # Convenience function to convert list to a space-separated string - def str_list(it): - return ' '.join([str(i) for i in it]) - - # Convenience function to compute L2 norm squared with NumPy - def l2_norm2(x): - x = x.reshape(-1) - return np.inner(x, x) - # Input data # Note: Sum with weights layers so that gradient checking will # verify that error signals are correct. @@ -83,7 +94,7 @@ def l2_norm2(x): initializer=lbann.ConstantInitializer(value=0.0), name='input1_weights') x_slice = lbann.Slice(lbann.Input(), - slice_points=str_list([0, slice_size, 2*slice_size])) + slice_points=tools.str_list([0, slice_size, 2*slice_size])) x0 = lbann.Sum([x_slice, lbann.WeightsLayer(weights=x0_weights, dims=str(slice_size))]) @@ -108,16 +119,16 @@ def l2_norm2(x): y = lbann.CrossEntropy([x0, x1], data_layout='data_parallel') z = lbann.L2Norm2(y) obj.append(z) - metrics.append(lbann.Metric(z, name='data-parallel output')) + metrics.append(lbann.Metric(z, name='data-parallel layout')) # NumPy implementation vals = [] for i in range(num_samples()): - x = get_sample(i) + x = get_sample(i).astype(np.float64) x0 = x[:slice_size] x1 = x[slice_size:] y = -np.inner(x1, np.log(x0)) - z = l2_norm2(y) + z = tools.numpy_l2norm2(y) vals.append(z) val = np.mean(vals) tol = 8 * val * np.finfo(np.float32).eps @@ -138,16 +149,16 @@ def l2_norm2(x): y = lbann.CrossEntropy([x0, x1], data_layout='model_parallel') z = lbann.L2Norm2(y) obj.append(z) - metrics.append(lbann.Metric(z, name='model-parallel output')) + metrics.append(lbann.Metric(z, name='model-parallel layout')) # NumPy implementation vals = [] for i in range(num_samples()): - x = get_sample(i) + x = get_sample(i).astype(np.float64) x0 = x[:slice_size] x1 = x[slice_size:] y = -np.inner(x1, np.log(x0)) - z = l2_norm2(y) + z = tools.numpy_l2norm2(y) vals.append(z) val = np.mean(vals) tol = 8 * val * np.finfo(np.float32).eps diff --git a/bamboo/unit_tests/test_unit_layer_elu.py b/bamboo/unit_tests/test_unit_layer_elu.py index 4ab8c576325..2f2e94d8ee3 100644 --- a/bamboo/unit_tests/test_unit_layer_elu.py +++ b/bamboo/unit_tests/test_unit_layer_elu.py @@ -1,48 +1,193 @@ +import functools +import operator +import os +import os.path import sys -sys.path.insert(0, '../common_python') +import numpy as np + +# Bamboo utilities +current_file = os.path.realpath(__file__) +current_dir = os.path.dirname(current_file) +sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python')) import tools -import pytest -import os +# ============================================== +# Objects for Python data reader +# ============================================== +# Note: The Python data reader imports this file as a module and calls +# the functions below to ingest data. + +# Data +# Note: ELU is not differentiable at 0, so we make sure values +# are away from 0. +np.random.seed(201910243) +_num_samples = 37 +_sample_size = 8 +_samples = np.random.choice([-1.0, 1.0], size=(_num_samples,_sample_size)) +_samples += np.random.uniform(-0.5,0.5, size=_samples.shape) +_samples = _samples.astype(np.float32) + +# Sample access functions +def get_sample(index): + return _samples[index,:] +def num_samples(): + return _num_samples +def sample_dims(): + return (_sample_size,) + +# ============================================== +# Setup LBANN experiment +# ============================================== + +def setup_experiment(lbann): + """Construct LBANN experiment. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + trainer = lbann.Trainer() + model = construct_model(lbann) + data_reader = construct_data_reader(lbann) + optimizer = lbann.NoOptimizer() + return trainer, model, data_reader, optimizer + +def construct_model(lbann): + """Construct LBANN model. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Input data + # Note: Sum with a weights layer so that gradient checking will + # verify that error signals are correct. + x_weights = lbann.Weights(optimizer=lbann.SGD(), + initializer=lbann.ConstantInitializer(value=0.0)) + x0 = lbann.WeightsLayer(weights=x_weights, dims=str(_sample_size)) + x1 = lbann.Identity(lbann.Input()) + x = lbann.Sum([x0, x1]) + x_lbann = x + + # Objects for LBANN model + obj = [] + metrics = [] + callbacks = [] + + # ------------------------------------------ + # Data-parallel layout + # ------------------------------------------ + + # LBANN implementation + x = x_lbann + y = lbann.Elu(x, alpha=1, data_layout='data_parallel') + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='data-parallel layout')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i).astype(np.float64) + y = np.where(x < 0, np.expm1(x), x) + z = tools.numpy_l2norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # Model-parallel layout + # ------------------------------------------ + + # LBANN implementation + x = x_lbann + y = lbann.Elu(x, alpha=0.5, data_layout='model_parallel') + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='model-parallel layout')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i).astype(np.float64) + y = np.where(x < 0, 0.5*np.expm1(x), x) + z = tools.numpy_l2norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # Gradient checking + # ------------------------------------------ + + callbacks.append(lbann.CallbackCheckGradients(error_on_failure=True)) -def skeleton_layer_elu(cluster, executables, dir_name, compiler_name, - weekly, data_reader_percent): - if compiler_name not in executables: - e = 'skeleton_layer_elu: default_exes[%s] does not exist' % compiler_name - print('Skip - ' + e) - pytest.skip(e) - output_file_name = '%s/bamboo/unit_tests/output/layer_elu_%s_output.txt' % (dir_name, compiler_name) - error_file_name = '%s/bamboo/unit_tests/error/layer_elu_%s_error.txt' % (dir_name, compiler_name) - command = tools.get_command( - cluster=cluster, executable=executables[compiler_name], num_nodes=1, - time_limit=10, - num_processes=2, dir_name=dir_name, - data_reader_name='synthetic', - data_reader_percent=data_reader_percent, - model_folder='tests/layer_tests', model_name='elu', - optimizer_name='sgd', - output_file_name=output_file_name, error_file_name=error_file_name, weekly=weekly) - return_code = os.system(command) - tools.assert_success(return_code, error_file_name) + # ------------------------------------------ + # Construct model + # ------------------------------------------ + mini_batch_size = num_samples() // 2 + num_epochs = 0 + return lbann.Model(mini_batch_size, + num_epochs, + layers=lbann.traverse_layer_graph(x_lbann), + objective_function=obj, + metrics=metrics, + callbacks=callbacks) -def test_unit_layer_elu_clang6(cluster, exes, dirname, weekly, data_reader_percent): - skeleton_layer_elu(cluster, exes, dirname, 'clang6', weekly, data_reader_percent) +def construct_data_reader(lbann): + """Construct Protobuf message for Python data reader. + The Python data reader will import the current Python file to + access the sample access functions. -def test_unit_layer_elu_gcc7(cluster, exes, dirname, weekly, data_reader_percent): - skeleton_layer_elu(cluster, exes, dirname, 'gcc7', weekly, data_reader_percent) + Args: + lbann (module): Module for LBANN Python frontend + """ -def test_unit_layer_elu_intel19(cluster, exes, dirname, weekly, data_reader_percent): - skeleton_layer_elu(cluster, exes, dirname, 'intel19', weekly, data_reader_percent) + # Note: The training data reader should be removed when + # https://github.com/LLNL/lbann/issues/1098 is resolved. + message = lbann.reader_pb2.DataReader() + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'train' + ) + ]) + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'test' + ) + ]) + return message +# ============================================== +# Setup PyTest +# ============================================== -# Run with python3 -m pytest -s test_unit_layer_elu.py -k 'test_unit_layer_elu_exe' --exe= -def test_unit_layer_elu_exe(cluster, dirname, exe, weekly, data_reader_percent): - if exe is None: - e = 'test_unit_layer_elu_exe: Non-local testing' - print('Skip - ' + e) - pytest.skip(e) - exes = {'exe': exe} - skeleton_layer_elu(cluster, exes, dirname, 'exe', weekly, data_reader_percent) +# Create test functions that can interact with PyTest +for test in tools.create_tests(setup_experiment, __file__): + globals()[test.__name__] = test diff --git a/bamboo/unit_tests/test_unit_layer_embedding.py b/bamboo/unit_tests/test_unit_layer_embedding.py index 4677ea76b67..30c7ac991ae 100644 --- a/bamboo/unit_tests/test_unit_layer_embedding.py +++ b/bamboo/unit_tests/test_unit_layer_embedding.py @@ -5,7 +5,7 @@ import sys import numpy as np -# Local files +# Bamboo utilities current_file = os.path.realpath(__file__) current_dir = os.path.dirname(current_file) sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python')) @@ -58,15 +58,6 @@ def construct_model(lbann): """ - # Convenience function to convert list to a space-separated string - def str_list(it): - return ' '.join([str(i) for i in it]) - - # Convenience function to compute L2 norm squared with NumPy - def l2_norm2(x): - x = x.reshape(-1) - return np.inner(x, x) - # Input data x = lbann.Identity(lbann.Input()) x_lbann = x @@ -88,7 +79,7 @@ def l2_norm2(x): # LBANN implementation embedding_weights = lbann.Weights( optimizer=lbann.SGD(), - initializer=lbann.ValueInitializer(values=str_list(np.nditer(embeddings))) + initializer=lbann.ValueInitializer(values=tools.str_list(np.nditer(embeddings))) ) x = x_lbann y = lbann.Embedding(x, @@ -105,7 +96,7 @@ def l2_norm2(x): for i in range(num_samples()): x = get_sample(i)[0] y = embeddings[x] - z = l2_norm2(y) + z = tools.numpy_l2norm2(y) vals.append(z) val = np.mean(vals) tol = 8 * val * np.finfo(np.float32).eps @@ -131,7 +122,7 @@ def l2_norm2(x): # is set. Avoid gradient checking by not using an optimizer. embedding_weights = lbann.Weights( optimizer=None, - initializer=lbann.ValueInitializer(values=str_list(np.nditer(embeddings))) + initializer=lbann.ValueInitializer(values=tools.str_list(np.nditer(embeddings))) ) x = x_lbann y = lbann.Embedding(x, @@ -149,10 +140,10 @@ def l2_norm2(x): for i in range(num_samples()): x = get_sample(i)[0] if x == padding_idx: - y = np.zeros(shape=embedding_dim, dtype=np.float32) + y = np.zeros(shape=embedding_dim) else: y = embeddings[x] - z = l2_norm2(y) + z = tools.numpy_l2norm2(y) vals.append(z) val = np.mean(vals) tol = 8 * val * np.finfo(np.float32).eps diff --git a/bamboo/unit_tests/test_unit_layer_entrywise_batch_normalization.py b/bamboo/unit_tests/test_unit_layer_entrywise_batch_normalization.py index 2e9997a81ae..77fd72ae22b 100644 --- a/bamboo/unit_tests/test_unit_layer_entrywise_batch_normalization.py +++ b/bamboo/unit_tests/test_unit_layer_entrywise_batch_normalization.py @@ -5,7 +5,7 @@ import sys import numpy as np -# Local files +# Bamboo utilities current_file = os.path.realpath(__file__) current_dir = os.path.dirname(current_file) sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python')) @@ -57,23 +57,19 @@ def construct_model(lbann): """ - # Convenience function to convert list to a space-separated string - def str_list(it): - return ' '.join([str(i) for i in it]) - # Input data # Note: We want to use gradient checking to verify that error # signals are correct. To do this, we zero-initialize a weights # object, construct a zero-valued tensor, and add it to the # input. To make sure that batchnorm is non-trivial, we multiply # the zero-valued tensor by the mini-batch index. - x = lbann.Reshape(lbann.Input(), dims=str_list(_sample_dims)) + x = lbann.Reshape(lbann.Input(), dims=tools.str_list(_sample_dims)) x_weights = lbann.Weights(optimizer=lbann.SGD(), initializer=lbann.ConstantInitializer(value=0.0)) x0 = lbann.WeightsLayer(weights=x_weights, - dims=str_list(_sample_dims)) + dims=tools.str_list(_sample_dims)) x1 = lbann.Divide([lbann.MiniBatchIndex(), lbann.MiniBatchSize()]) - x1 = lbann.Tessellate(lbann.Reshape(x1, dims='1 1 1'), dims=str_list(_sample_dims)) + x1 = lbann.Tessellate(lbann.Reshape(x1, dims='1 1 1'), dims=tools.str_list(_sample_dims)) x = lbann.Sum([x, lbann.Multiply([x0, x1])]) x_lbann = x @@ -96,7 +92,7 @@ def str_list(it): data_layout='data_parallel') z = lbann.L2Norm2(y) obj.append(z) - metrics.append(lbann.Metric(z, name='data-parallel output')) + metrics.append(lbann.Metric(z, name='data-parallel layout')) # ------------------------------------------ # Model-parallel layout @@ -112,7 +108,7 @@ def str_list(it): data_layout='model_parallel') z = lbann.L2Norm2(y) obj.append(z) - metrics.append(lbann.Metric(z, name='model-parallel output')) + metrics.append(lbann.Metric(z, name='model-parallel layout')) # ------------------------------------------ # Gradient checking diff --git a/bamboo/unit_tests/test_unit_layer_entrywise_scale_bias.py b/bamboo/unit_tests/test_unit_layer_entrywise_scale_bias.py index e3fd0382af8..75e2cdd5bde 100644 --- a/bamboo/unit_tests/test_unit_layer_entrywise_scale_bias.py +++ b/bamboo/unit_tests/test_unit_layer_entrywise_scale_bias.py @@ -5,7 +5,7 @@ import sys import numpy as np -# Local files +# Bamboo utilities current_file = os.path.realpath(__file__) current_dir = os.path.dirname(current_file) sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python')) @@ -59,23 +59,14 @@ def construct_model(lbann): """ - # Convenience function to convert list to a space-separated string - def str_list(it): - return ' '.join([str(i) for i in it]) - - # Convenience function to compute L2 norm squared with NumPy - def l2_norm2(x): - x = x.reshape(-1) - return np.inner(x, x) - # Input data # Note: Sum with a weights layer so that gradient checking will # verify that error signals are correct. x_weights = lbann.Weights(optimizer=lbann.SGD(), initializer=lbann.ConstantInitializer(value=0.0)) x0 = lbann.WeightsLayer(weights=x_weights, - dims=str_list(_sample_dims)) - x1 = lbann.Reshape(lbann.Input(), dims=str_list(_sample_dims)) + dims=tools.str_list(_sample_dims)) + x1 = lbann.Reshape(lbann.Input(), dims=tools.str_list(_sample_dims)) x = lbann.Sum([x0, x1]) x_lbann = x @@ -89,8 +80,8 @@ def l2_norm2(x): # ------------------------------------------ # LBANN implementation - scale_values = str_list(np.nditer(_scale)) - bias_values = str_list(np.nditer(_bias)) + scale_values = tools.str_list(np.nditer(_scale)) + bias_values = tools.str_list(np.nditer(_bias)) scalebias_weights = lbann.Weights( optimizer=lbann.SGD(), initializer=lbann.ValueInitializer(values='{} {}'.format(scale_values, @@ -101,14 +92,14 @@ def l2_norm2(x): data_layout='data_parallel') z = lbann.L2Norm2(y) obj.append(z) - metrics.append(lbann.Metric(z, name='data-parallel output')) + metrics.append(lbann.Metric(z, name='data-parallel layout')) # NumPy implementation vals = [] for i in range(num_samples()): - x = get_sample(i).reshape(_sample_dims) - y = _scale * x + _bias - z = l2_norm2(y) + x = get_sample(i).reshape(_sample_dims).astype(np.float64) + y = _scale.astype(np.float64) * x + _bias.astype(np.float64) + z = tools.numpy_l2norm2(y) vals.append(z) val = np.mean(vals) tol = 8 * val * np.finfo(np.float32).eps @@ -124,8 +115,8 @@ def l2_norm2(x): # ------------------------------------------ # LBANN implementation - scale_values = str_list(np.nditer(_scale)) - bias_values = str_list(np.nditer(_bias)) + scale_values = tools.str_list(np.nditer(_scale)) + bias_values = tools.str_list(np.nditer(_bias)) scalebias_weights = lbann.Weights( optimizer=lbann.SGD(), initializer=lbann.ValueInitializer(values='{} {}'.format(scale_values, @@ -136,14 +127,14 @@ def l2_norm2(x): data_layout='model_parallel') z = lbann.L2Norm2(y) obj.append(z) - metrics.append(lbann.Metric(z, name='model-parallel output')) + metrics.append(lbann.Metric(z, name='model-parallel layout')) # NumPy implementation vals = [] for i in range(num_samples()): - x = get_sample(i).reshape(_sample_dims) - y = _scale * x + _bias - z = l2_norm2(y) + x = get_sample(i).reshape(_sample_dims).astype(np.float64) + y = _scale.astype(np.float64) * x + _bias.astype(np.float64) + z = tools.numpy_l2norm2(y) vals.append(z) val = np.mean(vals) tol = 8 * val * np.finfo(np.float32).eps diff --git a/bamboo/unit_tests/test_unit_layer_fully_connected.py b/bamboo/unit_tests/test_unit_layer_fully_connected.py index f7bc06a52cd..47469c1d499 100644 --- a/bamboo/unit_tests/test_unit_layer_fully_connected.py +++ b/bamboo/unit_tests/test_unit_layer_fully_connected.py @@ -5,7 +5,7 @@ import sys import numpy as np -# Local files +# Bamboo utilities current_file = os.path.realpath(__file__) current_dir = os.path.dirname(current_file) sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python')) @@ -57,10 +57,6 @@ def construct_model(lbann): """ - # Convenience function to convert list to a space-separated string - def str_list(it): - return ' '.join([str(i) for i in it]) - # Input data # Note: Sum with a weights layer so that gradient checking will # verify that error signals are correct. @@ -80,25 +76,20 @@ def str_list(it): # Compute expected metric values with NumPy # ------------------------------------------ - # Convenience function to compute L2 norm squared with NumPy - def l2_norm2(x): - x = x.reshape(-1) - return np.inner(x, x) - # Weight values linearity = np.random.normal(size=(_output_size,_input_size)).astype(np.float32) bias = np.random.normal(size=(_output_size,1)).astype(np.float32) # With bias - x = _samples.transpose() - y = np.matmul(linearity, x) + bias - z = l2_norm2(y) / _num_samples + x = _samples.transpose().astype(np.float64) + y = np.matmul(linearity.astype(np.float64), x) + bias.astype(np.float64) + z = tools.numpy_l2norm2(y) / _num_samples val_with_bias = z # Without bias - x = _samples.transpose() - y = np.matmul(linearity, x) - z = l2_norm2(y) / _num_samples + x = _samples.transpose().astype(np.float64) + y = np.matmul(linearity.astype(np.float64), x) + z = tools.numpy_l2norm2(y) / _num_samples val_without_bias = z # ------------------------------------------ @@ -109,13 +100,13 @@ def l2_norm2(x): linearity_weights = lbann.Weights( optimizer=lbann.SGD(), initializer=lbann.ValueInitializer( - values=str_list(np.nditer(linearity, order='F')) + values=tools.str_list(np.nditer(linearity, order='F')) ) ) bias_weights = lbann.Weights( optimizer=lbann.SGD(), initializer=lbann.ValueInitializer( - values=str_list(np.nditer(bias)) + values=tools.str_list(np.nditer(bias)) ) ) x = x_lbann @@ -147,13 +138,13 @@ def l2_norm2(x): linearity_weights = lbann.Weights( optimizer=lbann.SGD(), initializer=lbann.ValueInitializer( - values=str_list(np.nditer(linearity, order='F')) + values=tools.str_list(np.nditer(linearity, order='F')) ) ) bias_weights = lbann.Weights( optimizer=lbann.SGD(), initializer=lbann.ValueInitializer( - values=str_list(np.nditer(bias)) + values=tools.str_list(np.nditer(bias)) ) ) x = x_lbann @@ -185,7 +176,7 @@ def l2_norm2(x): linearity_weights = lbann.Weights( optimizer=lbann.SGD(), initializer=lbann.ValueInitializer( - values=str_list(np.nditer(linearity, order='C')) + values=tools.str_list(np.nditer(linearity, order='C')) ) ) x = x_lbann @@ -217,7 +208,7 @@ def l2_norm2(x): linearity_weights = lbann.Weights( optimizer=lbann.SGD(), initializer=lbann.ValueInitializer( - values=str_list(np.nditer(linearity, order='C')) + values=tools.str_list(np.nditer(linearity, order='C')) ) ) x = x_lbann diff --git a/bamboo/unit_tests/test_unit_layer_identity.py b/bamboo/unit_tests/test_unit_layer_identity.py index b2e7d2058cb..0bc14a7dcb1 100644 --- a/bamboo/unit_tests/test_unit_layer_identity.py +++ b/bamboo/unit_tests/test_unit_layer_identity.py @@ -1,52 +1,189 @@ +import functools +import operator +import os +import os.path import sys -sys.path.insert(0, '../common_python') +import numpy as np + +# Bamboo utilities +current_file = os.path.realpath(__file__) +current_dir = os.path.dirname(current_file) +sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python')) import tools -import pytest -import os +# ============================================== +# Objects for Python data reader +# ============================================== +# Note: The Python data reader imports this file as a module and calls +# the functions below to ingest data. + +# Data +np.random.seed(201910244) +_num_samples = 83 +_sample_size = 47 +_samples = np.random.normal(size=(_num_samples,_sample_size)).astype(np.float32) + +# Sample access functions +def get_sample(index): + return _samples[index,:] +def num_samples(): + return _num_samples +def sample_dims(): + return (_sample_size,) + +# ============================================== +# Setup LBANN experiment +# ============================================== + +def setup_experiment(lbann): + """Construct LBANN experiment. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + trainer = lbann.Trainer() + model = construct_model(lbann) + data_reader = construct_data_reader(lbann) + optimizer = lbann.NoOptimizer() + return trainer, model, data_reader, optimizer + +def construct_model(lbann): + """Construct LBANN model. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Input data + # Note: Sum with a weights layer so that gradient checking will + # verify that error signals are correct. + x_weights = lbann.Weights(optimizer=lbann.SGD(), + initializer=lbann.ConstantInitializer(value=0.0)) + x0 = lbann.WeightsLayer(weights=x_weights, dims=str(_sample_size)) + x1 = lbann.Identity(lbann.Input()) + x = lbann.Sum([x0, x1]) + x_lbann = x + + # Objects for LBANN model + obj = [] + metrics = [] + callbacks = [] + + # ------------------------------------------ + # Data-parallel layout + # ------------------------------------------ + + # LBANN implementation + x = x_lbann + y = lbann.Identity(x, data_layout='data_parallel') + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='data-parallel layout')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i).astype(np.float64) + y = x + z = tools.numpy_l2norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # Model-parallel layout + # ------------------------------------------ + + # LBANN implementation + x = x_lbann + y = lbann.Identity(x, data_layout='model_parallel') + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='model-parallel layout')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i).astype(np.float64) + y = x + z = tools.numpy_l2norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # Gradient checking + # ------------------------------------------ + + callbacks.append(lbann.CallbackCheckGradients(error_on_failure=True)) + + # ------------------------------------------ + # Construct model + # ------------------------------------------ + + mini_batch_size = num_samples() // 2 + num_epochs = 0 + return lbann.Model(mini_batch_size, + num_epochs, + layers=lbann.traverse_layer_graph(x_lbann), + objective_function=obj, + metrics=metrics, + callbacks=callbacks) + +def construct_data_reader(lbann): + """Construct Protobuf message for Python data reader. + + The Python data reader will import the current Python file to + access the sample access functions. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Note: The training data reader should be removed when + # https://github.com/LLNL/lbann/issues/1098 is resolved. + message = lbann.reader_pb2.DataReader() + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'train' + ) + ]) + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'test' + ) + ]) + return message + +# ============================================== +# Setup PyTest +# ============================================== -def skeleton_layer_identity(cluster, executables, dir_name, compiler_name, - weekly, data_reader_percent): - if compiler_name not in executables: - e = 'skeleton_layer_identity: default_exes[%s] does not exist' % compiler_name - print('Skip - ' + e) - pytest.skip(e) - output_file_name = '%s/bamboo/unit_tests/output/layer_identity_%s_output.txt' % (dir_name, compiler_name) - error_file_name = '%s/bamboo/unit_tests/error/layer_identity_%s_error.txt' % (dir_name, compiler_name) - command = tools.get_command( - cluster=cluster, executable=executables[compiler_name], num_nodes=1, - time_limit=10, - num_processes=2, dir_name=dir_name, - data_reader_name='synthetic', - data_reader_percent=data_reader_percent, - model_folder='tests/layer_tests', model_name='identity', - optimizer_name='sgd', - output_file_name=output_file_name, error_file_name=error_file_name, weekly=weekly) - return_code = os.system(command) - tools.assert_success(return_code, error_file_name) - - -def test_unit_layer_identity_clang6(cluster, exes, dirname, - weekly, data_reader_percent): - skeleton_layer_identity(cluster, exes, dirname, 'clang6', - weekly, data_reader_percent) - - -def test_unit_layer_identity_gcc7(cluster, exes, dirname, weekly, data_reader_percent): - skeleton_layer_identity(cluster, exes, dirname, 'gcc7', weekly, data_reader_percent) - - -def test_unit_layer_identity_intel19(cluster, exes, dirname, - weekly, data_reader_percent): - skeleton_layer_identity(cluster, exes, dirname, 'intel19', - weekly, data_reader_percent) - - -# Run with python3 -m pytest -s test_unit_layer_identity.py -k 'test_unit_layer_identity_exe' --exe= -def test_unit_layer_identity_exe(cluster, dirname, exe, weekly, data_reader_percent): - if exe is None: - e = 'test_unit_layer_identity_exe: Non-local testing' - print('Skip - ' + e) - pytest.skip(e) - exes = {'exe': exe} - skeleton_layer_identity(cluster, exes, dirname, 'exe', weekly, data_reader_percent) +# Create test functions that can interact with PyTest +for test in tools.create_tests(setup_experiment, __file__): + globals()[test.__name__] = test diff --git a/bamboo/unit_tests/test_unit_layer_l1_norm.py b/bamboo/unit_tests/test_unit_layer_l1_norm.py index 92fdd3c36c7..06c7a517371 100644 --- a/bamboo/unit_tests/test_unit_layer_l1_norm.py +++ b/bamboo/unit_tests/test_unit_layer_l1_norm.py @@ -1,51 +1,193 @@ +import functools +import operator +import os +import os.path import sys -sys.path.insert(0, '../common_python') +import numpy as np + +# Bamboo utilities +current_file = os.path.realpath(__file__) +current_dir = os.path.dirname(current_file) +sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python')) import tools -import pytest -import os +# ============================================== +# Objects for Python data reader +# ============================================== +# Note: The Python data reader imports this file as a module and calls +# the functions below to ingest data. + +# Data +# Note: The L1 norm is not differentiable at 0, so we make sure values +# are away from 0. +np.random.seed(201910245) +_num_samples = 23 +_sample_size = 11 +_samples = np.random.choice([-1.0, 1.0], size=(_num_samples,_sample_size)) +_samples += np.random.uniform(-0.5,0.5, size=_samples.shape) +_samples = _samples.astype(np.float32) + +# Sample access functions +def get_sample(index): + return _samples[index,:] +def num_samples(): + return _num_samples +def sample_dims(): + return (_sample_size,) + +# ============================================== +# Setup LBANN experiment +# ============================================== + +def setup_experiment(lbann): + """Construct LBANN experiment. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + trainer = lbann.Trainer() + model = construct_model(lbann) + data_reader = construct_data_reader(lbann) + optimizer = lbann.NoOptimizer() + return trainer, model, data_reader, optimizer + +def construct_model(lbann): + """Construct LBANN model. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Input data + # Note: Sum with a weights layer so that gradient checking will + # verify that error signals are correct. + x_weights = lbann.Weights(optimizer=lbann.SGD(), + initializer=lbann.ConstantInitializer(value=0.0)) + x0 = lbann.WeightsLayer(weights=x_weights, dims=str(_sample_size)) + x1 = lbann.Identity(lbann.Input()) + x = lbann.Sum([x0, x1]) + x_lbann = x + + # Objects for LBANN model + obj = [] + metrics = [] + callbacks = [] + + # ------------------------------------------ + # Data-parallel layout + # ------------------------------------------ + + # LBANN implementation + x = x_lbann + y = lbann.L1Norm(x, data_layout='data_parallel') + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='data-parallel layout')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i).astype(np.float64) + y = np.linalg.norm(x, 1) + z = tools.numpy_l2norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # Model-parallel layout + # ------------------------------------------ + + # LBANN implementation + x = x_lbann + y = lbann.L1Norm(x, data_layout='model_parallel') + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='model-parallel layout')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i).astype(np.float64) + y = np.linalg.norm(x, 1) + z = tools.numpy_l2norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # Gradient checking + # ------------------------------------------ + + callbacks.append(lbann.CallbackCheckGradients(error_on_failure=True)) + + # ------------------------------------------ + # Construct model + # ------------------------------------------ + + mini_batch_size = num_samples() // 2 + num_epochs = 0 + return lbann.Model(mini_batch_size, + num_epochs, + layers=lbann.traverse_layer_graph(x_lbann), + objective_function=obj, + metrics=metrics, + callbacks=callbacks) + +def construct_data_reader(lbann): + """Construct Protobuf message for Python data reader. + + The Python data reader will import the current Python file to + access the sample access functions. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Note: The training data reader should be removed when + # https://github.com/LLNL/lbann/issues/1098 is resolved. + message = lbann.reader_pb2.DataReader() + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'train' + ) + ]) + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'test' + ) + ]) + return message + +# ============================================== +# Setup PyTest +# ============================================== -def skeleton_layer_l1_norm(cluster, executables, dir_name, compiler_name, - weekly, data_reader_percent): - if compiler_name not in executables: - e = 'skeleton_layer_l1_norm: default_exes[%s] does not exist' % compiler_name - print('Skip - ' + e) - pytest.skip(e) - output_file_name = '%s/bamboo/unit_tests/output/layer_l1_norm_%s_output.txt' % (dir_name, compiler_name) - error_file_name = '%s/bamboo/unit_tests/error/layer_l1_norm_%s_error.txt' % (dir_name, compiler_name) - command = tools.get_command( - cluster=cluster, executable=executables[compiler_name], num_nodes=1, - time_limit=10, - num_processes=2, dir_name=dir_name, - data_reader_name='synthetic', - data_reader_percent=data_reader_percent, - model_folder='tests/layer_tests', model_name='l1_norm', - optimizer_name='sgd', - output_file_name=output_file_name, error_file_name=error_file_name, weekly=weekly) - return_code = os.system(command) - tools.assert_success(return_code, error_file_name) - - -def test_unit_layer_l1_norm_clang6(cluster, exes, dirname, weekly, data_reader_percent): - skeleton_layer_l1_norm(cluster, exes, dirname, 'clang6', - weekly, data_reader_percent) - - -def test_unit_layer_l1_norm_gcc7(cluster, exes, dirname, weekly, data_reader_percent): - skeleton_layer_l1_norm(cluster, exes, dirname, 'gcc7', weekly, data_reader_percent) - - -def test_unit_layer_l1_norm_intel19(cluster, exes, dirname, - weekly, data_reader_percent): - skeleton_layer_l1_norm(cluster, exes, dirname, 'intel19', - weekly, data_reader_percent) - - -# Run with python3 -m pytest -s test_unit_ridge_regression.py -k 'test_unit_layer_l1_norm_exe' --exe= -def test_unit_layer_l1_norm_exe(cluster, dirname, exe, weekly, data_reader_percent): - if exe is None: - e = 'test_unit_layer_l1_norm_exe: Non-local testing' - print('Skip - ' + e) - pytest.skip(e) - exes = {'exe': exe} - skeleton_layer_l1_norm(cluster, exes, dirname, 'exe', weekly, data_reader_percent) +# Create test functions that can interact with PyTest +for test in tools.create_tests(setup_experiment, __file__): + globals()[test.__name__] = test diff --git a/bamboo/unit_tests/test_unit_layer_l2_norm2.py b/bamboo/unit_tests/test_unit_layer_l2_norm2.py deleted file mode 100644 index 90901e881b1..00000000000 --- a/bamboo/unit_tests/test_unit_layer_l2_norm2.py +++ /dev/null @@ -1,51 +0,0 @@ -import sys -sys.path.insert(0, '../common_python') -import tools -import pytest -import os - - -def skeleton_layer_l2_norm2(cluster, executables, dir_name, compiler_name, - weekly, data_reader_percent): - if compiler_name not in executables: - e = 'skeleton_layer_l2_norm2: default_exes[%s] does not exist' % compiler_name - print('Skip - ' + e) - pytest.skip(e) - output_file_name = '%s/bamboo/unit_tests/output/layer_l2_norm2_%s_output.txt' % (dir_name, compiler_name) - error_file_name = '%s/bamboo/unit_tests/error/layer_l2_norm2_%s_error.txt' % (dir_name, compiler_name) - command = tools.get_command( - cluster=cluster, executable=executables[compiler_name], num_nodes=1, - time_limit=10, - num_processes=2, dir_name=dir_name, - data_reader_name='synthetic', - data_reader_percent=data_reader_percent, - model_folder='tests/layer_tests', model_name='l2_norm2', - optimizer_name='sgd', - output_file_name=output_file_name, error_file_name=error_file_name, weekly=weekly) - return_code = os.system(command) - tools.assert_success(return_code, error_file_name) - - -def test_unit_layer_l2_norm2_clang6(cluster, exes, dirname, - weekly, data_reader_percent): - skeleton_layer_l2_norm2(cluster, exes, dirname, 'clang6', - weekly, data_reader_percent) - -def test_unit_layer_l2_norm2_gcc7(cluster, exes, dirname, weekly, data_reader_percent): - skeleton_layer_l2_norm2(cluster, exes, dirname, 'gcc7', weekly, data_reader_percent) - - -def test_unit_layer_l2_norm2_intel19(cluster, exes, dirname, - weekly, data_reader_percent): - skeleton_layer_l2_norm2(cluster, exes, dirname, 'intel19', - weekly, data_reader_percent) - - -# Run with python3 -m pytest -s test_unit_ridge_regression.py -k 'test_unit_layer_l2_norm2_exe' --exe= -def test_unit_layer_l2_norm2_exe(cluster, dirname, exe, weekly, data_reader_percent): - if exe is None: - e = 'test_unit_layer_l2_norm2_exe: Non-local testing' - print('Skip - ' + e) - pytest.skip(e) - exes = {'exe': exe} - skeleton_layer_l2_norm2(cluster, exes, dirname, 'exe', weekly, data_reader_percent) diff --git a/bamboo/unit_tests/test_unit_layer_leaky_relu.py b/bamboo/unit_tests/test_unit_layer_leaky_relu.py index 73e5f02d769..b848266d19e 100644 --- a/bamboo/unit_tests/test_unit_layer_leaky_relu.py +++ b/bamboo/unit_tests/test_unit_layer_leaky_relu.py @@ -1,54 +1,193 @@ +import functools +import operator +import os +import os.path import sys -sys.path.insert(0, '../common_python') +import numpy as np + +# Bamboo utilities +current_file = os.path.realpath(__file__) +current_dir = os.path.dirname(current_file) +sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python')) import tools -import pytest -import os +# ============================================== +# Objects for Python data reader +# ============================================== +# Note: The Python data reader imports this file as a module and calls +# the functions below to ingest data. + +# Data +# Note: The leaky ReLU is not differentiable at 0, so we make sure values +# are away from 0. +np.random.seed(201910246) +_num_samples = 23 +_sample_size = 11 +_samples = np.random.choice([-1.0, 1.0], size=(_num_samples,_sample_size)) +_samples += np.random.uniform(-0.5,0.5, size=_samples.shape) +_samples = _samples.astype(np.float32) + +# Sample access functions +def get_sample(index): + return _samples[index,:] +def num_samples(): + return _num_samples +def sample_dims(): + return (_sample_size,) + +# ============================================== +# Setup LBANN experiment +# ============================================== + +def setup_experiment(lbann): + """Construct LBANN experiment. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + trainer = lbann.Trainer() + model = construct_model(lbann) + data_reader = construct_data_reader(lbann) + optimizer = lbann.NoOptimizer() + return trainer, model, data_reader, optimizer + +def construct_model(lbann): + """Construct LBANN model. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Input data + # Note: Sum with a weights layer so that gradient checking will + # verify that error signals are correct. + x_weights = lbann.Weights(optimizer=lbann.SGD(), + initializer=lbann.ConstantInitializer(value=0.0)) + x0 = lbann.WeightsLayer(weights=x_weights, dims=str(_sample_size)) + x1 = lbann.Identity(lbann.Input()) + x = lbann.Sum([x0, x1]) + x_lbann = x + + # Objects for LBANN model + obj = [] + metrics = [] + callbacks = [] + + # ------------------------------------------ + # Data-parallel layout + # ------------------------------------------ + + # LBANN implementation + x = x_lbann + y = lbann.LeakyRelu(x, negative_slope=0.01, data_layout='data_parallel') + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='data-parallel layout')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i).astype(np.float64) + y = np.where(x > 0, x, 0.01*x) + z = tools.numpy_l2norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # Model-parallel layout + # ------------------------------------------ + + # LBANN implementation + x = x_lbann + y = lbann.LeakyRelu(x, negative_slope=2, data_layout='model_parallel') + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='model-parallel layout')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i).astype(np.float64) + y = np.where(x > 0, x, 2*x) + z = tools.numpy_l2norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # Gradient checking + # ------------------------------------------ + + callbacks.append(lbann.CallbackCheckGradients(error_on_failure=True)) + + # ------------------------------------------ + # Construct model + # ------------------------------------------ + + mini_batch_size = num_samples() // 2 + num_epochs = 0 + return lbann.Model(mini_batch_size, + num_epochs, + layers=lbann.traverse_layer_graph(x_lbann), + objective_function=obj, + metrics=metrics, + callbacks=callbacks) + +def construct_data_reader(lbann): + """Construct Protobuf message for Python data reader. + + The Python data reader will import the current Python file to + access the sample access functions. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Note: The training data reader should be removed when + # https://github.com/LLNL/lbann/issues/1098 is resolved. + message = lbann.reader_pb2.DataReader() + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'train' + ) + ]) + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'test' + ) + ]) + return message + +# ============================================== +# Setup PyTest +# ============================================== -def skeleton_layer_leaky_relu(cluster, executables, dir_name, compiler_name, - weekly, data_reader_percent): - if compiler_name not in executables: - e = 'skeleton_layer_leaky_relu: default_exes[%s] does not exist' % compiler_name - print('Skip - ' + e) - pytest.skip(e) - output_file_name = '%s/bamboo/unit_tests/output/layer_leaky_relu_%s_output.txt' % (dir_name, compiler_name) - error_file_name = '%s/bamboo/unit_tests/error/layer_leaky_relu_%s_error.txt' % (dir_name, compiler_name) - command = tools.get_command( - cluster=cluster, executable=executables[compiler_name], num_nodes=1, - time_limit=10, - num_processes=2, dir_name=dir_name, - data_reader_name='synthetic', - data_reader_percent=data_reader_percent, - model_folder='tests/layer_tests', model_name='leaky_relu', - optimizer_name='sgd', - output_file_name=output_file_name, error_file_name=error_file_name, weekly=weekly) - return_code = os.system(command) - tools.assert_success(return_code, error_file_name) - - -def test_unit_layer_leaky_relu_clang6(cluster, exes, dirname, - weekly, data_reader_percent): - skeleton_layer_leaky_relu(cluster, exes, dirname, 'clang6', - weekly, data_reader_percent) - - -def test_unit_layer_leaky_relu_gcc7(cluster, exes, dirname, - weekly, data_reader_percent): - skeleton_layer_leaky_relu(cluster, exes, dirname, 'gcc7', - weekly, data_reader_percent) - - -def test_unit_layer_leaky_relu_intel19(cluster, exes, dirname, - weekly, data_reader_percent): - skeleton_layer_leaky_relu(cluster, exes, dirname, 'intel19', - weekly, data_reader_percent) - - -# Run with python3 -m pytest -s test_unit_ridge_regression.py -k 'test_unit_layer_leaky_relu_exe' --exe= -def test_unit_layer_leaky_relu_exe(cluster, dirname, exe, weekly, data_reader_percent): - if exe is None: - e = 'test_unit_layer_leaky_relu_exe: Non-local testing' - print('Skip - ' + e) - pytest.skip(e) - exes = {'exe': exe} - skeleton_layer_leaky_relu(cluster, exes, dirname, 'exe', weekly, data_reader_percent) +# Create test functions that can interact with PyTest +for test in tools.create_tests(setup_experiment, __file__): + globals()[test.__name__] = test diff --git a/bamboo/unit_tests/test_unit_layer_log_sigmoid.py b/bamboo/unit_tests/test_unit_layer_log_sigmoid.py index a4265f52bb9..6e3c6717732 100644 --- a/bamboo/unit_tests/test_unit_layer_log_sigmoid.py +++ b/bamboo/unit_tests/test_unit_layer_log_sigmoid.py @@ -1,55 +1,191 @@ +import functools +import operator +import os +import os.path import sys -sys.path.insert(0, '../common_python') +import numpy as np + +# Bamboo utilities +current_file = os.path.realpath(__file__) +current_dir = os.path.dirname(current_file) +sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python')) import tools -import pytest -import os +# ============================================== +# Objects for Python data reader +# ============================================== +# Note: The Python data reader imports this file as a module and calls +# the functions below to ingest data. + +# Data +# Note: The L1 norm is not differentiable at 0, so we make sure values +# are away from 0. +np.random.seed(201910247) +_num_samples = 23 +_sample_size = 7 +_samples = np.random.normal(size=(_num_samples,_sample_size)).astype(np.float32) + +# Sample access functions +def get_sample(index): + return _samples[index,:] +def num_samples(): + return _num_samples +def sample_dims(): + return (_sample_size,) + +# ============================================== +# Setup LBANN experiment +# ============================================== + +def setup_experiment(lbann): + """Construct LBANN experiment. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + trainer = lbann.Trainer() + model = construct_model(lbann) + data_reader = construct_data_reader(lbann) + optimizer = lbann.NoOptimizer() + return trainer, model, data_reader, optimizer + +def construct_model(lbann): + """Construct LBANN model. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Input data + # Note: Sum with a weights layer so that gradient checking will + # verify that error signals are correct. + x_weights = lbann.Weights(optimizer=lbann.SGD(), + initializer=lbann.ConstantInitializer(value=0.0)) + x0 = lbann.WeightsLayer(weights=x_weights, dims=str(_sample_size)) + x1 = lbann.Identity(lbann.Input()) + x = lbann.Sum([x0, x1]) + x_lbann = x + + # Objects for LBANN model + obj = [] + metrics = [] + callbacks = [] + + # ------------------------------------------ + # Data-parallel layout + # ------------------------------------------ + + # LBANN implementation + x = x_lbann + y = lbann.LogSigmoid(x, data_layout='data_parallel') + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='data-parallel layout')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i).astype(np.float64) + y = x - np.log1p(np.exp(x)) + z = tools.numpy_l2norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # Model-parallel layout + # ------------------------------------------ + + # LBANN implementation + x = x_lbann + y = lbann.LogSigmoid(x, data_layout='model_parallel') + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='model-parallel layout')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i).astype(np.float64) + y = x - np.log1p(np.exp(x)) + z = tools.numpy_l2norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # Gradient checking + # ------------------------------------------ + + callbacks.append(lbann.CallbackCheckGradients(error_on_failure=True)) + + # ------------------------------------------ + # Construct model + # ------------------------------------------ + + mini_batch_size = num_samples() // 2 + num_epochs = 0 + return lbann.Model(mini_batch_size, + num_epochs, + layers=lbann.traverse_layer_graph(x_lbann), + objective_function=obj, + metrics=metrics, + callbacks=callbacks) + +def construct_data_reader(lbann): + """Construct Protobuf message for Python data reader. + + The Python data reader will import the current Python file to + access the sample access functions. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Note: The training data reader should be removed when + # https://github.com/LLNL/lbann/issues/1098 is resolved. + message = lbann.reader_pb2.DataReader() + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'train' + ) + ]) + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'test' + ) + ]) + return message + +# ============================================== +# Setup PyTest +# ============================================== -def skeleton_layer_log_sigmoid(cluster, executables, dir_name, compiler_name, - weekly, data_reader_percent): - if compiler_name not in executables: - e = 'skeleton_layer_log_sigmoid: default_exes[%s] does not exist' % compiler_name - print('Skip - ' + e) - pytest.skip(e) - output_file_name = '%s/bamboo/unit_tests/output/layer_log_sigmoid_%s_output.txt' % (dir_name, compiler_name) - error_file_name = '%s/bamboo/unit_tests/error/layer_log_sigmoid_%s_error.txt' % (dir_name, compiler_name) - command = tools.get_command( - cluster=cluster, executable=executables[compiler_name], num_nodes=1, - time_limit=10, - num_processes=2, dir_name=dir_name, - data_reader_name='synthetic', - data_reader_percent=data_reader_percent, - model_folder='tests/layer_tests', model_name='log_sigmoid', - optimizer_name='sgd', - output_file_name=output_file_name, error_file_name=error_file_name, weekly=weekly) - return_code = os.system(command) - tools.assert_success(return_code, error_file_name) - - -def test_unit_layer_log_sigmoid_clang6(cluster, exes, dirname, - weekly, data_reader_percent): - skeleton_layer_log_sigmoid(cluster, exes, dirname, 'clang6', - weekly, data_reader_percent) - - -def test_unit_layer_log_sigmoid_gcc7(cluster, exes, dirname, - weekly, data_reader_percent): - skeleton_layer_log_sigmoid(cluster, exes, dirname, 'gcc7', - weekly, data_reader_percent) - - -def test_unit_layer_log_sigmoid_intel19(cluster, exes, dirname, - weekly, data_reader_percent): - skeleton_layer_log_sigmoid(cluster, exes, dirname, 'intel19', - weekly, data_reader_percent) - - -# Run with python3 -m pytest -s test_unit_layer_log_sigmoid.py -k 'test_unit_layer_log_sigmoid_exe' --exe= -def test_unit_layer_log_sigmoid_exe(cluster, dirname, exe, weekly, data_reader_percent): - if exe is None: - e = 'test_unit_layer_log_sigmoid_exe: Non-local testing' - print('Skip - ' + e) - pytest.skip(e) - exes = {'exe': exe} - skeleton_layer_log_sigmoid(cluster, exes, dirname, 'exe', - weekly, data_reader_percent) +# Create test functions that can interact with PyTest +for test in tools.create_tests(setup_experiment, __file__): + globals()[test.__name__] = test diff --git a/bamboo/unit_tests/test_unit_layer_log_softmax.py b/bamboo/unit_tests/test_unit_layer_log_softmax.py index 8250b199895..c135c316dc9 100644 --- a/bamboo/unit_tests/test_unit_layer_log_softmax.py +++ b/bamboo/unit_tests/test_unit_layer_log_softmax.py @@ -5,7 +5,7 @@ import sys import numpy as np -# Local files +# Bamboo utilities current_file = os.path.realpath(__file__) current_dir = os.path.dirname(current_file) sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python')) @@ -36,8 +36,13 @@ def sample_dims(): # ============================================== def numpy_log_softmax(x): - """NumPy implementation of log-softmax.""" - x = x.astype(np.float64) + """Log-softmax, computed with NumPy + + The computation is performed with 64-bit floats. + + """ + if x.dtype is not np.float64: + x = x.astype(np.float64) x = x - np.max(x) return x - np.log(np.sum(np.exp(x))) @@ -66,15 +71,6 @@ def construct_model(lbann): """ - # Convenience function to convert list to a space-separated string - def str_list(it): - return ' '.join([str(i) for i in it]) - - # Convenience function to compute L2 norm squared with NumPy - def l2_norm2(x): - x = x.reshape(-1).astype(np.float64) - return np.inner(x, x) - # Input data # Note: Sum with a weights layer so that gradient checking will # verify that error signals are correct. @@ -99,14 +95,14 @@ def l2_norm2(x): y = lbann.LogSoftmax(x, data_layout='data_parallel') z = lbann.L2Norm2(y) obj.append(z) - metrics.append(lbann.Metric(z, name='data-parallel output')) + metrics.append(lbann.Metric(z, name='data-parallel layout')) # NumPy implementation vals = [] for i in range(num_samples()): - x = get_sample(i) + x = get_sample(i).astype(np.float64) y = numpy_log_softmax(x) - z = l2_norm2(y) + z = tools.numpy_l2norm2(y) vals.append(z) val = np.mean(vals) tol = 8 * val * np.finfo(np.float32).eps @@ -126,14 +122,14 @@ def l2_norm2(x): y = lbann.LogSoftmax(x, data_layout='model_parallel') z = lbann.L2Norm2(y) obj.append(z) - metrics.append(lbann.Metric(z, name='model-parallel output')) + metrics.append(lbann.Metric(z, name='model-parallel layout')) # NumPy implementation vals = [] for i in range(num_samples()): - x = get_sample(i) + x = get_sample(i).astype(np.float64) y = numpy_log_softmax(x) - z = l2_norm2(y) + z = tools.numpy_l2norm2(y) vals.append(z) val = np.mean(vals) tol = 8 * val * np.finfo(np.float32).eps diff --git a/bamboo/unit_tests/test_unit_layer_mean_absolute_error.py b/bamboo/unit_tests/test_unit_layer_mean_absolute_error.py index b37f79e6ef0..e196ce1289a 100644 --- a/bamboo/unit_tests/test_unit_layer_mean_absolute_error.py +++ b/bamboo/unit_tests/test_unit_layer_mean_absolute_error.py @@ -1,56 +1,207 @@ +import functools +import operator +import os +import os.path import sys -sys.path.insert(0, '../common_python') +import numpy as np + +# Bamboo utilities +current_file = os.path.realpath(__file__) +current_dir = os.path.dirname(current_file) +sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python')) import tools -import pytest -import os +# ============================================== +# Objects for Python data reader +# ============================================== +# Note: The Python data reader imports this file as a module and calls +# the functions below to ingest data. + +# Data +# Note: MAE is not differentiable when the two inputs match, so we +# make sure inputs have separated values. +np.random.seed(201910248) +_samples = np.random.uniform(-0.25, 0.25, size=(27,2,7)).astype(np.float32) +_samples[:,1,:] += np.random.choice([-1.0,1.0], size=(27,7)) + +# Sample access functions +def get_sample(index): + return _samples[index].reshape(-1) +def num_samples(): + return _samples.shape[0] +def sample_dims(): + return (2*_samples.shape[-1],) + +# ============================================== +# Setup LBANN experiment +# ============================================== + +def setup_experiment(lbann): + """Construct LBANN experiment. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + trainer = lbann.Trainer() + model = construct_model(lbann) + data_reader = construct_data_reader(lbann) + optimizer = lbann.NoOptimizer() + return trainer, model, data_reader, optimizer + +def construct_model(lbann): + """Construct LBANN model. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Input data + # Note: Sum with weights layers so that gradient checking will + # verify that error signals are correct. + slice_size = _samples.shape[-1] + x0_weights = lbann.Weights(optimizer=lbann.SGD(), + initializer=lbann.ConstantInitializer(value=0.0), + name='input0_weights') + x1_weights = lbann.Weights(optimizer=lbann.SGD(), + initializer=lbann.ConstantInitializer(value=0.0), + name='input1_weights') + x_slice = lbann.Slice(lbann.Input(), + slice_points=tools.str_list([0, slice_size, 2*slice_size])) + x0 = lbann.Sum([x_slice, + lbann.WeightsLayer(weights=x0_weights, + dims=str(slice_size))]) + x1 = lbann.Sum([x_slice, + lbann.WeightsLayer(weights=x1_weights, + dims=str(slice_size))]) + x0_lbann = x0 + x1_lbann = x1 + + # Objects for LBANN model + obj = [] + metrics = [] + callbacks = [] + + # ------------------------------------------ + # Data-parallel layout + # ------------------------------------------ + + # LBANN implementation + x0 = x0_lbann + x1 = x1_lbann + y = lbann.MeanAbsoluteError([x0, x1], data_layout='data_parallel') + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='data-parallel layout')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i).astype(np.float64) + x0 = x[:slice_size] + x1 = x[slice_size:] + y = np.linalg.norm(x1-x0, 1) / slice_size + z = tools.numpy_l2norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # Model-parallel layout + # ------------------------------------------ + + # LBANN implementation + x0 = x0_lbann + x1 = x1_lbann + y = lbann.MeanAbsoluteError([x0, x1], data_layout='model_parallel') + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='model-parallel layout, unbiased')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i).astype(np.float64) + x0 = x[:slice_size] + x1 = x[slice_size:] + y = np.linalg.norm(x1-x0, 1) / slice_size + z = tools.numpy_l2norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # Gradient checking + # ------------------------------------------ + + callbacks.append(lbann.CallbackCheckGradients(error_on_failure=True)) + + # ------------------------------------------ + # Construct model + # ------------------------------------------ + + mini_batch_size = num_samples() // 2 + num_epochs = 0 + return lbann.Model(mini_batch_size, + num_epochs, + layers=lbann.traverse_layer_graph(x0_lbann), + objective_function=obj, + metrics=metrics, + callbacks=callbacks) + +def construct_data_reader(lbann): + """Construct Protobuf message for Python data reader. + + The Python data reader will import the current Python file to + access the sample access functions. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Note: The training data reader should be removed when + # https://github.com/LLNL/lbann/issues/1098 is resolved. + message = lbann.reader_pb2.DataReader() + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'train' + ) + ]) + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'test' + ) + ]) + return message + +# ============================================== +# Setup PyTest +# ============================================== -def skeleton_layer_mean_absolute_error(cluster, executables, dir_name, - compiler_name, weekly, data_reader_percent): - if compiler_name not in executables: - e = 'skeleton_layer_mean_absolute_error: default_exes[%s] does not exist' % compiler_name - print('Skip - ' + e) - pytest.skip(e) - output_file_name = '%s/bamboo/unit_tests/output/layer_mean_absolute_error_%s_output.txt' % (dir_name, compiler_name) - error_file_name = '%s/bamboo/unit_tests/error/layer_mean_absolute_error_%s_error.txt' % (dir_name, compiler_name) - command = tools.get_command( - cluster=cluster, executable=executables[compiler_name], num_nodes=1, - time_limit=10, - num_processes=2, dir_name=dir_name, - data_reader_name='synthetic', - data_reader_percent=data_reader_percent, - model_folder='tests/layer_tests', model_name='mean_absolute_error', - optimizer_name='sgd', - output_file_name=output_file_name, error_file_name=error_file_name, weekly=weekly) - return_code = os.system(command) - tools.assert_success(return_code, error_file_name) - - -def test_unit_layer_mean_absolute_error_clang6(cluster, exes, dirname, - weekly, data_reader_percent): - skeleton_layer_mean_absolute_error(cluster, exes, dirname, 'clang6', - weekly, data_reader_percent) - - -def test_unit_layer_mean_absolute_error_gcc7(cluster, exes, dirname, - weekly, data_reader_percent): - skeleton_layer_mean_absolute_error(cluster, exes, dirname, 'gcc7', - weekly, data_reader_percent) - - -def test_unit_layer_mean_absolute_error_intel19(cluster, exes, dirname, - weekly, data_reader_percent): - skeleton_layer_mean_absolute_error(cluster, exes, dirname, 'intel19', - weekly, data_reader_percent) - - -# Run with python3 -m pytest -s test_unit_ridge_regression.py -k 'test_unit_layer_mean_absolute_error_exe' --exe= -def test_unit_layer_mean_absolute_error_exe(cluster, dirname, exe, - weekly, data_reader_percent): - if exe is None: - e = 'test_unit_layer_mean_absolute_error_exe: Non-local testing' - print('Skip - ' + e) - pytest.skip(e) - exes = {'exe': exe} - skeleton_layer_mean_absolute_error(cluster, exes, dirname, 'exe', - weekly, data_reader_percent) +# Create test functions that can interact with PyTest +for test in tools.create_tests(setup_experiment, __file__): + globals()[test.__name__] = test diff --git a/bamboo/unit_tests/test_unit_layer_mean_squared_error.py b/bamboo/unit_tests/test_unit_layer_mean_squared_error.py index 6fe16a7a8f3..343658538c6 100644 --- a/bamboo/unit_tests/test_unit_layer_mean_squared_error.py +++ b/bamboo/unit_tests/test_unit_layer_mean_squared_error.py @@ -5,7 +5,7 @@ import sys import numpy as np -# Local files +# Bamboo utilities current_file = os.path.realpath(__file__) current_dir = os.path.dirname(current_file) sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python')) @@ -18,15 +18,8 @@ # the functions below to ingest data. # Data -# Note: The error bounds for gradient checking assume that the fourth -# derivative of the objective function is ~1. However, given our loss -# function: -# L = ( -xhat * log(x) )^2 -# L'''' = O( xhat^2 * log(x) / x^4 ) -# We have x >= 0.25 to make sure the fourth derivative does not get -# too big and mess up the error bounds. -np.random.seed(201910144) -_samples = np.random.normal(size=(13,2,9)).astype(np.float32) +np.random.seed(201910249) +_samples = np.random.normal(size=(27,2,13)).astype(np.float32) # Sample access functions def get_sample(index): @@ -61,15 +54,6 @@ def construct_model(lbann): """ - # Convenience function to convert list to a space-separated string - def str_list(it): - return ' '.join([str(i) for i in it]) - - # Convenience function to compute L2 norm squared with NumPy - def l2_norm2(x): - x = x.reshape(-1) - return np.inner(x, x) - # Input data # Note: Sum with weights layers so that gradient checking will # verify that error signals are correct. @@ -81,7 +65,7 @@ def l2_norm2(x): initializer=lbann.ConstantInitializer(value=0.0), name='input1_weights') x_slice = lbann.Slice(lbann.Input(), - slice_points=str_list([0, slice_size, 2*slice_size])) + slice_points=tools.str_list([0, slice_size, 2*slice_size])) x0 = lbann.Sum([x_slice, lbann.WeightsLayer(weights=x0_weights, dims=str(slice_size))]) @@ -106,16 +90,16 @@ def l2_norm2(x): y = lbann.MeanSquaredError([x0, x1], data_layout='data_parallel') z = lbann.L2Norm2(y) obj.append(z) - metrics.append(lbann.Metric(z, name='data-parallel output')) + metrics.append(lbann.Metric(z, name='data-parallel layout')) # NumPy implementation vals = [] for i in range(num_samples()): - x = get_sample(i) + x = get_sample(i).astype(np.float64) x0 = x[:slice_size] x1 = x[slice_size:] - y = l2_norm2(x0-x1) / slice_size - z = l2_norm2(y) + y = tools.numpy_l2norm2(x1-x0) / slice_size + z = tools.numpy_l2norm2(y) vals.append(z) val = np.mean(vals) tol = 8 * val * np.finfo(np.float32).eps @@ -136,16 +120,16 @@ def l2_norm2(x): y = lbann.MeanSquaredError([x0, x1], data_layout='model_parallel') z = lbann.L2Norm2(y) obj.append(z) - metrics.append(lbann.Metric(z, name='model-parallel output')) + metrics.append(lbann.Metric(z, name='model-parallel layout, unbiased')) # NumPy implementation vals = [] for i in range(num_samples()): - x = get_sample(i) + x = get_sample(i).astype(np.float64) x0 = x[:slice_size] x1 = x[slice_size:] - y = l2_norm2(x0-x1) / slice_size - z = l2_norm2(y) + y = tools.numpy_l2norm2(x1-x0) / slice_size + z = tools.numpy_l2norm2(y) vals.append(z) val = np.mean(vals) tol = 8 * val * np.finfo(np.float32).eps @@ -216,7 +200,5 @@ def construct_data_reader(lbann): # ============================================== # Create test functions that can interact with PyTest -# Note: Create test name by removing ".py" from file name -_test_name = os.path.splitext(os.path.basename(current_file))[0] -for test in tools.create_tests(setup_experiment, _test_name): +for test in tools.create_tests(setup_experiment, __file__): globals()[test.__name__] = test diff --git a/bamboo/unit_tests/test_unit_layer_one_hot.py b/bamboo/unit_tests/test_unit_layer_one_hot.py index a435dfc8e22..2fe1e459475 100644 --- a/bamboo/unit_tests/test_unit_layer_one_hot.py +++ b/bamboo/unit_tests/test_unit_layer_one_hot.py @@ -5,7 +5,7 @@ import sys import numpy as np -# Local files +# Bamboo utilities current_file = os.path.realpath(__file__) current_dir = os.path.dirname(current_file) sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python')) diff --git a/bamboo/unit_tests/test_unit_layer_relu.py b/bamboo/unit_tests/test_unit_layer_relu.py index ca4cfa92b0d..897642d454f 100644 --- a/bamboo/unit_tests/test_unit_layer_relu.py +++ b/bamboo/unit_tests/test_unit_layer_relu.py @@ -1,49 +1,193 @@ +import functools +import operator +import os +import os.path import sys -sys.path.insert(0, '../common_python') +import numpy as np + +# Bamboo utilities +current_file = os.path.realpath(__file__) +current_dir = os.path.dirname(current_file) +sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python')) import tools -import pytest -import os +# ============================================== +# Objects for Python data reader +# ============================================== +# Note: The Python data reader imports this file as a module and calls +# the functions below to ingest data. + +# Data +# Note: ReLU is not differentiable at 0, so we make sure values +# are away from 0. +np.random.seed(2019102410) +_num_samples = 23 +_sample_size = 41 +_samples = np.random.choice([-1.0, 1.0], size=(_num_samples,_sample_size)) +_samples += np.random.uniform(-0.5,0.5, size=_samples.shape) +_samples = _samples.astype(np.float32) + +# Sample access functions +def get_sample(index): + return _samples[index,:] +def num_samples(): + return _num_samples +def sample_dims(): + return (_sample_size,) + +# ============================================== +# Setup LBANN experiment +# ============================================== + +def setup_experiment(lbann): + """Construct LBANN experiment. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + trainer = lbann.Trainer() + model = construct_model(lbann) + data_reader = construct_data_reader(lbann) + optimizer = lbann.NoOptimizer() + return trainer, model, data_reader, optimizer + +def construct_model(lbann): + """Construct LBANN model. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Input data + # Note: Sum with a weights layer so that gradient checking will + # verify that error signals are correct. + x_weights = lbann.Weights(optimizer=lbann.SGD(), + initializer=lbann.ConstantInitializer(value=0.0)) + x0 = lbann.WeightsLayer(weights=x_weights, dims=str(_sample_size)) + x1 = lbann.Identity(lbann.Input()) + x = lbann.Sum([x0, x1]) + x_lbann = x + + # Objects for LBANN model + obj = [] + metrics = [] + callbacks = [] + + # ------------------------------------------ + # Data-parallel layout + # ------------------------------------------ + + # LBANN implementation + x = x_lbann + y = lbann.Relu(x, data_layout='data_parallel') + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='data-parallel layout')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i).astype(np.float64) + y = np.maximum(x, 0.0) + z = tools.numpy_l2norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # Model-parallel layout + # ------------------------------------------ + + # LBANN implementation + x = x_lbann + y = lbann.Relu(x, data_layout='model_parallel') + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='model-parallel layout')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i).astype(np.float64) + y = np.maximum(x, 0.0) + z = tools.numpy_l2norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # Gradient checking + # ------------------------------------------ + + callbacks.append(lbann.CallbackCheckGradients(error_on_failure=True)) + + # ------------------------------------------ + # Construct model + # ------------------------------------------ + + mini_batch_size = num_samples() // 2 + num_epochs = 0 + return lbann.Model(mini_batch_size, + num_epochs, + layers=lbann.traverse_layer_graph(x_lbann), + objective_function=obj, + metrics=metrics, + callbacks=callbacks) + +def construct_data_reader(lbann): + """Construct Protobuf message for Python data reader. + + The Python data reader will import the current Python file to + access the sample access functions. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Note: The training data reader should be removed when + # https://github.com/LLNL/lbann/issues/1098 is resolved. + message = lbann.reader_pb2.DataReader() + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'train' + ) + ]) + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'test' + ) + ]) + return message + +# ============================================== +# Setup PyTest +# ============================================== -def skeleton_layer_relu(cluster, executables, dir_name, compiler_name, - weekly, data_reader_percent): - if compiler_name not in executables: - e = 'skeleton_layer_relu: default_exes[%s] does not exist' % compiler_name - print('Skip - ' + e) - pytest.skip(e) - output_file_name = '%s/bamboo/unit_tests/output/layer_relu_%s_output.txt' % (dir_name, compiler_name) - error_file_name = '%s/bamboo/unit_tests/error/layer_relu_%s_error.txt' % (dir_name, compiler_name) - command = tools.get_command( - cluster=cluster, executable=executables[compiler_name], - num_nodes=1, - time_limit=10, - num_processes=2, dir_name=dir_name, - data_reader_name='synthetic', - data_reader_percent=data_reader_percent, - model_folder='tests/layer_tests', model_name='relu', - optimizer_name='sgd', - output_file_name=output_file_name, error_file_name=error_file_name, weekly=weekly) - return_code = os.system(command) - tools.assert_success(return_code, error_file_name) - - -def test_unit_layer_relu_clang6(cluster, exes, dirname, weekly, data_reader_percent): - skeleton_layer_relu(cluster, exes, dirname, 'clang6', weekly, data_reader_percent) - - -def test_unit_layer_relu_gcc7(cluster, exes, dirname, weekly, data_reader_percent): - skeleton_layer_relu(cluster, exes, dirname, 'gcc7', weekly, data_reader_percent) - - -def test_unit_layer_relu_intel19(cluster, exes, dirname, weekly, data_reader_percent): - skeleton_layer_relu(cluster, exes, dirname, 'intel19', weekly, data_reader_percent) - - -# Run with python3 -m pytest -s test_unit_layer_relu.py -k 'test_unit_layer_relu_exe' --exe= -def test_unit_layer_relu_exe(cluster, dirname, exe, weekly, data_reader_percent): - if exe is None: - e = 'test_unit_layer_relu_exe: Non-local testing' - print('Skip - ' + e) - pytest.skip(e) - exes = {'exe': exe} - skeleton_layer_relu(cluster, exes, dirname, 'exe', weekly, data_reader_percent) +# Create test functions that can interact with PyTest +for test in tools.create_tests(setup_experiment, __file__): + globals()[test.__name__] = test diff --git a/bamboo/unit_tests/test_unit_layer_selu.py b/bamboo/unit_tests/test_unit_layer_selu.py index 778628e5ee9..d46ce681223 100644 --- a/bamboo/unit_tests/test_unit_layer_selu.py +++ b/bamboo/unit_tests/test_unit_layer_selu.py @@ -1,49 +1,209 @@ +import functools +import operator +import os +import os.path import sys -sys.path.insert(0, '../common_python') +import numpy as np + +# Bamboo utilities +current_file = os.path.realpath(__file__) +current_dir = os.path.dirname(current_file) +sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python')) import tools -import pytest -import os +# ============================================== +# Objects for Python data reader +# ============================================== +# Note: The Python data reader imports this file as a module and calls +# the functions below to ingest data. + +# Data +# Note: SELU is not differentiable at 0, so we make sure values +# are away from 0. +np.random.seed(2019102411) +_num_samples = 20 +_sample_size = 5 +_samples = np.random.choice([-1.0, 1.0], size=(_num_samples,_sample_size)) +_samples += np.random.uniform(-0.5,0.5, size=_samples.shape) +_samples = _samples.astype(np.float32) + +# Sample access functions +def get_sample(index): + return _samples[index,:] +def num_samples(): + return _num_samples +def sample_dims(): + return (_sample_size,) + +# ============================================== +# NumPy SELU +# ============================================== + +def numpy_selu(x): + """NumPy implementation of SELU activation. + + The computation is performed with 64-bit floats. + + """ + if x.dtype is not np.float64: + x = x.astype(np.float64) + alpha = 1.6732632423543772848170429916717 + scale = 1.0507009873554804934193349852946 + return scale * np.where(x < 0, alpha * np.expm1(x), x) + +# ============================================== +# Setup LBANN experiment +# ============================================== + +def setup_experiment(lbann): + """Construct LBANN experiment. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + trainer = lbann.Trainer() + model = construct_model(lbann) + data_reader = construct_data_reader(lbann) + optimizer = lbann.NoOptimizer() + return trainer, model, data_reader, optimizer + +def construct_model(lbann): + """Construct LBANN model. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Input data + # Note: Sum with a weights layer so that gradient checking will + # verify that error signals are correct. + x_weights = lbann.Weights(optimizer=lbann.SGD(), + initializer=lbann.ConstantInitializer(value=0.0)) + x0 = lbann.WeightsLayer(weights=x_weights, dims=str(_sample_size)) + x1 = lbann.Identity(lbann.Input()) + x = lbann.Sum([x0, x1]) + x_lbann = x + + # Objects for LBANN model + obj = [] + metrics = [] + callbacks = [] + + # ------------------------------------------ + # Data-parallel layout + # ------------------------------------------ + + # LBANN implementation + x = x_lbann + y = lbann.Selu(x, data_layout='data_parallel') + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='data-parallel layout')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i).astype(np.float64) + y = numpy_selu(x) + z = tools.numpy_l2norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # Model-parallel layout + # ------------------------------------------ + + # LBANN implementation + x = x_lbann + y = lbann.Selu(x, data_layout='model_parallel') + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='model-parallel layout')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i).astype(np.float64) + y = numpy_selu(x) + z = tools.numpy_l2norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # Gradient checking + # ------------------------------------------ + + callbacks.append(lbann.CallbackCheckGradients(error_on_failure=True)) + + # ------------------------------------------ + # Construct model + # ------------------------------------------ + + mini_batch_size = num_samples() // 2 + num_epochs = 0 + return lbann.Model(mini_batch_size, + num_epochs, + layers=lbann.traverse_layer_graph(x_lbann), + objective_function=obj, + metrics=metrics, + callbacks=callbacks) + +def construct_data_reader(lbann): + """Construct Protobuf message for Python data reader. + + The Python data reader will import the current Python file to + access the sample access functions. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Note: The training data reader should be removed when + # https://github.com/LLNL/lbann/issues/1098 is resolved. + message = lbann.reader_pb2.DataReader() + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'train' + ) + ]) + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'test' + ) + ]) + return message + +# ============================================== +# Setup PyTest +# ============================================== -def skeleton_layer_selu(cluster, executables, dir_name, compiler_name, - weekly, data_reader_percent): - if compiler_name not in executables: - e = 'skeleton_layer_selu: default_exes[%s] does not exist' % compiler_name - print('Skip - ' + e) - pytest.skip(e) - output_file_name = '%s/bamboo/unit_tests/output/layer_selu_%s_output.txt' % (dir_name, compiler_name) - error_file_name = '%s/bamboo/unit_tests/error/layer_selu_%s_error.txt' % (dir_name, compiler_name) - command = tools.get_command( - cluster=cluster, executable=executables[compiler_name], - num_nodes=1, - time_limit=10, - num_processes=2, dir_name=dir_name, - data_reader_name='synthetic', - data_reader_percent=data_reader_percent, - model_folder='tests/layer_tests', model_name='selu', - optimizer_name='sgd', - output_file_name=output_file_name, error_file_name=error_file_name, weekly=weekly) - return_code = os.system(command) - tools.assert_success(return_code, error_file_name) - - -def test_unit_layer_selu_clang6(cluster, exes, dirname, weekly, data_reader_percent): - skeleton_layer_selu(cluster, exes, dirname, 'clang6', weekly, data_reader_percent) - - -def test_unit_layer_selu_gcc7(cluster, exes, dirname, weekly, data_reader_percent): - skeleton_layer_selu(cluster, exes, dirname, 'gcc7', weekly, data_reader_percent) - - -def test_unit_layer_selu_intel19(cluster, exes, dirname, weekly, data_reader_percent): - skeleton_layer_selu(cluster, exes, dirname, 'intel19', weekly, data_reader_percent) - - -# Run with python3 -m pytest -s test_unit_layer_selu.py -k 'test_unit_layer_selu_exe' --exe= -def test_unit_layer_selu_exe(cluster, dirname, exe, weekly, data_reader_percent): - if exe is None: - e = 'test_unit_layer_selu_exe: Non-local testing' - print('Skip - ' + e) - pytest.skip(e) - exes = {'exe': exe} - skeleton_layer_selu(cluster, exes, dirname, 'exe', weekly, data_reader_percent) +# Create test functions that can interact with PyTest +for test in tools.create_tests(setup_experiment, __file__): + globals()[test.__name__] = test diff --git a/bamboo/unit_tests/test_unit_layer_sigmoid.py b/bamboo/unit_tests/test_unit_layer_sigmoid.py index f2cdbc3fafa..bb4a946a2f1 100644 --- a/bamboo/unit_tests/test_unit_layer_sigmoid.py +++ b/bamboo/unit_tests/test_unit_layer_sigmoid.py @@ -1,52 +1,195 @@ +import functools +import operator +import os +import os.path import sys -sys.path.insert(0, '../common_python') +import numpy as np + +# Bamboo utilities +current_file = os.path.realpath(__file__) +current_dir = os.path.dirname(current_file) +sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python')) import tools -import pytest -import os +# ============================================== +# Objects for Python data reader +# ============================================== +# Note: The Python data reader imports this file as a module and calls +# the functions below to ingest data. + +# Data +# Note: The L1 norm is not differentiable at 0, so we make sure values +# are away from 0. +np.random.seed(2019102412) +_num_samples = 23 +_sample_size = 17 +_samples = np.random.normal(size=(_num_samples,_sample_size)).astype(np.float32) + +# Sample access functions +def get_sample(index): + return _samples[index,:] +def num_samples(): + return _num_samples +def sample_dims(): + return (_sample_size,) + +# ============================================== +# Setup LBANN experiment +# ============================================== + +def setup_experiment(lbann): + """Construct LBANN experiment. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + trainer = lbann.Trainer() + model = construct_model(lbann) + data_reader = construct_data_reader(lbann) + optimizer = lbann.NoOptimizer() + return trainer, model, data_reader, optimizer + +def construct_model(lbann): + """Construct LBANN model. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Input data + # Note: Sum with a weights layer so that gradient checking will + # verify that error signals are correct. + x_weights = lbann.Weights(optimizer=lbann.SGD(), + initializer=lbann.ConstantInitializer(value=0.0)) + x0 = lbann.WeightsLayer(weights=x_weights, dims=str(_sample_size)) + x1 = lbann.Identity(lbann.Input()) + x = lbann.Sum([x0, x1]) + x_lbann = x + + # Objects for LBANN model + obj = [] + metrics = [] + callbacks = [] + + # ------------------------------------------ + # Data-parallel layout + # ------------------------------------------ + + # LBANN implementation + x = x_lbann + y = lbann.Sigmoid(x, data_layout='data_parallel') + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='data-parallel layout')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i).astype(np.float64) + y = np.where(x >= 0, + 1 / (1 + np.exp(-x)), + np.exp(x) / (1 + np.exp(x))) + z = tools.numpy_l2norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # Model-parallel layout + # ------------------------------------------ + + # LBANN implementation + x = x_lbann + y = lbann.Sigmoid(x, data_layout='model_parallel') + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='model-parallel layout')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i).astype(np.float64) + y = np.where(x >= 0, + 1 / (1 + np.exp(-x)), + np.exp(x) / (1 + np.exp(x))) + z = tools.numpy_l2norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # Gradient checking + # ------------------------------------------ + + callbacks.append(lbann.CallbackCheckGradients(error_on_failure=True)) + + # ------------------------------------------ + # Construct model + # ------------------------------------------ + + mini_batch_size = num_samples() // 2 + num_epochs = 0 + return lbann.Model(mini_batch_size, + num_epochs, + layers=lbann.traverse_layer_graph(x_lbann), + objective_function=obj, + metrics=metrics, + callbacks=callbacks) + +def construct_data_reader(lbann): + """Construct Protobuf message for Python data reader. + + The Python data reader will import the current Python file to + access the sample access functions. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Note: The training data reader should be removed when + # https://github.com/LLNL/lbann/issues/1098 is resolved. + message = lbann.reader_pb2.DataReader() + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'train' + ) + ]) + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'test' + ) + ]) + return message + +# ============================================== +# Setup PyTest +# ============================================== -def skeleton_layer_sigmoid(cluster, executables, dir_name, compiler_name, - weekly, data_reader_percent): - if compiler_name not in executables: - e = 'skeleton_layer_sigmoid: default_exes[%s] does not exist' % compiler_name - print('Skip - ' + e) - pytest.skip(e) - output_file_name = '%s/bamboo/unit_tests/output/layer_sigmoid_%s_output.txt' % (dir_name, compiler_name) - error_file_name = '%s/bamboo/unit_tests/error/layer_sigmoid_%s_error.txt' % (dir_name, compiler_name) - command = tools.get_command( - cluster=cluster, executable=executables[compiler_name], - num_nodes=1, - time_limit=10, - num_processes=2, dir_name=dir_name, - data_reader_name='synthetic', - data_reader_percent=data_reader_percent, - model_folder='tests/layer_tests', model_name='sigmoid', - optimizer_name='sgd', - output_file_name=output_file_name, error_file_name=error_file_name, weekly=weekly) - return_code = os.system(command) - tools.assert_success(return_code, error_file_name) - - -def test_unit_layer_sigmoid_clang6(cluster, exes, dirname, weekly, data_reader_percent): - skeleton_layer_sigmoid(cluster, exes, dirname, 'clang6', - weekly, data_reader_percent) - - -def test_unit_layer_sigmoid_gcc7(cluster, exes, dirname, weekly, data_reader_percent): - skeleton_layer_sigmoid(cluster, exes, dirname, 'gcc7', weekly, data_reader_percent) - - -def test_unit_layer_sigmoid_intel19(cluster, exes, dirname, - weekly, data_reader_percent): - skeleton_layer_sigmoid(cluster, exes, dirname, 'intel19', - weekly, data_reader_percent) - - -# Run with python3 -m pytest -s test_unit_layer_sigmoid.py -k 'test_unit_layer_sigmoid_exe' --exe= -def test_unit_layer_sigmoid_exe(cluster, dirname, exe, weekly, data_reader_percent): - if exe is None: - e = 'test_unit_layer_sigmoid_exe: Non-local testing' - print('Skip - ' + e) - pytest.skip(e) - exes = {'exe': exe} - skeleton_layer_sigmoid(cluster, exes, dirname, 'exe', weekly, data_reader_percent) +# Create test functions that can interact with PyTest +for test in tools.create_tests(setup_experiment, __file__): + globals()[test.__name__] = test diff --git a/bamboo/unit_tests/test_unit_layer_slice.py b/bamboo/unit_tests/test_unit_layer_slice.py index 54e2fc95a4c..b21cbbdbd58 100644 --- a/bamboo/unit_tests/test_unit_layer_slice.py +++ b/bamboo/unit_tests/test_unit_layer_slice.py @@ -6,7 +6,7 @@ import numpy as np import pytest -# Local files +# Bamboo utilities current_file = os.path.realpath(__file__) current_dir = os.path.dirname(current_file) sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python')) @@ -58,15 +58,6 @@ def construct_model(lbann): """ - # Convenience function to convert list to a space-separated string - def str_list(it): - return ' '.join([str(i) for i in it]) - - # Convenience function to compute L2 norm squared with NumPy - def l2_norm2(x): - x = x.reshape(-1) - return np.inner(x, x) - # LBANN objects obj = [] metrics = [] @@ -80,8 +71,8 @@ def l2_norm2(x): w = lbann.Weights(optimizer=lbann.SGD(), initializer=lbann.ConstantInitializer(value=0.0)) x0 = lbann.WeightsLayer(weights=w, - dims=str_list(_sample_dims)) - x1 = lbann.Reshape(lbann.Input(), dims=str_list(_sample_dims)) + dims=tools.str_list(_sample_dims)) + x1 = lbann.Reshape(lbann.Input(), dims=tools.str_list(_sample_dims)) x_lbann = lbann.Sum([x0, x1]) # -------------------------- @@ -91,7 +82,7 @@ def l2_norm2(x): # LBANN implementation slice_points = (2, 3, 6, 7) x = x_lbann - x_slice = lbann.Slice(x, axis=0, slice_points=str_list(slice_points)) + x_slice = lbann.Slice(x, axis=0, slice_points=tools.str_list(slice_points)) y = [] for _ in range(len(slice_points)-1): y.append(lbann.L2Norm2(x_slice)) @@ -102,11 +93,11 @@ def l2_norm2(x): # NumPy implementation vals = [] for i in range(num_samples()): - x = get_sample(i).reshape(_sample_dims) + x = get_sample(i).reshape(_sample_dims).astype(np.float64) y = [] for j in range(len(slice_points)-1): x_slice = x[slice_points[j]:slice_points[j+1],:,:] - y.append(l2_norm2(x_slice)) + y.append(tools.numpy_l2norm2(x_slice)) z = sum(y) vals.append(z) val = np.mean(vals) @@ -125,7 +116,7 @@ def l2_norm2(x): # LBANN implementation slice_points = (0, 2, 3, 4) x = x_lbann - x_slice = lbann.Slice(x, axis=1, slice_points=str_list(slice_points)) + x_slice = lbann.Slice(x, axis=1, slice_points=tools.str_list(slice_points)) y = [] for _ in range(len(slice_points)-1): y.append(lbann.L2Norm2(x_slice)) @@ -136,11 +127,11 @@ def l2_norm2(x): # NumPy implementation vals = [] for i in range(num_samples()): - x = get_sample(i).reshape(_sample_dims) + x = get_sample(i).reshape(_sample_dims).astype(np.float64) y = [] for j in range(len(slice_points)-1): x_slice = x[:,slice_points[j]:slice_points[j+1],:] - y.append(l2_norm2(x_slice)) + y.append(tools.numpy_l2norm2(x_slice)) z = sum(y) vals.append(z) val = np.mean(vals) @@ -159,7 +150,7 @@ def l2_norm2(x): # LBANN implementation slice_points = (0, 1, 2, 3) x = x_lbann - x_slice = lbann.Slice(x, axis=2, slice_points=str_list(slice_points)) + x_slice = lbann.Slice(x, axis=2, slice_points=tools.str_list(slice_points)) y = [] for _ in range(len(slice_points)-1): y.append(lbann.L2Norm2(x_slice)) @@ -170,11 +161,11 @@ def l2_norm2(x): # NumPy implementation vals = [] for i in range(num_samples()): - x = get_sample(i).reshape(_sample_dims) + x = get_sample(i).reshape(_sample_dims).astype(np.float64) y = [] for j in range(len(slice_points)-1): x_slice = x[:,:,slice_points[j]:slice_points[j+1]] - y.append(l2_norm2(x_slice)) + y.append(tools.numpy_l2norm2(x_slice)) z = sum(y) vals.append(z) val = np.mean(vals) diff --git a/bamboo/unit_tests/test_unit_layer_softmax.py b/bamboo/unit_tests/test_unit_layer_softmax.py index 0d494f42e62..2654cbba768 100644 --- a/bamboo/unit_tests/test_unit_layer_softmax.py +++ b/bamboo/unit_tests/test_unit_layer_softmax.py @@ -5,7 +5,7 @@ import sys import numpy as np -# Local files +# Bamboo utilities current_file = os.path.realpath(__file__) current_dir = os.path.dirname(current_file) sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python')) @@ -38,11 +38,12 @@ def sample_dims(): def numpy_softmax(x): """NumPy implementation of softmax. - There is also an implementation in SciPy 1.2.0 - (scipy.special.softmax). + The computation is performed with 64-bit floats. There is also an + implementation of softmax in SciPy 1.2.0 (scipy.special.softmax). """ - x = x.astype(np.float64) + if x.dtype is not np.float64: + x = x.astype(np.float64) y = np.exp(x - np.max(x)) return y / np.sum(y) @@ -71,15 +72,6 @@ def construct_model(lbann): """ - # Convenience function to convert list to a space-separated string - def str_list(it): - return ' '.join([str(i) for i in it]) - - # Convenience function to compute L2 norm squared with NumPy - def l2_norm2(x): - x = x.reshape(-1).astype(np.float64) - return np.inner(x, x) - # Input data # Note: Sum with a weights layer so that gradient checking will # verify that error signals are correct. @@ -104,14 +96,14 @@ def l2_norm2(x): y = lbann.Softmax(x, data_layout='data_parallel') z = lbann.L2Norm2(y) obj.append(z) - metrics.append(lbann.Metric(z, name='data-parallel output')) + metrics.append(lbann.Metric(z, name='data-parallel layout')) # NumPy implementation vals = [] for i in range(num_samples()): - x = get_sample(i) + x = get_sample(i).astype(np.float64) y = numpy_softmax(x) - z = l2_norm2(y) + z = tools.numpy_l2norm2(y) vals.append(z) val = np.mean(vals) tol = 8 * val * np.finfo(np.float32).eps @@ -131,14 +123,14 @@ def l2_norm2(x): y = lbann.Softmax(x, data_layout='model_parallel') z = lbann.L2Norm2(y) obj.append(z) - metrics.append(lbann.Metric(z, name='model-parallel output')) + metrics.append(lbann.Metric(z, name='model-parallel layout')) # NumPy implementation vals = [] for i in range(num_samples()): - x = get_sample(i) + x = get_sample(i).astype(np.float64) y = numpy_softmax(x) - z = l2_norm2(y) + z = tools.numpy_l2norm2(y) vals.append(z) val = np.mean(vals) tol = 8 * val * np.finfo(np.float32).eps diff --git a/bamboo/unit_tests/test_unit_layer_softplus.py b/bamboo/unit_tests/test_unit_layer_softplus.py index c9e1ef426ea..f0e52881db1 100644 --- a/bamboo/unit_tests/test_unit_layer_softplus.py +++ b/bamboo/unit_tests/test_unit_layer_softplus.py @@ -1,52 +1,191 @@ +import functools +import operator +import os +import os.path import sys -sys.path.insert(0, '../common_python') +import numpy as np + +# Bamboo utilities +current_file = os.path.realpath(__file__) +current_dir = os.path.dirname(current_file) +sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python')) import tools -import pytest -import os +# ============================================== +# Objects for Python data reader +# ============================================== +# Note: The Python data reader imports this file as a module and calls +# the functions below to ingest data. + +# Data +# Note: The L1 norm is not differentiable at 0, so we make sure values +# are away from 0. +np.random.seed(2019102413) +_num_samples = 11 +_sample_size = 7 +_samples = np.random.normal(size=(_num_samples,_sample_size)).astype(np.float32) + +# Sample access functions +def get_sample(index): + return _samples[index,:] +def num_samples(): + return _num_samples +def sample_dims(): + return (_sample_size,) + +# ============================================== +# Setup LBANN experiment +# ============================================== + +def setup_experiment(lbann): + """Construct LBANN experiment. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + trainer = lbann.Trainer() + model = construct_model(lbann) + data_reader = construct_data_reader(lbann) + optimizer = lbann.NoOptimizer() + return trainer, model, data_reader, optimizer + +def construct_model(lbann): + """Construct LBANN model. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Input data + # Note: Sum with a weights layer so that gradient checking will + # verify that error signals are correct. + x_weights = lbann.Weights(optimizer=lbann.SGD(), + initializer=lbann.ConstantInitializer(value=0.0)) + x0 = lbann.WeightsLayer(weights=x_weights, dims=str(_sample_size)) + x1 = lbann.Identity(lbann.Input()) + x = lbann.Sum([x0, x1]) + x_lbann = x + + # Objects for LBANN model + obj = [] + metrics = [] + callbacks = [] + + # ------------------------------------------ + # Data-parallel layout + # ------------------------------------------ + + # LBANN implementation + x = x_lbann + y = lbann.Softplus(x, data_layout='data_parallel') + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='data-parallel layout')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i).astype(np.float64) + y = np.log1p(np.exp(x)) + z = tools.numpy_l2norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # Model-parallel layout + # ------------------------------------------ + + # LBANN implementation + x = x_lbann + y = lbann.Softplus(x, data_layout='model_parallel') + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='model-parallel layout')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i).astype(np.float64) + y = np.log1p(np.exp(x)) + z = tools.numpy_l2norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # Gradient checking + # ------------------------------------------ + + callbacks.append(lbann.CallbackCheckGradients(error_on_failure=True)) + + # ------------------------------------------ + # Construct model + # ------------------------------------------ + + mini_batch_size = num_samples() // 2 + num_epochs = 0 + return lbann.Model(mini_batch_size, + num_epochs, + layers=lbann.traverse_layer_graph(x_lbann), + objective_function=obj, + metrics=metrics, + callbacks=callbacks) + +def construct_data_reader(lbann): + """Construct Protobuf message for Python data reader. + + The Python data reader will import the current Python file to + access the sample access functions. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Note: The training data reader should be removed when + # https://github.com/LLNL/lbann/issues/1098 is resolved. + message = lbann.reader_pb2.DataReader() + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'train' + ) + ]) + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'test' + ) + ]) + return message + +# ============================================== +# Setup PyTest +# ============================================== -def skeleton_layer_softplus(cluster, executables, dir_name, compiler_name, - weekly, data_reader_percent): - if compiler_name not in executables: - e = 'skeleton_layer_softplus: default_exes[%s] does not exist' % compiler_name - print('Skip - ' + e) - pytest.skip(e) - output_file_name = '%s/bamboo/unit_tests/output/layer_softplus_%s_output.txt' % (dir_name, compiler_name) - error_file_name = '%s/bamboo/unit_tests/error/layer_softplus_%s_error.txt' % (dir_name, compiler_name) - command = tools.get_command( - cluster=cluster, executable=executables[compiler_name], num_nodes=1, - time_limit=10, - num_processes=2, dir_name=dir_name, - data_reader_name='synthetic', - data_reader_percent=data_reader_percent, - model_folder='tests/layer_tests', model_name='softplus', - optimizer_name='sgd', - output_file_name=output_file_name, error_file_name=error_file_name, weekly=weekly) - return_code = os.system(command) - tools.assert_success(return_code, error_file_name) - - -def test_unit_layer_softplus_clang6(cluster, exes, dirname, - weekly, data_reader_percent): - skeleton_layer_softplus(cluster, exes, dirname, 'clang6', - weekly, data_reader_percent) - - -def test_unit_layer_softplus_gcc7(cluster, exes, dirname, weekly, data_reader_percent): - skeleton_layer_softplus(cluster, exes, dirname, 'gcc7', weekly, data_reader_percent) - - -def test_unit_layer_softplus_intel19(cluster, exes, dirname, - weekly, data_reader_percent): - skeleton_layer_softplus(cluster, exes, dirname, 'intel19', - weekly, data_reader_percent) - - -# Run with python3 -m pytest -s test_unit_layer_softplus.py -k 'test_unit_layer_softplus_exe' --exe= -def test_unit_layer_softplus_exe(cluster, dirname, exe, weekly, data_reader_percent): - if exe is None: - e = 'test_unit_layer_softplus_exe: Non-local testing' - print('Skip - ' + e) - pytest.skip(e) - exes = {'exe': exe} - skeleton_layer_softplus(cluster, exes, dirname, 'exe', weekly, data_reader_percent) +# Create test functions that can interact with PyTest +for test in tools.create_tests(setup_experiment, __file__): + globals()[test.__name__] = test diff --git a/bamboo/unit_tests/test_unit_layer_softsign.py b/bamboo/unit_tests/test_unit_layer_softsign.py index e47b10ae649..2f20c51364d 100644 --- a/bamboo/unit_tests/test_unit_layer_softsign.py +++ b/bamboo/unit_tests/test_unit_layer_softsign.py @@ -1,57 +1,189 @@ +import functools +import operator +import os +import os.path import sys -sys.path.insert(0, '../common_python') +import numpy as np + +# Bamboo utilities +current_file = os.path.realpath(__file__) +current_dir = os.path.dirname(current_file) +sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python')) import tools -import pytest -import os +# ============================================== +# Objects for Python data reader +# ============================================== +# Note: The Python data reader imports this file as a module and calls +# the functions below to ingest data. + +# Data +np.random.seed(2019102414) +_num_samples = 11 +_sample_size = 7 +_samples = np.random.normal(size=(_num_samples,_sample_size)).astype(np.float32) + +# Sample access functions +def get_sample(index): + return _samples[index,:] +def num_samples(): + return _num_samples +def sample_dims(): + return (_sample_size,) + +# ============================================== +# Setup LBANN experiment +# ============================================== + +def setup_experiment(lbann): + """Construct LBANN experiment. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + trainer = lbann.Trainer() + model = construct_model(lbann) + data_reader = construct_data_reader(lbann) + optimizer = lbann.NoOptimizer() + return trainer, model, data_reader, optimizer + +def construct_model(lbann): + """Construct LBANN model. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Input data + # Note: Sum with a weights layer so that gradient checking will + # verify that error signals are correct. + x_weights = lbann.Weights(optimizer=lbann.SGD(), + initializer=lbann.ConstantInitializer(value=0.0)) + x0 = lbann.WeightsLayer(weights=x_weights, dims=str(_sample_size)) + x1 = lbann.Identity(lbann.Input()) + x = lbann.Sum([x0, x1]) + x_lbann = x + + # Objects for LBANN model + obj = [] + metrics = [] + callbacks = [] + + # ------------------------------------------ + # Data-parallel layout + # ------------------------------------------ + + # LBANN implementation + x = x_lbann + y = lbann.Softsign(x, data_layout='data_parallel') + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='data-parallel layout')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i).astype(np.float64) + y = x / (1 + np.abs(x)) + z = tools.numpy_l2norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # Model-parallel layout + # ------------------------------------------ + + # LBANN implementation + x = x_lbann + y = lbann.Softsign(x, data_layout='model_parallel') + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='model-parallel layout')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i).astype(np.float64) + y = x / (1 + np.abs(x)) + z = tools.numpy_l2norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # Gradient checking + # ------------------------------------------ + + callbacks.append(lbann.CallbackCheckGradients(error_on_failure=True)) + + # ------------------------------------------ + # Construct model + # ------------------------------------------ + + mini_batch_size = num_samples() // 2 + num_epochs = 0 + return lbann.Model(mini_batch_size, + num_epochs, + layers=lbann.traverse_layer_graph(x_lbann), + objective_function=obj, + metrics=metrics, + callbacks=callbacks) + +def construct_data_reader(lbann): + """Construct Protobuf message for Python data reader. + + The Python data reader will import the current Python file to + access the sample access functions. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Note: The training data reader should be removed when + # https://github.com/LLNL/lbann/issues/1098 is resolved. + message = lbann.reader_pb2.DataReader() + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'train' + ) + ]) + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'test' + ) + ]) + return message + +# ============================================== +# Setup PyTest +# ============================================== -def skeleton_layer_softsign(cluster, executables, dir_name, compiler_name, - weekly, data_reader_percent): - if compiler_name not in executables: - e = 'skeleton_layer_softsign: default_exes[%s] does not exist' % compiler_name - print('Skip - ' + e) - pytest.skip(e) - output_file_name = '%s/bamboo/unit_tests/output/layer_softsign_%s_output.txt' % (dir_name, compiler_name) - error_file_name = '%s/bamboo/unit_tests/error/layer_softsign_%s_error.txt' % (dir_name, compiler_name) - command = tools.get_command( - cluster=cluster, executable=executables[compiler_name], num_nodes=1, - time_limit=10, - num_processes=2, dir_name=dir_name, - data_reader_name='synthetic', - data_reader_percent=data_reader_percent, - model_folder='tests/layer_tests', model_name='softsign', - optimizer_name='sgd', - output_file_name=output_file_name, error_file_name=error_file_name, weekly=weekly) - return_code = os.system(command) - tools.assert_success(return_code, error_file_name) - - -def test_unit_layer_softsign_clang6(cluster, exes, dirname, weekly, data_reader_percent): - skeleton_layer_softsign(cluster, exes, dirname, 'clang6', - weekly, data_reader_percent) - - -def test_unit_layer_softsign_gcc7(cluster, exes, dirname, weekly, data_reader_percent): - skeleton_layer_softsign(cluster, exes, dirname, 'gcc7', weekly, data_reader_percent) - - -def test_unit_layer_softsign_intel19(cluster, exes, dirname, - weekly, data_reader_percent): - skeleton_layer_softsign(cluster, exes, dirname, 'intel19', - weekly, data_reader_percent) - - -def test_unit_layer_softsign_intel19(cluster, exes, dirname, - weekly, data_reader_percent): - skeleton_layer_softsign(cluster, exes, dirname, 'intel19', - weekly, data_reader_percent) - - -# Run with python3 -m pytest -s test_unit_layer_softsign.py -k 'test_unit_layer_softsign_exe' --exe= -def test_unit_layer_softsign_exe(cluster, dirname, exe, weekly, data_reader_percent): - if exe is None: - e = 'test_unit_layer_softsign_exe: Non-local testing' - print('Skip - ' + e) - pytest.skip(e) - exes = {'exe': exe} - skeleton_layer_softsign(cluster, exes, dirname, 'exe', weekly, data_reader_percent) +# Create test functions that can interact with PyTest +for test in tools.create_tests(setup_experiment, __file__): + globals()[test.__name__] = test diff --git a/bamboo/unit_tests/test_unit_layer_squared_difference.py b/bamboo/unit_tests/test_unit_layer_squared_difference.py index 9a219f5463a..885cc9a8c8b 100644 --- a/bamboo/unit_tests/test_unit_layer_squared_difference.py +++ b/bamboo/unit_tests/test_unit_layer_squared_difference.py @@ -1,56 +1,204 @@ +import functools +import operator +import os +import os.path import sys -sys.path.insert(0, '../common_python') +import numpy as np + +# Bamboo utilities +current_file = os.path.realpath(__file__) +current_dir = os.path.dirname(current_file) +sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python')) import tools -import pytest -import os +# ============================================== +# Objects for Python data reader +# ============================================== +# Note: The Python data reader imports this file as a module and calls +# the functions below to ingest data. + +# Data +np.random.seed(2019102415) +_samples = np.random.normal(size=(23,2,7)).astype(np.float32) + +# Sample access functions +def get_sample(index): + return _samples[index].reshape(-1) +def num_samples(): + return _samples.shape[0] +def sample_dims(): + return (2*_samples.shape[-1],) + +# ============================================== +# Setup LBANN experiment +# ============================================== + +def setup_experiment(lbann): + """Construct LBANN experiment. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + trainer = lbann.Trainer() + model = construct_model(lbann) + data_reader = construct_data_reader(lbann) + optimizer = lbann.NoOptimizer() + return trainer, model, data_reader, optimizer + +def construct_model(lbann): + """Construct LBANN model. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Input data + # Note: Sum with weights layers so that gradient checking will + # verify that error signals are correct. + slice_size = _samples.shape[-1] + x0_weights = lbann.Weights(optimizer=lbann.SGD(), + initializer=lbann.ConstantInitializer(value=0.0), + name='input0_weights') + x1_weights = lbann.Weights(optimizer=lbann.SGD(), + initializer=lbann.ConstantInitializer(value=0.0), + name='input1_weights') + x_slice = lbann.Slice(lbann.Input(), + slice_points=tools.str_list([0, slice_size, 2*slice_size])) + x0 = lbann.Sum([x_slice, + lbann.WeightsLayer(weights=x0_weights, + dims=str(slice_size))]) + x1 = lbann.Sum([x_slice, + lbann.WeightsLayer(weights=x1_weights, + dims=str(slice_size))]) + x0_lbann = x0 + x1_lbann = x1 + + # Objects for LBANN model + obj = [] + metrics = [] + callbacks = [] + + # ------------------------------------------ + # Data-parallel layout + # ------------------------------------------ + + # LBANN implementation + x0 = x0_lbann + x1 = x1_lbann + y = lbann.SquaredDifference([x0, x1], data_layout='data_parallel') + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='data-parallel layout')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i).astype(np.float64) + x0 = x[:slice_size] + x1 = x[slice_size:] + y = (x1-x0)**2 + z = tools.numpy_l2norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # Model-parallel layout + # ------------------------------------------ + + # LBANN implementation + x0 = x0_lbann + x1 = x1_lbann + y = lbann.SquaredDifference([x0, x1], data_layout='model_parallel') + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='model-parallel layout, unbiased')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i).astype(np.float64) + x0 = x[:slice_size] + x1 = x[slice_size:] + y = (x1-x0)**2 + z = tools.numpy_l2norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # Gradient checking + # ------------------------------------------ + + callbacks.append(lbann.CallbackCheckGradients(error_on_failure=True)) + + # ------------------------------------------ + # Construct model + # ------------------------------------------ + + mini_batch_size = num_samples() // 2 + num_epochs = 0 + return lbann.Model(mini_batch_size, + num_epochs, + layers=lbann.traverse_layer_graph(x0_lbann), + objective_function=obj, + metrics=metrics, + callbacks=callbacks) + +def construct_data_reader(lbann): + """Construct Protobuf message for Python data reader. + + The Python data reader will import the current Python file to + access the sample access functions. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Note: The training data reader should be removed when + # https://github.com/LLNL/lbann/issues/1098 is resolved. + message = lbann.reader_pb2.DataReader() + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'train' + ) + ]) + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'test' + ) + ]) + return message + +# ============================================== +# Setup PyTest +# ============================================== -def skeleton_layer_squared_difference(cluster, executables, dir_name, - compiler_name, weekly, data_reader_percent): - if compiler_name not in executables: - e = 'skeleton_layer_squared_difference: default_exes[%s] does not exist' % compiler_name - print('Skip - ' + e) - pytest.skip(e) - output_file_name = '%s/bamboo/unit_tests/output/layer_squared_difference_%s_output.txt' % (dir_name, compiler_name) - error_file_name = '%s/bamboo/unit_tests/error/layer_squared_difference_%s_error.txt' % (dir_name, compiler_name) - command = tools.get_command( - cluster=cluster, executable=executables[compiler_name], num_nodes=1, - time_limit=10, - num_processes=2, dir_name=dir_name, - data_reader_name='synthetic', - data_reader_percent=data_reader_percent, - model_folder='tests/layer_tests', model_name='squared_difference', - optimizer_name='sgd', - output_file_name=output_file_name, error_file_name=error_file_name, weekly=weekly) - return_code = os.system(command) - tools.assert_success(return_code, error_file_name) - - -def test_unit_layer_squared_difference_clang6(cluster, exes, dirname, - weekly, data_reader_percent): - skeleton_layer_squared_difference(cluster, exes, dirname, 'clang6', - weekly, data_reader_percent) - - -def test_unit_layer_squared_difference_gcc7(cluster, exes, dirname, - weekly, data_reader_percent): - skeleton_layer_squared_difference(cluster, exes, dirname, 'gcc7', - weekly, data_reader_percent) - - -def test_unit_layer_squared_difference_intel19(cluster, exes, dirname, - weekly, data_reader_percent): - skeleton_layer_squared_difference(cluster, exes, dirname, 'intel19', - weekly, data_reader_percent) - - -# Run with python3 -m pytest -s test_unit_layer_squared_difference.py -k 'test_unit_layer_squared_difference_exe' --exe= -def test_unit_layer_squared_difference_exe(cluster, dirname, exe, - weekly, data_reader_percent): - if exe is None: - e = 'test_unit_layer_squared_difference_exe: Non-local testing' - print('Skip - ' + e) - pytest.skip(e) - exes = {'exe': exe} - skeleton_layer_squared_difference(cluster, exes, dirname, 'exe', - weekly, data_reader_percent) +# Create test functions that can interact with PyTest +for test in tools.create_tests(setup_experiment, __file__): + globals()[test.__name__] = test diff --git a/bamboo/unit_tests/test_unit_layer_tessellate.py b/bamboo/unit_tests/test_unit_layer_tessellate.py index f53292a3baf..86e7b81cc5c 100644 --- a/bamboo/unit_tests/test_unit_layer_tessellate.py +++ b/bamboo/unit_tests/test_unit_layer_tessellate.py @@ -1,54 +1,198 @@ +import functools +import operator +import os +import os.path import sys -sys.path.insert(0, '../common_python') -import tools +import numpy as np import pytest -import os +# Bamboo utilities +current_file = os.path.realpath(__file__) +current_dir = os.path.dirname(current_file) +sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python')) +import tools + +# ============================================== +# Objects for Python data reader +# ============================================== +# Note: The Python data reader imports this file as a module and calls +# the functions below to ingest data. + +# Data +np.random.seed(2019102416) +_num_samples = 29 +_sample_dims = (3,1,4) +_sample_size = functools.reduce(operator.mul, _sample_dims) +_samples = np.random.normal(size=(_num_samples,_sample_size)).astype(np.float32) + +# Sample access functions +def get_sample(index): + return _samples[index,:] +def num_samples(): + return _num_samples +def sample_dims(): + return (_sample_size,) + +# ============================================== +# Setup LBANN experiment +# ============================================== + +def setup_experiment(lbann): + """Construct LBANN experiment. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + trainer = lbann.Trainer() + model = construct_model(lbann) + data_reader = construct_data_reader(lbann) + optimizer = lbann.NoOptimizer() + return trainer, model, data_reader, optimizer + +def construct_model(lbann): + """Construct LBANN model. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Input data + # Note: Sum with a weights layer so that gradient checking will + # verify that error signals are correct. + x_weights = lbann.Weights(optimizer=lbann.SGD(), + initializer=lbann.ConstantInitializer(value=0.0), + name='input_weights') + x0 = lbann.WeightsLayer(weights=x_weights, dims=tools.str_list(_sample_dims)) + x1 = lbann.Reshape(lbann.Input(), dims=tools.str_list(_sample_dims)) + x = lbann.Sum([x0, x1]) + x_lbann = x + + # Objects for LBANN model + obj = [] + metrics = [] + callbacks = [] + + # ------------------------------------------ + # Data-parallel layout + # ------------------------------------------ + + # LBANN implementation + output_dims = (7,4,3) + x = x_lbann + y = lbann.Tessellate(x, + dims=tools.str_list(output_dims), + data_layout='data_parallel') + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='data-parallel layout')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i).reshape(_sample_dims).astype(np.float64) + y = np.tile(x, (3,4,1))[:7,:4,:3] + z = tools.numpy_l2norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # Model-parallel layout + # ------------------------------------------ + + # LBANN implementation + output_dims = (2,1,9) + x = x_lbann + y = lbann.Tessellate(x, + dims=tools.str_list(output_dims), + data_layout='model_parallel') + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='model-parallel layout')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i).reshape(_sample_dims).astype(np.float64) + y = np.tile(x, (1,1,3))[:2,:1,:9] + z = tools.numpy_l2norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # -------------------------- + # Gradient checking + # -------------------------- + + callbacks.append(lbann.CallbackCheckGradients(error_on_failure=True)) + + # -------------------------- + # Construct model + # -------------------------- + + mini_batch_size = num_samples() // 2 + num_epochs = 0 + return lbann.Model(mini_batch_size, + num_epochs, + layers=lbann.traverse_layer_graph(x_lbann), + objective_function=obj, + metrics=metrics, + callbacks=callbacks) + +def construct_data_reader(lbann): + """Construct Protobuf message for Python data reader. + + The Python data reader will import the current Python file to + access the sample access functions. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Note: The training data reader should be removed when + # https://github.com/LLNL/lbann/issues/1098 is resolved. + message = lbann.reader_pb2.DataReader() + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'train' + ) + ]) + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'test' + ) + ]) + return message + +# ============================================== +# Setup PyTest +# ============================================== -def skeleton_layer_tessellate(cluster, executables, dir_name, compiler_name, - weekly, data_reader_percent): - if compiler_name not in executables: - e = 'skeleton_layer_tessellate: default_exes[%s] does not exist' % compiler_name - print('Skip - ' + e) - pytest.skip(e) - output_file_name = '%s/bamboo/unit_tests/output/layer_tessellate_%s_output.txt' % (dir_name, compiler_name) - error_file_name = '%s/bamboo/unit_tests/error/layer_tessellate_%s_error.txt' % (dir_name, compiler_name) - command = tools.get_command( - cluster=cluster, executable=executables[compiler_name], num_nodes=1, - time_limit=10, - num_processes=2, dir_name=dir_name, - data_reader_name='synthetic', - data_reader_percent=data_reader_percent, - model_folder='tests/layer_tests', model_name='tessellate', - optimizer_name='sgd', - output_file_name=output_file_name, error_file_name=error_file_name, weekly=weekly) - return_code = os.system(command) - tools.assert_success(return_code, error_file_name) - - -def test_unit_layer_tessellate_clang6(cluster, exes, dirname, - weekly, data_reader_percent): - skeleton_layer_tessellate(cluster, exes, dirname, 'clang6', - weekly, data_reader_percent) - - -def test_unit_layer_tessellate_gcc7(cluster, exes, dirname, weekly, data_reader_percent): - skeleton_layer_tessellate(cluster, exes, dirname, 'gcc7', - weekly, data_reader_percent) - - -def test_unit_layer_tessellate_intel19(cluster, exes, dirname, - weekly, data_reader_percent): - skeleton_layer_tessellate(cluster, exes, dirname, 'intel19', - weekly, data_reader_percent) - - -# Run with python3 -m pytest -s test_unit_layer_tessellate.py -k 'test_unit_layer_tessellate_exe' --exe= -def test_unit_layer_tessellate_exe(cluster, dirname, exe, weekly, data_reader_percent): - if exe is None: - e = 'test_unit_layer_tessellate_exe: Non-local testing' - print('Skip - ' + e) - pytest.skip(e) - exes = {'exe': exe} - skeleton_layer_tessellate(cluster, exes, dirname, 'exe', - weekly, data_reader_percent) +# Create test functions that can interact with PyTest +for test in tools.create_tests(setup_experiment, __file__): + globals()[test.__name__] = test diff --git a/bamboo/unit_tests/test_unit_layer_variance.py b/bamboo/unit_tests/test_unit_layer_variance.py index 5968a39f585..aac27260a37 100644 --- a/bamboo/unit_tests/test_unit_layer_variance.py +++ b/bamboo/unit_tests/test_unit_layer_variance.py @@ -1,53 +1,243 @@ +import functools +import operator +import os +import os.path import sys -sys.path.insert(0, '../common_python') +import numpy as np + +# Bamboo utilities +current_file = os.path.realpath(__file__) +current_dir = os.path.dirname(current_file) +sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python')) import tools -import pytest -import os +# ============================================== +# Objects for Python data reader +# ============================================== +# Note: The Python data reader imports this file as a module and calls +# the functions below to ingest data. + +# Data +np.random.seed(2019102417) +_num_samples = 11 +_sample_size = 7 +_samples = np.random.normal(size=(_num_samples,_sample_size)).astype(np.float32) + +# Sample access functions +def get_sample(index): + return _samples[index,:] +def num_samples(): + return _num_samples +def sample_dims(): + return (_sample_size,) + +# ============================================== +# Setup LBANN experiment +# ============================================== + +def setup_experiment(lbann): + """Construct LBANN experiment. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + trainer = lbann.Trainer() + model = construct_model(lbann) + data_reader = construct_data_reader(lbann) + optimizer = lbann.NoOptimizer() + return trainer, model, data_reader, optimizer + +def construct_model(lbann): + """Construct LBANN model. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Input data + # Note: Sum with a weights layer so that gradient checking will + # verify that error signals are correct. + x_weights = lbann.Weights(optimizer=lbann.SGD(), + initializer=lbann.ConstantInitializer(value=0.0)) + x0 = lbann.WeightsLayer(weights=x_weights, dims=str(_sample_size)) + x1 = lbann.Identity(lbann.Input()) + x = lbann.Sum([x0, x1]) + x_lbann = x + + # Objects for LBANN model + obj = [] + metrics = [] + callbacks = [] + + # ------------------------------------------ + # Data-parallel layout, unbiased + # ------------------------------------------ + + # LBANN implementation + x = x_lbann + y = lbann.Variance(x, data_layout='data_parallel') + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='data-parallel layout, unbiased')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i).astype(np.float64) + y = np.cov(x, bias=False) + z = tools.numpy_l2norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # Model-parallel layout, unbiased + # ------------------------------------------ + + # LBANN implementation + x = x_lbann + y = lbann.Variance(x, data_layout='model_parallel') + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='model-parallel layout, unbiased')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i).astype(np.float64) + y = np.cov(x, bias=False) + z = tools.numpy_l2norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # Data-parallel layout, biased + # ------------------------------------------ + + # LBANN implementation + x = x_lbann + y = lbann.Variance(x, biased=True, data_layout='data_parallel') + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='data-parallel layout, biased')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i).astype(np.float64) + y = np.cov(x, bias=True) + z = tools.numpy_l2norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # Model-parallel layout, biased + # ------------------------------------------ + + # LBANN implementation + x = x_lbann + y = lbann.Variance(x, biased=True, data_layout='model_parallel') + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='model-parallel layout, biased')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i).astype(np.float64) + y = np.cov(x, bias=True) + z = tools.numpy_l2norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # Gradient checking + # ------------------------------------------ + + callbacks.append(lbann.CallbackCheckGradients(error_on_failure=True)) + + # ------------------------------------------ + # Construct model + # ------------------------------------------ + + mini_batch_size = num_samples() // 2 + num_epochs = 0 + return lbann.Model(mini_batch_size, + num_epochs, + layers=lbann.traverse_layer_graph(x_lbann), + objective_function=obj, + metrics=metrics, + callbacks=callbacks) + +def construct_data_reader(lbann): + """Construct Protobuf message for Python data reader. + + The Python data reader will import the current Python file to + access the sample access functions. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Note: The training data reader should be removed when + # https://github.com/LLNL/lbann/issues/1098 is resolved. + message = lbann.reader_pb2.DataReader() + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'train' + ) + ]) + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'test' + ) + ]) + return message + +# ============================================== +# Setup PyTest +# ============================================== -def skeleton_layer_variance(cluster, executables, dir_name, compiler_name, - weekly, data_reader_percent): - if compiler_name not in executables: - e = 'skeleton_layer_variance: default_exes[%s] does not exist' % compiler_name - print('Skip - ' + e) - pytest.skip(e) - output_file_name = '%s/bamboo/unit_tests/output/layer_variance_%s_output.txt' % (dir_name, compiler_name) - error_file_name = '%s/bamboo/unit_tests/error/layer_variance_%s_error.txt' % (dir_name, compiler_name) - command = tools.get_command( - cluster=cluster, executable=executables[compiler_name], - num_nodes=1, - time_limit=10, - num_processes=2, dir_name=dir_name, - data_reader_name='synthetic', - data_reader_percent=data_reader_percent, - model_folder='tests/layer_tests', model_name='variance', - optimizer_name='sgd', - output_file_name=output_file_name, error_file_name=error_file_name, weekly=weekly) - return_code = os.system(command) - tools.assert_success(return_code, error_file_name) - - -def test_unit_layer_variance_clang6(cluster, exes, dirname, - weekly, data_reader_percent): - skeleton_layer_variance(cluster, exes, dirname, 'clang6', - weekly, data_reader_percent) - - -def test_unit_layer_variance_gcc7(cluster, exes, dirname, weekly, data_reader_percent): - skeleton_layer_variance(cluster, exes, dirname, 'gcc7', weekly, data_reader_percent) - - -def test_unit_layer_variance_intel19(cluster, exes, dirname, - weekly, data_reader_percent): - skeleton_layer_variance(cluster, exes, dirname, 'intel19', - weekly, data_reader_percent) - - -# Run with python3 -m pytest -s test_unit_ridge_regression.py -k 'test_unit_layer_variance_exe' --exe= -def test_unit_layer_variance_exe(cluster, dirname, exe, weekly, data_reader_percent): - if exe is None: - e = 'test_unit_layer_variance_exe: Non-local testing' - print('Skip - ' + e) - pytest.skip(e) - exes = {'exe': exe} - skeleton_layer_variance(cluster, exes, dirname, 'exe', weekly, data_reader_percent) +# Create test functions that can interact with PyTest +for test in tools.create_tests(setup_experiment, __file__): + globals()[test.__name__] = test diff --git a/model_zoo/tests/layer_tests/model_channelwise_mean.prototext b/model_zoo/tests/layer_tests/model_channelwise_mean.prototext deleted file mode 100644 index fe8e1de1f1e..00000000000 --- a/model_zoo/tests/layer_tests/model_channelwise_mean.prototext +++ /dev/null @@ -1,93 +0,0 @@ -trainer { -} -model { - data_layout: "data_parallel" - mini_batch_size: 11 - num_epochs: 0 - - ################################################### - # Objective function and metrics - ################################################### - - objective_function { - layer_term { layer: "l2" } - } - metric { - layer_metric { - layer: "l2" - name: "L2 norm" - } - } - - ################################################### - # Callbacks - ################################################### - - callback { print {} } - callback { timer {} } - callback { - check_metric { - metric: "L2 norm" # Expected value: 2 - lower_bound: 1.999 - upper_bound: 2.001 - error_on_failure: true - execution_modes: "test" - } - } - callback { - check_gradients { - execution_modes: "test" - verbose: false - error_on_failure: true - } - } - - ################################################### - # Layers - ################################################### - - layer { - name: "data" - data_layout: "data_parallel" - input {} - } - - # Input data - layer { - name: "x" - weights_layer { - dims: "2 3 2" - } - data_layout: "data_parallel" - weights: "x_vals" - } - weights { - name: "x_vals" - initializer { - value_initializer { - values: "1.2 1 0.8 3.3 -0.2 -0.1 -0.9 -1.1 -2 -1.3 0.3 -1" - } - } - } - - # Variations of channel-wise mean layer - layer { - parents: "x" - name: "channelwise_mean_data_parallel" - channelwise_mean {} - data_layout: "data_parallel" - } - - # Combine into objective function - layer { - parents: "channelwise_mean_data_parallel" - name: "sum" - sum {} - } - layer { - parents: "sum" - name: "l2" - l2_norm2 {} - } - -} diff --git a/model_zoo/tests/layer_tests/model_clamp.prototext b/model_zoo/tests/layer_tests/model_clamp.prototext deleted file mode 100644 index 5d6aaf40fb1..00000000000 --- a/model_zoo/tests/layer_tests/model_clamp.prototext +++ /dev/null @@ -1,123 +0,0 @@ -trainer { -} -model { - data_layout: "data_parallel" - mini_batch_size: 11 - num_epochs: 0 - - ################################################### - # Objective function and metrics - ################################################### - - objective_function { - layer_term { layer: "l2" } - } - metric { - layer_metric { - layer: "l2" - name: "L2 norm" - } - } - - ################################################### - # Callbacks - ################################################### - - callback { print {} } - callback { timer {} } - callback { - check_metric { - metric: "L2 norm" # Expected value: 25.25 - lower_bound: 25.24 - upper_bound: 25.26 - error_on_failure: true - execution_modes: "test" - } - } - callback { - check_gradients { - execution_modes: "test" - verbose: false - error_on_failure: true - } - } - - ################################################### - # Layers - ################################################### - - layer { - name: "data" - data_layout: "data_parallel" - input {} - } - - # Input data - layer { - name: "x" - weights_layer { - dims: "5" - } - data_layout: "model_parallel" - weights: "x_vals" - } - weights { - name: "x_vals" - initializer { - value_initializer { - values: "-2 -0.25 0.25 0.5 2" - } - } - } - - # Variations of clamp layer - layer { - parents: "x" - name: "clamp_0_1_data_parallel" - clamp { - min: 0 - max: 1 - } - data_layout: "data_parallel" - } - layer { - parents: "x" - name: "clamp_0_1_model_parallel" - clamp { - min: 0 - max: 1 - } - data_layout: "model_parallel" - } - layer { - parents: "x" - name: "clamp_neg1_1_data_parallel" - clamp { - min: -1 - max: 1 - } - data_layout: "data_parallel" - } - layer { - parents: "x" - name: "clamp_neg1_1_model_parallel" - clamp { - min: -1 - max: 1 - } - data_layout: "model_parallel" - } - - # Combine into objective function - layer { - parents: "clamp_0_1_data_parallel clamp_0_1_model_parallel clamp_neg1_1_data_parallel clamp_neg1_1_model_parallel" - name: "sum" - sum {} - } - layer { - parents: "sum" - name: "l2" - l2_norm2 {} - } - -} diff --git a/model_zoo/tests/layer_tests/model_covariance.prototext b/model_zoo/tests/layer_tests/model_covariance.prototext deleted file mode 100644 index 18076486724..00000000000 --- a/model_zoo/tests/layer_tests/model_covariance.prototext +++ /dev/null @@ -1,127 +0,0 @@ -trainer { -} -model { - data_layout: "data_parallel" - mini_batch_size: 11 - num_epochs: 0 - - ################################################### - # Objective function and metrics - ################################################### - - objective_function { - layer_term { layer: "l2" } - } - metric { - layer_metric { - layer: "l2" - name: "L2 norm" - } - } - - ################################################### - # Callbacks - ################################################### - - callback { print {} } - callback { timer {} } - callback { - check_metric { - metric: "L2 norm" # Expected value: 0.08365 - lower_bound: 0.08364 - upper_bound: 0.08366 - error_on_failure: true - execution_modes: "test" - } - } - callback { - check_gradients { - execution_modes: "test" - verbose: false - error_on_failure: true - } - } - - ################################################### - # Layers - ################################################### - - layer { - name: "data" - data_layout: "data_parallel" - input {} - } - - # Input data - layer { - name: "x0" - weights_layer { - dims: "5" - } - data_layout: "model_parallel" - weights: "x0_vals" - } - weights { - name: "x0_vals" - initializer { - value_initializer { - values: "1 -0.5 0.25 -0.125 0.0675" - } - } - } - layer { - name: "x1" - weights_layer { - dims: "5" - } - data_layout: "model_parallel" - weights: "x1_vals" - } - weights { - name: "x1_vals" - initializer { - value_initializer { - values: "0.1 0.2 0.4 0.8 1.6" - } - } - } - - # Variations of covariance layer - layer { - parents: "x0 x1" - name: "unbiased_covariance_model_parallel" - covariance { biased: false } - data_layout: "model_parallel" - } - layer { - parents: "x0 x1" - name: "biased_covariance_model_parallel" - covariance { biased: true } - data_layout: "model_parallel" - } - layer { - parents: "x0 x1" - name: "unbiased_covariance_data_parallel" - covariance { biased: false } - data_layout: "data_parallel" - } - layer { - parents: "x0 x1" - name: "biased_covariance_data_parallel" - covariance { biased: true } - data_layout: "data_parallel" - } - - # Combine into objective function - layer { - parents: "unbiased_covariance_model_parallel biased_covariance_model_parallel unbiased_covariance_data_parallel biased_covariance_data_parallel" - name: "sum" - sum {} - } - layer { - parents: "sum" - name: "l2" - l2_norm2 {} - } - -} diff --git a/model_zoo/tests/layer_tests/model_elu.prototext b/model_zoo/tests/layer_tests/model_elu.prototext deleted file mode 100644 index 88de9a6d908..00000000000 --- a/model_zoo/tests/layer_tests/model_elu.prototext +++ /dev/null @@ -1,115 +0,0 @@ -trainer { -} -model { - data_layout: "data_parallel" - mini_batch_size: 11 - num_epochs: 0 - - ################################################### - # Objective function and metrics - ################################################### - - objective_function { - layer_term { layer: "l2" } - } - metric { - layer_metric { - layer: "l2" - name: "L2 norm" - } - } - - ################################################### - # Callbacks - ################################################### - - callback { print {} } - callback { timer {} } - callback { - check_metric { - metric: "L2 norm" # Expected value: 28.17 - lower_bound: 28.16 - upper_bound: 28.18 - error_on_failure: true - execution_modes: "test" - } - } - callback { - check_gradients { - execution_modes: "test" - verbose: false - error_on_failure: true - } - } - - ################################################### - # Layers - ################################################### - - layer { - name: "data" - data_layout: "data_parallel" - input {} - } - - # Input data - layer { - name: "x" - weights_layer { - dims: "5" - } - data_layout: "model_parallel" - weights: "x_vals" - } - weights { - name: "x_vals" - initializer { - value_initializer { - values: "-2 -0.25 0.25 0.5 1" - } - } - } - - # Variations of ELU layer - layer { - parents: "x" - name: "elu_alpha_default_data_parallel" - elu {} - data_layout: "data_parallel" - } - layer { - parents: "x" - name: "elu_alpha_default_model_parallel" - elu {} - data_layout: "model_parallel" - } - layer { - parents: "x" - name: "elu_alpha_05_data_parallel" - elu { - alpha: 0.5 - } - data_layout: "data_parallel" - } - layer { - parents: "x" - name: "elu_alpha_05_model_parallel" - elu { - alpha: 0.5 - } - data_layout: "model_parallel" - } - - # Combine into objective function - layer { - parents: "elu_alpha_default_data_parallel elu_alpha_default_model_parallel elu_alpha_05_data_parallel elu_alpha_05_model_parallel" - name: "sum" - sum {} - } - layer { - parents: "sum" - name: "l2" - l2_norm2 {} - } - -} diff --git a/model_zoo/tests/layer_tests/model_identity.prototext b/model_zoo/tests/layer_tests/model_identity.prototext deleted file mode 100644 index aa26c7c46ea..00000000000 --- a/model_zoo/tests/layer_tests/model_identity.prototext +++ /dev/null @@ -1,99 +0,0 @@ -trainer { -} -model { - data_layout: "data_parallel" - mini_batch_size: 11 - num_epochs: 0 - - ################################################### - # Objective function and metrics - ################################################### - - objective_function { - layer_term { layer: "l2" } - } - metric { - layer_metric { - layer: "l2" - name: "L2 norm" - } - } - - ################################################### - # Callbacks - ################################################### - - callback { print {} } - callback { timer {} } - callback { - check_metric { - metric: "L2 norm" # Expected value: 14.25 - lower_bound: 14.24 - upper_bound: 14.26 - error_on_failure: true - execution_modes: "test" - } - } - callback { - check_gradients { - execution_modes: "test" - verbose: false - error_on_failure: true - } - } - - ################################################### - # Layers - ################################################### - - layer { - name: "data" - data_layout: "data_parallel" - input {} - } - - # Input data - layer { - name: "x" - weights_layer { - dims: "5" - } - data_layout: "model_parallel" - weights: "x_vals" - } - weights { - name: "x_vals" - initializer { - value_initializer { - values: "-1.5 -0.25 0 0.5 1" - } - } - } - - # Variations of identity layer - layer { - parents: "x" - name: "identity_model_parallel" - identity {} - data_layout: "model_parallel" - } - layer { - parents: "x" - name: "identity_data_parallel" - identity {} - data_layout: "data_parallel" - } - - # Combine into objective function - layer { - parents: "identity_model_parallel identity_data_parallel" - name: "sum" - sum {} - } - layer { - parents: "sum" - name: "l2" - l2_norm2 {} - } - -} diff --git a/model_zoo/tests/layer_tests/model_l1_norm.prototext b/model_zoo/tests/layer_tests/model_l1_norm.prototext deleted file mode 100644 index fd87729973d..00000000000 --- a/model_zoo/tests/layer_tests/model_l1_norm.prototext +++ /dev/null @@ -1,99 +0,0 @@ -trainer { -} -model { - data_layout: "data_parallel" - mini_batch_size: 11 - num_epochs: 0 - - ################################################### - # Objective function and metrics - ################################################### - - objective_function { - layer_term { layer: "l2" } - } - metric { - layer_metric { - layer: "l2" - name: "L2 norm" - } - } - - ################################################### - # Callbacks - ################################################### - - callback { print {} } - callback { timer {} } - callback { - check_metric { - metric: "L2 norm" # Expected value: 16 - lower_bound: 15.99 - upper_bound: 16.01 - error_on_failure: true - execution_modes: "test" - } - } - callback { - check_gradients { - execution_modes: "test" - verbose: false - error_on_failure: true - } - } - - ################################################### - # Layers - ################################################### - - layer { - name: "data" - data_layout: "data_parallel" - input {} - } - - # Input data - layer { - name: "x" - weights_layer { - dims: "5" - } - data_layout: "model_parallel" - weights: "x_vals" - } - weights { - name: "x_vals" - initializer { - value_initializer { - values: "1 -0.5 0.25 -0.125 0.125" - } - } - } - - # Variations of L1 norm layer - layer { - parents: "x" - name: "l1_norm_model_parallel" - l1_norm {} - data_layout: "model_parallel" - } - layer { - parents: "x" - name: "l1_norm_data_parallel" - l1_norm {} - data_layout: "data_parallel" - } - - # Combine into objective function - layer { - parents: "l1_norm_model_parallel l1_norm_data_parallel" - name: "sum" - sum {} - } - layer { - parents: "sum" - name: "l2" - l2_norm2 {} - } - -} diff --git a/model_zoo/tests/layer_tests/model_l2_norm2.prototext b/model_zoo/tests/layer_tests/model_l2_norm2.prototext deleted file mode 100644 index e327e05846a..00000000000 --- a/model_zoo/tests/layer_tests/model_l2_norm2.prototext +++ /dev/null @@ -1,79 +0,0 @@ -trainer { -} -model { - data_layout: "data_parallel" - mini_batch_size: 11 - num_epochs: 0 - - ################################################### - # Objective function and metrics - ################################################### - - objective_function { - layer_term { layer: "l2" } - } - metric { - layer_metric { - layer: "l2" - name: "L2 norm" - } - } - - ################################################### - # Callbacks - ################################################### - - callback { print {} } - callback { timer {} } - callback { - check_metric { - metric: "L2 norm" # Expected value: 2.5 - lower_bound: 2.499 - upper_bound: 2.501 - error_on_failure: true - execution_modes: "test" - } - } - callback { - check_gradients { - execution_modes: "test" - verbose: false - error_on_failure: true - } - } - - ################################################### - # Layers - ################################################### - - layer { - name: "data" - data_layout: "data_parallel" - input {} - } - - layer { - name: "x" - weights_layer { - dims: "5" - } - data_layout: "model_parallel" - weights: "x_vals" - } - weights { - name: "x_vals" - initializer { - value_initializer { - values: "0 1 -0.5 0.5 -1" - } - } - } - - layer { - parents: "x" - name: "l2" - l2_norm2 {} - data_layout: "model_parallel" - } - -} diff --git a/model_zoo/tests/layer_tests/model_leaky_relu.prototext b/model_zoo/tests/layer_tests/model_leaky_relu.prototext deleted file mode 100644 index a5fccee5a42..00000000000 --- a/model_zoo/tests/layer_tests/model_leaky_relu.prototext +++ /dev/null @@ -1,115 +0,0 @@ -trainer { -} -model { - data_layout: "data_parallel" - mini_batch_size: 11 - num_epochs: 0 - - ################################################### - # Objective function and metrics - ################################################### - - objective_function { - layer_term { layer: "l2" } - } - metric { - layer_metric { - layer: "l2" - name: "L2 norm" - } - } - - ################################################### - # Callbacks - ################################################### - - callback { print {} } - callback { timer {} } - callback { - check_metric { - metric: "L2 norm" # Expected value: 6.946 - lower_bound: 6.945 - upper_bound: 6.947 - error_on_failure: true - execution_modes: "test" - } - } - callback { - check_gradients { - execution_modes: "test" - verbose: false - error_on_failure: true - } - } - - ################################################### - # Layers - ################################################### - - layer { - name: "data" - data_layout: "data_parallel" - input {} - } - - # Input data - layer { - name: "x" - weights_layer { - dims: "5" - } - data_layout: "model_parallel" - weights: "x_vals" - } - weights { - name: "x_vals" - initializer { - value_initializer { - values: "-2 -1 -0.25 0.25 0.5" - } - } - } - - # Variations of L1 norm layer - layer { - parents: "x" - name: "leaky_relu_slope_default_data_parallel" - leaky_relu {} - data_layout: "data_parallel" - } - layer { - parents: "x" - name: "leaky_relu_slope_default_model_parallel" - leaky_relu {} - data_layout: "model_parallel" - } - layer { - parents: "x" - name: "leaky_relu_slope_03_data_parallel" - leaky_relu { - negative_slope: 0.3 - } - data_layout: "data_parallel" - } - layer { - parents: "x" - name: "leaky_relu_slope_03_model_parallel" - leaky_relu { - negative_slope: 0.3 - } - data_layout: "model_parallel" - } - - # Combine into objective function - layer { - parents: "leaky_relu_slope_default_data_parallel leaky_relu_slope_default_model_parallel leaky_relu_slope_03_data_parallel leaky_relu_slope_03_model_parallel" - name: "sum" - sum {} - } - layer { - parents: "sum" - name: "l2" - l2_norm2 {} - } - -} diff --git a/model_zoo/tests/layer_tests/model_log_sigmoid.prototext b/model_zoo/tests/layer_tests/model_log_sigmoid.prototext deleted file mode 100644 index af3b0526eb2..00000000000 --- a/model_zoo/tests/layer_tests/model_log_sigmoid.prototext +++ /dev/null @@ -1,99 +0,0 @@ -trainer { -} -model { - data_layout: "data_parallel" - mini_batch_size: 11 - num_epochs: 0 - - ################################################### - # Objective function and metrics - ################################################### - - objective_function { - layer_term { layer: "l2" } - } - metric { - layer_metric { - layer: "l2" - name: "L2 norm" - } - } - - ################################################### - # Callbacks - ################################################### - - callback { print {} } - callback { timer {} } - callback { - check_metric { - metric: "L2 norm" # Expected value: 12.51 - lower_bound: 12.50 - upper_bound: 12.52 - error_on_failure: true - execution_modes: "test" - } - } - callback { - check_gradients { - execution_modes: "test" - verbose: false - error_on_failure: true - } - } - - ################################################### - # Layers - ################################################### - - layer { - name: "data" - data_layout: "data_parallel" - input {} - } - - # Input data - layer { - name: "x" - weights_layer { - dims: "5" - } - data_layout: "model_parallel" - weights: "x_vals" - } - weights { - name: "x_vals" - initializer { - value_initializer { - values: "-1 -0.25 0 0.5 2" - } - } - } - - # Variations of log sigmoid layer - layer { - parents: "x" - name: "log_sigmoid_model_parallel" - log_sigmoid {} - data_layout: "model_parallel" - } - layer { - parents: "x" - name: "log_sigmoid_data_parallel" - log_sigmoid {} - data_layout: "data_parallel" - } - - # Combine into objective function - layer { - parents: "log_sigmoid_model_parallel log_sigmoid_data_parallel" - name: "sum" - sum {} - } - layer { - parents: "sum" - name: "l2" - l2_norm2 {} - } - -} diff --git a/model_zoo/tests/layer_tests/model_log_softmax.prototext b/model_zoo/tests/layer_tests/model_log_softmax.prototext deleted file mode 100644 index 4645fe727d2..00000000000 --- a/model_zoo/tests/layer_tests/model_log_softmax.prototext +++ /dev/null @@ -1,99 +0,0 @@ -trainer { -} -model { - data_layout: "data_parallel" - mini_batch_size: 11 - num_epochs: 0 - - ################################################### - # Objective function and metrics - ################################################### - - objective_function { - layer_term { layer: "l2" } - } - metric { - layer_metric { - layer: "l2" - name: "L2 norm" - } - } - - ################################################### - # Callbacks - ################################################### - - callback { print {} } - callback { timer {} } - callback { - check_metric { - metric: "L2 norm" # Expected value: 275.4 - lower_bound: 275.3 - upper_bound: 275.5 - error_on_failure: true - execution_modes: "test" - } - } - callback { - check_gradients { - execution_modes: "test" - verbose: false - error_on_failure: true - } - } - - ################################################### - # Layers - ################################################### - - layer { - name: "data" - data_layout: "data_parallel" - input {} - } - - # Input data - layer { - name: "x" - weights_layer { - dims: "5" - } - data_layout: "model_parallel" - weights: "x_vals" - } - weights { - name: "x_vals" - initializer { - value_initializer { - values: "-4 -2 0 1 2" - } - } - } - - # Variations of log softmax layer - layer { - parents: "x" - name: "log_softmax_model_parallel" - log_softmax {} - data_layout: "model_parallel" - } - layer { - parents: "x" - name: "log_softmax_data_parallel" - log_softmax {} - data_layout: "data_parallel" - } - - # Combine into objective function - layer { - parents: "log_softmax_model_parallel log_softmax_data_parallel" - name: "sum" - sum {} - } - layer { - parents: "sum" - name: "l2" - l2_norm2 {} - } - -} diff --git a/model_zoo/tests/layer_tests/model_mean_absolute_error.prototext b/model_zoo/tests/layer_tests/model_mean_absolute_error.prototext deleted file mode 100644 index 26521501938..00000000000 --- a/model_zoo/tests/layer_tests/model_mean_absolute_error.prototext +++ /dev/null @@ -1,115 +0,0 @@ -trainer { -} -model { - data_layout: "data_parallel" - mini_batch_size: 11 - num_epochs: 0 - - ################################################### - # Objective function and metrics - ################################################### - - objective_function { - layer_term { layer: "l2" } - } - metric { - layer_metric { - layer: "l2" - name: "L2 norm" - } - } - - ################################################### - # Callbacks - ################################################### - - callback { print {} } - callback { timer {} } - callback { - check_metric { - metric: "L2 norm" # Expected value: 1 - lower_bound: 0.999 - upper_bound: 1.001 - error_on_failure: true - execution_modes: "test" - } - } - callback { - check_gradients { - execution_modes: "test" - verbose: false - error_on_failure: true - } - } - - ################################################### - # Layers - ################################################### - - layer { - name: "data" - data_layout: "data_parallel" - input {} - } - - # Input data - layer { - name: "x0" - weights_layer { - dims: "5" - } - data_layout: "model_parallel" - weights: "x0_vals" - } - weights { - name: "x0_vals" - initializer { - value_initializer { - values: "1 -0.5 0.25 -0.125 0.125" - } - } - } - layer { - name: "x1" - weights_layer { - dims: "5" - } - data_layout: "model_parallel" - weights: "x1_vals" - } - weights { - name: "x1_vals" - initializer { - value_initializer { - values: "1.5 0 -1 -0.125 -0.125" - } - } - } - - # Variations of mean absolute error layer - layer { - parents: "x0 x1" - name: "mean_absolute_error_model_parallel" - mean_absolute_error {} - data_layout: "model_parallel" - } - layer { - parents: "x0 x1" - name: "mean_absolute_error_data_parallel" - mean_absolute_error {} - data_layout: "data_parallel" - } - - # Combine into objective function - layer { - parents: "mean_absolute_error_model_parallel mean_absolute_error_data_parallel" - name: "sum" - sum {} - } - layer { - parents: "sum" - name: "l2" - l2_norm2 {} - } - -} diff --git a/model_zoo/tests/layer_tests/model_relu.prototext b/model_zoo/tests/layer_tests/model_relu.prototext deleted file mode 100644 index edfb9ab5e89..00000000000 --- a/model_zoo/tests/layer_tests/model_relu.prototext +++ /dev/null @@ -1,99 +0,0 @@ -trainer { -} -model { - data_layout: "data_parallel" - mini_batch_size: 11 - num_epochs: 0 - - ################################################### - # Objective function and metrics - ################################################### - - objective_function { - layer_term { layer: "l2" } - } - metric { - layer_metric { - layer: "l2" - name: "L2 norm" - } - } - - ################################################### - # Callbacks - ################################################### - - callback { print {} } - callback { timer {} } - callback { - check_metric { - metric: "L2 norm" # Expected value: 5.25 - lower_bound: 5.249 - upper_bound: 5.251 - error_on_failure: true - execution_modes: "test" - } - } - callback { - check_gradients { - execution_modes: "test" - verbose: false - error_on_failure: true - } - } - - ################################################### - # Layers - ################################################### - - layer { - name: "data" - data_layout: "data_parallel" - input {} - } - - # Input data - layer { - name: "x" - weights_layer { - dims: "5" - } - data_layout: "model_parallel" - weights: "x_vals" - } - weights { - name: "x_vals" - initializer { - value_initializer { - values: "-1.5 -0.25 0.25 0.5 1" - } - } - } - - # Variations of ReLU layer - layer { - parents: "x" - name: "relu_model_parallel" - relu {} - data_layout: "model_parallel" - } - layer { - parents: "x" - name: "relu_data_parallel" - relu {} - data_layout: "data_parallel" - } - - # Combine into objective function - layer { - parents: "relu_model_parallel relu_data_parallel" - name: "sum" - sum {} - } - layer { - parents: "sum" - name: "l2" - l2_norm2 {} - } - -} diff --git a/model_zoo/tests/layer_tests/model_selu.prototext b/model_zoo/tests/layer_tests/model_selu.prototext deleted file mode 100644 index 2b76d8f003b..00000000000 --- a/model_zoo/tests/layer_tests/model_selu.prototext +++ /dev/null @@ -1,99 +0,0 @@ -trainer { -} -model { - data_layout: "data_parallel" - mini_batch_size: 11 - num_epochs: 0 - - ################################################### - # Objective function and metrics - ################################################### - - objective_function { - layer_term { layer: "l2" } - } - metric { - layer_metric { - layer: "l2" - name: "L2 norm" - } - } - - ################################################### - # Callbacks - ################################################### - - callback { print {} } - callback { timer {} } - callback { - check_metric { - metric: "L2 norm" # Expected value: 15.64 - lower_bound: 15.63 - upper_bound: 15.65 - error_on_failure: true - execution_modes: "test" - } - } - callback { - check_gradients { - execution_modes: "test" - verbose: false - error_on_failure: true - } - } - - ################################################### - # Layers - ################################################### - - layer { - name: "data" - data_layout: "data_parallel" - input {} - } - - # Input data - layer { - name: "x" - weights_layer { - dims: "5" - } - data_layout: "model_parallel" - weights: "x_vals" - } - weights { - name: "x_vals" - initializer { - value_initializer { - values: "-2 -0.25 0.25 0.5 1" - } - } - } - - # Variations of SELU layer - layer { - parents: "x" - name: "selu_model_parallel" - selu {} - data_layout: "model_parallel" - } - layer { - parents: "x" - name: "selu_data_parallel" - selu {} - data_layout: "data_parallel" - } - - # Combine into objective function - layer { - parents: "selu_model_parallel selu_data_parallel" - name: "sum" - sum {} - } - layer { - parents: "sum" - name: "l2" - l2_norm2 {} - } - -} diff --git a/model_zoo/tests/layer_tests/model_sigmoid.prototext b/model_zoo/tests/layer_tests/model_sigmoid.prototext deleted file mode 100644 index 08f16f063ca..00000000000 --- a/model_zoo/tests/layer_tests/model_sigmoid.prototext +++ /dev/null @@ -1,99 +0,0 @@ -trainer { -} -model { - data_layout: "data_parallel" - mini_batch_size: 11 - num_epochs: 0 - - ################################################### - # Objective function and metrics - ################################################### - - objective_function { - layer_term { layer: "l2" } - } - metric { - layer_metric { - layer: "l2" - name: "L2 norm" - } - } - - ################################################### - # Callbacks - ################################################### - - callback { print {} } - callback { timer {} } - callback { - check_metric { - metric: "L2 norm" # Expected value: 7.317 - lower_bound: 7.316 - upper_bound: 7.318 - error_on_failure: true - execution_modes: "test" - } - } - callback { - check_gradients { - execution_modes: "test" - verbose: false - error_on_failure: true - } - } - - ################################################### - # Layers - ################################################### - - layer { - name: "data" - data_layout: "data_parallel" - input {} - } - - # Input data - layer { - name: "x" - weights_layer { - dims: "5" - } - data_layout: "model_parallel" - weights: "x_vals" - } - weights { - name: "x_vals" - initializer { - value_initializer { - values: "-200 -0.25 0 0.5 100" - } - } - } - - # Variations of sigmoid layer - layer { - parents: "x" - name: "sigmoid_model_parallel" - sigmoid {} - data_layout: "model_parallel" - } - layer { - parents: "x" - name: "sigmoid_data_parallel" - sigmoid {} - data_layout: "data_parallel" - } - - # Combine into objective function - layer { - parents: "sigmoid_model_parallel sigmoid_data_parallel" - name: "sum" - sum {} - } - layer { - parents: "sum" - name: "l2" - l2_norm2 {} - } - -} diff --git a/model_zoo/tests/layer_tests/model_softplus.prototext b/model_zoo/tests/layer_tests/model_softplus.prototext deleted file mode 100644 index 7e5f31df652..00000000000 --- a/model_zoo/tests/layer_tests/model_softplus.prototext +++ /dev/null @@ -1,99 +0,0 @@ -trainer { -} -model { - data_layout: "data_parallel" - mini_batch_size: 11 - num_epochs: 0 - - ################################################### - # Objective function and metrics - ################################################### - - objective_function { - layer_term { layer: "l2" } - } - metric { - layer_metric { - layer: "l2" - name: "L2 norm" - } - } - - ################################################### - # Callbacks - ################################################### - - callback { print {} } - callback { timer {} } - callback { - check_metric { - metric: "L2 norm" # Expected value: 14.01 - lower_bound: 14.00 - upper_bound: 14.02 - error_on_failure: true - execution_modes: "test" - } - } - callback { - check_gradients { - execution_modes: "test" - verbose: false - error_on_failure: true - } - } - - ################################################### - # Layers - ################################################### - - layer { - name: "data" - data_layout: "data_parallel" - input {} - } - - # Input data - layer { - name: "x" - weights_layer { - dims: "5" - } - data_layout: "model_parallel" - weights: "x_vals" - } - weights { - name: "x_vals" - initializer { - value_initializer { - values: "-2 -0.25 0 0.5 1" - } - } - } - - # Variations of softplus layer - layer { - parents: "x" - name: "softplus_model_parallel" - softplus {} - data_layout: "model_parallel" - } - layer { - parents: "x" - name: "softplus_data_parallel" - softplus {} - data_layout: "data_parallel" - } - - # Combine into objective function - layer { - parents: "softplus_model_parallel softplus_data_parallel" - name: "sum" - sum {} - } - layer { - parents: "sum" - name: "l2" - l2_norm2 {} - } - -} diff --git a/model_zoo/tests/layer_tests/model_softsign.prototext b/model_zoo/tests/layer_tests/model_softsign.prototext deleted file mode 100644 index 61979cb03a0..00000000000 --- a/model_zoo/tests/layer_tests/model_softsign.prototext +++ /dev/null @@ -1,99 +0,0 @@ -trainer { -} -model { - data_layout: "data_parallel" - mini_batch_size: 11 - num_epochs: 0 - - ################################################### - # Objective function and metrics - ################################################### - - objective_function { - layer_term { layer: "l2" } - } - metric { - layer_metric { - layer: "l2" - name: "L2 norm" - } - } - - ################################################### - # Callbacks - ################################################### - - callback { print {} } - callback { timer {} } - callback { - check_metric { - metric: "L2 norm" # Expected value: 8.486 - lower_bound: 8.485 - upper_bound: 8.487 - error_on_failure: true - execution_modes: "test" - } - } - callback { - check_gradients { - execution_modes: "test" - verbose: false - error_on_failure: true - } - } - - ################################################### - # Layers - ################################################### - - layer { - name: "data" - data_layout: "data_parallel" - input {} - } - - # Input data - layer { - name: "x" - weights_layer { - dims: "5" - } - data_layout: "model_parallel" - weights: "x_vals" - } - weights { - name: "x_vals" - initializer { - value_initializer { - values: "-200 -0.25 0 0.5 100" - } - } - } - - # Variations of softsign layer - layer { - parents: "x" - name: "softsign_model_parallel" - softsign {} - data_layout: "model_parallel" - } - layer { - parents: "x" - name: "softsign_data_parallel" - softsign {} - data_layout: "data_parallel" - } - - # Combine into objective function - layer { - parents: "softsign_model_parallel softsign_data_parallel" - name: "sum" - sum {} - } - layer { - parents: "sum" - name: "l2" - l2_norm2 {} - } - -} diff --git a/model_zoo/tests/layer_tests/model_squared_difference.prototext b/model_zoo/tests/layer_tests/model_squared_difference.prototext deleted file mode 100644 index 73de72545d9..00000000000 --- a/model_zoo/tests/layer_tests/model_squared_difference.prototext +++ /dev/null @@ -1,114 +0,0 @@ -trainer { -} -model { - mini_batch_size: 11 - num_epochs: 0 - - ################################################### - # Objective function and metrics - ################################################### - - objective_function { - layer_term { layer: "l2" } - } - metric { - layer_metric { - layer: "l2" - name: "L2 norm" - } - } - - ################################################### - # Callbacks - ################################################### - - callback { print {} } - callback { timer {} } - callback { - check_metric { - metric: "L2 norm" # Expected value: 10.28 - lower_bound: 10.27 - upper_bound: 10.29 - error_on_failure: true - execution_modes: "test" - } - } - callback { - check_gradients { - execution_modes: "test" - verbose: false - error_on_failure: true - } - } - - ################################################### - # Layers - ################################################### - - layer { - name: "data" - data_layout: "data_parallel" - input {} - } - - # Input data - layer { - name: "x0" - weights_layer { - dims: "5" - } - data_layout: "model_parallel" - weights: "x0_vals" - } - weights { - name: "x0_vals" - initializer { - value_initializer { - values: "1 -0.5 0.25 -0.125 0.125" - } - } - } - layer { - name: "x1" - weights_layer { - dims: "5" - } - data_layout: "model_parallel" - weights: "x1_vals" - } - weights { - name: "x1_vals" - initializer { - value_initializer { - values: "1.5 0 -1 -0.125 -0.125" - } - } - } - - # Variations of mean absolute error layer - layer { - parents: "x0 x1" - name: "squared_difference_model_parallel" - squared_difference {} - data_layout: "model_parallel" - } - layer { - parents: "x0 x1" - name: "squared_difference_data_parallel" - squared_difference {} - data_layout: "data_parallel" - } - - # Combine into objective function - layer { - parents: "squared_difference_model_parallel squared_difference_data_parallel" - name: "sum" - sum {} - } - layer { - parents: "sum" - name: "l2" - l2_norm2 {} - } - -} diff --git a/model_zoo/tests/layer_tests/model_tessellate.prototext b/model_zoo/tests/layer_tests/model_tessellate.prototext deleted file mode 100644 index e48fd2a5005..00000000000 --- a/model_zoo/tests/layer_tests/model_tessellate.prototext +++ /dev/null @@ -1,121 +0,0 @@ -trainer { -} -model { - mini_batch_size: 11 - num_epochs: 0 - - ################################################### - # Objective function and metrics - ################################################### - - objective_function { - layer_term { layer: "l2" } - } - metric { - layer_metric { - layer: "l2" - name: "L2 norm" - } - } - - ################################################### - # Callbacks - ################################################### - - callback { print {} } - callback { timer {} } - callback { - check_metric { - metric: "L2 norm" # Expected value: 131.5 - lower_bound: 131.4 - upper_bound: 131.6 - error_on_failure: true - execution_modes: "test" - } - } - callback { - check_gradients { - execution_modes: "test" - verbose: false - error_on_failure: true - } - } - - ################################################### - # Layers - ################################################### - - layer { - name: "data" - data_layout: "data_parallel" - input {} - } - - # Input data - layer { - name: "x" - weights_layer { - dims: "1 3 1" - } - weights: "x_vals" - } - weights { - name: "x_vals" - initializer { - value_initializer { - values: "0.4 0.6 -0.5" - } - } - } - - # Variations of tessellate layer - layer { - parents: "x" - name: "tessellate_data_parallel" - tessellate { - dims: "2 4 3" - } - data_layout: "data_parallel" - } - layer { - parents: "x" - name: "tessellate_model_parallel" - tessellate { - dims: "2 4 3" - } - data_layout: "model_parallel" - } - - # Combine into objective function - layer { - parents: "tessellate_data_parallel tessellate_model_parallel" - name: "sum" - sum {} - } - layer { - name: "scales" - weights_layer {} - weights: "scales_vals" - hint_layer: "sum" - } - weights { - name: "scales_vals" - initializer { - value_initializer { - values: "1.2 1.3 1.4 1.5 1.6 1.7 1.8 1.9 2.0 2.1 2.2 2.3 2.4 2.5 2.6 2.7 2.8 2.9 3.0 3.1 3.2 3.3 3.4 3.5" - } - } - optimizer { no_optimizer {} } - } - layer { - parents: "sum scales" - name: "scaled_sum" - multiply {} - } - layer { - parents: "scaled_sum" - name: "l2" - l2_norm2 {} - } - -} diff --git a/model_zoo/tests/layer_tests/model_variance.prototext b/model_zoo/tests/layer_tests/model_variance.prototext deleted file mode 100644 index 71bbc8f948e..00000000000 --- a/model_zoo/tests/layer_tests/model_variance.prototext +++ /dev/null @@ -1,111 +0,0 @@ -trainer { -} -model { - data_layout: "data_parallel" - mini_batch_size: 11 - num_epochs: 0 - - ################################################### - # Objective function and metrics - ################################################### - - objective_function { - layer_term { layer: "l2" } - } - metric { - layer_metric { - layer: "l2" - name: "L2 norm" - } - } - - ################################################### - # Callbacks - ################################################### - - callback { print {} } - callback { timer {} } - callback { - check_metric { - metric: "L2 norm" # Expected value: 1.239 - lower_bound: 1.238 - upper_bound: 1.240 - error_on_failure: true - execution_modes: "test" - } - } - callback { - check_gradients { - execution_modes: "test" - verbose: false - error_on_failure: true - } - } - - ################################################### - # Layers - ################################################### - - layer { - name: "data" - data_layout: "data_parallel" - input {} - } - - # Input data - layer { - name: "x" - weights_layer { - dims: "5" - } - data_layout: "model_parallel" - weights: "x_vals" - } - weights { - name: "x_vals" - initializer { - value_initializer { - values: "1 -0.5 0.25 -0.125 0.0675" - } - } - } - - # Variations of variance layer - layer { - parents: "x" - name: "unbiased_variance_model_parallel" - variance { biased: false } - data_layout: "model_parallel" - } - layer { - parents: "x" - name: "biased_variance_model_parallel" - variance { biased: true } - data_layout: "model_parallel" - } - layer { - parents: "x" - name: "unbiased_variance_data_parallel" - variance { biased: false } - data_layout: "data_parallel" - } - layer { - parents: "x" - name: "biased_variance_data_parallel" - variance { biased: true } - data_layout: "data_parallel" - } - - # Combine into objective function - layer { - parents: "unbiased_variance_model_parallel biased_variance_model_parallel unbiased_variance_data_parallel biased_variance_data_parallel" - name: "sum" - sum {} - } - layer { - parents: "sum" - name: "l2" - l2_norm2 {} - } - -} From 925e1e55bcb109e0cb7360b889ba0e7d57174a5a Mon Sep 17 00:00:00 2001 From: Naoya Maruyama Date: Thu, 31 Oct 2019 11:47:38 -0700 Subject: [PATCH 376/634] Update CMake version to 3.14.5 --- scripts/build_lbann_lc.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/build_lbann_lc.sh b/scripts/build_lbann_lc.sh index 1251513c04e..9a168ef765e 100755 --- a/scripts/build_lbann_lc.sh +++ b/scripts/build_lbann_lc.sh @@ -311,7 +311,7 @@ fi # Load packages if [ ${USE_MODULES} -ne 0 ]; then module load git - module load cmake/3.12.1 + module load cmake/3.14.5 else use git fi From aa24e8ffb71b972967d57df7ae20cdf33ceec857 Mon Sep 17 00:00:00 2001 From: Naoya Maruyama Date: Thu, 31 Oct 2019 11:50:53 -0700 Subject: [PATCH 377/634] Update CUDA and related library versions CUDA 10.1, NCCL 2.4.8, and cuDNN 7.6.4 --- scripts/build_lbann_lc.sh | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/scripts/build_lbann_lc.sh b/scripts/build_lbann_lc.sh index 9a168ef765e..e18a18515aa 100755 --- a/scripts/build_lbann_lc.sh +++ b/scripts/build_lbann_lc.sh @@ -13,7 +13,6 @@ CORAL=$([[ $(hostname) =~ (sierra|lassen|ray) ]] && echo 1 || echo 0) COMPILER=gnu if [ "${CLUSTER}" == "surface" -o "${CLUSTER}" == "pascal" ]; then module load gcc/7.3.0 - module load opt cudatoolkit/9.2 elif [ "${CLUSTER}" == "sierra" -o "${CLUSTER}" == "lassen" ]; then module load gcc/7.3.1 fi @@ -566,11 +565,8 @@ if [ "${CLUSTER}" == "surface" -o "${CORAL}" -eq 1 -o "${CLUSTER}" == "pascal" ] WITH_ALUMINUM=${WITH_ALUMINUM:-ON} ALUMINUM_WITH_NCCL=${ALUMINUM_WITH_NCCL:-ON} if [[ ${CORAL} -eq 1 ]]; then - export NCCL_DIR=/usr/workspace/wsb/brain/nccl2/nccl_2.4.2-1+cuda9.2_ppc64le module del cuda - CUDA_TOOLKIT_MODULE=${CUDA_TOOLKIT_MODULE:-cuda/9.2.148} - else - export NCCL_DIR=/usr/workspace/wsb/brain/nccl2/nccl_2.4.2-1+cuda9.2_x86_64 + CUDA_TOOLKIT_MODULE=${CUDA_TOOLKIT_MODULE:-cuda/10.1.243} fi # Hack for surface @@ -580,8 +576,7 @@ if [ "${CLUSTER}" == "surface" -o "${CORAL}" -eq 1 -o "${CLUSTER}" == "pascal" ] CUDA_TOOLKIT_MODULE=cudatoolkit/9.2 ;; pascal) - module load opt - CUDA_TOOLKIT_MODULE=cudatoolkit/9.2 + CUDA_TOOLKIT_MODULE=${CUDA_TOOLKIT_MODULE:-cuda/10.1.168} ;; esac fi @@ -610,18 +605,26 @@ if [ "${WITH_CUDA}" == "ON" ]; then CUDA_TOOLKIT_VERSION=$(${CUDA_TOOLKIT_ROOT_DIR}/bin/nvcc --version | grep -oE "V[0-9]+\.[0-9]+" | sed 's/V//') # CUDNN - if [ -z "${CUDNN_DIR}" ]; then - if [ "${CUDA_TOOLKIT_VERSION}" == "9.2" ]; then - CUDNN_DIR=/usr/workspace/wsb/brain/cudnn/cudnn-7.5.1/cuda-${CUDA_TOOLKIT_VERSION}_${ARCH} - elif [ "${CUDA_TOOLKIT_VERSION}" == "9.1" ]; then - CUDNN_DIR=/usr/workspace/wsb/brain/cudnn/cudnn-7.1.3/cuda-${CUDA_TOOLKIT_VERSION}_${ARCH} - fi + if [[ -z $CUDNN_DIR ]]; then + CUDNN_VER=${CUDNN_VER:-7.6.4} + CUDNN_DIR=/usr/workspace/wsb/brain/cudnn/cudnn-${CUDNN_VER}/cuda-${CUDA_TOOLKIT_VERSION}_${ARCH} fi - if [ ! -d "${CUDNN_DIR}" ]; then + if [[ ! -d $CUDNN_DIR ]]; then echo "Could not find cuDNN at $CUDNN_DIR" exit 1 fi export CUDNN_DIR + + # NCCL + if [[ -z $NCCL_DIR ]]; then + NCCL_VER=${NCCL_VER:-2.4.8-1} + NCCL_DIR=/usr/workspace/wsb/brain/nccl2/nccl_${NCCL_VER}+cuda${CUDA_TOOLKIT_VERSION}_${ARCH} + fi + if [[ ! -d $NCCL_DIR ]]; then + echo "Could not find NCCL at $NCCL_DIR" + exit 1 + fi + export NCCL_DIR else HAS_GPU=0 WITH_CUDA=${WITH_CUDA:-OFF} From ab7978d0fd568593e48b69f3f20df15b0ee8af77 Mon Sep 17 00:00:00 2001 From: Naoya Maruyama Date: Thu, 31 Oct 2019 22:15:57 -0700 Subject: [PATCH 378/634] Change the default version of NCCL back to 2.4.2 --- scripts/build_lbann_lc.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/scripts/build_lbann_lc.sh b/scripts/build_lbann_lc.sh index e18a18515aa..21dc0147578 100755 --- a/scripts/build_lbann_lc.sh +++ b/scripts/build_lbann_lc.sh @@ -617,7 +617,9 @@ if [ "${WITH_CUDA}" == "ON" ]; then # NCCL if [[ -z $NCCL_DIR ]]; then - NCCL_VER=${NCCL_VER:-2.4.8-1} + # Subsequent 2.4.X versions are known to have a performance + # regression. See the release notes. + NCCL_VER=${NCCL_VER:-2.4.2-1} NCCL_DIR=/usr/workspace/wsb/brain/nccl2/nccl_${NCCL_VER}+cuda${CUDA_TOOLKIT_VERSION}_${ARCH} fi if [[ ! -d $NCCL_DIR ]]; then From 545557e7526eed91ec3716fbddc47e0b8c9e488d Mon Sep 17 00:00:00 2001 From: Tim Moon Date: Fri, 1 Nov 2019 14:22:23 -0700 Subject: [PATCH 379/634] Update docs to reflect the trainer refactor (#1334) See PR #916. --- docs/running_lbann.rst | 56 ++++++++++++++++++++++-------------------- 1 file changed, 30 insertions(+), 26 deletions(-) diff --git a/docs/running_lbann.rst b/docs/running_lbann.rst index cb6c7575327..b7e16d0e9a9 100644 --- a/docs/running_lbann.rst +++ b/docs/running_lbann.rst @@ -86,11 +86,6 @@ Comments: Model components ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. warning:: `A major refactor of core model infrastructure - `_ is - pending. This documentation will be updated once it is - merged and the interface stabilized. - + Layer: A tensor operation, arranged within a directed acyclic graph. - During evaluation ("forward prop"), a layer receives input tensors @@ -115,6 +110,10 @@ Model components is recommended to manually insert identity layers so that the parent/child relationships are absolutely unambiguous. + - See `lbann/src/proto/layers.proto + `_ + for a full list of supported layers. + + Weights: A tensor consisting of trainable parameters, typically associated with one or more layers. A weight tensor owns an initializer to initially populate its values and an optimizer to @@ -197,16 +196,14 @@ reader's work is IO-bound or if the computation is largely on GPUs. identity layers as children of the input layer. Note that layers within a model treat the data for a mini-batch as a -single tensor where the leading dimension is the mini-batch -size. Thus, corresponding tensors in all data samples must have the -same dimensions. The data dimensions must be known from the beginning -of the experiment and can not change. However, real data is rarely so -consistent and some preprocessing is typically required. - -.. warning:: `A major refactor of the preprocessing pipeline - `_ is - pending. This documentation will be updated once it is - merged and the interface stabilized. +single tensor where the leading dimension is the mini-batch size. +Thus, corresponding tensors in all data samples must have the same +dimensions. The data dimensions must be known from the beginning of +the experiment and can not change. However, real data is rarely so +consistent and some preprocessing is typically required. See +`lbann/src/proto/transforms.proto +`_ +for a list of available preprocessing transforms. ------------------------------------------------ Python frontend @@ -279,27 +276,31 @@ Basic usage A typical workflow involves the following steps: -1. Configuring LBANN model components (like the graph of +1. Configuring a :python:`Trainer`. + +2. Configuring LBANN model components (like the graph of :python:`Layer` s) and creating a :python:`Model`. + Classes for model components are automatically generated from the - LBANN Protobuf specification at `src/proto/lbann.proto - `_. - This file is currently the best source of documentation. Message + LBANN Protobuf specifications in `lbann/src/proto + `_. These + files are currently the best source of documentation. Message fields in the Protobuf specification are optional keyword - arguments for the corresponding Python class constructor. + arguments for the corresponding Python class constructor. If a + keyword argument is not provided, it is logically zero (e.g. false + for Boolean fields and empty for string fields) -2. Configuring the default :python:`Optimizer` to be used by the - :python:`Weights` es. +3. Configuring the default :python:`Optimizer` to be used by the + :python:`Weights` objects. -3. Loading in a Protobuf text file describing the data reader. +4. Loading in a Protobuf text file describing the data reader. + The Python frontend currently does not have good support for specifying data readers. If any data reader properties need to be set programmatically, the user must do it directly via the Protobuf Python API. -4. Launching LBANN by calling :python:`run`. +5. Launching LBANN by calling :python:`run`. + :python:`lbann.run` will detect whether the user is currently on a login node or a compute node. If on a login node, a batch job @@ -345,6 +346,9 @@ A simple example # Setup experiment # ---------------------------------- + # Setup trainer + trainer = lbann.Trainer() + # Setup model mini_batch_size = 64 num_epochs = 5 @@ -361,7 +365,7 @@ A simple example # Load data reader from prototext import google.protobuf.text_format as txtf data_reader_proto = lbann.lbann_pb2.LbannPB() - with open('path/to/lbann/model_zoo/data_readers/data_reader.prototext', 'r') as f: + with open('path/to/lbann/model_zoo/data_readers/data_reader_mnist.prototext', 'r') as f: txtf.Merge(f.read(), data_reader_proto) data_reader_proto = data_reader_proto.data_reader @@ -369,7 +373,7 @@ A simple example # Run experiment # ---------------------------------- - lbann.run(model, data_reader_proto, opt) + lbann.run(trainer, model, data_reader_proto, opt) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Useful submodules From 99ca2679a93c5fcd5717553908519edffc594966 Mon Sep 17 00:00:00 2001 From: Tom Benson <30674819+benson31@users.noreply.github.com> Date: Mon, 4 Nov 2019 08:54:47 -0800 Subject: [PATCH 380/634] Documentation updates (#1332) Doxygen now installs with Sphinx for deployment on RTD. LBANN vocabulary description is now included in the RTD. Fixed a typo. --- docs/RSTDocsFlavorText.py | 12 ++++++++---- docs/SourceTreeDoxyfile | 6 +++--- docs/conf.py | 7 ++++++- docs/documentation_building.rst | 2 +- docs/index.rst | 3 +++ 5 files changed, 21 insertions(+), 9 deletions(-) diff --git a/docs/RSTDocsFlavorText.py b/docs/RSTDocsFlavorText.py index 96555316b14..bfdc1caf249 100644 --- a/docs/RSTDocsFlavorText.py +++ b/docs/RSTDocsFlavorText.py @@ -37,10 +37,14 @@ lbann_rst_flavor_text = { '.' : ''' -Welcome to the LBANN developers' documentation. The documentation is -laid out following a similar structure to the source code to aid in -navigation. - ''', +The LBANN API documentation is almost entirely generated by `Doxygen +`_. We encourage developers to view the +`Doxygen-generated documentation +<../_static/doxygen/html/index.html>`_. The API documentation is largely +reproduced here (using `Breathe +`_) for those who prefer the +Sphinx/RTD style. It is laid out following a similar structure to the +source code to aid in navigation.''', 'callbacks' : ''' Callbacks give users information about their model as it is trained. diff --git a/docs/SourceTreeDoxyfile b/docs/SourceTreeDoxyfile index 030ca51b653..f2272e6beaf 100644 --- a/docs/SourceTreeDoxyfile +++ b/docs/SourceTreeDoxyfile @@ -58,7 +58,7 @@ PROJECT_LOGO = # entered, it will be relative to the location where doxygen was started. If # left blank the current directory will be used. -OUTPUT_DIRECTORY = doxy_out +OUTPUT_DIRECTORY = _static/doxygen # If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub- # directories (in 2 levels) under the output directory of each output format and @@ -1564,7 +1564,7 @@ EXTRA_SEARCH_MAPPINGS = # If the GENERATE_LATEX tag is set to YES, doxygen will generate LaTeX output. # The default value is: YES. -GENERATE_LATEX = YES +GENERATE_LATEX = NO # The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put. If a # relative path is entered the value of OUTPUT_DIRECTORY will be put in front of @@ -1841,7 +1841,7 @@ GENERATE_XML = YES # The default directory is: xml. # This tag requires that the tag GENERATE_XML is set to YES. -XML_OUTPUT = xml +XML_OUTPUT = ../../doxy_out/xml # If the XML_PROGRAMLISTING tag is set to YES, doxygen will dump the program # listings (including syntax highlighting and cross-referencing information) to diff --git a/docs/conf.py b/docs/conf.py index d1763486df5..a921a29a037 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -18,9 +18,13 @@ import subprocess, os, runpy -rebuild_doxygen = not os.path.isdir("doxy_out/xml") +rebuild_doxygen = not os.path.isdir("doxy_out/xml") or not os.path.isdir("_static/doxygen/html") +if not os.path.isdir("_static"): + os.makedirs("_static") + if rebuild_doxygen: + os.makedirs("doxy_out/xml") subprocess.call('doxygen SourceTreeDoxyfile', shell=True) #exec(open("./BuildRSTDocs.py").read()) @@ -75,6 +79,7 @@ # The name of the Pygments (syntax highlighting) style to use. pygments_style = 'sphinx' +html_static_path = ['_static'] # -- Options for HTML output ------------------------------------------------- diff --git a/docs/documentation_building.rst b/docs/documentation_building.rst index 98fe792e89c..7800f699d44 100644 --- a/docs/documentation_building.rst +++ b/docs/documentation_building.rst @@ -51,6 +51,6 @@ In order to make :bash:`make html` work, you may need to do a few steps: 4. Add Doxygen to your path with :bash:`PATH=":${PATH}"`. You may want to add this to your "~/.bash_profile" so your :bash:`PATH` is - always correct. Run :bash:`source ~.bash_profile` to run that code. + always correct. Run :bash:`source ~/.bash_profile` to run that code. 5. Try running :bash:`make html` again. diff --git a/docs/index.rst b/docs/index.rst index c665b6fab7f..e4603712d64 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -22,6 +22,8 @@ recurrent neural networks via back propagation through time (BPTT) training, transfer learning, and multi-model and ensemble training methods. +Users are advised to view `the Doxygen API Documentation +<_static/doxygen/html/index.html>`_ for API information. .. toctree:: :maxdepth: 2 @@ -40,6 +42,7 @@ methods. :maxdepth: 2 :caption: Developer Documentation + lbann lbann/lbann style_guide continuous_integration From 279b4d4a5f0268519c9ce91f5940a59a355d1336 Mon Sep 17 00:00:00 2001 From: Tim Moon Date: Mon, 4 Nov 2019 15:16:59 -0800 Subject: [PATCH 381/634] Add hacked bugfixes for Lassen (#1337) * Add hacked bugfixes for Lassen in LC-specific Python launcher * Tweak docs for hacked Lassen bugfixes --- python/lbann/contrib/lc/launcher.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/python/lbann/contrib/lc/launcher.py b/python/lbann/contrib/lc/launcher.py index 34a3ca7c6a5..93d99895453 100644 --- a/python/lbann/contrib/lc/launcher.py +++ b/python/lbann/contrib/lc/launcher.py @@ -129,7 +129,7 @@ def make_batch_script(script_file=None, mask_str = ','.join([hex(mask) for mask in masks]) launcher_args.append('--cpu_bind=mask_cpu:{}'.format(mask_str)) - # Hacked bugfix for MPI_Init in MVAPICH2-2.3 + # Hacked bugfix for MPI_Init in MVAPICH2-2.3 (8/23/18) # Note: MPI_Init hangs when started with more than 35 # processes. This bug is not present in MVAPICH2-2.2 but is # present in MVAPICH2-2.3rc2. @@ -164,6 +164,17 @@ def make_batch_script(script_file=None, if 'IBV_FORK_SAFE' not in environment: environment['IBV_FORK_SAFE'] = 1 + # Hacked bugfix for hcoll (1/23/19) + # Note: Fixes hangs in MPI_Bcast. + if 'HCOLL_ENABLE_SHARP' not in environment: + environment['HCOLL_ENABLE_SHARP'] = 0 + if 'OMPI_MCA_coll_hcoll_enable' not in environment: + environment['OMPI_MCA_coll_hcoll_enable'] = 0 + + # Hacked bugfix for Spectrum MPI PAMI (9/17/19) + if 'PAMI_MAX_NUM_CACHED_PAGES' not in environment: + environment['PAMI_MAX_NUM_CACHED_PAGES'] = 0 + return lbann.launcher.make_batch_script(script_file=script_file, work_dir=work_dir, nodes=nodes, From 32e10ccd8215b25fc8d12364e677b8d3ecf76811 Mon Sep 17 00:00:00 2001 From: Tom Benson <30674819+benson31@users.noreply.github.com> Date: Tue, 5 Nov 2019 11:25:59 -0800 Subject: [PATCH 382/634] tidy the source for imcomm (#1340) --- include/lbann/callbacks/imcomm.hpp | 18 +++----- src/callbacks/imcomm.cpp | 69 ++++++++++++------------------ 2 files changed, 35 insertions(+), 52 deletions(-) diff --git a/include/lbann/callbacks/imcomm.hpp b/include/lbann/callbacks/imcomm.hpp index 1108b061882..66b4e68dcc3 100644 --- a/include/lbann/callbacks/imcomm.hpp +++ b/include/lbann/callbacks/imcomm.hpp @@ -46,7 +46,7 @@ class imcomm : public callback_base { using callback_base::on_backward_prop_end; enum comm_type { - NONE, /** Do no gradient updates. */ + NONE=0, /** Do no gradient updates. */ NORMAL, /** Simply sum gradient updates. */ }; @@ -54,7 +54,7 @@ class imcomm : public callback_base { * Initialize with ct being used for all weights. */ imcomm(comm_type ct = NORMAL, - const std::shared_ptr& summarizer = nullptr); + const std::shared_ptr& summarizer = nullptr); imcomm(const imcomm&) = default; imcomm& operator=(const imcomm&) = default; imcomm* copy() const override { @@ -65,7 +65,7 @@ class imcomm : public callback_base { * Implies no inter-model updates for other weights. */ imcomm(comm_type ct, std::unordered_set weights_list, - const std::shared_ptr& summarizer = nullptr); + const std::shared_ptr& summarizer = nullptr); /** Choose comm type ct for weights. */ void set_weights_comm(weights *w, comm_type ct); @@ -80,15 +80,12 @@ class imcomm : public callback_base { std::string name() const override { return "imcomm"; } private: - /** Parameters for a given set of weights. */ - struct imcomm_params { - /** Type of communication done. */ - comm_type ct = NONE; - }; + /** Default communication type. */ comm_type m_default_ct; + /** Per-weights parameters. */ - std::unordered_map m_weights_params; + std::unordered_map m_weights_params; /** Summarize relevant statistics. */ void do_summary(model *m, weights *w, EvalType im_time); @@ -97,8 +94,7 @@ class imcomm : public callback_base { std::shared_ptr m_summarizer = nullptr; }; - -/** returns a string representation of the weight_initialization */ +/** returns a string representation of the weight_initialization. */ std::string get_comm_type_name(imcomm::comm_type m); // Builder function diff --git a/src/callbacks/imcomm.cpp b/src/callbacks/imcomm.cpp index a7726a24f8e..8efa1864f79 100644 --- a/src/callbacks/imcomm.cpp +++ b/src/callbacks/imcomm.cpp @@ -40,48 +40,36 @@ namespace lbann { namespace callback { imcomm::imcomm(imcomm::comm_type ct, - const std::shared_ptr& summarizer) : + const std::shared_ptr& summarizer) : m_default_ct(ct), m_summarizer(summarizer) {} imcomm::imcomm(imcomm::comm_type ct, - std::unordered_set weights_list, - const std::shared_ptr& summarizer) : + std::unordered_set weights_list, + const std::shared_ptr& summarizer) : imcomm(ct, summarizer) { for (weights *w : weights_list) { - m_weights_params[w] = {}; - m_weights_params[w].ct = ct; + m_weights_params[w] = ct; } } -void imcomm::set_weights_comm(weights *w, - comm_type ct) { - m_weights_params[w] = {}; - m_weights_params[w].ct = ct; +void imcomm::set_weights_comm(weights *w, comm_type ct) { + m_weights_params[w] = ct; } void imcomm::setup(model *m) { for (weights *w : m->get_weights()) { - // Add weights if not already in list if (m_weights_params.find(w) == m_weights_params.end()) { - m_weights_params[w] = {}; - m_weights_params[w].ct = (w->get_optimizer() != nullptr ? - m_default_ct : - NONE); + m_weights_params[w] = (w->get_optimizer() != nullptr ? + m_default_ct : + NONE); } - // Setup imcomm parameters if needed - imcomm_params& params = m_weights_params[w]; - if (params.ct != NONE) { - optimizer *opt = w->get_optimizer(); - if (opt == nullptr) { - std::stringstream err; - err << "imcomm: trying to do inter-model gradient communication on " - << w->get_name() << ", which has no optimizer"; - LBANN_ERROR(err.str()); - } + if ((m_weights_params[w] != NONE) && ( w->get_optimizer() == nullptr)) { + LBANN_ERROR( + "imcomm: trying to do inter-model gradient communication on ", + w->get_name(),", which has no optimizer"); } - } } @@ -106,16 +94,16 @@ void imcomm::on_backward_prop_end(model *m) { } for (weights *w : m->get_weights()) { EvalType start_time = get_time(); - imcomm_params& params = m_weights_params[w]; - if (params.ct == NONE) { + auto const& ct = m_weights_params[w]; + if (ct == NONE) { continue; } optimizer *opt = w->get_optimizer(); auto gradient = std::unique_ptr{opt->get_gradient().Copy()}; - Mat* local_gradients = &(static_cast(gradient->Matrix())); - switch (params.ct) { + auto& local_gradients = gradient->Matrix(); + switch (ct) { case NORMAL: - comm->intertrainer_sum_matrix(*local_gradients); + comm->intertrainer_sum_matrix(local_gradients); break; default: LBANN_ERROR("imcomm: unknown comm type"); @@ -128,7 +116,7 @@ void imcomm::on_backward_prop_end(model *m) { } void imcomm::do_summary(model *m, weights *w, - EvalType im_time) { + EvalType im_time) { if (m_summarizer == nullptr) { return; } @@ -149,21 +137,22 @@ void imcomm::do_summary(model *m, weights *w, bytes_received, c.get_step()); } -static std::vector comm_type_names = { "none", "normal" }; - -/** returns a string representation of the weight_initialization */ +/* Returns a string representation of the weight_initialization */ std::string get_comm_type_name(imcomm::comm_type m) { - if ((int)m < 0 or (int)m >= (int)comm_type_names.size()) { - LBANN_ERROR(" Invalid comm_type"); + switch (m) { + case imcomm::NONE: return "none"; + case imcomm::NORMAL: return "normal"; + default: + LBANN_ERROR("Unknown value for comm_type"); } - return comm_type_names[(int)m]; } std::unique_ptr build_imcomm_callback_from_pbuf( const google::protobuf::Message& proto_msg, const std::shared_ptr& summarizer) { - const auto& params = dynamic_cast(proto_msg); + using param_msg_type = lbann_data::Callback::CallbackImComm; + const auto& params = dynamic_cast(proto_msg); const auto& type_str = params.intertrainer_comm_method(); imcomm::comm_type type = imcomm::comm_type::NONE; if (type_str == "none") { @@ -171,9 +160,7 @@ build_imcomm_callback_from_pbuf( } else if (type_str == "normal") { type = imcomm::comm_type::NORMAL; } else { - std::ostringstream err; - err << "invalid inter-model communication type (" << type_str << ")"; - LBANN_ERROR(err.str()); + LBANN_ERROR("invalid inter-model communication type (", type_str, ")"); } std::unordered_set selected_weights; /// @todo Initialize weights return make_unique(type, selected_weights, summarizer); From 4ae76465bae71e2ea4396e0ef5a6afa3a0d5dea5 Mon Sep 17 00:00:00 2001 From: Tim Moon Date: Tue, 5 Nov 2019 17:58:07 -0800 Subject: [PATCH 383/634] Move vision models to applications directory (#1335) * Create vision application and add LeNet * Add ImageNet models to vision application * Remove vision scripts in model zoo --- applications/vision/README.md | 21 +++ {model_zoo => applications}/vision/alexnet.py | 62 ++----- applications/vision/data/__init__.py | 0 applications/vision/data/imagenet/__init__.py | 42 +++++ .../data/imagenet/data_reader.prototext | 60 +++++++ applications/vision/data/mnist/.gitignore | 5 + applications/vision/data/mnist/__init__.py | 59 +++++++ .../vision/data/mnist/data_reader.prototext | 30 ++++ .../vision/densenet.py | 152 +++--------------- {model_zoo => applications}/vision/lenet.py | 22 +-- {model_zoo => applications}/vision/resnet.py | 79 ++------- docs/running_lbann.rst | 9 +- model_zoo/vision/.gitignore | 1 - 13 files changed, 277 insertions(+), 265 deletions(-) create mode 100644 applications/vision/README.md rename {model_zoo => applications}/vision/alexnet.py (52%) mode change 100755 => 100644 create mode 100644 applications/vision/data/__init__.py create mode 100644 applications/vision/data/imagenet/__init__.py create mode 100644 applications/vision/data/imagenet/data_reader.prototext create mode 100644 applications/vision/data/mnist/.gitignore create mode 100644 applications/vision/data/mnist/__init__.py create mode 100644 applications/vision/data/mnist/data_reader.prototext rename {model_zoo => applications}/vision/densenet.py (72%) mode change 100755 => 100644 rename {model_zoo => applications}/vision/lenet.py (79%) mode change 100755 => 100644 rename {model_zoo => applications}/vision/resnet.py (65%) mode change 100755 => 100644 delete mode 100644 model_zoo/vision/.gitignore diff --git a/applications/vision/README.md b/applications/vision/README.md new file mode 100644 index 00000000000..fc05737bf67 --- /dev/null +++ b/applications/vision/README.md @@ -0,0 +1,21 @@ +# Example models for computer vision + +This directory contains LBANN implementations of widely-used vision +models. They are intended to validate and benchmark LBANN's vision +functionality, and are also suitable as pedagogical tools for using +LBANN. + +## LeNet + +`lenet.py` trains a LeNet model on MNIST data. It is a simple script +intended to demonstrate LBANN's Python API. It calls helper functions +in `data/mnist/__init__.py` to download MNIST data and construct MNIST +data readers. + +## ImageNet models + +`alexnet.py`, `resnet.py`, and `densenet.py` are primarily used for +performance benchmarks and scaling studies. It uses LLNL-specific +features and the helper functions in `data/imagenet/__init__.py` +assume that the user is on an LLNL LC system and belongs to the +`brainusr` group. diff --git a/model_zoo/vision/alexnet.py b/applications/vision/alexnet.py old mode 100755 new mode 100644 similarity index 52% rename from model_zoo/vision/alexnet.py rename to applications/vision/alexnet.py index eca35721714..51c0ff7c4d5 --- a/model_zoo/vision/alexnet.py +++ b/applications/vision/alexnet.py @@ -1,23 +1,18 @@ -#!/usr/bin/env python3 import argparse -from os.path import abspath, dirname, join -import google.protobuf.text_format as txtf import lbann import lbann.models -import lbann.proto import lbann.contrib.args - -# Default data reader -model_zoo_dir = dirname(dirname(abspath(__file__))) -data_reader_prototext = join(model_zoo_dir, - 'data_readers', - 'data_reader_imagenet.prototext') +import lbann.contrib.lc.launcher +import data.imagenet # Command-line arguments desc = ('Construct and run AlexNet on ImageNet-1K data. ' 'Running the experiment is only supported on LC systems.') parser = argparse.ArgumentParser(description=desc) lbann.contrib.args.add_scheduler_arguments(parser) +parser.add_argument( + '--job-name', action='store', default='lbann_alexnet', type=str, + help='scheduler job name (default: lbann_alexnet)') parser.add_argument( '--mini-batch-size', action='store', default=256, type=int, help='mini-batch size (default: 256)', metavar='NUM') @@ -25,17 +20,12 @@ '--num-epochs', action='store', default=100, type=int, help='number of epochs (default: 100)', metavar='NUM') parser.add_argument( - '--num-labels', action='store', default=1000, type=int, - help='number of data classes (default: 1000)', metavar='NUM') + '--num-classes', action='store', default=1000, type=int, + help='number of ImageNet classes (default: 1000)', metavar='NUM') lbann.contrib.args.add_optimizer_arguments(parser) parser.add_argument( - '--data-reader', action='store', - default=data_reader_prototext, type=str, - help='data reader prototext file (default: ' + data_reader_prototext + ')', - metavar='FILE') -parser.add_argument( - '--prototext', action='store', type=str, - help='exported prototext file', metavar='FILE') + '--setup_only', action='store_true', + help='setup LBANN experiment without running it') args = parser.parse_args() # Due to a data reader limitation, the actual model realization must be @@ -78,35 +68,15 @@ # Setup optimizer opt = lbann.contrib.args.create_optimizer(args) -# Load data reader from prototext -data_reader_proto = lbann.lbann_pb2.LbannPB() -with open(args.data_reader, 'r') as f: - txtf.Merge(f.read(), data_reader_proto) -data_reader_proto = data_reader_proto.data_reader +# Setup data reader +data_reader = data.imagenet.make_data_reader(num_classes=args.num_classes) # Setup trainer trainer = lbann.Trainer() -# Save prototext -if args.prototext: - lbann.proto.save_prototext(args.prototext, - trainer=trainer, - model=model, optimizer=opt, - data_reader=data_reader_proto) - # Run experiment -if not args.prototext: - from lbann.contrib.lc.paths import imagenet_dir, imagenet_labels - import lbann.contrib.lc.launcher - kwargs = lbann.contrib.args.get_scheduler_kwargs(args) - classes = args.num_labels - kwargs['lbann_args'] = ( - '--data_filedir_train={} --data_filename_train={} ' - '--data_filedir_test={} --data_filename_test={}' - .format(imagenet_dir(data_set='train', num_classes=classes), - imagenet_labels(data_set='train', num_classes=classes), - imagenet_dir(data_set='val', num_classes=classes), - imagenet_labels(data_set='val', num_classes=classes))) - lbann.contrib.lc.launcher.run(trainer, model, data_reader_proto, opt, - job_name = 'lbann_alexnet', - **kwargs) +kwargs = lbann.contrib.args.get_scheduler_kwargs(args) +lbann.contrib.lc.launcher.run(trainer, model, data_reader, opt, + job_name=args.job_name, + setup_only=args.setup_only, + **kwargs) diff --git a/applications/vision/data/__init__.py b/applications/vision/data/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/applications/vision/data/imagenet/__init__.py b/applications/vision/data/imagenet/__init__.py new file mode 100644 index 00000000000..cf8790cd413 --- /dev/null +++ b/applications/vision/data/imagenet/__init__.py @@ -0,0 +1,42 @@ +import os +import os.path + +import google.protobuf.text_format +import lbann +import lbann.contrib.lc.paths + +def make_data_reader(num_classes=1000): + + # Load Protobuf message from file + current_dir = os.path.dirname(os.path.realpath(__file__)) + protobuf_file = os.path.join(current_dir, 'data_reader.prototext') + message = lbann.lbann_pb2.LbannPB() + with open(protobuf_file, 'r') as f: + google.protobuf.text_format.Merge(f.read(), message) + message = message.data_reader + + # Check if data paths are accessible + train_data_dir = lbann.contrib.lc.paths.imagenet_dir(data_set='train', + num_classes=num_classes) + train_label_file = lbann.contrib.lc.paths.imagenet_labels(data_set='train', + num_classes=num_classes) + test_data_dir = lbann.contrib.lc.paths.imagenet_dir(data_set='val', + num_classes=num_classes) + test_label_file = lbann.contrib.lc.paths.imagenet_labels(data_set='val', + num_classes=num_classes) + if not os.path.isdir(train_data_dir): + raise FileNotFoundError('could not access {}'.format(train_data_dir)) + if not os.path.isfile(train_label_file): + raise FileNotFoundError('could not access {}'.format(train_label_file)) + if not os.path.isdir(test_data_dir): + raise FileNotFoundError('could not access {}'.format(test_data_dir)) + if not os.path.isfile(test_label_file): + raise FileNotFoundError('could not access {}'.format(test_label_file)) + + # Set paths + message.reader[0].data_filedir = train_data_dir + message.reader[0].data_filename = train_label_file + message.reader[1].data_filedir = test_data_dir + message.reader[1].data_filename = test_label_file + + return message diff --git a/applications/vision/data/imagenet/data_reader.prototext b/applications/vision/data/imagenet/data_reader.prototext new file mode 100644 index 00000000000..3f4e0270f3f --- /dev/null +++ b/applications/vision/data/imagenet/data_reader.prototext @@ -0,0 +1,60 @@ +data_reader { + reader { + name: "imagenet" + role: "train" + shuffle: true + data_filedir: "path/to/ILSVRC2012/train" + data_filename: "path/to/ILSVRC2012/labels/train.txt" + validation_percent: 0.0 + percent_of_data_to_use: 1.0 + num_labels: 1000 + + transforms { + random_resized_crop { + height: 224 + width: 224 + } + } + transforms { + horizontal_flip { + p: 0.5 + } + } + transforms { + colorize {} + } + transforms { + normalize_to_lbann_layout { + means: "0.406 0.456 0.485" + stddevs: "0.225 0.224 0.229" + } + } + } + + reader { + name: "imagenet" + role: "validate" + data_filedir: "path/to/ILSVRC2012/val" + data_filename: "path/to/ILSVRC2012/labels/val.txt" + percent_of_data_to_use: 1.0 + num_labels: 1000 + + transforms { + resized_center_crop { + height: 256 + width: 256 + crop_height: 224 + crop_width: 224 + } + } + transforms { + colorize {} + } + transforms { + normalize_to_lbann_layout { + means: "0.406 0.456 0.485" + stddevs: "0.225 0.224 0.229" + } + } + } +} diff --git a/applications/vision/data/mnist/.gitignore b/applications/vision/data/mnist/.gitignore new file mode 100644 index 00000000000..10c191aa77f --- /dev/null +++ b/applications/vision/data/mnist/.gitignore @@ -0,0 +1,5 @@ +*.gz +train-images-idx3-ubyte +train-labels-idx1-ubyte +t10k-images-idx3-ubyte +t10k-labels-idx1-ubyte diff --git a/applications/vision/data/mnist/__init__.py b/applications/vision/data/mnist/__init__.py new file mode 100644 index 00000000000..271ccf0f61e --- /dev/null +++ b/applications/vision/data/mnist/__init__.py @@ -0,0 +1,59 @@ +import gzip +import os +import os.path +import urllib.request + +import google.protobuf.text_format +import lbann + +# Paths +data_dir = os.path.dirname(os.path.realpath(__file__)) + +def download_data(): + """Download MNIST data files, if needed. + + Data files are downloaded from http://yann.lecun.com/exdb/mnist/ + and uncompressed. Does nothing if the files already exist. + + """ + + # MNIST data files and associated URLs + urls = { + 'train-images-idx3-ubyte': 'http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz', + 'train-labels-idx1-ubyte': 'http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz', + 't10k-images-idx3-ubyte': 'http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz', + 't10k-labels-idx1-ubyte': 'http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz', + } + + # Download and uncompress MNIST data files, if needed + for data_file, url in urls.items(): + data_file = os.path.join(data_dir, data_file) + compressed_file = data_file + '.gz' + if not os.path.isfile(data_file): + urllib.request.urlretrieve(url, filename=compressed_file) + with gzip.open(compressed_file, 'rb') as in_file: + with open(data_file, 'wb') as out_file: + out_file.write(in_file.read()) + +def make_data_reader(): + """Make Protobuf message for MNIST data reader. + + MNIST data is downloaded if needed. + + """ + + # Download MNIST data files + download_data() + + # Load Protobuf message from file + protobuf_file = os.path.join(data_dir, 'data_reader.prototext') + message = lbann.lbann_pb2.LbannPB() + with open(protobuf_file, 'r') as f: + google.protobuf.text_format.Merge(f.read(), message) + message = message.data_reader + + # Set paths + for reader in message.reader: + reader.data_filedir = data_dir + + return message diff --git a/applications/vision/data/mnist/data_reader.prototext b/applications/vision/data/mnist/data_reader.prototext new file mode 100644 index 00000000000..61c3b32cf42 --- /dev/null +++ b/applications/vision/data/mnist/data_reader.prototext @@ -0,0 +1,30 @@ +data_reader { + reader { + name: "mnist" + role: "train" + shuffle: true + data_filedir: "lbann/applications/vision/data/mnist" + data_filename: "train-images-idx3-ubyte" + label_filename: "train-labels-idx1-ubyte" + validation_percent: 0.1 + percent_of_data_to_use: 1.0 + transforms { + scale { + scale: 0.003921568627 # 1/255 + } + } + } + reader { + name: "mnist" + role: "test" + data_filedir: "lbann/applications/vision/data/mnist" + data_filename: "t10k-images-idx3-ubyte" + label_filename: "t10k-labels-idx1-ubyte" + percent_of_data_to_use: 1.0 + transforms { + scale { + scale: 0.003921568627 # 1/255 + } + } + } +} diff --git a/model_zoo/vision/densenet.py b/applications/vision/densenet.py old mode 100755 new mode 100644 similarity index 72% rename from model_zoo/vision/densenet.py rename to applications/vision/densenet.py index b1b19c6b44a..5b1fc7f9e19 --- a/model_zoo/vision/densenet.py +++ b/applications/vision/densenet.py @@ -1,13 +1,8 @@ -#!/usr/bin/env python3 import argparse -import os.path -import subprocess -import google.protobuf.text_format as txtf import lbann import lbann.contrib.args import lbann.contrib.lc.launcher - -# TODO: Add trainer argument after PR #916 merges +import data.imagenet LOG = True @@ -16,50 +11,6 @@ def log(string): if LOG: print(string) -# Commands to run ############################################################## - -# Allocate notes on Pascal from ssh: -# salloc --nodes=16 --partition=pbatch --time=180 - -# From lbann/model_zoo/vision: -# ./densenet.py -# --disable-run (if experiment shouldn't be run) -# --mini-batch-size 128 (if mini-batch-size should be something other than 256) -# --nodes 16 (if more than one node is to be used; 16 is optimal) -# --procs-per-node 2 - -# To run the full 90 epochs from ssh: -# ./densenet.py --nodes 16 --procs-per-node 2 > /usr/workspace/wsb//lbann/model_zoo/vision/output.txt -# mini-batch-size default => 256, num-epochs => 90 - -# To run 10 epoch test from ssh: -# ./densenet.py --nodes 16 --procs-per-node 2 --mini-batch-size 256 --num-epochs 10 > /usr/workspace/wsb//lbann/model_zoo/vision/output.txt - -# To avoid needing to stay logged into ssh, create a script -# densenet_batch_job.cmd such as: -# #!/bin/bash -# #SBATCH --nodes 8 -# #SBATCH --partition pbatch -# #SBATCH --time 840 -# -# module load gcc/7.1.0 -# ../../scripts/build_lbann_lc.sh --compiler gnu --reconfigure -# -# module load python/3.6.4 -# ./densenet.py --nodes 8 --procs-per-node 2 --mini-batch-size 256 --num-epochs 10 &> /usr/workspace/wsb//lbann/model_zoo/vision/output.txt - -# and from lbann/model_zoo/vision run: -# sbatch densenet_batch_job.cmd - -# To generate visualization, from lbann run: -# scripts/viz.py model_zoo/models/densenet/generated_densenet.prototext - -# Copy the output file, experiment directory, and visualization -# from LC to your computer by running the following commands from your computer: -# scp @pascal.llnl.gov:/usr/workspace/wsb//lbann/model_zoo/vision/output.txt . -# scp -r @pascal.llnl.gov:/usr/workspace/wsb//lbann/model_zoo/vision/_lbann_densenet/ . -# scp @pascal.llnl.gov:/usr/workspace/wsb//lbann/graph.pdf . - # DenseNet ##################################################################### # See src/proto/lbann.proto for possible functions to call. @@ -373,6 +324,9 @@ def get_args(): 'Running the experiment is only supported on LC systems.') parser = argparse.ArgumentParser(description=desc) lbann.contrib.args.add_scheduler_arguments(parser) + parser.add_argument( + '--job-name', action='store', default='lbann_densenet', type=str, + help='scheduler job name (default: lbann_densenet)') parser.add_argument( '--mini-batch-size', action='store', default=256, type=int, help='mini-batch size (default: 256)', metavar='NUM') @@ -380,41 +334,15 @@ def get_args(): '--num-epochs', action='store', default=90, type=int, help='number of epochs (default: 90)', metavar='NUM') parser.add_argument( - '--num-labels', action='store', default=1000, type=int, - help='number of data classes (default: 1000)', metavar='NUM') + '--num-classes', action='store', default=1000, type=int, + help='number of ImageNet classes (default: 1000)', metavar='NUM') lbann.contrib.args.add_optimizer_arguments( parser, default_optimizer='sgd', default_learning_rate=0.1 ) - lbann_dir = subprocess.check_output( - 'git rev-parse --show-toplevel'.split()).strip() - # https://stackoverflow.com/questions/606191/convert-bytes-to-a-string - lbann_dir = lbann_dir.decode("utf-8") - data_reader_prototext = os.path.join(lbann_dir, - 'model_zoo', - 'data_readers', - 'data_reader_imagenet.prototext') parser.add_argument( - '--data-reader', action='store', - default=data_reader_prototext, type=str, - help='data reader prototext file (default: ' + data_reader_prototext + ')', - metavar='FILE') - parser.add_argument( - '--imagenet-classes', action='store', type=int, - help='number of ImageNet-1K classes (availability of subsampled datasets may vary by system)', - metavar='NUM') - generated_prototext = os.path.join(lbann_dir, - 'model_zoo', - 'models', - 'densenet', - 'generated_densenet.prototext') - parser.add_argument( - '--prototext', action='store', - default=generated_prototext, type=str, - help='exported prototext file', metavar='FILE') - parser.add_argument( - '--disable-run', action='store_true', + '--setup_only', action='store_true', help='do not run experiment (e.g. if only the prototext is desired)') args = parser.parse_args() return args @@ -473,11 +401,8 @@ def set_up_experiment(args, metrics=metrics, callbacks=callbacks) - # Load data reader from prototext - data_reader_proto = lbann.lbann_pb2.LbannPB() - with open(args.data_reader, 'r') as f: - txtf.Merge(f.read(), data_reader_proto) - data_reader_proto = data_reader_proto.data_reader + # Set up data reader + data_reader = data.imagenet.make_data_reader(num_classes=args.num_classes) # Set up optimizer if args.optimizer == 'sgd': @@ -493,53 +418,19 @@ def set_up_experiment(args, # Setup trainer trainer = lbann.Trainer() - # Save prototext to args.prototext - if args.prototext: - lbann.proto.save_prototext(args.prototext, - trainer=trainer, - model=model, - optimizer=optimizer, - data_reader=data_reader_proto) - - return trainer, model, data_reader_proto, optimizer + return trainer, model, data_reader, optimizer def run_experiment(args, trainer, model, - data_reader_proto, + data_reader, optimizer): - # Run experiment - if not args.disable_run: - from lbann.contrib.lc.paths import imagenet_dir, imagenet_labels - import lbann.contrib.lc.launcher - kwargs = {} - if args.nodes: - kwargs['nodes'] = args.nodes - if args.procs_per_node: - kwargs['procs_per_node'] = args.procs_per_node - if args.partition: - kwargs['partition'] = args.partition - if args.account: - kwargs['account'] = args.account - if args.time_limit: - kwargs['time_limit'] = args.time_limit - if args.imagenet_classes: - classes = args.imagenet_classes - kwargs['lbann_args'] = ( - '--data_filedir_train={} --data_filename_train={} ' - '--data_filedir_test={} --data_filename_test={}' - .format(imagenet_dir(data_set='train', num_classes=classes), - imagenet_labels(data_set='train', - num_classes=classes), - imagenet_dir(data_set='val', num_classes=classes), - imagenet_labels(data_set='val', - num_classes=classes))) - lbann.contrib.lc.launcher.run(trainer, model, - data_reader_proto, - optimizer, - job_name='lbann_densenet', - **kwargs) + # Note: Use `lbann.run` instead for non-LC systems. + kwargs = lbann.contrib.args.get_scheduler_kwargs(args) + lbann.contrib.lc.launcher.run(trainer, model, data_reader, optimizer, + job_name=args.job_name, + **kwargs) # Main function ################################################################ @@ -547,23 +438,19 @@ def main(): # ---------------------------------- # Command-line arguments # ---------------------------------- - args = get_args() - # Match this with number of GPUs per node - # On Lassen, this will be 4. - # On Pascal, this will be 2. - # If there are no GPUs, then match the number of processes per node. - statistics_group_size = 2 + args = get_args() # ---------------------------------- # Construct layer graph # ---------------------------------- + input_node = lbann.Input() # Start counting cumulative layers at 1. cumulative_layer_num = 1 log('Input. cumulative_layer_num={n}'.format(n=cumulative_layer_num)) (probs, labels) = construct_layer_graph( - statistics_group_size, + args.procs_per_node, 121, cumulative_layer_num, input_node) # ---------------------------------- @@ -576,7 +463,6 @@ def main(): # ---------------------------------- # Run experiment # ---------------------------------- - # Note: Use `lbann.run` instead for non-LC systems. run_experiment(args, trainer, model, data_reader_proto, optimizer) diff --git a/model_zoo/vision/lenet.py b/applications/vision/lenet.py old mode 100755 new mode 100644 similarity index 79% rename from model_zoo/vision/lenet.py rename to applications/vision/lenet.py index ecab1daf54b..d7d7ff9b7dd --- a/model_zoo/vision/lenet.py +++ b/applications/vision/lenet.py @@ -1,15 +1,12 @@ -#!/usr/bin/env python3 import argparse -import os.path -import google.protobuf.text_format as txtf import lbann +import data.mnist # ---------------------------------- # Command-line arguments # ---------------------------------- -desc = ('Construct and run LeNet on MNIST data. ' - 'Running the experiment is only supported on LC systems.') +desc = ('Train LeNet on MNIST data using LBANN.') parser = argparse.ArgumentParser(description=desc) parser.add_argument( '--partition', action='store', type=str, @@ -87,15 +84,8 @@ # Setup optimizer opt = lbann.SGD(learn_rate=0.01, momentum=0.9) -# Load data reader from prototext -model_zoo_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) -data_reader_file = os.path.join(model_zoo_dir, - 'data_readers', - 'data_reader_mnist.prototext') -data_reader_proto = lbann.lbann_pb2.LbannPB() -with open(data_reader_file, 'r') as f: - txtf.Merge(f.read(), data_reader_proto) -data_reader_proto = data_reader_proto.data_reader +# Setup data reader +data_reader = data.mnist.make_data_reader() # Setup trainer trainer = lbann.Trainer() @@ -109,6 +99,4 @@ kwargs = {} if args.partition: kwargs['partition'] = args.partition if args.account: kwargs['account'] = args.account -lbann.run(trainer, model, data_reader_proto, opt, - job_name='lbann_lenet', - **kwargs) +lbann.run(trainer, model, data_reader, opt, **kwargs) diff --git a/model_zoo/vision/resnet.py b/applications/vision/resnet.py old mode 100755 new mode 100644 similarity index 65% rename from model_zoo/vision/resnet.py rename to applications/vision/resnet.py index 3d4d0029a64..34f203ccc10 --- a/model_zoo/vision/resnet.py +++ b/applications/vision/resnet.py @@ -1,25 +1,20 @@ -#!/usr/bin/env python3 import argparse -from os.path import abspath, dirname, join -import google.protobuf.text_format as txtf import lbann import lbann.models import lbann.models.resnet -import lbann.proto import lbann.contrib.args import lbann.contrib.models.wide_resnet - -# Default data reader -model_zoo_dir = dirname(dirname(abspath(__file__))) -data_reader_prototext = join(model_zoo_dir, - 'data_readers', - 'data_reader_imagenet.prototext') +import lbann.contrib.lc.launcher +import data.imagenet # Command-line arguments desc = ('Construct and run ResNet on ImageNet-1K data. ' 'Running the experiment is only supported on LC systems.') parser = argparse.ArgumentParser(description=desc) lbann.contrib.args.add_scheduler_arguments(parser) +parser.add_argument( + '--job-name', action='store', default='lbann_resnet', type=str, + help='scheduler job name (default: lbann_resnet)') parser.add_argument( '--resnet', action='store', default=50, type=int, choices=(18, 34, 50, 101, 152), @@ -37,10 +32,6 @@ parser.add_argument( '--block-channels', action='store', default=None, type=str, help='Internal channels in each ResNet block (comma-separated list)') -parser.add_argument( - '--bn-stats-aggregation', action='store', type=str, - help=('aggregation mode for batch normalization statistics ' - '(default: "local") (DEPRECATED)')) parser.add_argument( '--bn-statistics-group-size', action='store', default=1, type=int, help=('Group size for aggregating batch normalization statistics ' @@ -54,39 +45,21 @@ '--num-epochs', action='store', default=90, type=int, help='number of epochs (default: 90)', metavar='NUM') parser.add_argument( - '--num-labels', action='store', default=1000, type=int, - help='number of data classes (default: 1000)', metavar='NUM') + '--num-classes', action='store', default=1000, type=int, + help='number of ImageNet classes (default: 1000)', metavar='NUM') parser.add_argument( '--random-seed', action='store', default=0, type=int, help='random seed for LBANN RNGs', metavar='NUM') lbann.contrib.args.add_optimizer_arguments(parser, default_learning_rate=0.1) parser.add_argument( - '--data-reader', action='store', - default=data_reader_prototext, type=str, - help='data reader prototext file (default: ' + data_reader_prototext + ')', - metavar='FILE') -parser.add_argument( - '--prototext', action='store', type=str, - help='exported prototext file (do not run experiment)', metavar='FILE') + '--setup_only', action='store_true', + help='setup LBANN experiment without running it') args = parser.parse_args() # Due to a data reader limitation, the actual model realization must be # hardcoded to 1000 labels for ImageNet. imagenet_labels = 1000 -# Handle old-style batchnorm aggregation. -if args.bn_stats_aggregation is not None: - print('--bn-stats-aggregation is deprected, use --bn-statistics-group-size') - if args.bn_stats_aggregation == 'local': - args.bn_statistics_group_size = 1 - elif args.bn_stats_aggregation == 'node_local': - raise RuntimeError('Cannot translate node_local stats aggregation') - elif args.bn_stats_aggregation == 'global': - args.bn_statistics_group_size = 0 - else: - raise RuntimeError('Unknown stats aggregation ' - + args.bn_stats_aggregation) - # Choose ResNet variant resnet_variant_dict = {18: lbann.models.ResNet18, 34: lbann.models.ResNet34, @@ -171,35 +144,15 @@ # Setup optimizer opt = lbann.contrib.args.create_optimizer(args) -# Load data reader from prototext -data_reader_proto = lbann.lbann_pb2.LbannPB() -with open(args.data_reader, 'r') as f: - txtf.Merge(f.read(), data_reader_proto) -data_reader_proto = data_reader_proto.data_reader +# Setup data reader +data_reader = data.imagenet.make_data_reader(num_classes=args.num_classes) # Setup trainer trainer = lbann.Trainer() -# Save prototext -if args.prototext: - lbann.proto.save_prototext(args.prototext, - trainer=trainer, - model=model, optimizer=opt, - data_reader=data_reader_proto) - # Run experiment -if not args.prototext: - from lbann.contrib.lc.paths import imagenet_dir, imagenet_labels - import lbann.contrib.lc.launcher - kwargs = lbann.contrib.args.get_scheduler_kwargs(args) - classes = args.num_labels - kwargs['lbann_args'] = ( - '--data_filedir_train={} --data_filename_train={} ' - '--data_filedir_test={} --data_filename_test={}' - .format(imagenet_dir(data_set='train', num_classes=classes), - imagenet_labels(data_set='train', num_classes=classes), - imagenet_dir(data_set='val', num_classes=classes), - imagenet_labels(data_set='val', num_classes=classes))) - lbann.contrib.lc.launcher.run(trainer, model, data_reader_proto, opt, - job_name='lbann_resnet', - **kwargs) +kwargs = lbann.contrib.args.get_scheduler_kwargs(args) +lbann.contrib.lc.launcher.run(trainer, model, data_reader, opt, + job_name=args.job_name, + setup_only=args.setup_only, + **kwargs) diff --git a/docs/running_lbann.rst b/docs/running_lbann.rst index b7e16d0e9a9..f6e66b2e6f6 100644 --- a/docs/running_lbann.rst +++ b/docs/running_lbann.rst @@ -210,9 +210,8 @@ Python frontend ------------------------------------------------ LBANN provides a Python frontend with syntax reminiscent of `PyTorch -`_. See the `model zoo implementation of LeNet -`_ -for a simple example. +`_. See `a simple implementation of LeNet +`_. Comments: @@ -363,10 +362,10 @@ A simple example opt = lbann.SGD(learn_rate=0.01, momentum=0.9) # Load data reader from prototext - import google.protobuf.text_format as txtf + import google.protobuf.text_format data_reader_proto = lbann.lbann_pb2.LbannPB() with open('path/to/lbann/model_zoo/data_readers/data_reader_mnist.prototext', 'r') as f: - txtf.Merge(f.read(), data_reader_proto) + google.protobuf.text_format.Merge(f.read(), data_reader_proto) data_reader_proto = data_reader_proto.data_reader # ---------------------------------- diff --git a/model_zoo/vision/.gitignore b/model_zoo/vision/.gitignore deleted file mode 100644 index 8a9d92cf4c7..00000000000 --- a/model_zoo/vision/.gitignore +++ /dev/null @@ -1 +0,0 @@ -model.prototext From 201acf307d234a19e91c9507a484604459577f3e Mon Sep 17 00:00:00 2001 From: davidHysom Date: Wed, 6 Nov 2019 11:01:55 -0800 Subject: [PATCH 384/634] Identify and fix bugs in data_reader_jag_conduit (#1339) * Added preload_helper method; deleted read_partial_node (read_partial_node() was the primary bug) * Not checking data is now the default; The former flag --no_check_data has been replaced by the flag --check_data * added a second mutex; was using the same mutex twice, without unlocking the first call, which was causing a deadlock * Modfied preload_helper() to avoid a conduit::Node copy. * Deleted a "has_path()" check in preload_helper() , since, if the path doesn't exist, conduit will throw an exception; lbann doing the check and throwing an exception is redundant. * Passes bamboo --- .../data_readers/data_reader_jag_conduit.hpp | 3 +- .../lbann/data_store/data_store_conduit.hpp | 1 + src/data_readers/data_reader_jag_conduit.cpp | 89 +++++++------------ src/data_store/data_store_conduit.cpp | 26 +++--- 4 files changed, 51 insertions(+), 68 deletions(-) diff --git a/include/lbann/data_readers/data_reader_jag_conduit.hpp b/include/lbann/data_readers/data_reader_jag_conduit.hpp index ecb59c58c3c..bf47bde8657 100644 --- a/include/lbann/data_readers/data_reader_jag_conduit.hpp +++ b/include/lbann/data_readers/data_reader_jag_conduit.hpp @@ -345,7 +345,6 @@ class data_reader_jag_conduit : public generic_data_reader { bool has_path(const file_handle_t& h, const std::string& path) const; void read_node(const file_handle_t& h, const std::string& path, conduit::Node& n) const; - void read_partial_node(const file_handle_t& h, const std::string& path, conduit::Node& n) const; /// Allow const access to the conduit data structure static const conduit::Node& get_conduit_node(const conduit::Node& n_base, const std::string key); @@ -464,6 +463,8 @@ class data_reader_jag_conduit : public generic_data_reader { sample_list_t m_sample_list; bool m_list_per_trainer; bool m_list_per_model; + + void preload_helper(const hid_t& h, const std::string &sample_name, const std::string &field_name, int data_id, conduit::Node &node); }; /** diff --git a/include/lbann/data_store/data_store_conduit.hpp b/include/lbann/data_store/data_store_conduit.hpp index 6c5f5c8b57a..6e6635dd879 100644 --- a/include/lbann/data_store/data_store_conduit.hpp +++ b/include/lbann/data_store/data_store_conduit.hpp @@ -216,6 +216,7 @@ private : /// used in set_conduit_node(...) std::mutex m_mutex; + std::mutex m_mutex_2; /// for use in local cache mode char *m_mem_seg = 0; diff --git a/src/data_readers/data_reader_jag_conduit.cpp b/src/data_readers/data_reader_jag_conduit.cpp index 8cc08991b03..4af68d7c814 100644 --- a/src/data_readers/data_reader_jag_conduit.cpp +++ b/src/data_readers/data_reader_jag_conduit.cpp @@ -289,7 +289,6 @@ const conduit::Node& data_reader_jag_conduit::get_conduit_node(const conduit::No } bool data_reader_jag_conduit::load_conduit_node(const size_t i, const std::string& key, conduit::Node& node) const { - if (m_io_thread_pool != nullptr && m_using_random_node.count(m_io_thread_pool->get_local_thread_id())) { LBANN_ERROR("previously retrieved a random conduit node from data_store, so shouldn't be here"); } @@ -346,55 +345,10 @@ bool data_reader_jag_conduit::load_conduit_node(const size_t i, const std::strin } } - if (options::get()->get_bool("old_method") || ! options::get()->get_bool("preload_data_store")) { - read_node(h, path, node); - } else { - read_partial_node(h, path, node); - } - + read_node(h, path, node); return true; } -#ifdef _USE_IO_HANDLE_ -void data_reader_jag_conduit::read_partial_node(const data_reader_jag_conduit::file_handle_t& h, const std::string& path, conduit::Node& n) const { - LBANN_ERROR("Not implemented; please contact Dave Hysom"); -} -#else - -void data_reader_jag_conduit::read_partial_node(const data_reader_jag_conduit::file_handle_t& h, const std::string& path, conduit::Node& n) const { - conduit::Node work; - if (!has_path(h, path)) { - LBANN_ERROR("has_path failed for: ", path, ": num nodes successfully loaded by this rank: ", m_data_store->get_data_size()); - } - const std::string key = path + "/inputs"; - const std::string key2 = path + "/outputs/scalars"; - - if (! has_path(h, key)) { - LBANN_ERROR("has_path failed for: ", key, ": num nodes successfully loaded by this rank: ", m_data_store->get_data_size()); - } - conduit::relay::io::hdf5_read(h, key, work); - n["inputs"] = work; - //n[key2] = work; - - if (! has_path(h, key2)) { - LBANN_ERROR("has_path failed for: ", key2, ": num nodes successfully loaded by this rank: ", m_data_store->get_data_size()); - } - conduit::relay::io::hdf5_read(h, key2, work); - n["/outputs/scalars"] = work; - //n[key] = work; - - for (auto &&t : m_emi_image_keys) { - const std::string key3 = "/" + path + "/outputs/images/" + t; - if (! has_path(h, key3)) { - LBANN_ERROR("has_path failed for: ", key3, ": num nodes successfully loaded by this rank: ", m_data_store->get_data_size()); - } - conduit::relay::io::hdf5_read(h, key3, work); - //n[key3] = work; - n["/outputs/images/" + t] = work; - } -} -#endif - bool data_reader_jag_conduit::has_conduit_path(const size_t i, const std::string& key) const { const sample_t& s = m_sample_list[i]; sample_file_id_t id = s.first; @@ -462,6 +416,10 @@ data_reader_jag_conduit::get_dependent_variable_type() const { return m_dependent; } +/** + * Note: this method is called by init_image_data_reader in + * src/proto/init_image_data_readers.cpp + */ void data_reader_jag_conduit::set_image_dims(const int width, const int height, const int ch) { if ((width > 0) && (height > 0) && (ch > 0)) { // set and valid m_image_width = width; @@ -473,6 +431,10 @@ void data_reader_jag_conduit::set_image_dims(const int width, const int height, set_linearized_image_size(); } +/** + * Note: this method is called by init_image_data_reader in + * src/proto/init_image_data_readers.cpp + */ void data_reader_jag_conduit::set_image_choices(const std::vector image_keys) { m_emi_image_keys = image_keys; // For example, in the data reader prototext file, have a line similar to the one below @@ -523,6 +485,10 @@ bool data_reader_jag_conduit::filter(const std::set& key_filter, return false; } +/** + * Note: this method is called by init_image_data_reader in + * src/proto/init_image_data_readers.cpp + */ void data_reader_jag_conduit::set_scalar_choices(const std::vector& keys) { m_scalar_keys = keys; check_scalar_keys(); @@ -552,6 +518,8 @@ const std::vector& data_reader_jag_conduit::get_scalar_choices() co /** * To use no key, set 'Undefined' to the corresponding variable type, * or call this with an empty vector argument after loading data. + * Note: this method is called by init_image_data_reader in + * src/proto/init_image_data_readers.cpp */ void data_reader_jag_conduit::set_input_choices(const std::vector& keys) { m_input_keys = keys; @@ -819,7 +787,7 @@ void data_reader_jag_conduit::load() { const std::string sample_list_file = data_dir + get_data_index_list(); options *opts = options::get(); - bool check_data = ! opts->get_bool("no_check_data"); + bool check_data = opts->get_bool("check_data"); /// The use of these flags need to be updated to properly separate /// how index lists are used between trainers and models @@ -827,7 +795,7 @@ void data_reader_jag_conduit::load() { double tm2 = get_time(); load_list_of_samples(sample_list_file, m_comm->get_procs_per_trainer(), m_comm->get_rank_in_trainer()); if(is_master()) { - std::cout << "Finished loadingsample list; time: " << get_time() - tm2 << std::endl; + std::cout << "Finished loading sample list; time: " << get_time() - tm2 << std::endl; if (!check_data) { std::cout << "Skipping check data" << std::endl; } @@ -894,6 +862,11 @@ void data_reader_jag_conduit::load() { select_subset_of_data(); } +void data_reader_jag_conduit::preload_helper(const hid_t& h, const std::string &sample_name, const std::string &field_name, int data_id, conduit::Node &node) { + const std::string path = sample_name + field_name; + const std::string key2 = '/' + LBANN_DATA_ID_STR(data_id) + field_name; + read_node(h, path, node[key2]); +} void data_reader_jag_conduit::do_preload_data_store() { conduit::Node work; @@ -906,7 +879,7 @@ void data_reader_jag_conduit::do_preload_data_store() { double tm1 = get_time(); if (get_comm()->am_world_master() || (opts->get_bool("ltfb_verbose") && get_comm()->am_trainer_master())) { - LBANN_WARNING("starting preload for role: ", get_role(), "; --old_method=", opts->get_bool("old_method")); + LBANN_WARNING("starting preload for role: ", get_role()); } for (size_t idx=0; idx < m_shuffled_indices.size(); idx++) { @@ -915,12 +888,19 @@ void data_reader_jag_conduit::do_preload_data_store() { continue; } try { - work.reset(); + const sample_t& s = m_sample_list[index]; + const std::string& sample_name = s.second; + sample_file_id_t id = s.first; m_sample_list.open_samples_file_handle(index, true); - load_conduit_node(index, key, work); + auto h = m_sample_list.get_samples_file_handle(id); conduit::Node & node = m_data_store->get_empty_node(index); - const std::string padded_idx = '/' + LBANN_DATA_ID_STR(index); - node[padded_idx] = work; + + preload_helper(h, sample_name, m_output_scalar_prefix, index, node); + preload_helper(h, sample_name, m_input_prefix, index, node); + for (auto t : m_emi_image_keys) { + const std::string field_name = m_output_image_prefix + t; + preload_helper(h, sample_name, field_name, index, node); + } m_data_store->set_preloaded_conduit_node(index, node); } catch (conduit::Error const& e) { LBANN_ERROR(" :: trying to load the node " + std::to_string(index) + " with key " + key + " and got " + e.what()); @@ -935,7 +915,6 @@ void data_reader_jag_conduit::do_preload_data_store() { m_sample_list.close_if_done_samples_file_handle(index); } - if (get_comm()->am_world_master() || (opts->get_bool("ltfb_verbose") && get_comm()->am_trainer_master())) { std::stringstream msg; diff --git a/src/data_store/data_store_conduit.cpp b/src/data_store/data_store_conduit.cpp index ed4246d99a4..2312eebeaf4 100644 --- a/src/data_store/data_store_conduit.cpp +++ b/src/data_store/data_store_conduit.cpp @@ -266,7 +266,6 @@ void data_store_conduit::spill_preloaded_conduit_node(int data_id, const conduit void data_store_conduit::set_preloaded_conduit_node(int data_id, const conduit::Node &node) { // note: at this point m_data[data_id] = node - { std::lock_guard lock(m_mutex); ++m_my_num_indices; @@ -294,15 +293,17 @@ void data_store_conduit::error_check_compacted_node(const conduit::Node &nd, int if (m_node_sizes_vary) { return; } - std::lock_guard lock(m_mutex); - if (m_compacted_sample_size == 0) { - m_compacted_sample_size = nd.total_bytes_compact(); - PROFILE("num bytes for nodes to be transmitted: ", nd.total_bytes_compact(), " per node"); - } else if (m_compacted_sample_size != nd.total_bytes_compact() && !m_node_sizes_vary) { - LBANN_ERROR("Conduit node being added data_id: ", data_id, - " is not the same size as existing nodes in the data_store ", - m_compacted_sample_size, " != ", nd.total_bytes_compact(), - " role: ", m_reader->get_role()); + { + std::lock_guard lock(m_mutex_2); + if (m_compacted_sample_size == 0) { + m_compacted_sample_size = nd.total_bytes_compact(); + PROFILE("num bytes for nodes to be transmitted: ", nd.total_bytes_compact(), " per node"); + } else if (m_compacted_sample_size != nd.total_bytes_compact() && !m_node_sizes_vary) { + LBANN_ERROR("Conduit node being added data_id: ", data_id, + " is not the same size as existing nodes in the data_store ", + m_compacted_sample_size, " != ", nd.total_bytes_compact(), + " role: ", m_reader->get_role()); + } } if (!nd.is_contiguous()) { LBANN_ERROR("m_data[", data_id, "] does not have a contiguous layout"); @@ -320,8 +321,9 @@ void data_store_conduit::error_check_compacted_node(const conduit::Node &nd, int // since the threading from the data_reader will cause you grief void data_store_conduit::set_conduit_node(int data_id, conduit::Node &node, bool already_have) { std::lock_guard lock(m_mutex); - // TODO: test whether having multiple mutexes below is better than - // locking this entire call with a single mutex + // TODO: test whether having multiple mutexes below is better (faster) than + // locking this entire call with a single mutex. For now I'm + // playing it safe and locking the whole dang thing. ++m_my_num_indices; if (m_is_local_cache && m_preload) { From f916eff82ede7a3e6758a38f4d696138a4531a41 Mon Sep 17 00:00:00 2001 From: Tim Moon Date: Wed, 6 Nov 2019 11:03:41 -0800 Subject: [PATCH 385/634] Fix bug in model zoo CMake (#1342) The vision subdirectory was removed in PR #1135 --- model_zoo/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/model_zoo/CMakeLists.txt b/model_zoo/CMakeLists.txt index 1249ef6392b..ef0bfcac210 100644 --- a/model_zoo/CMakeLists.txt +++ b/model_zoo/CMakeLists.txt @@ -41,5 +41,5 @@ install( # Install the relevant prototext install(FILES README.md DESTINATION ${CMAKE_INSTALL_DATADIR}/model_zoo) -install(DIRECTORY data_readers models optimizers tests vision +install(DIRECTORY data_readers models optimizers tests DESTINATION ${CMAKE_INSTALL_DATADIR}/model_zoo) From eea2e309055fd871edb9566e473d3c1a05a5fb1f Mon Sep 17 00:00:00 2001 From: Sam Ade Jacobs Date: Wed, 6 Nov 2019 11:30:00 -0800 Subject: [PATCH 386/634] Update train_atom_char_rnn.py With variable names for embedding layer. --- applications/ATOM/train_atom_char_rnn.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/applications/ATOM/train_atom_char_rnn.py b/applications/ATOM/train_atom_char_rnn.py index 14b1f820fe5..c1df599fdde 100644 --- a/applications/ATOM/train_atom_char_rnn.py +++ b/applications/ATOM/train_atom_char_rnn.py @@ -84,8 +84,8 @@ def construct_model(): for i in range(sequence_length-1): emb_l = lbann.Embedding( idl[i], - dictionary_size=dictionary_size, - embedding_size=embedding_size, + num_embeddings=dictionary_size, + embedding_dim=embedding_size, name='emb_'+str(i), device='CPU', weights=emb_weights From 0b3d99a6ad8a1560e40464b07d62d30eb748dc9b Mon Sep 17 00:00:00 2001 From: Tim Moon Date: Wed, 6 Nov 2019 11:32:49 -0800 Subject: [PATCH 387/634] Initial implementation of node2vec (#1333) * Add function to download graphs from Stanford SNAP website * Add SNAP repo as submodule * Implement node2vec on CPU without negative sampling * Tweak documentation for SNAP utilities * Use tabs in .gitmodules * Add tests for data ingestion in node2vec app --- .gitmodules | 4 + applications/graph/README.md | 17 ++++ applications/graph/data/.gitignore | 2 + applications/graph/dataset.py | 84 +++++++++++++++++ applications/graph/main.py | 111 +++++++++++++++++++++++ applications/graph/snap | 1 + applications/graph/test/test_dataset.py | 38 ++++++++ applications/graph/utils/__init__.py | 17 ++++ applications/graph/utils/snap.py | 116 ++++++++++++++++++++++++ 9 files changed, 390 insertions(+) create mode 100644 .gitmodules create mode 100644 applications/graph/README.md create mode 100644 applications/graph/data/.gitignore create mode 100644 applications/graph/dataset.py create mode 100644 applications/graph/main.py create mode 160000 applications/graph/snap create mode 100644 applications/graph/test/test_dataset.py create mode 100644 applications/graph/utils/__init__.py create mode 100644 applications/graph/utils/snap.py diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 00000000000..090b654b8e9 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,4 @@ +[submodule "applications/graph/snap"] + path = applications/graph/snap + url = https://github.com/snap-stanford/snap + ignore = dirty diff --git a/applications/graph/README.md b/applications/graph/README.md new file mode 100644 index 00000000000..70ed7bccb16 --- /dev/null +++ b/applications/graph/README.md @@ -0,0 +1,17 @@ +# Experiments with graph data + +This work is focused on scaling graph embedding algorithms on +distributed systems, both to achieve strong scaling and to handle very +large graphs. + +## Dependencies + +- SNAP: C++ package that includes baseline implementation of node2vec + algorithm. Install with: + +```bash +cd /path/to/lbann +git submodule update --init applications/graph/snap +cd applications/graph/snap +make +``` diff --git a/applications/graph/data/.gitignore b/applications/graph/data/.gitignore new file mode 100644 index 00000000000..d6b7ef32c84 --- /dev/null +++ b/applications/graph/data/.gitignore @@ -0,0 +1,2 @@ +* +!.gitignore diff --git a/applications/graph/dataset.py b/applications/graph/dataset.py new file mode 100644 index 00000000000..092413c56b2 --- /dev/null +++ b/applications/graph/dataset.py @@ -0,0 +1,84 @@ +"""Random walk dataset. + +This is intended to be imported by the Python data reader and used to +obtain data samples. + +""" +import os.path + +import numpy as np +import utils.snap + +# Options +graph_name = 'ego-Facebook' +walk_length = 80 +walk_context_length = 10 +walks_per_node = 4 +return_param = 1.0 +inout_param = 1.0 +directed = False +weighted = False + +# Download graph and perform random walk, if needed +root_dir = os.path.dirname(os.path.realpath(__file__)) +data_dir = os.path.join(root_dir, 'data', graph_name) +graph_file = os.path.join(data_dir, 'graph.txt') +walk_file = os.path.join(data_dir, 'walk.txt') +if not os.path.isfile(graph_file): + utils.snap.download_graph(graph_name, graph_file) +if not os.path.isfile(walk_file): + utils.snap.node2vec_walk( + graph_file, + walk_file, + walk_length, + walks_per_node, + return_param, + inout_param, + directed, + weighted) + +# Load random walks from file +walks = np.loadtxt(walk_file, dtype=int) +assert walks.shape[1] == walk_length, \ + ('Random walks in {} have length {}, but expected a walk length of {}' + .format(walk_file, walks.shape[1], walk_length)) + +# Sample access functions +def get_sample(index): + contexts_per_walk = walk_length - walk_context_length + 1 + walk_index, context_index = divmod(index, contexts_per_walk) + return walks[walk_index, + context_index:context_index+walk_context_length] +def num_samples(): + num_walks = walks.shape[0] + contexts_per_walk = walk_length - walk_context_length + 1 + return num_walks * contexts_per_walk +def sample_dims(): + return (walk_context_length,) + +def max_graph_node_id(graph_file=graph_file): + """Largest node ID in graph. + + Nodes should be numbered consecutively from 0 to + (num_graph_nodes-1). If there are any gaps in the IDs, then + unnecessary memory will be allocated. If any IDs are negative, + there may be mysterious errors. + + Args: + graph_file (str): Uncompressed edge list file. + + Returns: + int: Largest node ID in graph. + + """ + max_id = -1 + with open(graph_file) as f: + for line in f: + line = line.split('#')[0] + line = line.split() + if len(line) >= 2: + max_id = max(max_id, int(line[0])) + max_id = max(max_id, int(line[1])) + if max_id < 0: + raise RuntimeError('Graph has no non-negative node IDs') + return max_id diff --git a/applications/graph/main.py b/applications/graph/main.py new file mode 100644 index 00000000000..c6fdcc95518 --- /dev/null +++ b/applications/graph/main.py @@ -0,0 +1,111 @@ +import argparse +import os.path + +import lbann +import lbann.contrib.lc.launcher +import lbann.contrib.args + +import dataset +from utils import make_iterable, str_list +import utils.snap + +# ---------------------------------- +# Options +# ---------------------------------- + +# Command-line arguments +parser = argparse.ArgumentParser() +lbann.contrib.args.add_scheduler_arguments(parser) +parser.add_argument( + '--job-name', action='store', default='lbann_node2vec', type=str, + help='job name', metavar='NAME') +parser.add_argument( + '--mini-batch-size', action='store', default=256, type=int, + help='mini-batch size (default: 256)', metavar='NUM') +parser.add_argument( + '--num-epochs', action='store', default=20, type=int, + help='number of epochs (default: 20)', metavar='NUM') +parser.add_argument( + '--latent-dim', action='store', default=128, type=int, + help='latent space dimensions (default: 128)', metavar='NUM') +args = parser.parse_args() + +# ---------------------------------- +# Embedding weights +# ---------------------------------- + +embeddings = lbann.Weights(initializer=lbann.NormalInitializer(mean=0, + standard_deviation=1), + name='embeddings') + +# ---------------------------------- +# Construct layer graph +# ---------------------------------- + +# Properties of graph and random walk +num_graph_nodes = dataset.max_graph_node_id() + 1 +walk_length = dataset.sample_dims()[0] + +# Input is a sequence of graph node IDs +input_ = lbann.Identity(lbann.Input(), device='cpu') +input_slice = lbann.Slice(input_, + slice_points=str_list(range(walk_length+1)), + device='cpu') +walk = [] +for _ in range(walk_length): + walk.append(lbann.Identity(input_slice, device='cpu')) + +# Skip-gram architecture +latent = lbann.Embedding(walk[0], + weights=embeddings, + num_embeddings=num_graph_nodes, + embedding_dim=args.latent_dim, + device='cpu') +pred = lbann.FullyConnected(latent, + weights=embeddings, + num_neurons=num_graph_nodes, + has_bias=False, + transpose=True, + device='cpu') +pred = lbann.Softmax(pred, device='cpu') + +# Objective function +ground_truth = lbann.Sum([lbann.OneHot(node, size=num_graph_nodes, device='cpu') + for node in walk[1:]], + device='cpu') +obj = lbann.CrossEntropy([pred, ground_truth], device='cpu') + +# ---------------------------------- +# Create data reader +# ---------------------------------- + +reader = lbann.reader_pb2.DataReader() +_reader = reader.reader.add() +_reader.name = 'python' +_reader.role = 'train' +_reader.percent_of_data_to_use = 1.0 +_reader.python.module = 'dataset' +_reader.python.module_dir = os.path.dirname(os.path.realpath(__file__)) +_reader.python.sample_function = 'get_sample' +_reader.python.num_samples_function = 'num_samples' +_reader.python.sample_dims_function = 'sample_dims' + +# ---------------------------------- +# Run LBANN +# ---------------------------------- + +# Create LBANN objects +trainer = lbann.Trainer() +model = lbann.Model(args.mini_batch_size, + args.num_epochs, + layers=lbann.traverse_layer_graph(input_), + objective_function=obj, + callbacks=[lbann.CallbackPrint(), + lbann.CallbackTimer()]) +opt = lbann.SGD(learn_rate=0.01, momentum=0.9) + +# Run LBANN +kwargs = lbann.contrib.args.get_scheduler_kwargs(args) +lbann.contrib.lc.launcher.run(trainer, model, reader, opt, + job_name=args.job_name, + **kwargs) diff --git a/applications/graph/snap b/applications/graph/snap new file mode 160000 index 00000000000..907c34aac6b --- /dev/null +++ b/applications/graph/snap @@ -0,0 +1 @@ +Subproject commit 907c34aac6bcddc7c2f8efb64be76e87dd7e4ea5 diff --git a/applications/graph/test/test_dataset.py b/applications/graph/test/test_dataset.py new file mode 100644 index 00000000000..85525ae0c47 --- /dev/null +++ b/applications/graph/test/test_dataset.py @@ -0,0 +1,38 @@ +import os.path +import random +import sys + +# Local paths +root_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) +sys.path.append(root_dir) + +def test_dataset(): + import dataset + + # Check max node ID + max_graph_node_id = dataset.max_graph_node_id() + assert max_graph_node_id >= 0, 'Negative graph node ID' + assert max_graph_node_id != 0, \ + 'Max graph node ID is zero, ' \ + 'which implies graph has only one node or node IDs are negative' + + # Check sample dimensions + sample_dims = dataset.sample_dims() + assert len(sample_dims) == 1, 'Unexpected dimensions for data sample' + assert sample_dims[0] > 0, 'Invalid dimensions for data sample' + + # Check number of samples + num_samples = dataset.num_samples() + assert num_samples >= 0, 'Invalid number of data samples' + assert num_samples != 0, 'Dataset has no data samples' + + # Check samples + indices = [random.randint(0, num_samples-1) for _ in range(20)] + indices.append(0) + indices.append(num_samples-1) + for index in indices: + sample = dataset.get_sample(index) + assert sample.shape == sample_dims, 'Unexpected dimensions for data sample' + for node in sample: + assert 0 <= node <= max_graph_node_id, \ + 'Invalid graph node ID in data sample' diff --git a/applications/graph/utils/__init__.py b/applications/graph/utils/__init__.py new file mode 100644 index 00000000000..370660d532f --- /dev/null +++ b/applications/graph/utils/__init__.py @@ -0,0 +1,17 @@ +"""Utilities for LBANN graph models""" +import collections.abc + +def make_iterable(obj): + """Convert to an iterable object. + + Simply returns `obj` if it is alredy iterable. Otherwise returns a + 1-tuple containing `obj`. + """ + if isinstance(obj, collections.abc.Iterable) and not isinstance(obj, str): + return obj + else: + return (obj,) + +def str_list(it): + """Convert an iterable object to a space-separated string.""" + return ' '.join([str(i) for i in make_iterable(it)]) diff --git a/applications/graph/utils/snap.py b/applications/graph/utils/snap.py new file mode 100644 index 00000000000..ca43ba767e9 --- /dev/null +++ b/applications/graph/utils/snap.py @@ -0,0 +1,116 @@ +"""Utilities to interact with SNAP. + +SNAP is the Stanford Network Analysis Platform. See +https://snap.stanford.edu. + +""" +import os +import os.path +import urllib.request +import gzip +import subprocess + +# Root directory for LBANN graph application +_root_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) + + +def download_graph(name='ego-Facebook', + graph_file=None): + """Download graph edgelist file from SNAP website. + + Args: + name (str): Name of graph. + graph_file (str, optional): File where uncompressed edge list + will be saved (default: in 'data' directory). + + Returns: + str: Uncompressed edge list file. + + """ + + # Graphs from SNAP + download_urls = { + 'ego-Facebook': 'http://snap.stanford.edu/data/facebook_combined.txt.gz', + } + + # Paths + if not graph_file: + graph_file = os.path.join(_root_dir, 'data', name, 'graph.txt') + data_dir = os.path.dirname(graph_file) + if not os.path.isdir(data_dir): + os.makedirs(data_dir) + data_dir = os.path.realpath(data_dir) + graph_file = os.path.realpath(graph_file) + compressed_file = graph_file + '.gz' + + # Download and uncompress graph file + urllib.request.urlretrieve(download_urls[name], + filename=compressed_file) + with gzip.open(compressed_file, 'rb') as in_file: + with open(graph_file, 'wb') as out_file: + out_file.write(in_file.read()) + + return graph_file + + +def node2vec_walk(graph_file, + walk_file, + walk_length, + walks_per_node, + return_param=1.0, + inout_param=1.0, + directed=False, + weighted=False, + verbose=False): + """Perform random walk on graph for node2vec. + + See https://cs.stanford.edu/~jure/pubs/node2vec-kdd16.pdf + + Args: + graph_file (str): Uncompressed edge list file. + walk_file (str): File where random walks will be saved. + walk_length (int): Walk length. + walks_per_node (int): Number of walks per graph vertex. + return_param (float, optional): p-parameter for random walk + (default: 1.0). + inout_param (float, optional): q-parameter for random walk + (default: 1.0). + directed (bool, optional): Graph is directed (default: False). + weighted (bool, optional): Graph is weighted (default: False). + verbose (bool, optional): Verbose output (default: False). + + """ + + # Check executable + node2vec_exe = os.path.join(_root_dir, 'snap', 'examples', + 'node2vec', 'node2vec') + if not os.path.isfile(node2vec_exe): + raise FileNotFoundError( + 'Could not find node2vec executable at {}. ' + 'Has SNAP been built?' + .format(node2vec_exe) + ) + + # Construct invocation + command = [ + node2vec_exe, + '-i:{}'.format(graph_file), + '-o:{}'.format(walk_file), + '-d:-1', + '-l:{}'.format(walk_length), + '-r:{}'.format(walks_per_node), + '-k:-1', + '-e:-1', + '-p:{}'.format(return_param), + '-q:{}'.format(inout_param), + '-ow', + ] + if verbose: + command.append('-v') + if directed: + command.append('-dr') + if weighted: + command.append('-w') + + # Run executable + return subprocess.call(command) From 3357fb4b6fbc9111c8c36fffa8b90d95d97e9027 Mon Sep 17 00:00:00 2001 From: Jae-Seung Yeom Date: Wed, 6 Nov 2019 11:37:50 -0800 Subject: [PATCH 388/634] Initialize RNGs for trainers differently by default (#1336) * initialize RNGs for trainers differently by default (using RNG mix function) * update prototext examples * change random_init_trainers_identically from model option to trainer option --- model_zoo/models/jag/gan/vanilla/gan.prototext | 2 +- model_zoo/models/jag/wae.prototext | 2 +- model_zoo/models/jag/wae_cycle_gan/wae.prototext | 2 +- model_zoo/models/jag/wae_cycle_gan/wae_nobn.prototext | 2 +- src/proto/model.proto | 2 -- src/proto/trainer.proto | 4 +++- src/utils/lbann_library.cpp | 8 ++++---- 7 files changed, 11 insertions(+), 11 deletions(-) diff --git a/model_zoo/models/jag/gan/vanilla/gan.prototext b/model_zoo/models/jag/gan/vanilla/gan.prototext index ef720e0b0d2..7f26ee4466a 100644 --- a/model_zoo/models/jag/gan/vanilla/gan.prototext +++ b/model_zoo/models/jag/gan/vanilla/gan.prototext @@ -1,7 +1,7 @@ trainer { + random_init_trainers_identically: false } model { - random_init_models_differently: true objective_function { l2_weight_regularization { scale_factor: 0.0001 diff --git a/model_zoo/models/jag/wae.prototext b/model_zoo/models/jag/wae.prototext index d14a2b56b8b..ee55855dc95 100644 --- a/model_zoo/models/jag/wae.prototext +++ b/model_zoo/models/jag/wae.prototext @@ -1,7 +1,7 @@ trainer { + random_init_trainers_identically: false } model { - random_init_models_differently: true serialize_io: true objective_function { l2_weight_regularization { diff --git a/model_zoo/models/jag/wae_cycle_gan/wae.prototext b/model_zoo/models/jag/wae_cycle_gan/wae.prototext index c36f9460080..9be59b9029a 100644 --- a/model_zoo/models/jag/wae_cycle_gan/wae.prototext +++ b/model_zoo/models/jag/wae_cycle_gan/wae.prototext @@ -1,7 +1,7 @@ trainer { + random_init_trainers_identically: false } model { - random_init_models_differently: true serialize_io: true name: "wae_model" objective_function { diff --git a/model_zoo/models/jag/wae_cycle_gan/wae_nobn.prototext b/model_zoo/models/jag/wae_cycle_gan/wae_nobn.prototext index 71ad7e0b3a3..23713efdc48 100644 --- a/model_zoo/models/jag/wae_cycle_gan/wae_nobn.prototext +++ b/model_zoo/models/jag/wae_cycle_gan/wae_nobn.prototext @@ -1,7 +1,7 @@ trainer { + random_init_trainers_identically: false } model { - random_init_models_differently: true name: "wae_model" serialize_io: true objective_function { diff --git a/src/proto/model.proto b/src/proto/model.proto index 29d1c4d5753..e2c8f07f636 100644 --- a/src/proto/model.proto +++ b/src/proto/model.proto @@ -64,8 +64,6 @@ message Model { repeated Callback callback = 20; int64 random_seed = 30; - // If true, models will have their model rank mixed into their random seed. - bool random_init_models_differently = 31; Summarizer summarizer = 32; } diff --git a/src/proto/trainer.proto b/src/proto/trainer.proto index b16a0acc615..b02cdbf0335 100644 --- a/src/proto/trainer.proto +++ b/src/proto/trainer.proto @@ -53,7 +53,9 @@ message Trainer { // Advanced options // ------------------------------- + // If false, trainers will have their trainer rank mixed into their random seed. + bool random_init_trainers_identically = 4; + // Algorithmic block size for Hydrogen int64 hydrogen_block_size = 100; - } diff --git a/src/utils/lbann_library.cpp b/src/utils/lbann_library.cpp index 8388117226c..8bdfdfad335 100644 --- a/src/utils/lbann_library.cpp +++ b/src/utils/lbann_library.cpp @@ -182,16 +182,16 @@ std::unique_ptr build_model_from_prototext( // Initialize models differently if needed. #ifndef LBANN_DETERMINISTIC - if (pb_model->random_init_models_differently()) { - random_seed = random_seed + comm->get_trainer_rank(); + if (!pb_trainer->random_init_trainers_identically()) { + hash_combine(random_seed, comm->get_trainer_rank()); // Reseed here so that setup is done with this new seed. init_random(random_seed); init_data_seq_random(random_seed); } #else - if (pb_model->random_init_models_differently()) { + if (!pb_trainer->random_init_trainers_identically()) { if (master) { - std::cout << "WARNING: Ignoring random_init_models_differently " << + std::cout << "WARNING: forcing 'random_init_trainers_identically' " << "due to sequential consistency" << std::endl; } } From dc107e7ead0cf39755f480eb92c339d8ef98056f Mon Sep 17 00:00:00 2001 From: "Thomas R. Benson" Date: Wed, 6 Nov 2019 22:59:06 -0800 Subject: [PATCH 389/634] tweak doxygen format to improve navigation --- docs/SourceTreeDoxyfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/SourceTreeDoxyfile b/docs/SourceTreeDoxyfile index f2272e6beaf..44929c99dfe 100644 --- a/docs/SourceTreeDoxyfile +++ b/docs/SourceTreeDoxyfile @@ -1359,7 +1359,7 @@ ECLIPSE_DOC_ID = org.doxygen.Project # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. -DISABLE_INDEX = YES +DISABLE_INDEX = NO # The GENERATE_TREEVIEW tag is used to specify whether a tree-like index # structure should be generated to display hierarchical information. If the tag From 4bc80820f42e9b35f45df1192bb0773a02b7b54b Mon Sep 17 00:00:00 2001 From: Tom Benson <30674819+benson31@users.noreply.github.com> Date: Thu, 7 Nov 2019 14:42:15 -0800 Subject: [PATCH 390/634] Clara-based argument parser (#1324) Initial addition of Clara-based argument parser. This just adds the capability; it does not deploy it. --- CMakeLists.txt | 4 + bamboo/compiler_tests/build_script.sh | 1 + cmake/modules/FindClara.cmake | 34 + include/lbann/utils/CMakeLists.txt | 2 + include/lbann/utils/argument_parser.hpp | 653 ++++++++++++++++++ include/lbann/utils/environment_variable.hpp | 145 ++++ include/lbann/utils/from_string.hpp | 132 ++++ scripts/build_lbann_lc.sh | 1 + src/utils/CMakeLists.txt | 2 + src/utils/argument_parser.cpp | 129 ++++ src/utils/environment_variable.cpp | 48 ++ src/utils/unit_test/CMakeLists.txt | 6 + src/utils/unit_test/argument_parser_test.cpp | 555 +++++++++++++++ .../unit_test/environment_variable_test.cpp | 169 +++++ src/utils/unit_test/from_string_test.cpp | 141 ++++ .../unit_test/stubs/preset_env_accessor.cpp | 11 + .../unit_test/stubs/preset_env_accessor.hpp | 75 ++ superbuild/CMakeLists.txt | 7 + superbuild/clara/CMakeLists.txt | 74 ++ 19 files changed, 2189 insertions(+) create mode 100644 cmake/modules/FindClara.cmake create mode 100644 include/lbann/utils/argument_parser.hpp create mode 100644 include/lbann/utils/environment_variable.hpp create mode 100644 include/lbann/utils/from_string.hpp create mode 100644 src/utils/argument_parser.cpp create mode 100644 src/utils/environment_variable.cpp create mode 100644 src/utils/unit_test/argument_parser_test.cpp create mode 100644 src/utils/unit_test/environment_variable_test.cpp create mode 100644 src/utils/unit_test/from_string_test.cpp create mode 100644 src/utils/unit_test/stubs/preset_env_accessor.cpp create mode 100644 src/utils/unit_test/stubs/preset_env_accessor.hpp create mode 100644 superbuild/clara/CMakeLists.txt diff --git a/CMakeLists.txt b/CMakeLists.txt index cfa4120c6ee..ba6e4762b56 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -163,6 +163,9 @@ include(SetupCXX) # Required dependencies find_package(Threads REQUIRED) +# Argument parsing backend +find_package(Clara REQUIRED) + find_package(CEREAL NO_MODULE HINTS ${CEREAL_DIR} $ENV{CEREAL_DIR} PATH_SUFFIXES share/cmake/cereal @@ -497,6 +500,7 @@ target_include_directories(lbann PUBLIC # Use the IMPORTED targets when possible. target_link_libraries(lbann PUBLIC LbannProto) target_link_libraries(lbann PUBLIC Threads::Threads) +target_link_libraries(lbann PUBLIC clara::clara) target_link_libraries(lbann PUBLIC cereal) target_link_libraries(lbann PUBLIC OpenMP::OpenMP_CXX) target_link_libraries(lbann PUBLIC MPI::MPI_CXX) diff --git a/bamboo/compiler_tests/build_script.sh b/bamboo/compiler_tests/build_script.sh index 5cd13988337..14f0ca478d7 100755 --- a/bamboo/compiler_tests/build_script.sh +++ b/bamboo/compiler_tests/build_script.sh @@ -57,6 +57,7 @@ then export CUDNN_DIR=$(find ${BRAIN_DIR}/cudnn -maxdepth 2 -type d | grep "cuda-10.*_${ARCH}" | tail -n1) # Unit testing framework + export CLARA_DIR=${WORKSPACE_DIR}/stable_dependencies/clara export CATCH2_DIR=${WORKSPACE_DIR}/stable_dependencies/catch2 # Add Ninja support diff --git a/cmake/modules/FindClara.cmake b/cmake/modules/FindClara.cmake new file mode 100644 index 00000000000..ff2f02cafd3 --- /dev/null +++ b/cmake/modules/FindClara.cmake @@ -0,0 +1,34 @@ +# Output variables +# +# Clara_FOUND +# Clara_LIBRARIES +# Clara_INCLUDE_PATH +# +# Also creates an imported target clara::clara + +# Find the header +find_path(CLARA_INCLUDE_PATH clara.hpp + HINTS ${CLARA_DIR} $ENV{CLARA_DIR} ${Clara_DIR} $ENV{Clara_DIR} + PATH_SUFFIXES include + NO_DEFAULT_PATH) +find_path(CLARA_INCLUDE_PATH clara.hpp) + +# Handle the find_package arguments +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args( + Clara DEFAULT_MSG CLARA_INCLUDE_PATH) + +# Build the imported target +if (NOT TARGET clara::clara) + add_library(clara::clara INTERFACE IMPORTED) +endif() + +set_property(TARGET clara::clara + PROPERTY INTERFACE_INCLUDE_DIRECTORIES + ${CLARA_INCLUDE_PATH}) + +# Set the last of the output variables +set(CLARA_LIBRARIES clara::clara) + +# Cleanup +mark_as_advanced(FORCE CLARA_INCLUDE_PATH) diff --git a/include/lbann/utils/CMakeLists.txt b/include/lbann/utils/CMakeLists.txt index c31a52fdc75..1eb60010a52 100644 --- a/include/lbann/utils/CMakeLists.txt +++ b/include/lbann/utils/CMakeLists.txt @@ -1,6 +1,7 @@ # Add the headers for this directory set_full_path(THIS_DIR_HEADERS any.hpp + argument_parser.hpp compiler_control.hpp cublas.hpp cuda.hpp @@ -9,6 +10,7 @@ set_full_path(THIS_DIR_HEADERS description.hpp entrywise_operator.hpp enum_iterator.hpp + environment_variable.hpp eti_macros.hpp exception.hpp factory.hpp diff --git a/include/lbann/utils/argument_parser.hpp b/include/lbann/utils/argument_parser.hpp new file mode 100644 index 00000000000..00be73e1f3f --- /dev/null +++ b/include/lbann/utils/argument_parser.hpp @@ -0,0 +1,653 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_UTILS_ARGUMENT_PARSER_HPP_INCLUDED +#define LBANN_UTILS_ARGUMENT_PARSER_HPP_INCLUDED + +#include "lbann/utils/any.hpp" +#include "lbann/utils/environment_variable.hpp" + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace lbann +{ +namespace utils +{ + +/** @class argument_parser + * @brief Basic argument parsing with automatic help messages. + * + * @section arg_parser_params Supported parameter types + * + * The argument parser supports 3 types of command line parameters: + * flags, options, and arguments. + * + * @subsection arg_parser_flags Flags + * + * Flags default to "false" and toggle to "true" when they are given + * on the command line. It is an error to provide a value to a flag + * on the command line (e.g., "-flag 0"). If a flag called "-v" is + * tied to a variable called `verbose`, `verbose` will have default + * value `false`. Passing "-v" on the command line, `a.out -v`, will + * result in `verbose` having post-parse value `true`. + * + * @subsection arg_parser_options Options + * + * Options represent key-value pairs. They must take only a single + * value (e.g. `a.out -key value`). It is an error to omit a value + * for a parameter of option type (e.g., `a.out -key`). Options are + * strongly typed to match their default values. The string passed on + * the command line must be convertible to the type of the default + * value provided by the developer programmatically. + * + * @subsection arg_parser_arguments Arguments + * + * Arguments (or "positional arguments") do not name a key on the + * command line and are implicitly keyed by their index in the + * argument list. A corollary to this is that required arguments must + * appear before optional arguments. Arguments with each category + * ("required" and "optional") are keyed in the order in which they + * are added. + * + * On command line, "optional" arguments are ordered after the + * "required" arguments, in the order in which they are added. For + * example, adding an (optional) argument called "A", then adding + * a required argument called "B", then adding an (optioinal) + * argument called "C" will require that these arguments be passed + * as `a.out B A C`. Since "A" and "C" are optional, it is also + * valid to pass `a.out B` or `a.out B A`. It is undefined + * behavior to pass `a.out B C`. + * + * Erroneously passing `a.out B C` might be accepted by the parser + * if "A" and "C" have the same (or sufficiently compatible) + * types, but the output will not be as unexpected (the variable + * bound to "A" will have the value expected in "C", and the + * variable bound to "C" will have its default value). If "A" and + * "C" are not compatible types, an exception will be thrown. In + * the first case, the parser cannot read your mind to know if you + * passed things in the right order; it is the application + * developer's responsibility to ensure that all arguments have + * been added before the help message is printed, and it is the + * user's responsibility to consult the help message for the + * runtime ordering of arguments. + * + * @section arg_parser_finalize Finalization + * + * To accomodate the presence of required arguments with the + * maintenance-intensive practice of adding arguments willy-nilly + * (because I don't believe a PR without said terrifying + * capability would ever make it through), parsing of the + * arguments can be done two ways: with or without finalization. + * + * If there are no required arguments registered in the parser, + * these should be equivalent. If there are required arguments, + * they must all have been registered with the parser and seen in + * the arguments given to the parse functions before + * finalization. Semantically, the parser must be finalized before + * attempting to use any of the required arguments. + */ +class argument_parser +{ +public: + + /** @name Public types */ + ///@{ + + /** @brief A proxy class representing the current value associated + * with an option. + * + * This class is best manipulated generically, through `auto` + * variables. + * + * @tparam T The type of the held object. + */ + template + class readonly_reference + { + public: + readonly_reference(T& val) noexcept : ref_(val) {} + T const& get() const noexcept { return ref_; } + operator T const& () const noexcept { return this->get(); } + + template + bool operator==(S const& y) const noexcept + { return this->get() == y; } + + private: + T& ref_; + };// class readonly_reference + + /** @class parse_error + * @brief std::exception subclass that is thrown if the parser + * can not parse the arguments. + */ + struct parse_error : std::runtime_error + { + /** @brief Construct the exception with the string to be + * return by what() + */ + template + parse_error(T&& what_arg) + : std::runtime_error{std::forward(what_arg)} {} + }; + + /** @class missing_required_arguments + * @brief std::exception subclass that is thrown if a required + * argument is not found. + */ + struct missing_required_arguments : std::runtime_error + { + /** @brief Construct the exception with a list of the missing + * argument names. + * + * @param[in] missing_args A container that holds the names + * of the missing arguments. + */ + template + missing_required_arguments(Container const& missing_args) + : std::runtime_error{build_what_string_(missing_args)} + {} + + private: + template + std::string build_what_string_(Container const& missing_args) + { + std::ostringstream oss; + oss << "The following required arguments are missing: {"; + for (auto const& x : missing_args) + oss << " \"" << x << "\""; + oss << " }"; + return oss.str(); + } + }; + + ///@} + +public: + + /** @name Constructors */ + ///@{ + + /** @brief Create the parser */ + argument_parser(); + + ///@} + /** @name Adding options and arguments */ + ///@{ + + /** @brief Add a flag (i.e. a boolean parameter that is "true" if + * given and "false" if not given). + * + * The value of a flag defaults to `false`. If, for some strange + * reason, users should be forced to type the boolean value on + * the command line, e.g., "my_exe -b 1", use add_option() + * instead. If a flag with default value `true` is desired, + * invert the logic and use this instead. + * + * @param[in] name The name to be used to refer to the argument. + * @param[in] cli_flags The valid command line flags to trigger + * this flag to `true`. At least one must be given. + * @param[in] description A brief description of the argument, + * used for the help message. + * + * @return A read-only reference to the value pointed to by this + * flag. + */ + readonly_reference + add_flag(std::string const& name, + std::initializer_list cli_flags, + std::string const& description); + + /** @brief Add a flag with environment variable override. + * + * The value of a flag defaults to `false`. The flag may be set to + * `true` by passing the flag on the command line. Alternatively, + * it may be set to `true` if the environment variable `env` is + * defined and has a value that converts to `true`. + * + * @tparam AccessPolicy The access method for the environment + * variable. (Deduced.) + * + * @param[in] name The name to be used to refer to the argument. + * @param[in] cli_flags The valid command line flags to trigger + * this flag to `true`. At least one must be given. + * @param[in] env The environment variable to prefer over the + * default parameter value. + * @param[in] description A brief description of the argument, + * used for the help message. + * + * @return A read-only reference to the value pointed to by this + * flag. + */ + template + readonly_reference + add_flag(std::string const& name, + std::initializer_list cli_flags, + EnvVariable env, + std::string const& description) + { + if (env.exists() && env.template value()) + return add_flag_impl_(name, std::move(cli_flags), description, true); + else + return add_flag(name, std::move(cli_flags), description); + } + + /** @brief Add an additional named option. + * + * Currently, named options are all optional. This could be + * expanded if needed. + * + * @tparam T The type associated with the option. Deduced if a + * default value is given. If the default value is not + * given, the template parameter must be named explicitly + * and the default value will be default-constructed. + * + * @param[in] name The name to be used to refer to the argument. + * @param[in] cli_flags The valid command line flags to identify + * this option and its value. At least one must be + * given. + * @param[in] description A brief description of the argument, + * used for the help message. + * @param[in] default_value The default value to be returned if + * the option is not passed to the command line. + * + * @return A read-only reference to the value pointed to by this + * option. + */ + template + readonly_reference + add_option(std::string const& name, + std::initializer_list cli_flags, + std::string const& description, + T default_value = T()); + + /** @brief Add an additional named option. + * + * Currently, named options are all optional. This could be + * expanded if needed. + * + * @tparam T The type associated with the option. Deduced if a + * default value is given. If the default value is not + * given, the template parameter must be named explicitly + * and the default value will be default-constructed. + * @tparam AccessPolicy The access method for the environment + * variable. (Deduced.) + * + * @param[in] name The name to be used to refer to the argument. + * @param[in] cli_flags The valid command line flags to identify + * this option and its value. At least one must be + * given. + * @param[in] env The environment variable to prefer over the + * default parameter value. + * @param[in] description A brief description of the argument, + * used for the help message. + * @param[in] default_value The default value to be returned if + * the option is not passed to the command line. + * + * @return A read-only reference to the value pointed to by this + * option. + */ + template + readonly_reference + add_option(std::string const& name, + std::initializer_list cli_flags, + EnvVariable env, + std::string const& description, + T default_value = T()) + { + if (env.exists()) + return add_option(name, std::move(cli_flags), description, + env.template value()); + else + return add_option(name, std::move(cli_flags), description, + std::move(default_value)); + } + + /** @brief Add an additional named option; overloaded for "char + * const*" parameters. + * + * The value will be stored as an `std::string`. Its value must + * be extracted using `get(name)`. + * + * @param[in] name The name to be used to refer to the argument. + * @param[in] cli_flags The valid command line flags to trigger + * this flag to `true`. At least one must be given. + * @param[in] description A brief description of the argument, + * used for the help message. + * @param[in] default_value The default value to be returned if + * the option is not passed to the command line. + * + * @return A read-only reference to the value pointed to by this + * option. + */ + readonly_reference + add_option(std::string const& name, + std::initializer_list cli_flags, + std::string const& description, + char const* default_value) + { + return add_option(name, std::move(cli_flags), description, + std::string(default_value)); + } + + /** @brief Add an additional named option; overloaded for "char + * const*" parameters. + * + * The value will be stored as an `std::string`. Its value must + * be extracted using `get(name)`. + * + * @param[in] name The name to be used to refer to the argument. + * @param[in] cli_flags The valid command line flags to trigger + * this flag to `true`. At least one must be given. + * @param[in] env The environment variable to prefer over the + * default parameter value. + * @param[in] description A brief description of the argument, + * used for the help message. + * @param[in] default_value The default value to be returned if + * the option is not passed to the command line. + * + * @return A read-only reference to the value pointed to by this + * option. + */ + template + readonly_reference + add_option(std::string const& name, + std::initializer_list cli_flags, + EnvVariable env, + std::string const& description, + char const* default_value) + { + return add_option(name, cli_flags, std::move(env), + description, std::string(default_value)); + } + + /** @brief Add an optional positional argument. + * + * These are essentially defaulted positional arguments. They must + * be given on the command line in the order in which they are + * added to the parser. If the arguments have all been added by the + * time the help message is produced, the help message will display + * the correct ordering. + * + * @tparam T The type to which the argument maps. + * + * @param[in] name The name to be used to refer to the argument. + * @param[in] description A brief description of the argument, + * used for the help message. + * @param[in] default_value The value to use for this argument if + * not detected in the formal argument list. + * + * @return A read-only reference to the value pointed to by this + * argument. + */ + template + readonly_reference add_argument( + std::string const& name, + std::string const& description, + T default_value = T()); + + /** @brief Add a positional argument; char const* overload + * + * The data is stored in an std::string object internally and + * must be accessed using `get(name)`. + * + * @param[in] name The name to be used to refer to the argument. + * @param[in] description A brief description of the argument, + * used for the help message. + * @param[in] default_value The value to use for this argument if + * not detected in the formal argument list. + * + * @return A read-only reference to the value pointed to by this + * argument. + */ + readonly_reference add_argument( + std::string const& name, + std::string const& description, + char const* default_value) + { + return add_argument( + name, description, std::string(default_value)); + } + + /** @brief Add a "required" positional argument. + * + * @tparam T The type to which the argument maps. + * + * @param[in] name The name to be used to refer to the argument. + * @param[in] description A brief description of the argument, + * used for the help message. + * + * @return A read-only reference to the value pointed to by this + * argument. + */ + template + readonly_reference add_required_argument( + std::string const& name, + std::string const& description); + + ///@} + /** @name Command-line-like parsing */ + ///@{ + + /** @brief Parse the command line arguments and finalize the + * arguments. + * + * This is equivalent to calling parse_no_finalize() followed + * immediately by finalize(). + * + * @param[in] argc The number of arguments + * @param[in] argv The list of arguments + * + * @throws parse_error if an internal parsing error is detected. + */ + void parse(int argc, char const* const argv[]); + + /** @brief Parse the command line arguments but do not finalize + * the parser. + * + * This parses command-line-like arguments but does no checks for + * required arguments. Users should call finalize() before + * attempting to use the values associated with any required + * arguments. + * + * @param[in] argc The number of arguments + * @param[in] argv The list of arguments + * + * @throws parse_error if an internal parsing error is detected. + */ + void parse_no_finalize(int argc, char const* const argv[]); + + /** @brief Assert that all required components are set properly. + * + + * This should be called sometime after parse_no_finalize() and + * before using the values. This is implicitly called by parse(). + * + * @throws missing_required_arguments If a missing argument is + * detected. + */ + void finalize() const; + + ///@} + /** @name Queries */ + ///@{ + + /** @brief Get the executable name. + * + * This is only meaningful after calling either parse() or + * parse_no_finalize(). + * + * @return The name of the executable. + */ + std::string const& get_exe_name() const noexcept; + + /** @brief Test if an option exists in the parser. + * + * This only tests whether the argument or option is known to the + * parser, not whether it has been set or modified by the parser. + * + * @param[in] option_name The name of the option/argument. + */ + bool option_is_defined(std::string const& option_name) const; + + /** @brief Test if help has been requested. */ + bool help_requested() const; + + /** @brief Get the requested value from the argument list. + * @tparam T The type of the requested parameter. + * @param option_name The name given to the option or argument. + * @return A const-reference to the held value. + */ + template + T const& get(std::string const& option_name) const; + + ///@} + /** @name Output */ + ///@{ + + /** @brief Print a help string to a stream. + * @param[in] stream The ostream to print the help message to. + */ + void print_help(std::ostream& stream) const; + + ///@} + +private: + + /** @brief Implementation of add_flag */ + readonly_reference + add_flag_impl_(std::string const& name, + std::initializer_list cli_flags, + std::string const& description, + bool default_value); + +private: + /** @brief Dictionary of arguments to their values */ + std::unordered_map params_; + /** @brief Patch around in-progress clara limitation */ + std::unordered_set required_; + /** @brief The underlying clara object */ + clara::Parser parser_; + /** @brief The name of the executable. */ + std::string exe_name_ = ""; + +}; + +inline bool +argument_parser::option_is_defined(std::string const& option_name) const +{ + return params_.count(option_name); +} + +template +inline T const& argument_parser::get(std::string const& option_name) const +{ + return utils::any_cast(params_.at(option_name)); +} + +template +inline auto argument_parser::add_option( + std::string const& name, + std::initializer_list cli_flags, + std::string const& description, + T default_value) + -> readonly_reference +{ + params_[name] = std::move(default_value); + auto& param_ref = any_cast(params_[name]); + clara::Opt option(param_ref, name); + for (auto const& f : cli_flags) + option[f]; + parser_ |= option(description).optional(); + return param_ref; +} + +template +inline auto argument_parser::add_argument( + std::string const& name, + std::string const& description, + T default_value) + -> readonly_reference +{ + params_[name] = std::move(default_value); + auto& param_ref = utils::any_cast(params_[name]); + parser_ |= clara::Arg + (param_ref, name) + (description).optional(); + return param_ref; +} + +template +inline auto argument_parser::add_required_argument( + std::string const& name, + std::string const& description) + -> readonly_reference +{ + // Add the reference to bind to + params_[name] = T{}; + auto& param_any = params_[name]; + auto& param_ref = any_cast(param_any); + + required_.insert(name); + + // Make sure the required arguments are all grouped together. + auto iter = parser_.m_args.cbegin(), invalid = parser_.m_args.cend(); + while (iter != invalid && !iter->isOptional()) + ++iter; + + // Create the argument + auto ret = parser_.m_args.emplace( + iter, + [name,¶m_ref,this](std::string const& value) + { + auto result = clara::detail::convertInto(value, param_ref); + if (result) + required_.erase(name); + return result; + }, + name); + ret->operator() (description).required(); + return param_ref; +} +}// namespace utils + +utils::argument_parser& global_argument_parser(); + +}// namespace lbann + +/** @brief Write the parser's help string to the given @c ostream */ +std::ostream& operator<<(std::ostream&, lbann::utils::argument_parser const&); + +#endif /* LBANN_UTILS_ARGUMENT_PARSER_HPP_INCLUDED */ diff --git a/include/lbann/utils/environment_variable.hpp b/include/lbann/utils/environment_variable.hpp new file mode 100644 index 00000000000..95dd09de2f6 --- /dev/null +++ b/include/lbann/utils/environment_variable.hpp @@ -0,0 +1,145 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_UTILS_ENVIRONMENT_VARIABLE_HPP_INCLUDED +#define LBANN_UTILS_ENVIRONMENT_VARIABLE_HPP_INCLUDED + +#include "lbann/utils/from_string.hpp" + +#include + +namespace lbann +{ +namespace utils +{ + +/** @brief Access environment variables using getenv. */ +class GetEnvAccessor +{ +public: + std::string get(std::string const& var_name) const; +}; + +/** @brief An environment variable + * + * Values are acquired lazily. The only maintained state is the name. + */ +template +class EnvVariable +{ +public: + + /** @name Constructors */ + ///@{ + + /** @brief Construct from a string. */ + EnvVariable(std::string const& var_name); + + /** @brief Construct from a temporary string. */ + EnvVariable(std::string&& var_name); + + ///@} + /** @name Queries */ + ///@{ + + /** @brief Test if the variable exists in the environment. + * + * Existence means set to a nonempty string. + */ + bool exists() const; + + ///@} + /** @name Accessors */ + ///@{ + + /** @brief Get the name of the environment variable. */ + std::string const& name() const noexcept; + + /** @brief Get the string value of the environment variable. */ + std::string raw_value() const; + + /** @brief Get the value of the environment variable as a certain type. */ + template + T value() const; + + ///@} + +private: + /** @brief The name of the variable. */ + std::string name_; +}; + +/** @brief Convenience typedef */ +using ENV = EnvVariable<>; + +// Implementation + +template +inline +EnvVariable:: +EnvVariable(std::string const& var_name) + : name_{var_name} +{} + +template +inline +EnvVariable:: +EnvVariable(std::string&& var_name) + : name_{std::move(var_name)} +{} + +template +inline bool +EnvVariable::exists() const +{ + return raw_value().size() > 0; +} + +template +inline std::string const& +EnvVariable::name() const noexcept +{ + return name_; +} + +template +inline std::string +EnvVariable::raw_value() const +{ + AccessPolicy access; + return access.get(name_); +} + +template +template +T EnvVariable::value() const +{ + return from_string(raw_value()); +} + +}// namespace utils +}// namespace lbann +#endif /* LBANN_UTILS_ENVIRONMENT_VARIABLE_HPP_INCLUDED */ diff --git a/include/lbann/utils/from_string.hpp b/include/lbann/utils/from_string.hpp new file mode 100644 index 00000000000..9a7c21e0d52 --- /dev/null +++ b/include/lbann/utils/from_string.hpp @@ -0,0 +1,132 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_UTILS_FROM_STRING_INCLUDED +#define LBANN_UTILS_FROM_STRING_INCLUDED + +#include +#include + +namespace lbann { +namespace utils { + +/** @brief An exceedingly simple implementation of boost::lexical_cast, e.g. + * + * Generally, these implementations prefer `sto*` function calls to + * the stream method because stream operators do not provide + * straight-forward error feedback. + * + * @tparam T The type to cast to. + * + * @param str The input string. + * + * @return The value of the input string as a T. + * + * @todo chars, shorts, unsigned. Bool needs some work. + */ +template +T from_string(std::string const& str); + +inline std::string from_string(std::string&& str) +{ + return std::move(str); +} + +template <> +inline std::string from_string(std::string const& str) +{ + return str; +} + +template <> +inline int from_string(std::string const& str) +{ + return std::stoi(str); +} + +template <> +inline long from_string(std::string const& str) +{ + return std::stol(str); +} + +template <> +inline long long from_string(std::string const& str) +{ + return std::stoll(str); +} + +template <> +inline unsigned long from_string(std::string const& str) +{ + return std::stoul(str); +} + +template <> +inline unsigned long long from_string(std::string const& str) +{ + return std::stoull(str); +} + +template <> +inline float from_string(std::string const& str) +{ + return std::stof(str); +} + +template <> +inline double from_string(std::string const& str) +{ + return std::stod(str); +} + +template <> +inline long double from_string(std::string const& str) +{ + return std::stold(str); +} + +template <> +inline bool from_string(std::string const& str) +{ + auto upcase = [](std::string s) { + std::transform(s.begin(), s.end(), s.begin(), + [](unsigned char c) + { return std::toupper(c); }); + return s; + }; + auto upper = upcase(str); + if (upper == "TRUE") + return true; + else if (upper == "FALSE") + return false; + else + return from_string(str); +} + +}// namespace utils +}// namespace lbann +#endif // LBANN_UTILS_FROM_STRING_INCLUDED diff --git a/scripts/build_lbann_lc.sh b/scripts/build_lbann_lc.sh index 21dc0147578..4eccaf045d4 100755 --- a/scripts/build_lbann_lc.sh +++ b/scripts/build_lbann_lc.sh @@ -765,6 +765,7 @@ cmake \ -D CMAKE_INSTALL_MESSAGE=${CMAKE_INSTALL_MESSAGE} \ -D CMAKE_INSTALL_PREFIX=${INSTALL_DIR} \ -D LBANN_SB_BUILD_CEREAL=ON \ +-D LBANN_SB_BUILD_CLARA=ON \ -D LBANN_SB_BUILD_CNPY=ON \ -D LBANN_SB_BUILD_HYDROGEN=ON \ -D LBANN_SB_FWD_HYDROGEN_Hydrogen_ENABLE_CUDA=${WITH_CUDA} \ diff --git a/src/utils/CMakeLists.txt b/src/utils/CMakeLists.txt index 7531ba58a38..38e7a9f9b61 100644 --- a/src/utils/CMakeLists.txt +++ b/src/utils/CMakeLists.txt @@ -1,9 +1,11 @@ # Add the source files for this directory set_full_path(THIS_DIR_SOURCES + argument_parser.cpp cnpy_utils.cpp cublas.cpp cudnn.cpp description.cpp + environment_variable.cpp exception.cpp file_utils.cpp graph.cpp diff --git a/src/utils/argument_parser.cpp b/src/utils/argument_parser.cpp new file mode 100644 index 00000000000..3436abd0511 --- /dev/null +++ b/src/utils/argument_parser.cpp @@ -0,0 +1,129 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#include "lbann/utils/argument_parser.hpp" +#include "lbann/utils/any.hpp" +#include "lbann/utils/exception.hpp" + +#include + +#include +#include + +namespace lbann +{ +namespace utils +{ + +argument_parser::argument_parser() +{ + params_["print help"] = false; + parser_ |= clara::ExeName(exe_name_); + parser_ |= clara::Help(utils::any_cast(params_["print help"])); + + // Work around a bug in Clara logic + parser_.m_exeName.set(exe_name_); +} + +void argument_parser::parse(int argc, char const* const argv[]) +{ + parse_no_finalize(argc, argv); + finalize(); +} + +void argument_parser::parse_no_finalize(int argc, char const* const argv[]) +{ + + auto parse_result = parser_.parse(clara::Args(argc, argv)); + if (!parse_result) + throw parse_error( + lbann::build_string( + "Arguments could not be parsed.\n\nMessage: ", + parse_result.errorMessage())); +} + +void argument_parser::finalize() const +{ + if (!help_requested() && required_.size()) + throw missing_required_arguments(required_); +} + +auto argument_parser::add_flag( + std::string const& name, + std::initializer_list cli_flags, + std::string const& description) + -> readonly_reference +{ + return add_flag_impl_(name, std::move(cli_flags), description, false); +} + +std::string const& argument_parser::get_exe_name() const noexcept +{ + return exe_name_; +} + +bool argument_parser::help_requested() const +{ + return utils::any_cast(params_.at("print help")); +} + +void argument_parser::print_help(std::ostream& out) const +{ + out << parser_ << std::endl; +} + +auto argument_parser::add_flag_impl_( + std::string const& name, + std::initializer_list cli_flags, + std::string const& description, + bool default_value) + -> readonly_reference +{ + params_[name] = default_value; + auto& param_ref = any_cast(params_[name]); + clara::Opt option(param_ref); + for (auto const& f : cli_flags) + option[f]; + parser_ |= option(description).optional(); + return param_ref; +} + +}// namespace utils + +utils::argument_parser& global_argument_parser() +{ + static utils::argument_parser args; + return args; +} + +}// namespace lbann + +std::ostream& operator<<(std::ostream& os, + lbann::utils::argument_parser const& parser) +{ + parser.print_help(os); + return os; +} diff --git a/src/utils/environment_variable.cpp b/src/utils/environment_variable.cpp new file mode 100644 index 00000000000..d58853ff335 --- /dev/null +++ b/src/utils/environment_variable.cpp @@ -0,0 +1,48 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#include "lbann/utils/environment_variable.hpp" + +#include + +namespace lbann +{ +namespace utils +{ + +std::string GetEnvAccessor::get(std::string const& var_name) const +{ +#if _GNU_SOURCE + // If GNU, secure_getenv might be better?? + char const* env = secure_getenv(var_name.c_str()); +#else + char const* env = std::getenv(var_name.c_str()); +#endif // _GNU_SOURCE + return std::string(env ? env : ""); +} + +}// namespace utils +}// namespace lbann diff --git a/src/utils/unit_test/CMakeLists.txt b/src/utils/unit_test/CMakeLists.txt index 78043ef5ca1..813b1cbdfd8 100644 --- a/src/utils/unit_test/CMakeLists.txt +++ b/src/utils/unit_test/CMakeLists.txt @@ -1,12 +1,18 @@ set_full_path(_DIR_LBANN_CATCH2_TEST_FILES any_test.cpp + argument_parser_test.cpp beta_distribution_test.cpp + environment_variable_test.cpp factory_test.cpp + from_string_test.cpp hash_test.cpp image_test.cpp python_test.cpp random_test.cpp type_erased_matrix_test.cpp + + stubs/preset_env_accessor.hpp + stubs/preset_env_accessor.cpp ) set(LBANN_CATCH2_TEST_FILES diff --git a/src/utils/unit_test/argument_parser_test.cpp b/src/utils/unit_test/argument_parser_test.cpp new file mode 100644 index 00000000000..f12ddddbaa9 --- /dev/null +++ b/src/utils/unit_test/argument_parser_test.cpp @@ -0,0 +1,555 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// +#include + +#include "lbann/utils/argument_parser.hpp" + +#include "lbann/utils/environment_variable.hpp" + +#include "stubs/preset_env_accessor.hpp" + +SCENARIO ("Testing the argument parser", "[parser][utilities]") +{ + GIVEN ("An argument parser") + { + lbann::utils::argument_parser parser; + WHEN ("The default arguments are passed") + { + int const argc = 1; + char const* argv[] = { "argument_parser_test.exe" }; + THEN ("The parser recognizes the executable name") + { + REQUIRE_NOTHROW(parser.parse(argc, argv)); + REQUIRE( + parser.get_exe_name() == "argument_parser_test.exe"); + } + } + WHEN ("The short help flag is passed") + { + int const argc = 2; + char const* argv[] = {"argument_parser_test.exe", "-h"}; + THEN ("The parser notes that help has been requested.") + { + REQUIRE_FALSE(parser.help_requested()); + REQUIRE_NOTHROW(parser.parse(argc, argv)); + REQUIRE(parser.help_requested()); + } + } + WHEN ("The long help flag is passed") + { + int const argc = 2; + char const* argv[argc] = {"argument_parser_test.exe", "--help"}; + THEN ("The parser notes that help has been requested.") + { + REQUIRE_NOTHROW(parser.parse(argc, argv)); + REQUIRE(parser.help_requested()); + } + } + WHEN ("A boolean flag is added") + { + auto verbose = + parser.add_flag( + "verbose", {"-v", "--verbose"}, "print verbosely"); + THEN ("The flag's option name is known") + { + REQUIRE(parser.option_is_defined("verbose")); + REQUIRE_FALSE(verbose); + } + AND_WHEN("The flag is passed") + { + int const argc = 2; + char const* argv[] + = {"argument_parser_test.exe", "--verbose"}; + REQUIRE_FALSE(parser.get("verbose")); + THEN ("The verbose flag is registered") + { + REQUIRE_NOTHROW(parser.parse(argc, argv)); + REQUIRE(parser.get("verbose")); + REQUIRE(verbose); + } + } + } + WHEN ("An option is added") + { + auto num_threads = + parser.add_option("number of threads", {"-t", "--num_threads"}, + "The number of threads to use in this test.", 1); + THEN ("The option is registered with the parser.") + { + REQUIRE(parser.option_is_defined("number of threads")); + REQUIRE(parser.template get("number of threads") == 1); + REQUIRE(num_threads == 1); + } + AND_WHEN ("The short option is passed on the command line") + { + int const argc = 3; + char const* argv[] = {"argument_parser_test.exe", "-t", "9"}; + THEN ("The new value is registered.") + { + REQUIRE_NOTHROW(parser.parse(argc, argv)); + REQUIRE( + parser.template get("number of threads") == 9); + REQUIRE(num_threads == 9); + } + } + AND_WHEN ("The long option is passed on the command line") + { + int const argc = 3; + char const* argv[] + = {"argument_parser_test.exe", "--num_threads", "13"}; + THEN ("The new value is registered.") + { + REQUIRE_NOTHROW(parser.parse(argc, argv)); + REQUIRE( + parser.template get("number of threads") == 13); + REQUIRE(num_threads == 13); + } + } + } + WHEN ("A string-valued option is added") + { + auto name = + parser.add_option("my name", {"-n", "--name", "--my_name"}, + "The number of threads to use in this test.", + ""); + THEN ("The option is registered with the parser.") + { + REQUIRE(parser.option_is_defined("my name")); + REQUIRE(parser.template get("my name") + == ""); + } + AND_WHEN ("The short option is passed on the command line") + { + int const argc = 3; + char const* argv[] + = {"argument_parser_test.exe", "-n", "Banana Joe"}; + THEN ("The new value is registered.") + { + REQUIRE_NOTHROW(parser.parse(argc, argv)); + REQUIRE( + parser.template get("my name") + == "Banana Joe"); + REQUIRE(name == "Banana Joe"); + } + } + AND_WHEN ("The first long option is passed on the command line") + { + int const argc = 3; + char const* argv[] + = {"argument_parser_test.exe", "--name", "Plantain Pete"}; + THEN ("The new value is registered.") + { + REQUIRE_NOTHROW(parser.parse(argc, argv)); + REQUIRE( + parser.template get("my name") + == "Plantain Pete"); + REQUIRE(name == "Plantain Pete"); + } + } + AND_WHEN ("The second long option is passed on the command line") + { + int const argc = 3; + char const* argv[] + = {"argument_parser_test.exe", "--my_name", + "Jackfruit Jill"}; + THEN ("The new value is registered.") + { + REQUIRE_NOTHROW(parser.parse(argc, argv)); + REQUIRE( + parser.template get("my name") + == "Jackfruit Jill"); + REQUIRE(name == "Jackfruit Jill"); + } + } + } + + WHEN ("A required argument is added") + { + auto required_int = + parser.add_required_argument( + "required", "This argument is required."); + THEN ("The option is recognized") + { + REQUIRE(parser.option_is_defined("required")); + } + AND_WHEN("The option is not passed in the arguments") + { + int const argc = 1; + char const* argv[argc] = {"argument_parser_test.exe"}; + + THEN ("Finalization fails.") + { + parser.parse_no_finalize(argc,argv); + REQUIRE_THROWS_AS( + parser.finalize(), + lbann::utils::argument_parser::missing_required_arguments); + } + } + AND_WHEN("The option is passed in the arguments") + { + int const argc = 2; + char const* argv[argc] = {"argument_parser_test.exe","13"}; + + THEN ("Parsing is successful and the value is updated.") + { + REQUIRE_NOTHROW(parser.parse(argc, argv)); + REQUIRE(required_int == 13); + } + } + AND_WHEN("Another is added option and passed in the arguments") + { + auto required_string = + parser.add_required_argument( + "required string", "This argument is also required."); + + int const argc = 3; + char const* argv[argc] = {"argument_parser_test.exe","13","bananas"}; + + THEN ("Parsing is successful and the values are updated.") + { + REQUIRE_NOTHROW(parser.parse(argc, argv)); + REQUIRE(required_int == 13); + REQUIRE(required_string == "bananas"); + } + } + } + + WHEN ("An optional argument is added") + { + auto optional_int = + parser.add_argument( + "optional", "This argument is optional.", -1); + THEN ("The option is recognized") + { + REQUIRE(parser.option_is_defined("optional")); + REQUIRE(parser.template get("optional") == -1); + REQUIRE(optional_int == -1); + } + AND_WHEN("The option is not passed in the arguments") + { + int const argc = 1; + char const* argv[argc] = {"argument_parser_test.exe"}; + + THEN ("Parsing succeeds with no update to the value.") + { + REQUIRE_NOTHROW(parser.parse(argc,argv)); + REQUIRE(parser.template get("optional") == -1); + REQUIRE(optional_int == -1); + } + } + AND_WHEN("The option is passed in the arguments") + { + int const argc = 2; + char const* argv[argc] = {"argument_parser_test.exe","13"}; + + THEN ("Parsing is successful and the value is updated.") + { + REQUIRE_NOTHROW(parser.parse(argc,argv)); + REQUIRE(parser.template get("optional") == 13); + REQUIRE(optional_int == 13); + } + } + AND_WHEN("Another argument is added and passed in the arguments") + { + auto optional_string = + parser.add_argument( + "optional string", "This argument is also optional.", + "pickles"); + + int const argc = 3; + char const* argv[argc] = {"argument_parser_test.exe","42","bananas"}; + + THEN ("Parsing is successful and the values are updated.") + { + REQUIRE(optional_int == -1); + REQUIRE(optional_string == "pickles"); + REQUIRE_NOTHROW(parser.parse(argc, argv)); + REQUIRE(optional_int == 42); + REQUIRE(optional_string == "bananas"); + } + } + AND_WHEN("A required argument is added and passed in the arguments") + { + auto required_string = + parser.add_required_argument( + "required string", "This argument is required."); + + AND_WHEN("The arguments are passed in the add order") + { + int const argc = 3; + char const* argv[argc] = { + "argument_parser_test.exe","42","bananas"}; + THEN ("Parsing fails because required must come first") + { + REQUIRE_THROWS(parser.parse(argc,argv)); + REQUIRE(required_string == "42"); + } + } + AND_WHEN("The arguments are passed in the right order") + { + int const argc = 3; + char const* argv[argc] = { + "argument_parser_test.exe","bananas","42"}; + THEN ("The arguments must be reversed") + { + REQUIRE(optional_int == -1); + REQUIRE_NOTHROW(parser.parse(argc, argv)); + REQUIRE(optional_int == 42); + REQUIRE(required_string == "bananas"); + } + } + } + } + + WHEN ("A flag with env variable override is added") + { + using namespace lbann::utils::stubs; + using TestENV = lbann::utils::EnvVariable; + + auto verbose = + parser.add_flag("verbose", {"-v"}, + TestENV("VALUE_IS_TRUE"), ""); + + THEN("The flag registers as true.") + { + REQUIRE(parser.option_is_defined("verbose")); + REQUIRE(verbose); + } + + AND_WHEN ("The flag is passed on the command line") + { + int const argc = 2; + char const* argv[] + = {"argument_parser_test.exe", "-v"}; + + THEN ("The verbose flag is registered") + { + REQUIRE_NOTHROW(parser.parse(argc, argv)); + REQUIRE(parser.get("verbose")); + REQUIRE(verbose); + } + } + } + + WHEN ("A flag with false-valued env variable override is added") + { + using namespace lbann::utils::stubs; + using TestENV = lbann::utils::EnvVariable; + + auto verbose = + parser.add_flag("verbose", {"-v"}, + TestENV("VALUE_IS_FALSE"), ""); + + THEN("The flag registers as false.") + { + REQUIRE(parser.option_is_defined("verbose")); + REQUIRE_FALSE(verbose); + } + + AND_WHEN ("The flag is passed on the command line") + { + int const argc = 2; + char const* argv[] + = {"argument_parser_test.exe", "-v"}; + + THEN ("The verbose flag is registered") + { + REQUIRE_NOTHROW(parser.parse(argc, argv)); + REQUIRE(parser.get("verbose")); + REQUIRE(verbose); + } + } + } + + WHEN ("A flag with false-valued env variable override is added") + { + using namespace lbann::utils::stubs; + using TestENV = lbann::utils::EnvVariable; + + auto verbose = + parser.add_flag("verbose", {"-v"}, + TestENV("VALUE_IS_UNDEFINED"), ""); + + THEN("The flag registers as false.") + { + REQUIRE(parser.option_is_defined("verbose")); + REQUIRE_FALSE(verbose); + } + + AND_WHEN ("The flag is passed on the command line") + { + int const argc = 2; + char const* argv[] + = {"argument_parser_test.exe", "-v"}; + + THEN ("The verbose flag is registered") + { + REQUIRE_NOTHROW(parser.parse(argc, argv)); + REQUIRE(parser.get("verbose")); + REQUIRE(verbose); + } + } + } + + WHEN ("A defined environment varible is added") + { + using namespace lbann::utils::stubs; + using TestENV = lbann::utils::EnvVariable; + + parser.add_option( + "apple", {"-a"}, TestENV("APPLE"), + "Apple pie tastes good.", 1.23); + + REQUIRE(parser.option_is_defined("apple")); + + AND_WHEN("The option is not passed in the arguments") + { + int const argc = 1; + char const* argv[argc] = {"argument_parser_test.exe"}; + + THEN("The option has the value defined in the environment") + { + REQUIRE_NOTHROW(parser.parse(argc, argv)); + REQUIRE(parser.template get("apple") == 3.14); + } + } + + AND_WHEN("The option is passed in the arguments") + { + int const argc = 3; + char const* argv[argc] = {"argument_parser_test.exe", "-a", "5.0"}; + THEN("The option has the value defined in command line args") + { + REQUIRE_NOTHROW(parser.parse(argc, argv)); + REQUIRE(parser.template get("apple") == 5.0); + } + } + } + + WHEN ("An undefined environment varible is added") + { + using namespace lbann::utils::stubs; + using TestENV = lbann::utils::EnvVariable; + + parser.add_option( + "platypus", {"-p"}, TestENV("DOESNT_EXIST"), + "This variable won't exist.", 1.23); + + REQUIRE(parser.option_is_defined("platypus")); + + AND_WHEN("The option is not passed in the arguments") + { + int const argc = 1; + char const* argv[argc] = {"argument_parser_test.exe"}; + + THEN("The option has the default value") + { + REQUIRE_NOTHROW(parser.parse(argc, argv)); + REQUIRE(parser.template get("platypus") == 1.23); + } + } + AND_WHEN("The option is passed in the arguments") + { + int const argc = 3; + char const* argv[argc] = {"argument_parser_test.exe", "-p", "2.0"}; + THEN("The option has the value defined in the command line args") + { + REQUIRE_NOTHROW(parser.parse(argc, argv)); + REQUIRE(parser.template get("platypus") == 2.0); + } + } + } + + WHEN ("A defined string environment varible is added") + { + using namespace lbann::utils::stubs; + using TestENV = lbann::utils::EnvVariable; + + parser.add_option( + "pizza", {"-p"}, TestENV("PIZZA"), + "Mmmm pizza.", "mushroom"); + + REQUIRE(parser.option_is_defined("pizza")); + + AND_WHEN("The option is not passed in the arguments") + { + int const argc = 1; + char const* argv[argc] = {"argument_parser_test.exe"}; + + THEN("The option has the value defined in the environment") + { + REQUIRE_NOTHROW(parser.parse(argc, argv)); + REQUIRE(parser.template get("pizza") == "pepperoni"); + } + } + + AND_WHEN("The option is passed in the arguments") + { + int const argc = 3; + char const* argv[argc] = {"argument_parser_test.exe", "-p", "hawaiian"}; + THEN("The option has the value defined in the command line args") + { + REQUIRE_NOTHROW(parser.parse(argc, argv)); + REQUIRE(parser.template get("pizza") == "hawaiian"); + } + } + } + + WHEN ("An undefined environment varible is added to a string option") + { + using namespace lbann::utils::stubs; + using TestENV = lbann::utils::EnvVariable; + + parser.add_option( + "platypus", {"-p"}, TestENV("DOESNT_EXIST"), + "This variable won't exist.", "so cute"); + + REQUIRE(parser.option_is_defined("platypus")); + + AND_WHEN("The option is not passed in the arguments") + { + int const argc = 1; + char const* argv[argc] = {"argument_parser_test.exe"}; + + THEN("The option has the default value") + { + REQUIRE_NOTHROW(parser.parse(argc, argv)); + REQUIRE(parser.template get("platypus") == "so cute"); + } + } + AND_WHEN("The option is passed in the arguments") + { + int const argc = 3; + char const* argv[argc] = {"argument_parser_test.exe", "-p", "llama"}; + THEN("The option has the value defined in the command line args") + { + REQUIRE_NOTHROW(parser.parse(argc, argv)); + REQUIRE(parser.template get("platypus") == "llama"); + } + } + } + } +} diff --git a/src/utils/unit_test/environment_variable_test.cpp b/src/utils/unit_test/environment_variable_test.cpp new file mode 100644 index 00000000000..d38ae0df529 --- /dev/null +++ b/src/utils/unit_test/environment_variable_test.cpp @@ -0,0 +1,169 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#include + +#include "lbann/utils/environment_variable.hpp" + +#include "stubs/preset_env_accessor.hpp" + +using namespace lbann::utils; + +TEST_CASE("Environment variable wrapper", "[utilities][parser]") +{ + using TestENV = EnvVariable; + + SECTION("A floating point variable") + { + TestENV apple("APPLE"); + + CHECK(apple.exists()); + CHECK(apple.name() == "APPLE"); + CHECK(apple.raw_value() == "3.14"); + + // This class is (purposefully) not as rigorously typed as, say, + // the type-erased "any". Since conversion is done on-the-fly from + // a string, there's less need for strong typing. + CHECK(apple.value() == 3.14f); + CHECK(apple.value() == 3.14); + CHECK(apple.value() == 3); + + // Environment variables should always be convertible to strings + CHECK(apple.value() == apple.raw_value()); + } + + SECTION("An integer variable") + { + TestENV scoops("ICE_CREAM_SCOOPS"); + + CHECK(scoops.exists()); + CHECK(scoops.name() == "ICE_CREAM_SCOOPS"); + CHECK(scoops.raw_value() == "3"); + + CHECK(scoops.value() == 3.f); + CHECK(scoops.value() == 3.); + CHECK(scoops.value() == 3); + CHECK(scoops.value() == scoops.raw_value()); + } + + SECTION("A string variable") + { + TestENV pizza("PIZZA"); + CHECK(pizza.exists()); + CHECK(pizza.name() == "PIZZA"); + CHECK(pizza.raw_value() == "pepperoni"); + CHECK(pizza.value() == pizza.raw_value()); + + CHECK_THROWS_AS(pizza.value() == 123.f, std::invalid_argument); + CHECK_THROWS_AS(pizza.value() == 321., std::invalid_argument); + CHECK_THROWS_AS(pizza.value() == 42, std::invalid_argument); + } + + SECTION("Boolean variables") + { + SECTION("Variable stored as the string \"true\"") + { + TestENV true_str_var("VALUE_IS_TRUE"); + + CHECK(true_str_var.exists()); + CHECK(true_str_var.name() == "VALUE_IS_TRUE"); + CHECK(true_str_var.raw_value() == "true"); + CHECK(true_str_var.value() == true_str_var.raw_value()); + + CHECK(true_str_var.value()); + + CHECK_THROWS_AS(true_str_var.value() == 123.f, + std::invalid_argument); + CHECK_THROWS_AS(true_str_var.value() == 321., + std::invalid_argument); + CHECK_THROWS_AS(true_str_var.value() == 42, std::invalid_argument); + } + + SECTION("Variable stored as a \"1\"") + { + TestENV true_int_var("VALUE_IS_ONE"); + + CHECK(true_int_var.exists()); + CHECK(true_int_var.name() == "VALUE_IS_ONE"); + CHECK(true_int_var.raw_value() == "1"); + CHECK(true_int_var.value() == true_int_var.raw_value()); + CHECK(true_int_var.value()); + } + + SECTION("Variable stored as the string \"false\"") + { + TestENV false_str_var("VALUE_IS_FALSE"); + + CHECK(false_str_var.exists()); + CHECK(false_str_var.name() == "VALUE_IS_FALSE"); + CHECK(false_str_var.raw_value() == "false"); + CHECK(false_str_var.value() == false_str_var.raw_value()); + + CHECK_FALSE(false_str_var.value()); + + CHECK_THROWS_AS(false_str_var.value() == 123.f, + std::invalid_argument); + CHECK_THROWS_AS(false_str_var.value() == 321., + std::invalid_argument); + CHECK_THROWS_AS(false_str_var.value() == 42, std::invalid_argument); + } + + SECTION("Variable stored as a \"0\"") + { + TestENV false_int_var("VALUE_IS_ZERO"); + + CHECK(false_int_var.exists()); + CHECK(false_int_var.name() == "VALUE_IS_ZERO"); + CHECK(false_int_var.raw_value() == "0"); + CHECK(false_int_var.value() == false_int_var.raw_value()); + + CHECK_FALSE(false_int_var.value()); + } + + SECTION("Variable has a value not convertible to bool") + { + TestENV not_a_bool("PIZZA"); + CHECK_THROWS_AS(not_a_bool.value(), + std::invalid_argument); + } + } + + SECTION("A variable that doesn't exist") + { + TestENV bad("DOESNT_EXIST"); + + CHECK_FALSE(bad.exists()); + + CHECK(bad.name() == "DOESNT_EXIST"); + CHECK(bad.raw_value() == ""); + CHECK(bad.value() == bad.raw_value()); + + CHECK_THROWS_AS(bad.value() == 123.f, std::invalid_argument); + CHECK_THROWS_AS(bad.value() == 321., std::invalid_argument); + CHECK_THROWS_AS(bad.value() == 42, std::invalid_argument); + CHECK_THROWS_AS(bad.value(), std::invalid_argument); + } +} diff --git a/src/utils/unit_test/from_string_test.cpp b/src/utils/unit_test/from_string_test.cpp new file mode 100644 index 00000000000..063a7e1e199 --- /dev/null +++ b/src/utils/unit_test/from_string_test.cpp @@ -0,0 +1,141 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +// MUST include this +#include + +// File being tested +#include + +namespace +{ + +template T PositiveAnswer() noexcept; +template T NegativeAnswer() noexcept; + +template <> int PositiveAnswer() noexcept { return 123; } +template <> int NegativeAnswer() noexcept { return -456; } +template <> long PositiveAnswer() noexcept { return 123L; } +template <> long NegativeAnswer() noexcept { return -456L; } +template <> long long PositiveAnswer() noexcept { return 123LL; } +template <> long long NegativeAnswer() noexcept { return -456LL; } + +template <> unsigned long PositiveAnswer() noexcept +{ + return 9876543210UL; +} +template <> unsigned long NegativeAnswer() noexcept +{ + return static_cast(-1); +} +template <> unsigned long long PositiveAnswer() noexcept +{ + return 9876543210ULL; +} +template <> unsigned long long NegativeAnswer() noexcept +{ + return static_cast(-1); +} + +template <> float PositiveAnswer() noexcept { return 9.87f; } +template <> float NegativeAnswer() noexcept { return -6.54f; } +template <> double PositiveAnswer() noexcept { return 9.87; } +template <> double NegativeAnswer() noexcept { return -6.54; } +template <> long double PositiveAnswer() noexcept { return 9.87l; } +template <> long double NegativeAnswer() noexcept { return -6.54l; } + +}// namespace + +using lbann::utils::from_string; + +TEST_CASE("From string corner cases","[utilities][string]") +{ + SECTION("Boolean strings") + { + CHECK(from_string("true")); + CHECK(from_string("TRUE")); + CHECK(from_string("tRuE")); + CHECK(from_string("TrUe")); + CHECK(from_string("1")); + CHECK(from_string("431")); + CHECK(from_string("3.14")); + + CHECK_FALSE(from_string("false")); + CHECK_FALSE(from_string("FALSE")); + CHECK_FALSE(from_string("FaLsE")); + CHECK_FALSE(from_string("0")); + CHECK_FALSE(from_string("0.0")); + + // FIXME: This should be true: + //CHECK(from_string("0.2")); + + CHECK_THROWS_AS(from_string("not a bool"), std::invalid_argument); + } + + SECTION("From lvalue string to string") + { + std::string input("I am a string"); + REQUIRE(from_string(input) == input); + REQUIRE(from_string(input) == "I am a string"); + } + + SECTION("From rvalue string to string") + { + REQUIRE(from_string("I'm a string") == "I'm a string"); + } + + SECTION("Exceptional cases") + { + REQUIRE_THROWS_AS(from_string("9876543210"), std::out_of_range); + } +} + +TEMPLATE_TEST_CASE("From string to floating point type", + "[utilities][string]", + float, double, long double) +{ + REQUIRE_THROWS_AS(from_string("pineapple"), std::invalid_argument); + REQUIRE(from_string("9.87") == PositiveAnswer()); + REQUIRE(from_string("-6.54") == NegativeAnswer()); +} + +TEMPLATE_TEST_CASE("From string to signed integer type", + "[utilities][string]", + int, long, long long) +{ + REQUIRE_THROWS_AS(from_string("pineapple"), std::invalid_argument); + REQUIRE(from_string("123") == PositiveAnswer()); + REQUIRE(from_string("-456") == NegativeAnswer()); +} + +TEMPLATE_TEST_CASE("From string to unsigned integer type", + "[utilities][string]", + unsigned long, unsigned long long) +{ + REQUIRE_THROWS_AS(from_string("pineapple"), std::invalid_argument); + REQUIRE(from_string("9876543210") == PositiveAnswer()); + REQUIRE(from_string("-1") == NegativeAnswer()); +} diff --git a/src/utils/unit_test/stubs/preset_env_accessor.cpp b/src/utils/unit_test/stubs/preset_env_accessor.cpp new file mode 100644 index 00000000000..166cf6be7dd --- /dev/null +++ b/src/utils/unit_test/stubs/preset_env_accessor.cpp @@ -0,0 +1,11 @@ +#include "preset_env_accessor.hpp" + +namespace lbann { +namespace utils { +namespace stubs { + +std::unordered_map PresetEnvAccessor::vars_; + +}// namespace stubs +}// namespace utils +}// namespace lbann diff --git a/src/utils/unit_test/stubs/preset_env_accessor.hpp b/src/utils/unit_test/stubs/preset_env_accessor.hpp new file mode 100644 index 00000000000..db212119061 --- /dev/null +++ b/src/utils/unit_test/stubs/preset_env_accessor.hpp @@ -0,0 +1,75 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_UTILS_STUBS_PRESET_ENV_ACCESSOR_HPP_INCLUDED +#define LBANN_UTILS_STUBS_PRESET_ENV_ACCESSOR_HPP_INCLUDED + +#include +#include + +namespace lbann { +namespace utils { +namespace stubs { + +class PresetEnvAccessor +{ +public: + std::string get(std::string const&) const; +private: + static void populate_vars(); +private: + static std::unordered_map vars_; +}; + +inline std::string PresetEnvAccessor::get(std::string const& var_name) const +{ + if (vars_.size() == 0UL) populate_vars(); + + auto it = vars_.find(var_name); + if (it == vars_.end()) + return ""; + + return it->second; +} + +inline void PresetEnvAccessor::populate_vars() +{ + vars_ = { + {"APPLE", "3.14"}, // float + {"ICE_CREAM_SCOOPS", "3"}, // int + {"PIZZA", "pepperoni"}, // string + {"VALUE_IS_TRUE", "true"}, // true as string + {"VALUE_IS_ONE", "1"}, // true as int + {"VALUE_IS_FALSE", "false"}, // false as string + {"VALUE_IS_ZERO", "0"}, // false as int + }; +} + +}// namespace stubs +}// namespace utils +}// namespace lbann + +#endif /* LBANN_UTILS_STUBS_PRESET_ENV_ACCESSOR_HPP_INCLUDED */ diff --git a/superbuild/CMakeLists.txt b/superbuild/CMakeLists.txt index 91fc56f3eb5..be303911401 100644 --- a/superbuild/CMakeLists.txt +++ b/superbuild/CMakeLists.txt @@ -39,6 +39,8 @@ option(LBANN_SB_BUILD_CATCH2 "Pull and install CATCH2 library from Github" OFF) option(LBANN_SB_BUILD_CEREAL "Pull and install CEREAL library from Github" OFF) +option(LBANN_SB_BUILD_CLARA "Pull and install Clara library from Github" OFF) + option(LBANN_SB_BUILD_CNPY "Pull and build CNPY from Github" OFF) option(LBANN_SB_BUILD_CONDUIT "Pull and build CONDUIT from Github" OFF) @@ -74,6 +76,11 @@ if (LBANN_SB_BUILD_CATCH2) list(APPEND _BUILD_PKGS CATCH2) endif () +if (LBANN_SB_BUILD_CLARA) + add_subdirectory(clara) + list(APPEND _BUILD_PKGS CLARA) +endif () + if (LBANN_SB_BUILD_CEREAL) add_subdirectory(cereal) list(APPEND _BUILD_PKGS CEREAL) diff --git a/superbuild/clara/CMakeLists.txt b/superbuild/clara/CMakeLists.txt new file mode 100644 index 00000000000..a0b5bc95966 --- /dev/null +++ b/superbuild/clara/CMakeLists.txt @@ -0,0 +1,74 @@ +# Use CLARA_URL to specify the location of the git repo. Use +# CLARA_TAG to specify the commit. + +enable_language(CXX) + +# Handle the clone mechanism. First URL +option(CLARA_CLONE_VIA_SSH + "Clone CLARA using SSH instead of HTTPS" ${LBANN_SB_CLONE_VIA_SSH}) + +if (CLARA_CLONE_VIA_SSH) + set(CLARA_URL git@github.com:catchorg/clara.git + CACHE STRING "The URL from which to clone CLARA") +else () + set(CLARA_URL "https://github.com/catchorg/clara.git" + CACHE STRING "The URL from which to clone CLARA") +endif () + +# ... then the tag. +set(CLARA_TAG "v1.1.5" + CACHE STRING "The git tag or hash to checkout for CLARA") + +# Where to install CLARA +set(CLARA_CMAKE_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}" + CACHE PATH "The installation location of CLARA.") + +# The build type for CLARA +set(CLARA_CMAKE_BUILD_TYPE "${CMAKE_BUILD_TYPE}" + CACHE STRING "The build type for CLARA.") + +if (CLARA_CUSTOM_SOURCE_DIR) + set(CLARA_SOURCE_DIR "${CLARA_CUSTOM_SOURCE_DIR}") + set(CLARA_URL "") + set(CLARA_TAG "") + set(_GIT_REPOSITORY_TAG) + set(_GIT_TAG_TAG) + message(STATUS "Using CLARA source in: ${CLARA_SOURCE_DIR}") +else () + set(CLARA_SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/src") + set(_GIT_REPOSITORY_TAG "GIT_REPOSITORY") + set(_GIT_TAG_TAG "GIT_TAG") +endif () + +set(CLARA_INSTALL_DIR ${CLARA_CMAKE_INSTALL_PREFIX}/include) + +# Now add the external project +include(ExternalProject) +ExternalProject_Add(CLARA + PREFIX ${CMAKE_CURRENT_BINARY_DIR} + TMP_DIR ${CMAKE_CURRENT_BINARY_DIR}/tmp + STAMP_DIR ${CMAKE_CURRENT_BINARY_DIR}/stamp + ${_GIT_REPOSITORY_TAG} ${CLARA_URL} + ${_GIT_TAG_TAG} ${CLARA_TAG} + SOURCE_DIR ${CLARA_SOURCE_DIR} + BUILD_IN_SOURCE 1 + INSTALL_DIR ${CLARA_CMAKE_INSTALL_PREFIX} + USES_TERMINAL_BUILD 1 + LOG_DOWNLOAD 1 + LOG_UPDATE 1 + LOG_CONFIGURE 1 + LOG_BUILD 1 + LOG_INSTALL 1 + LOG_TEST 1 + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + INSTALL_COMMAND + ${CMAKE_COMMAND} -E make_directory ${CLARA_INSTALL_DIR} + COMMAND + ${CMAKE_COMMAND} -E copy + ${CLARA_SOURCE_DIR}/single_include/clara.hpp + ${CLARA_INSTALL_DIR} + ) + +set(CLARA_DIR ${CLARA_CMAKE_INSTALL_PREFIX} + CACHE INTERNAL "The install prefix of CLARA.") From d2060b12703bafaacbf36576a10f246aa227d0f5 Mon Sep 17 00:00:00 2001 From: Tim Moon Date: Fri, 8 Nov 2019 10:45:20 -0800 Subject: [PATCH 391/634] Create NLP application directory (#1338) * Implement Python class to preprocess Project Gutenberg books * Switch NLP app to use Hugging Face tokenizer * Add simple RNN for text data * Tweak documentation for NLP app --- applications/nlp/README.md | 14 +++ applications/nlp/data/.gitignore | 2 + applications/nlp/rnn/dataset.py | 25 +++++ applications/nlp/rnn/main.py | 114 ++++++++++++++++++++++ applications/nlp/utils/__init__.py | 17 ++++ applications/nlp/utils/gutenberg.py | 146 ++++++++++++++++++++++++++++ 6 files changed, 318 insertions(+) create mode 100644 applications/nlp/README.md create mode 100644 applications/nlp/data/.gitignore create mode 100644 applications/nlp/rnn/dataset.py create mode 100644 applications/nlp/rnn/main.py create mode 100644 applications/nlp/utils/__init__.py create mode 100644 applications/nlp/utils/gutenberg.py diff --git a/applications/nlp/README.md b/applications/nlp/README.md new file mode 100644 index 00000000000..b0dd9a0b2b0 --- /dev/null +++ b/applications/nlp/README.md @@ -0,0 +1,14 @@ +# Example models for natural language processing + +This directory contains LBANN experiments with text data, with the +goal of developing and optimizing NLP functionality. It will +eventually contain reference implementations of widely-used NLP +models. + +## Dependencies + +- Transformers: NLP library for TensorFlow and PyTorch. Install with: + +```bash +pip3 install transformers +``` diff --git a/applications/nlp/data/.gitignore b/applications/nlp/data/.gitignore new file mode 100644 index 00000000000..d6b7ef32c84 --- /dev/null +++ b/applications/nlp/data/.gitignore @@ -0,0 +1,2 @@ +* +!.gitignore diff --git a/applications/nlp/rnn/dataset.py b/applications/nlp/rnn/dataset.py new file mode 100644 index 00000000000..66fb7c0fd88 --- /dev/null +++ b/applications/nlp/rnn/dataset.py @@ -0,0 +1,25 @@ +import os.path +import sys + +# Local imports +current_file = os.path.realpath(__file__) +root_dir = os.path.dirname(os.path.dirname(current_file)) +sys.path.append(root_dir) +import utils.gutenberg + +# Options +text_name = 'frankenstein' +sequence_length = 10 + +# Download and tokenize text data, if needed +data_url = utils.gutenberg.get_url(text_name) +data_dir = os.path.join(root_dir, 'data', text_name) +corpus = utils.gutenberg.GutenbergCorpus(data_dir, data_url) + +# Sample access functions +def get_sample(index): + return corpus[index:index+sequence_length] +def num_samples(): + return len(corpus) - sequence_length + 1 +def sample_dims(): + return (sequence_length,) diff --git a/applications/nlp/rnn/main.py b/applications/nlp/rnn/main.py new file mode 100644 index 00000000000..85d57349e02 --- /dev/null +++ b/applications/nlp/rnn/main.py @@ -0,0 +1,114 @@ +"""Simple recurrent network on tokenized text data.""" +import argparse +import os.path +import sys + +import lbann +import lbann.modules +import lbann.contrib.lc.launcher +import lbann.contrib.args + +# Local imports +current_dir = os.path.dirname(os.path.realpath(__file__)) +root_dir = os.path.dirname(current_dir) +sys.path.append(root_dir) +import dataset +from utils import str_list + +# ---------------------------------- +# Options +# ---------------------------------- + +# Command-line arguments +parser = argparse.ArgumentParser() +lbann.contrib.args.add_scheduler_arguments(parser) +parser.add_argument( + '--job-name', action='store', default='lbann_textrnn', type=str, + help='job name', metavar='NAME') +parser.add_argument( + '--mini-batch-size', action='store', default=256, type=int, + help='mini-batch size (default: 256)', metavar='NUM') +parser.add_argument( + '--num-epochs', action='store', default=20, type=int, + help='number of epochs (default: 20)', metavar='NUM') +parser.add_argument( + '--latent-dim', action='store', default=128, type=int, + help='latent space dimensions (default: 128)', metavar='NUM') +args = parser.parse_args() + +# ---------------------------------- +# Construct layer graph +# ---------------------------------- + +# Dataset properties +vocab_size = dataset.corpus.vocab_size +sequence_length = dataset.sample_dims()[0] + +# Input is a sequence of token IDs +input_ = lbann.Identity(lbann.Input(), device='cpu') +input_slice = lbann.Slice(input_, + slice_points=str_list(range(sequence_length+1)), + device='cpu') +tokens = [] +for _ in range(sequence_length): + tokens.append(lbann.Identity(input_slice, device='cpu')) + +# Layer modules +lstm = lbann.modules.LSTMCell(args.latent_dim) +lstm_state = [lbann.Constant(value=0, num_neurons=str_list(args.latent_dim)), + lbann.Constant(value=0, num_neurons=str_list(args.latent_dim))] +pred_fc = lbann.modules.FullyConnectedModule(vocab_size, + data_layout='model_parallel') + +# Iterate through RNN steps +loss = [] +for step in range(sequence_length-1): + + # Predict next token with RNN + x = lbann.Embedding(tokens[step], + num_embeddings=vocab_size, + embedding_dim=args.latent_dim, + device='cpu') + x, lstm_state = lstm(x, lstm_state) + x = pred_fc(x) + pred = lbann.Softmax(x) + + # Evaluate prediction with cross entropy + ground_truth = lbann.OneHot(tokens[step+1], size=vocab_size) + cross_entropy = lbann.CrossEntropy([pred, ground_truth]) + loss.append(lbann.LayerTerm(cross_entropy, scale=1/(sequence_length-1))) + +# ---------------------------------- +# Create data reader +# ---------------------------------- + +reader = lbann.reader_pb2.DataReader() +_reader = reader.reader.add() +_reader.name = 'python' +_reader.role = 'train' +_reader.percent_of_data_to_use = 1.0 +_reader.python.module = 'dataset' +_reader.python.module_dir = current_dir +_reader.python.sample_function = 'get_sample' +_reader.python.num_samples_function = 'num_samples' +_reader.python.sample_dims_function = 'sample_dims' + +# ---------------------------------- +# Run LBANN +# ---------------------------------- + +# Create LBANN objects +trainer = lbann.Trainer() +model = lbann.Model(args.mini_batch_size, + args.num_epochs, + layers=lbann.traverse_layer_graph(input_), + objective_function=loss, + callbacks=[lbann.CallbackPrint(), + lbann.CallbackTimer()]) +opt = lbann.SGD(learn_rate=0.01, momentum=0.9) + +# Run LBANN +kwargs = lbann.contrib.args.get_scheduler_kwargs(args) +lbann.contrib.lc.launcher.run(trainer, model, reader, opt, + job_name=args.job_name, + **kwargs) diff --git a/applications/nlp/utils/__init__.py b/applications/nlp/utils/__init__.py new file mode 100644 index 00000000000..29d6d9d3e7f --- /dev/null +++ b/applications/nlp/utils/__init__.py @@ -0,0 +1,17 @@ +import collections.abc + +def make_iterable(obj): + """Convert to an iterable object. + + Simply returns `obj` if it is alredy iterable. Otherwise returns a + 1-tuple containing `obj`. + + """ + if isinstance(obj, collections.abc.Iterable) and not isinstance(obj, str): + return obj + else: + return (obj,) + +def str_list(it): + """Convert an iterable object to a space-separated string.""" + return ' '.join([str(i) for i in make_iterable(it)]) diff --git a/applications/nlp/utils/gutenberg.py b/applications/nlp/utils/gutenberg.py new file mode 100644 index 00000000000..fa6e7e0086c --- /dev/null +++ b/applications/nlp/utils/gutenberg.py @@ -0,0 +1,146 @@ +"""Helper functions for text data from Project Gutenberg.""" +import array +import os +import os.path +import re +import urllib.request +import numpy as np + + +def get_url(name): + """URL to Project Gutenberg text file.""" + urls = { + 'frankenstein': 'https://www.gutenberg.org/files/84/84-0.txt', + 'shakespeare': 'https://www.gutenberg.org/files/100/100-0.txt', + } + return urls[name.lower()] + + +def strip_boilerplate(raw_file, stripped_file): + """Remove header and footer from Project Gutenberg text file. + + See: + + https://www.gutenberg.org/wiki/Gutenberg:Project_Gutenberg_Header_How-To + + Args: + raw_file (str): Text file downloaded from Project Gutenberg. + stripped_file (str): Path where the stripped file will be + saved. + + """ + with open(raw_file, 'r') as in_file, \ + open(stripped_file, 'w') as out_file: + started = False + begin_regex = re.compile('^\*\*\* START OF THIS PROJECT GUTENBERG EBOOK .* \*\*\*$') + end_regex = re.compile('^\*\*\* END OF THIS PROJECT GUTENBERG EBOOK .* \*\*\*$') + for line in in_file: + if started: + if end_regex.match(line): + break + else: + out_file.write(line) + elif begin_regex.match(line): + started = True + + +def tokenize(text_file, + encoded_file=None, + vocab_file=None, + ignore_whitespace=True): + """Convert text file to sequence of token IDs. + + Tokenization is performed with BERT tokenizer. + + Args: + text_file (str): Text file to be encoded. + encoded_file (str, optional): If provided, path where the + encoded data will be saved as an .npz file. The sequence of + token IDs is saved as 'encoded_data' and the vocabulary + size is saved as 'vocab_size'. + vocab_file (str, optional): If provided, path where the + vocabulary will be saved as a text file. + ignore_whitespace (bool, optional): Whether to ignore text + lines that are purely made of whitespace (default: True). + + Returns: + array of int: Sequence of token IDs. + int: Number of tokens in vocabulary. + + """ + + # Get BERT tokenizer from Transformers + import transformers + tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-uncased') + vocab_size = tokenizer.vocab_size + if vocab_file: + tokenizer.save_vocabulary(vocab_file) + + # Apply tokenizer to text file + encoded_data = array.array('l') + with open(text_file) as f: + for line in f: + if ignore_whitespace and line.isspace(): + continue + encoded_data.extend(tokenizer.encode(line)) + if encoded_file: + np.savez_compressed(encoded_file, + encoded_data=encoded_data, + vocab_size=vocab_size) + return encoded_data, vocab_size + + +class GutenbergCorpus(): + """Tokenized text from Project Gutenberg. + + Args: + data_dir (str): Directory for downloading data and + intermediate. + data_url (str): URL to Project Gutenberg text file. + + Attributes: + token_data (array of int): Sequence of token IDs. + vocab_size (int): Number of tokens in vocabulary. + + """ + def __init__(self, data_dir, data_url): + + # Create data directory if needed + if not os.path.isdir(data_dir): + os.makedirs(data_dir) + data_dir = os.path.realpath(data_dir) + + # Load tokenized data + # Note: If needed, download the text data from Project + # Gutenberg and tokenize it. + token_data_file = os.path.join(data_dir, 'token_data.npz') + if os.path.isfile(token_data_file): + data = np.load(token_data_file) + token_data = data['encoded_data'] + vocab_size = data['vocab_size'] + else: + text_data_file = os.path.join(data_dir, 'text_data.txt') + if not os.path.isfile(text_data_file): + raw_file = os.path.join(data_dir, 'raw.txt') + if not os.path.isfile(raw_file): + urllib.request.urlretrieve(data_url, + filename=raw_file) + strip_boilerplate(raw_file, text_data_file) + vocab_file = os.path.join(data_dir, 'vocab.txt') + token_data, vocab_size = tokenize(text_data_file, + token_data_file, + vocab_file) + + # Class members + self.token_data = token_data + self.vocab_size = vocab_size + + def __iter__(self): + """Iterator through token IDs.""" + return self.token_data.__iter__() + def __getitem__(self, key): + """Get token ID.""" + return self.token_data.__getitem__(key) + def __len__(self): + """Get total number of tokens in corpus.""" + return self.token_data.__len__() From 6c9ead107bcd00bce9b67cb6473203a01e305198 Mon Sep 17 00:00:00 2001 From: davidHysom Date: Tue, 12 Nov 2019 05:31:10 -0800 Subject: [PATCH 392/634] Data store cache (#1329) * lcocal cache mode is working again * added public function: data_store_conduit::set_profile_msg(). This permits the data reader to print timing statistics to the profile file, instead of cout/cerr. * Modified generic_data_reader to query the data store for it's mode settings, instead of querying the options class. * commiting working version. Next, will change some method names. * Method names for setting and querying the data store's modes are now: void set_is_preloading(bool) bool is_preloading() void set_is_explicitly_loading(bool) bool is_explicitly_loading() void set_loading_is_complete() bool is_fully_loaded() See notes in data_store_conduit.hpp for a (hopefully) complete explanation as to what each of these means and how they interact. * final working version (I hope) * Addressed concerns from Nikoli's review --- .../lbann/data_readers/data_reader_image.hpp | 3 +- .../lbann/data_store/data_store_conduit.hpp | 199 +++-- src/data_readers/data_reader.cpp | 40 +- src/data_readers/data_reader_jag_conduit.cpp | 2 +- src/data_store/data_store_conduit.cpp | 733 ++++++++++++------ src/models/model.cpp | 10 +- 6 files changed, 675 insertions(+), 312 deletions(-) diff --git a/include/lbann/data_readers/data_reader_image.hpp b/include/lbann/data_readers/data_reader_image.hpp index f4cf92a0e36..cde595e781e 100644 --- a/include/lbann/data_readers/data_reader_image.hpp +++ b/include/lbann/data_readers/data_reader_image.hpp @@ -96,6 +96,8 @@ class image_data_reader : public generic_data_reader { void do_preload_data_store() override; + void load_conduit_node_from_file(int data_id, conduit::Node &node); + protected: void copy_members(const image_data_reader &rhs); @@ -112,7 +114,6 @@ class image_data_reader : public generic_data_reader { int m_image_linearized_size; ///< linearized image size int m_num_labels; ///< number of labels - void load_conduit_node_from_file(int data_id, conduit::Node &node); bool load_conduit_nodes_from_file(const std::unordered_set &data_ids); }; diff --git a/include/lbann/data_store/data_store_conduit.hpp b/include/lbann/data_store/data_store_conduit.hpp index 6e6635dd879..3d436475a0a 100644 --- a/include/lbann/data_store/data_store_conduit.hpp +++ b/include/lbann/data_store/data_store_conduit.hpp @@ -52,6 +52,13 @@ class data_store_conduit { public: + // need to quickly change from unordered_map to map for debugging + using map_ii_t = std::unordered_map; + using map_is_t = std::unordered_map; + + // not currently used; will be in the future + using map_ss_t = std::unordered_map; + //! ctor data_store_conduit(generic_data_reader *reader); @@ -69,7 +76,6 @@ class data_store_conduit { //! dtor ~data_store_conduit(); - /// required when the copy ctor is used to construct a validation set void set_data_reader_ptr(generic_data_reader *reader); //! convenience handle @@ -80,9 +86,10 @@ class data_store_conduit { void setup(int mini_batch_size); + // TODO FIXME void check_mem_capacity(lbann_comm *comm, const std::string sample_list_file, size_t stride, size_t offset); - /// returns the conduit node + /** @brief Returns the conduit Node associated with the data_id */ const conduit::Node & get_conduit_node(int data_id) const; /// if 'already_have = true' then the passed 'node' was obtained by a call to @@ -90,6 +97,7 @@ class data_store_conduit { void set_conduit_node(int data_id, conduit::Node &node, bool already_have = false); void set_preloaded_conduit_node(int data_id, const conduit::Node &node); + void spill_preloaded_conduit_node(int data_id, const conduit::Node &node); const conduit::Node & get_random_node() const; @@ -99,13 +107,71 @@ class data_store_conduit { /// returns an empty node conduit::Node & get_empty_node(int data_id); - void set_is_preloaded(); + //================================================================= + // methods for setting and querying the data store's mode + //================================================================= + /** @brief Returns true if preloading is turned on + * + * See notes in: is_explicitly_loading() + */ + bool is_preloading() const { return m_preloading; } + + /** @brief Returns true if explicitly loading is turned on + * + * 'explicitly loading' means that the data that will be owned + * by each rank is passed into the data store during the first epoch. + * This is in contrast to preloading, in which the data is passed into + * the data store prior to the first epoch. Explicit and preloading + * are exclusive: at most only one may be true, however, both will + * be set to false when all loading is complete. + */ + bool is_explicitly_loading() const { return m_explicitly_loading; } + + /** @brief Returns true if all loading has been completed + * + * See notes in: set_loading_is_complete() + */ + bool is_fully_loaded() const; + + /** @brief Returns "true" is running in local cache mode + * + * In local cache mode, each node contains a complete copy + * of the data set. This is stored in a shared memory segment, + * but part of the set may be spilled to disk if memory is + * insufficient. Local cache mode is activated via the cmd line + * flag: --data_store_cache + */ + bool is_local_cache() const { return m_is_local_cache; } + + /** @brief Turn preloading on or off */ + void set_is_preloading(bool flag); + + /** @brief Turn on explicit loading */ + void set_is_explicitly_loading(bool flag); + + /** @brief Marks the data_store as fully loaded + * + * Fully loaded means that each rank has all the data that it + * is intended to own. When not running in local cache mode, this + * occurs (1) at the conclusion of preloading, prior to the beginning of + * the first epoch, or (2) at the conclusion of the first epoch, if + * explicitly loading. When running in local cache mode, this occurs + * (1) at the conclusion of preload_local_cache(), which is called prior + * to the first epoch, or (2) at the conclusion of exchange_local_caches(), + * at th conclusion of the first epoch, if explicitly loading. + */ + void set_loading_is_complete(); - bool is_preloaded() { return m_preload; } - void set_explicit_loading(bool flag) { m_explicit_loading = flag; } + /** @brief turns local cache mode on of off */ + void set_is_local_cache(bool flag) { m_is_local_cache = flag; } - bool is_explicitly_loading() { return m_explicit_loading; } + /** @brief Check that explicit loading, preloading, and fully loaded flags are consistent */ + void check_query_flags() const; + + //================================================================= + // END methods for setting and querying the data store's mode + //================================================================= /// fills in m_owner, which maps index -> owning processor void build_preloaded_owner_map(const std::vector& per_rank_list_sizes); @@ -118,15 +184,6 @@ class data_store_conduit { /// with the index int get_index_owner(int idx); - /** @brief Returns "true" is running in local cache mode - * - * In local cache mode, each node contains a complete copy - * of the data set. This is stored in a shared memory segment, - * but part of the set may be spilled to disk if memory is - * insufficient. Local cache mode is activated via the cmd line - * flag: --data_store_cache - */ - bool is_local_cache() const { return m_is_local_cache; } /** @brief Read the data set into memory * @@ -164,7 +221,7 @@ class data_store_conduit { * * Profile logging is enabled on P_0 via the cmd line flag: --data_store_profile */ - void flush_profile_file(); + void flush_profile_file() const; /** @brief Writes object's state to file */ void write_checkpoint(std::string dir_name); @@ -172,8 +229,26 @@ class data_store_conduit { /** @brief Loads object's state from file */ void load_checkpoint(std::string dir_name, generic_data_reader *reader = nullptr); + /** @brief Add text to the profiling file, if it's opened */ + void set_profile_msg(std::string); + + /** @brief Runs an internal test to ensure the locally cached conduit data is correct + * + * For use during development and testing. This test is activated via + * the cmd line flag: --data_store_test_cache. Output may be written to + * cout, and the profile and debug files (if they are opened) + * @param n is the maximum number of samples to test; set to -1 to test all + * @return true, if all samples read from file match those constructed from + * the local shared memory segment (aka, cache) + */ + bool test_local_cache_imagenet(int n); + + void test_imagenet_node(int sample_id, bool dereference = true); + private : + bool m_run_checkpoint_test = false; + /** @brief The number of samples that this processor owns */ size_t m_my_num_indices = 0; @@ -212,7 +287,7 @@ private : int m_num_files_in_cur_spill_dir; /** @brief maps data_id to m_m_cur_spill_dir_integer. */ - std::unordered_map m_spilled_nodes; + map_ii_t m_spilled_nodes; /// used in set_conduit_node(...) std::mutex m_mutex; @@ -271,11 +346,18 @@ private : bool m_is_setup = false; /// set to true if data_store is preloaded - bool m_preload = false; + bool m_loading_is_complete = false; + + /** @brief True, if we are in preload mode */ + bool m_preloading = false; - /// set to true if data_store is being explicitly loaded - //VBE: please explain what this means! - bool m_explicit_loading = false; + /** @brief True, if we are in explicit loading mode + * + * There is some redundancy here: m_preloading and m_explicitly_loading + * can not both be true, but both may be false. When m_loading_is_complete + * is true, both m_preloading and m_preloading should be false. + */ + bool m_explicitly_loading = false; /// The size of the mini-batch that was used to calculate ownership /// of samples when building the owner map. This size has to be @@ -304,28 +386,29 @@ private : int m_rank_in_world = -1; // -1 for debugging int m_np_in_trainer; - /** @brief Maps an index to the processor that owns the associated data - * - * Must be mutable since rhs.m_owner may be modified in copy_members, - * in which rhs is const. - */ - //TODO: make undoredered map; for development want map() for ordered printing - mutable std::map m_owner; + /** @brief Maps an index to the processor that owns the associated data */ + map_ii_t m_owner; /// convenience handle const std::vector *m_shuffled_indices; /** @brief Contains the conduit nodes that are "owned" by this rank * - * Map data_id -> conduit::Node. - * Must be mutable since rhs.m_owner may be modified in copy_members, - * in which rhs is const. + * Maps data_id -> conduit::Node. */ - mutable std::unordered_map m_data; + std::unordered_map m_data; + + /** @brief Contains the conduit nodes that are "owned" by this rank + * + * This differs from m_data in that this holds temporarily, + * during the first epoch, if we're running in local cache mode + * and explicitly loading + */ + std::unordered_map m_data_cache; /// Contains the list of data IDs that will be received std::vector m_recv_data_ids; - std::unordered_map m_recv_sample_sizes; + map_ii_t m_recv_sample_sizes; /// This vector contains Nodes that this processor needs for /// the current minibatch; this is filled in by exchange_data() @@ -340,8 +423,15 @@ private : std::vector m_outgoing_msg_sizes; std::vector m_incoming_msg_sizes; - /// for use when conduit Nodes have non-uniform size, e.g, imagenet - std::unordered_map m_sample_sizes; + /** @brief Maps a data_id to its image size + * + * Used when conduit Nodes have non-uniform size, e.g, imagenet; + * see: set_node_sizes_vary() + */ + map_is_t m_sample_sizes; + + /** @brief Maps a data_id to the image location in a shared memory segment */ + map_is_t m_image_offsets; /// maps processor id -> set of indices (whose associated samples) /// this proc needs to send. (formerly called "proc_to_indices); @@ -352,10 +442,6 @@ private : /// this proc needs to recv from others. (formerly called "needed") std::vector> m_indices_to_recv; - /// offset at which the raw image will be stored in a shared memory segment; - /// for use in local cache mode; maps data_id to offset - std::unordered_map m_image_offsets; - //========================================================================= // methods follow //========================================================================= @@ -383,27 +469,31 @@ private : void error_check_compacted_node(const conduit::Node &nd, int data_id); + /** @brief All ranks exchange their cached data */ + void exchange_local_caches(); + /// Currently only used for imagenet. On return, 'sizes' maps a sample_id to image size, and indices[p] contains the sample_ids that P_p owns /// for use in local cache mode - void get_image_sizes(std::unordered_map &sizes, std::vector> &indices); - - /// fills in m_image_offsets for use in local cache mode - void compute_image_offsets(std::unordered_map &sizes, std::vector> &indices); + void get_image_sizes(map_is_t &sizes, std::vector> &indices); /// for use in local cache mode - void allocate_shared_segment(std::unordered_map &sizes, std::vector> &indices); + void allocate_shared_segment(map_is_t &sizes, std::vector> &indices); /// for use in local cache mode - void read_files(std::vector &work, std::unordered_map &sizes, std::vector &indices); + void read_files(std::vector &work, map_is_t &sizes, std::vector &indices); + + /// fills in m_image_offsets for use in local cache mode + void compute_image_offsets(map_is_t &image_sizes, std::vector> &indices); /// for use in local cache mode - void build_conduit_nodes(std::unordered_map &sizes); + void exchange_images(std::vector &work, map_is_t &image_sizes, std::vector> &indices); /// for use in local cache mode - void exchange_images(std::vector &work, std::unordered_map &image_sizes, std::vector> &indices); + void build_conduit_nodes(map_is_t &sizes); + /// for use in local cache mode - void fillin_shared_images(const std::vector &images, size_t offset); + void fillin_shared_images(char* images, size_t size, size_t offset); /** @brief For testing during development * @@ -441,7 +531,7 @@ private : * This method is called for both --data_store_spill and * --data_store_test_checkpoint */ - void setup_spill(const std::string &dir); + void setup_spill(std::string dir); /** @brief Saves this object's state to file * @@ -462,11 +552,18 @@ private : /** @brief Creates a directory for spilling conduit nodes */ void open_next_conduit_spill_directory(); + /** @brief Write timing data for data exchange to the profile file, if it's opened */ + void profile_timing(); + + void setup_checkpoint_test(); + + std::string get_lassen_spill_dir(); + //========================================================================= // functions and templates for optional profiling and debug files follow //========================================================================= - void PROFILE() { + void PROFILE() const { if (!m_profile) { return; } @@ -475,7 +572,7 @@ private : } template - void PROFILE(T var1, Types... var2) { + void PROFILE(T var1, Types... var2) const { if (!m_world_master) { return; } diff --git a/src/data_readers/data_reader.cpp b/src/data_readers/data_reader.cpp index 41a77988a60..bebbbb3e85a 100644 --- a/src/data_readers/data_reader.cpp +++ b/src/data_readers/data_reader.cpp @@ -707,15 +707,11 @@ void generic_data_reader::instantiate_data_store() { m_data_store->set_node_sizes_vary(); } - //a call to m_data_store->check_mem_capacity(...) should go here, but - //at the moment that depends on the sample_list class, which it shouldn't - //TODO: revisit - m_data_store->set_shuffled_indices(&m_shuffled_indices); - if (is_master()) { - std::cout << "generic_data_reader::instantiate_data_store time: : " << (get_time() - tm1) << std::endl; - } + std::stringstream s; + s << "generic_data_reader::instantiate_data_store time: : " << (get_time() - tm1); + m_data_store->set_profile_msg(s.str()); } void generic_data_reader::setup_data_store(int mini_batch_size) { @@ -723,28 +719,25 @@ void generic_data_reader::setup_data_store(int mini_batch_size) { LBANN_ERROR("m_data_store == nullptr; you shouldn't be here"); } // optionally preload the data store - options *opts = options::get(); - - if (opts->get_bool("preload_data_store") || opts->get_bool("data_store_cache")) { - if(is_master()) { - std::cerr << "generic_data_reader::instantiate_data_store - Starting the preload" << std::endl; - } + if (m_data_store->is_preloading()) { + m_data_store->set_profile_msg("generic_data_reader::instantiate_data_store - Starting the preload"); double tm2 = get_time(); preload_data_store(); - if(is_master()) { - std::cout << "Preload complete; time: " << get_time() - tm2 << std::endl; - } + std::stringstream s; + s << "Preload complete; time: " << get_time() - tm2; + m_data_store->set_profile_msg(s.str()); size_t n = m_data_store->get_num_global_indices(); if (n != m_shuffled_indices.size()) { LBANN_ERROR("num samples loaded: ", n, " != shuffled-indices.size(): ", m_shuffled_indices.size()); } } + m_data_store->setup(mini_batch_size); } bool generic_data_reader::data_store_active() const { - if (m_data_store != nullptr && m_data_store->is_preloaded()) { + if (m_data_store != nullptr && m_data_store->is_fully_loaded()) { return true; } @@ -760,7 +753,7 @@ bool generic_data_reader::data_store_active() const { bool generic_data_reader::priming_data_store() const { const auto& c = static_cast(m_model->get_execution_context()); - if (m_data_store != nullptr && m_data_store->is_preloaded()) { + if (m_data_store != nullptr && m_data_store->is_fully_loaded()) { return false; } @@ -812,8 +805,11 @@ void generic_data_reader::set_role(std::string role) { void generic_data_reader::preload_data_store() { if (m_data_store->is_local_cache()) { + m_data_store->set_profile_msg("generic_data_reader::preload_data_store() calling m_data_store->preload_local_cache()"); m_data_store->preload_local_cache(); - } else { + } + + else { std::vector local_list_sizes; int np = m_comm->get_procs_per_trainer(); int base_files_per_rank = m_shuffled_indices.size() / np; @@ -828,11 +824,13 @@ void generic_data_reader::preload_data_store() { local_list_sizes[j] += 1; } } + m_data_store->set_profile_msg("generic_data_reader::preload_data_store() calling m_data_store->build_preloaded_owner_map()"); m_data_store->build_preloaded_owner_map(local_list_sizes); + m_data_store->set_profile_msg("generic_data_reader::preload_data_store() calling do_preload_data_store()"); + do_preload_data_store(); + m_data_store->set_loading_is_complete(); } - do_preload_data_store(); - m_data_store->set_is_preloaded(); } diff --git a/src/data_readers/data_reader_jag_conduit.cpp b/src/data_readers/data_reader_jag_conduit.cpp index 4af68d7c814..60cda6027af 100644 --- a/src/data_readers/data_reader_jag_conduit.cpp +++ b/src/data_readers/data_reader_jag_conduit.cpp @@ -302,7 +302,7 @@ bool data_reader_jag_conduit::load_conduit_node(const size_t i, const std::strin if (!has_path(h, path)) { const std::string& file_name = m_sample_list.get_samples_filename(id); if (m_data_store != nullptr) { - if (! m_data_store->is_preloaded()) { + if (! m_data_store->is_fully_loaded()) { const conduit::Node obj = m_data_store->get_random_node(); node = obj["data"]; const std::vector& child_names = node.child_names(); diff --git a/src/data_store/data_store_conduit.cpp b/src/data_store/data_store_conduit.cpp index 2312eebeaf4..13c1b40282a 100644 --- a/src/data_store/data_store_conduit.cpp +++ b/src/data_store/data_store_conduit.cpp @@ -52,12 +52,14 @@ namespace lbann { +std::string commify(size_t n); + data_store_conduit::data_store_conduit( generic_data_reader *reader) : m_reader(reader) { m_comm = m_reader->get_comm(); if (m_comm == nullptr) { - LBANN_ERROR(" m_comm is nullptr"); + LBANN_ERROR("m_comm is nullptr"); } m_world_master = m_comm->am_world_master(); @@ -70,59 +72,33 @@ data_store_conduit::data_store_conduit( options *opts = options::get(); - std::string spill_dir; - - // error check for a single "spill" flag if (opts->has_string("data_store_test_checkpoint") && opts->has_string("data_store_spill")) { LBANN_ERROR("you passed both --data_store_test_checkpoint and --data_store_spill; please use one or the other or none, but not both"); - } - - // error check if running in checkpoint test mode + } if (opts->has_string("data_store_test_checkpoint")) { - std::string c = opts->get_string("data_store_test_checkpoint"); - if (c == "1") { - LBANN_ERROR("--data_store_test_checkpoint=1; you probably forgot to specify the spill directory; you must specify --data_store_test_checkpoint='"); - } else { - if (c == "lassen") { - char * val = std::getenv("BBPATH"); - if (val == NULL) { - LBANN_ERROR("std::getenv(\"BBPATH\") returned NULL; unable to use burst buffer"); - } - std::string cc(val); - c = cc + "/data_store"; - } - spill_dir = c; - m_test_dir = c; - } - } - - // error check if running in spill mode + setup_checkpoint_test(); + } if (opts->has_string("data_store_spill")) { - const std::string c = opts->get_string("data_store_spill"); - if (c == "1") { - LBANN_ERROR("--data_store_spill=1; you probably forgot to specify the spill directory; you must specify --data_store_spill='"); - } else { - spill_dir = c; - } - } - - if (spill_dir != "") { - m_spill_dir_base = spill_dir; - } - - // error check: if running in local cache mode, must preload - // TODO: future work -- modify so preload is not necessary - m_is_local_cache = opts->get_bool("data_store_cache"); - if (m_is_local_cache && !opts->get_bool("preload_data_store")) { - LBANN_ERROR("data_store_cache is currently only implemented for preload mode; this will change in the future. For now, pleas pass both flags: data_store_cache and --preload_data_store"); + setup_spill(opts->get_string("data_store_spill")); } - if (m_is_local_cache) { + set_is_local_cache(opts->get_bool("data_store_cache")); + set_is_preloading(opts->get_bool("preload_data_store")); + set_is_explicitly_loading(! is_preloading()); + + if (is_local_cache()) { PROFILE("data_store_conduit is running in local_cache mode"); } else { PROFILE("data_store_conduit is running in multi-message mode"); } + if (is_explicitly_loading()) { + PROFILE("data_store_conduit is explicitly loading"); + } else { + PROFILE("data_store_conduit is preloading"); + } + + check_query_flags(); } data_store_conduit::~data_store_conduit() { @@ -144,6 +120,29 @@ data_store_conduit::~data_store_conduit() { } } +void data_store_conduit::setup_checkpoint_test() { + std::string c = options::get()->get_string("data_store_test_checkpoint"); + if (c == "1") { + LBANN_ERROR("--data_store_test_checkpoint=1; you probably forgot to specify the spill directory; you must specify --data_store_test_checkpoint='"); + } + if (c == "lassen") { + c = get_lassen_spill_dir(); + } + m_spill_dir_base = c; + m_test_dir = c; + m_run_checkpoint_test = true; +} + +std::string data_store_conduit::get_lassen_spill_dir() { + char * val = std::getenv("BBPATH"); + if (val == NULL) { + LBANN_ERROR("std::getenv(\"BBPATH\") returned NULL; unable to use burst buffer"); + } + std::string cc(val); + return cc + "/data_store"; +} + + data_store_conduit::data_store_conduit(const data_store_conduit& rhs) { copy_members(rhs); } @@ -167,8 +166,9 @@ void data_store_conduit::set_data_reader_ptr(generic_data_reader *reader) { void data_store_conduit::copy_members(const data_store_conduit& rhs) { m_is_setup = rhs.m_is_setup; - m_preload = rhs.m_preload; - m_explicit_loading = rhs.m_explicit_loading; + m_preloading = rhs.m_preloading; + m_loading_is_complete = rhs.m_loading_is_complete; + m_explicitly_loading = rhs.m_explicitly_loading; m_owner_map_mb_size = rhs.m_owner_map_mb_size; m_compacted_sample_size = rhs.m_compacted_sample_size; m_is_local_cache = rhs.m_is_local_cache; @@ -217,18 +217,9 @@ void data_store_conduit::copy_members(const data_store_conduit& rhs) { } void data_store_conduit::setup(int mini_batch_size) { - double tm1 = get_time(); PROFILE("starting setup()"); - m_owner_map_mb_size = mini_batch_size; - - if (m_is_local_cache && m_preload) { - preload_local_cache(); - } m_is_setup = true; - - PROFILE("time for data_store_conduit setup: ", (get_time()-tm1), - " (will be insignificant unless running in local cache mode)"); } void data_store_conduit::setup_data_store_buffers() { @@ -269,7 +260,13 @@ void data_store_conduit::set_preloaded_conduit_node(int data_id, const conduit:: { std::lock_guard lock(m_mutex); ++m_my_num_indices; - } + } + + if (is_local_cache()) { + m_data[data_id] = node; + return; + } + if (m_spill) { spill_preloaded_conduit_node(data_id, node); @@ -326,7 +323,7 @@ void data_store_conduit::set_conduit_node(int data_id, conduit::Node &node, bool // playing it safe and locking the whole dang thing. ++m_my_num_indices; - if (m_is_local_cache && m_preload) { + if (is_local_cache() && is_preloading()) { LBANN_ERROR("you called data_store_conduit::set_conduit_node, but you're running in local cache mode with preloading; something is broken; please contact Dave Hysom"); } @@ -386,8 +383,6 @@ void data_store_conduit::set_conduit_node(int data_id, conduit::Node &node, bool } } -//n.b. Do not put any PROFILE or DEBUG statements in this method, -// since the threading from the data_reader will cause you grief const conduit::Node & data_store_conduit::get_conduit_node(int data_id) const { if (is_local_cache()) { std::unordered_map::const_iterator t3 = m_data.find(data_id); @@ -692,6 +687,7 @@ int data_store_conduit::get_index_owner(int idx) { } void data_store_conduit::check_mem_capacity(lbann_comm *comm, const std::string sample_list_file, size_t stride, size_t offset) { +//TODO: this is junky, and isn't called anywhere; rethink! if (comm->am_world_master()) { // note: we only estimate memory required by the data reader/store @@ -893,28 +889,65 @@ void data_store_conduit::exchange_sample_sizes() { m_have_sample_sizes = true; } -void data_store_conduit::set_is_preloaded() { - m_preload = true; +void data_store_conduit::set_is_preloading(bool flag) { + m_preloading = flag; +} + +void data_store_conduit::set_is_explicitly_loading(bool flag) { + m_explicitly_loading = flag; + if (is_preloading() && is_explicitly_loading()) { + LBANN_ERROR("flags for both explicit and pre- loading are set; this is an error"); + } +} + +void data_store_conduit::set_loading_is_complete() { + PROFILE("set_loading_is_complete()"); + m_loading_is_complete = true; + set_is_preloading(false); + set_is_explicitly_loading(false); + check_query_flags(); + + if (m_run_checkpoint_test) { + test_checkpoint(m_spill_dir_base); + } +} + +bool data_store_conduit::is_fully_loaded() const { + if (m_loading_is_complete) { + return true; + } + return false; } -void data_store_conduit::get_image_sizes(std::unordered_map &file_sizes, std::vector> &indices) { +void data_store_conduit::get_image_sizes(map_is_t &file_sizes, std::vector> &indices) { /// this block fires if image sizes have been precomputed if (options::get()->has_string("image_sizes_filename")) { LBANN_ERROR("not yet implemented"); //TODO dah - implement, if this becomes a bottleneck (but I don't think it will) } - else { - // get list of image file names - image_data_reader *image_reader = dynamic_cast(m_reader); - if (image_reader == nullptr) { - LBANN_ERROR("data_reader_image *image_reader = dynamic_cast(m_reader) failed"); + // get list of image file names + image_data_reader *image_reader = dynamic_cast(m_reader); + if (image_reader == nullptr) { + LBANN_ERROR("data_reader_image *image_reader = dynamic_cast(m_reader) failed"); + } + const std::vector &image_list = image_reader->get_image_list(); + std::vector my_image_sizes; + + // this block fires if we're exchanging cache data at the end + // of the first epoch, and the data store was not preloaded + if (is_explicitly_loading()) { + for (const auto &t : m_data) { + int data_id = t.first; + my_image_sizes.push_back(data_id); + my_image_sizes.push_back(t.second[LBANN_DATA_ID_STR(data_id) + "/buffer_size"].value()); } - const std::vector &image_list = image_reader->get_image_list(); - + } + + else { // get sizes of files for which I'm responsible - std::vector my_image_sizes; for (size_t h=m_rank_in_trainer; hsize(); h += m_np_in_trainer) { + ++m_my_num_indices; const std::string fn = m_reader->get_file_dir() + '/' + image_list[(*m_shuffled_indices)[h]].first; std::ifstream in(fn.c_str()); if (!in) { @@ -925,39 +958,41 @@ void data_store_conduit::get_image_sizes(std::unordered_map &file_si my_image_sizes.push_back(in.tellg()); in.close(); } - int my_count = my_image_sizes.size(); + } - std::vector counts(m_np_in_trainer); - m_comm->all_gather(&my_count, 1, counts.data(), 1, m_comm->get_trainer_comm()); + // exchange image sizes + int my_count = my_image_sizes.size(); - //my_image_sizes[h*2] contains the image index - //my_image_sizes[h*2+1] contains the image sizee + std::vector counts(m_np_in_trainer); + m_comm->all_gather(&my_count, 1, counts.data(), 1, m_comm->get_trainer_comm()); - //fill in displacement vector for gathering the actual image sizes - std::vector disp(m_np_in_trainer + 1); - disp[0] = 0; - for (size_t h=0; h work(image_list.size()*2); - m_comm->trainer_all_gather(my_image_sizes, work, counts, disp); - indices.resize(m_np_in_trainer); - for (int h=0; h disp(m_np_in_trainer + 1); + disp[0] = 0; + for (size_t h=0; h work(image_list.size()*2); + m_comm->trainer_all_gather(my_image_sizes, work, counts, disp); + indices.resize(m_np_in_trainer); + for (int h=0; h &sizes, std::vector> &indices) { +void data_store_conduit::compute_image_offsets(map_is_t &sizes, std::vector> &indices) { size_t offset = 0; for (size_t p=0; p &s } } - -void data_store_conduit::allocate_shared_segment(std::unordered_map &sizes, std::vector> &indices) { +void data_store_conduit::allocate_shared_segment(map_is_t &sizes, std::vector> &indices) { off_t size = 0; for (auto &&t : sizes) { size += t.second; @@ -987,13 +1021,12 @@ void data_store_conduit::allocate_shared_segment(std::unordered_map size_t avail_mem = stat.f_bsize*stat.f_bavail; double percent = 100.0 * m_mem_seg_length / avail_mem; std::stringstream msg; - msg << " size of required shared memory segment: " << m_mem_seg_length << "\n" - << " available mem: " << avail_mem << "\n" - << " required size is " << percent << " percent of available\n"; - if (m_world_master) { - std::cerr << "\nShared memory segment statistics:\n" - << msg.str() << "\n"; - } + PROFILE( + " Shared Memory segment statistics:\n", + " size of required shared memory segment: ", commify(m_mem_seg_length), "\n", + " available mem: ", commify(avail_mem), "\n", + " required size is ", percent, " percent of available"); + if (m_mem_seg_length >= avail_mem) { LBANN_ERROR("insufficient available memory:\n", msg.str()); } @@ -1059,46 +1092,56 @@ void data_store_conduit::allocate_shared_segment(std::unordered_map } void data_store_conduit::preload_local_cache() { - std::unordered_map file_sizes; + exchange_local_caches(); +} + +void data_store_conduit::exchange_local_caches() { + PROFILE("Starting exchange_local_caches"); + PROFILE(" At new epoch; m_cur_epoch: ", m_cur_epoch); + PROFILE(" is_explicitly_loading(): ", is_explicitly_loading()); + PROFILE(" is_preloading(): ", is_preloading()); + PROFILE(" is_local_cache(): ", is_local_cache()); + PROFILE(" is_fully_loaded: ", is_fully_loaded()); + + // indices[j] will contain the indices + // that P_j will read from disk, and subsequently bcast to all others std::vector> indices; double tm1 = get_time(); - if (m_world_master) std::cerr << "calling get_image_sizes" << std::endl; - get_image_sizes(file_sizes, indices); - if (m_world_master) std::cerr << " get_image_sizes time: " << (get_time()-tm1) << std::endl; - tm1 = get_time(); - //indices[j] contains the indices (wrt m_reader->get_image_list()) - //that P_j will read from disk, and subsequently bcast to all others - // - //file_sizes maps an index to its file size - - if (m_world_master) std::cerr << "calling allocate_shared_segment" << std::endl; - allocate_shared_segment(file_sizes, indices); - if (m_world_master) std::cerr << " allocate_shared_segment time: " << (get_time()-tm1) << std::endl; + get_image_sizes(m_sample_sizes, indices); + PROFILE(" get_image_sizes time: ", (get_time()-tm1)); + tm1 = get_time(); + allocate_shared_segment(m_sample_sizes, indices); + PROFILE(" allocate_shared_segment time: ", (get_time()-tm1)); - if (m_world_master) std::cerr << "calling read_files" << std::endl; std::vector work; - read_files(work, file_sizes, indices[m_rank_in_trainer]); - if (m_world_master) std::cerr << " read_files time: " << (get_time()- tm1) << std::endl; + if (! is_explicitly_loading()) { + tm1 = get_time(); + read_files(work, m_sample_sizes, indices[m_rank_in_trainer]); + PROFILE(" read_files time: ", (get_time()- tm1)); + } + tm1 = get_time(); + compute_image_offsets(m_sample_sizes, indices); + PROFILE(" compute_image_offsets time: ", (get_time()-tm1)); - if (m_world_master) std::cerr << "calling compute_image_offsets" << std::endl; - compute_image_offsets(file_sizes, indices); - if (m_world_master) std::cerr << " compute_image_offsets time: " << (get_time()-tm1) << std::endl; tm1 = get_time(); + exchange_images(work, m_sample_sizes, indices); + PROFILE(" exchange_images time: ", (get_time()-tm1)); - if (m_world_master) std::cerr << "calling exchange_images" << std::endl; - exchange_images(work, file_sizes, indices); - if (m_world_master) std::cerr << " exchange_images time: " << (get_time()-tm1) << std::endl; tm1 = get_time(); + build_conduit_nodes(m_sample_sizes); + PROFILE(" build_conduit_nodes time: ", (get_time()-tm1)); - if (m_world_master) std::cerr << "calling build_conduit_nodes" << std::endl; - build_conduit_nodes(file_sizes); - if (m_world_master) std::cerr << " build_conduit_nodes time: " << (get_time()-tm1) << std::endl; + set_loading_is_complete(); + + if (options::get()->get_bool("data_store_test_cache")) { + test_local_cache_imagenet(20); + } } -void data_store_conduit::read_files(std::vector &work, std::unordered_map &sizes, std::vector &indices) { +void data_store_conduit::read_files(std::vector &work, map_is_t &sizes, std::vector &indices) { //reserve space for reading this proc's files into a contiguous memory space size_t n = 0; @@ -1107,15 +1150,13 @@ void data_store_conduit::read_files(std::vector &work, std::unordered_map< } work.resize(n); - DEBUG("data_store_conduit::read_files; requested work size: ", n); - //get the list of images from the data reader image_data_reader *image_reader = dynamic_cast(m_reader); const std::vector &image_list = image_reader->get_image_list(); //read the images size_t offset = 0; - if (m_world_master) std::cerr << " my num files: " << indices.size() << std::endl; + PROFILE(" my num files: ", indices.size()); for (size_t j=0; j &work, std::unordered_map< in.close(); offset += s; } - if (m_world_master) std::cerr << " finished reading files\n"; } -void data_store_conduit::build_conduit_nodes(std::unordered_map &sizes) { +void data_store_conduit::build_conduit_nodes(map_is_t &sizes) { image_data_reader *image_reader = dynamic_cast(m_reader); const std::vector &image_list = image_reader->get_image_list(); - for (size_t idx=0; idx &images, size_t offset) { - memcpy(m_mem_seg+offset, reinterpret_cast(images.data()), images.size()); +void data_store_conduit::fillin_shared_images(char* images, size_t size, size_t offset) { + PROFILE(" fillin_shared_images; size: ", commify(size), " offset: ", commify(offset)); + memcpy(reinterpret_cast(m_mem_seg+offset), reinterpret_cast(images), size); } -void data_store_conduit::exchange_images(std::vector &work, std::unordered_map &image_sizes, std::vector> &indices) { - std::vector work2; +void data_store_conduit::exchange_images(std::vector &work, map_is_t &image_sizes, std::vector> &indices) { + + // If explicitly loading we need to build "work" (the vector to be broadcast); + // if preloading, this has already been built in read_files() + if (is_explicitly_loading()) { + if (work.size() != 0) { + LBANN_ERROR("work.size() != 0, but it should be"); + } + + // Compute the required buffer size + size_t n = 0; + for (const auto &t : m_data) { + int data_id = t.first; + size_t sz = t.second[LBANN_DATA_ID_STR(data_id) + "/buffer_size"].value(); + n += sz; + } + work.resize(n); + PROFILE(" size required for my work buffer: ", work.size()); + + // Copy the images into the work vector + size_t offset2 = 0; + for (const auto &t : m_data) { + int data_id = t.first; + const conduit::Node &node = t.second; + const char *buf = node[LBANN_DATA_ID_STR(data_id) + "/buffer"].value(); + size_t sz = node[LBANN_DATA_ID_STR(data_id) + "/buffer_size"].value(); + memcpy(work.data()+offset2, reinterpret_cast(buf), sz); + offset2 += sz; + if (offset2 > work.size()) { + LBANN_ERROR("offset >= work.size(); offset: ", offset2, " work.size(): ", work.size(), " sz: ", sz); + } + } + } + int node_rank = m_comm->get_rank_in_node(); + std::vector work2; size_t offset = 0; for (int p=0; ptrainer_broadcast(p, work.data(), work.size()); - if (node_rank == 0) { - fillin_shared_images(work, offset); - } - } else { - size_t sz = 0; - for (auto idx : indices[p]) { - sz += image_sizes[idx]; - } - work2.resize(sz); - m_comm->trainer_broadcast(p, work2.data(), sz); - if (node_rank == 0) { - fillin_shared_images(work2, offset); - } + // Count the number of bytes to be broadcast by P_p + size_t bytes = 0; + for (auto idx : indices[p]) { + bytes += image_sizes[idx]; } + //PROFILE(" \nP_", p, " has ", commify(bytes), " bytes to bcast"); + + // Set up the rounds; due to MPI yuckiness, can bcast at most INT_MAX bytes + // in a single broadcast + std::vector rounds; + int n = bytes/INT_MAX; + if (n < 0) { + LBANN_ERROR("(n < 0; that shouldn't be possible; please contact Dave Hysom"); + } + for (int k=0; ktrainer_broadcast(p, work.data()+work_vector_offset, sz); + if (node_rank == 0) { + fillin_shared_images(work.data()+work_vector_offset, sz, offset); + } + } else { + work2.resize(sz); + m_comm->trainer_broadcast(p, work2.data(), sz); + if (node_rank == 0) { + fillin_shared_images(work2.data(), sz, offset); + } + } + work_vector_offset += sz; + offset += sz; + } + } m_comm->barrier(m_comm->get_node_comm()); } @@ -1226,16 +1333,9 @@ void data_store_conduit::exchange_owner_maps() { "my owner map size: ", m_owner.size()); } -void data_store_conduit::exchange_mini_batch_data(size_t current_pos, size_t mb_size) { - if (is_local_cache()) { - return; - } - double tm1 = get_time(); - - if (m_reader->at_new_epoch()) { - PROFILE("At new epoch; m_cur_epoch: ", m_cur_epoch); - if (m_cur_epoch > 0) { - PROFILE( +void data_store_conduit::profile_timing() { + if (m_cur_epoch > 0) { + PROFILE( "\n", "Exchange Data Timing:\n", " exchange_mini_batch_data: ", m_exchange_time, "\n", @@ -1244,51 +1344,74 @@ void data_store_conduit::exchange_mini_batch_data(size_t current_pos, size_t mb_ " wait alls: ", m_wait_all_time, "\n", " unpacking rcvd nodes: ", m_rebuild_time, "\n\n"); - if (options::get()->get_bool("data_store_min_max_timing")) { - std::vector send; - static int count = 5; - send.reserve(count); - send.push_back(m_exchange_time); - send.push_back(m_exchange_sample_sizes_time); - send.push_back(m_start_snd_rcv_time); - send.push_back(m_wait_all_time); - send.push_back(m_rebuild_time); - if (m_trainer_master) { - std::vector rcv_max(count); - std::vector rcv_min(count); - m_comm->trainer_reduce(send.data(), count, rcv_max.data(), El::mpi::MAX); - m_comm->trainer_reduce(send.data(), count, rcv_min.data(), El::mpi::MIN); - PROFILE( - "Exchange Data MAX Timing:\n", - " exchange_mini_batch_data: ", rcv_max[0], "\n", - " exchange sample sizes: ", rcv_max[1], "\n", - " start sends and rcvs: ", rcv_max[2], "\n", - " wait alls: ", rcv_max[3], "\n", - " unpacking rcvd nodes: ", rcv_max[4], "\n\n"); - PROFILE( - "Exchange Data MIN Timing:\n", - " exchange_mini_batch_data: ", rcv_min[0], "\n", - " exchange sample sizes: ", rcv_min[1], "\n", - " start sends and rcvs: ", rcv_min[2], "\n", - " wait alls: ", rcv_min[3], "\n", - " unpacking rcvd nodes: ", rcv_min[4], "\n\n"); - } else { - m_comm->trainer_reduce(send.data(), count, 0, El::mpi::MAX); - m_comm->trainer_reduce(send.data(), count, 0, El::mpi::MIN); - } + if (options::get()->get_bool("data_store_min_max_timing")) { + std::vector send; + static int count = 5; + send.reserve(count); + send.push_back(m_exchange_time); + send.push_back(m_exchange_sample_sizes_time); + send.push_back(m_start_snd_rcv_time); + send.push_back(m_wait_all_time); + send.push_back(m_rebuild_time); + if (m_trainer_master) { + std::vector rcv_max(count); + std::vector rcv_min(count); + m_comm->trainer_reduce(send.data(), count, rcv_max.data(), El::mpi::MAX); + m_comm->trainer_reduce(send.data(), count, rcv_min.data(), El::mpi::MIN); + PROFILE( + "Exchange Data MAX Timing:\n", + " exchange_mini_batch_data: ", rcv_max[0], "\n", + " exchange sample sizes: ", rcv_max[1], "\n", + " start sends and rcvs: ", rcv_max[2], "\n", + " wait alls: ", rcv_max[3], "\n", + " unpacking rcvd nodes: ", rcv_max[4], "\n\n"); + PROFILE( + "Exchange Data MIN Timing:\n", + " exchange_mini_batch_data: ", rcv_min[0], "\n", + " exchange sample sizes: ", rcv_min[1], "\n", + " start sends and rcvs: ", rcv_min[2], "\n", + " wait alls: ", rcv_min[3], "\n", + " unpacking rcvd nodes: ", rcv_min[4], "\n\n"); + } else { + m_comm->trainer_reduce(send.data(), count, 0, El::mpi::MAX); + m_comm->trainer_reduce(send.data(), count, 0, El::mpi::MIN); } - - m_exchange_sample_sizes_time = 0.; - m_start_snd_rcv_time = 0.; - m_wait_all_time = 0.; - m_rebuild_time = 0.; - m_exchange_time = 0.; } + + m_exchange_sample_sizes_time = 0.; + m_start_snd_rcv_time = 0.; + m_wait_all_time = 0.; + m_rebuild_time = 0.; + m_exchange_time = 0.; + } +} + +void data_store_conduit::exchange_mini_batch_data(size_t current_pos, size_t mb_size) { + if (is_local_cache() && is_fully_loaded()) { + return; + } + + if (m_reader->at_new_epoch()) { ++m_cur_epoch; + PROFILE("Starting exchange_mini_batch_data"); + PROFILE(" At new epoch; m_cur_epoch: ", m_cur_epoch); + PROFILE(" is_explicitly_loading(): ", is_explicitly_loading()); + PROFILE(" is_local_cache(): ", is_local_cache()); + PROFILE(" is_fully_loaded: ", is_fully_loaded()); + if (! is_local_cache()) { + profile_timing(); + } + } + + if (m_reader->at_new_epoch() && is_local_cache() && is_explicitly_loading()) { + exchange_local_caches(); + return; } + double tm1 = get_time(); + // when not running in preload mode, exchange owner maps after the 1st epoch - if (m_reader->at_new_epoch() && !options::get()->get_bool("preload_data_store") && !is_local_cache() && m_cur_epoch == 1) { + if (m_reader->at_new_epoch() && ! is_preloading() && !is_local_cache() && m_cur_epoch == 1) { PROFILE("calling exchange_owner_maps"); exchange_owner_maps(); /* @@ -1301,10 +1424,6 @@ void data_store_conduit::exchange_mini_batch_data(size_t current_pos, size_t mb_ */ } - if (m_test_dir != "" && m_reader->at_new_epoch() && !is_local_cache() && m_cur_epoch == 1) { - test_checkpoint(m_test_dir); - } - exchange_data_by_sample(current_pos, mb_size); m_exchange_time += (get_time() - tm1); } @@ -1317,7 +1436,7 @@ void data_store_conduit::flush_debug_file() { m_debug->open(m_debug_filename.c_str(), std::ios::app); } -void data_store_conduit::flush_profile_file() { +void data_store_conduit::flush_profile_file() const { if (!m_profile) { return; } @@ -1348,8 +1467,8 @@ void data_store_conduit::test_checkpoint(const std::string &checkpoint_dir) { m_cur_epoch = -1; m_is_setup = false; - m_preload = false; - m_explicit_loading = true; + m_preloading = false; + m_explicitly_loading = true; m_owner_map_mb_size = 0; m_compacted_sample_size = 0; m_node_sizes_vary = true; @@ -1389,15 +1508,16 @@ void data_store_conduit::make_dir_if_it_doesnt_exist(const std::string &dir_name if (node_rank == 0) { bool exists = file::directory_exists(dir_name); if (!exists) { - if (m_world_master) { - std::cerr << "data_store_conduit; the directory '" << dir_name << "' doesn't exist; creating it\n"; - } + PROFILE("data_store_conduit; the directory '", dir_name, "' doesn't exist; creating it"); file::make_directory(dir_name); } } } -void data_store_conduit::setup_spill(const std::string &base_dir) { +void data_store_conduit::setup_spill(std::string base_dir) { + if (base_dir == "lassen") { + base_dir = get_lassen_spill_dir(); + } m_spill_dir_base = base_dir; m_spill = true; m_cur_spill_dir_integer = -1; @@ -1455,10 +1575,12 @@ void data_store_conduit::save_state() { { cereal::XMLOutputArchive archive(os); - archive(CEREAL_NVP(m_cur_epoch), + archive(CEREAL_NVP(m_my_num_indices), + CEREAL_NVP(m_cur_epoch), CEREAL_NVP(m_is_setup), - CEREAL_NVP(m_preload), - CEREAL_NVP(m_explicit_loading), + CEREAL_NVP(m_preloading), + CEREAL_NVP(m_loading_is_complete), + CEREAL_NVP(m_explicitly_loading), CEREAL_NVP(m_owner_map_mb_size), CEREAL_NVP(m_compacted_sample_size), CEREAL_NVP(m_is_local_cache), @@ -1493,9 +1615,10 @@ void data_store_conduit::load_checkpoint(std::string dir_name, generic_data_read LBANN_ERROR("failed to open ", m_cereal_fn, " for reading"); } cereal::XMLInputArchive iarchive(in); - iarchive(m_cur_epoch, m_is_setup, - m_preload, m_explicit_loading, - m_owner_map_mb_size, + iarchive(CEREAL_NVP(m_my_num_indices), + m_cur_epoch, m_is_setup, + m_preloading, m_loading_is_complete, + m_explicitly_loading, m_owner_map_mb_size, m_compacted_sample_size, m_is_local_cache, m_node_sizes_vary, m_have_sample_sizes, m_owner, m_sample_sizes); @@ -1548,8 +1671,8 @@ void data_store_conduit::print_variables() { } std::cerr << "m_cur_epoch: " << m_cur_epoch << std::endl << "m_is_setup: " << m_is_setup << std::endl - << "m_preload: " << m_preload << std::endl - << "m_explicit_loading: " << m_explicit_loading << std::endl + << "m_preloading: " << m_preloading << std::endl + << "m_explicitly_loading: " << m_explicitly_loading << std::endl << "m_owner_map_mb_size: " << m_owner_map_mb_size << std::endl << "m_compacted_sample_size: " << m_compacted_sample_size << std::endl << "m_node_sizes_vary: " << m_node_sizes_vary << std::endl; @@ -1604,7 +1727,7 @@ void data_store_conduit::load_spilled_conduit_nodes() { for (const auto &v : m_indices_to_send) { for (const auto &id : v) { - std::unordered_map::const_iterator it = m_spilled_nodes.find(id); + map_ii_t::const_iterator it = m_spilled_nodes.find(id); if (it == m_spilled_nodes.end()) { LBANN_ERROR("it == m_spilled_nodes.end() for sample_id: ", id, "; m_spilled_nodes.size: ", m_spilled_nodes.size()); } @@ -1655,4 +1778,148 @@ void data_store_conduit::print_partial_owner_map(int n) { } } +void data_store_conduit::set_profile_msg(std::string s) { + PROFILE(s); +} + +void data_store_conduit::test_imagenet_node(int index, bool dereference) { + image_data_reader *image_reader = dynamic_cast(m_reader); + if (image_reader == nullptr) { + LBANN_ERROR("data_reader_image *image_reader = dynamic_cast(m_reader) failed"); + } + + int data_id = index; + if (dereference) { + data_id = (*m_shuffled_indices)[index]; + } + if (m_image_offsets.find(data_id) == m_image_offsets.end()) { + LBANN_ERROR("m_image_offsets.find(data_id) == m_image_offsets.end()"); + } + + if (m_image_offsets.find(data_id) == m_image_offsets.end()) { + LBANN_ERROR("m_image_offsets.find(data_id) == m_image_offsets.end() for data_id: ", data_id); + } + + if (m_sample_sizes.find(data_id) == m_sample_sizes.end()) { + LBANN_ERROR("failed to find data_id ", data_id, " in the image_sizes map"); + } + size_t szz = m_sample_sizes[data_id]; + PROFILE("test_imagenet_node() for data_id: ", commify(data_id), " at offset: ", commify(m_image_offsets[data_id]), " image size: ", commify(szz)); + if (m_image_offsets[data_id] >= INT_MAX) { + PROFILE(" WARNING: offset is >= INT_MAX!"); + } + + std::cerr << "testing sample_id: "<< commify(data_id)<< " stored at offset: "<< commify(m_image_offsets[data_id]); + if (m_image_offsets[data_id] >= INT_MAX) { + std::cerr << "; (>= INT_MAX)\n"; + } else { + std::cerr << std::endl; + } + conduit::Node nd1; + image_reader->load_conduit_node_from_file(data_id, nd1); + char *buf1 = nd1[LBANN_DATA_ID_STR(data_id) + "/buffer"].value(); + size_t size1 = nd1[LBANN_DATA_ID_STR(data_id) + "/buffer_size"].value(); + + const conduit::Node &nd2 = get_conduit_node(data_id); + const char *buf2 = nd2[LBANN_DATA_ID_STR(data_id) + "/buffer"].value(); + size_t size2 = nd2[LBANN_DATA_ID_STR(data_id) + "/buffer_size"].value(); + + if (size1 != size2) { + PROFILE("buffer sizes mismatch: size of buffer read from file does not match buffer size from cache; from file: ", size1, " from cache: ", size2, " for data_id: ", data_id); + + + + if (m_world_master) { + const conduit::Schema &s = nd2.schema(); + s.print(); + nd2.print(); + } + + + + LBANN_ERROR("buffer sizes mismatch: size of buffer read from file does not match buffer size from cache; from file: ", size1, " from cache: ", size2, " for deta_id: ", data_id); + } + for (size_t i=0; i (int)m_shuffled_indices->size()) { + n = m_shuffled_indices->size(); + } + + // edge cases: get images with smallest and largest offsets in the cache + size_t max_offset = 0; + size_t min_offset = 200000000; + size_t id_max = 0; + size_t id_min = 0; + for (auto t : m_image_offsets) { + if (t.second > max_offset) { + id_max = t.first; + max_offset = t.second; + } + if (t.second < min_offset) { + id_min = t.first; + min_offset = t.second; + } + } + + // test image with smallest offset + test_imagenet_node(id_min, false); + + // test n randomly selected images + for (int h=0; hsize(); + test_imagenet_node(index); + } + + // test image with largest offset + test_imagenet_node(id_max, false); + + if (m_world_master) std::cerr<< " All tests passed\n"; + PROFILE(" All tests passed\n."); + return true; +} + +std::string commify(size_t n) { + std::string s = std::to_string(n); + std::stringstream s2; + int c = 0; + for (int j = (int)s.size()-1; j>=0; j--) { + s2 << s[j]; + ++c; + if (c == 3) { + if (j > 0) { + s2 << ","; + c = 0; + } + } + } + std::string r = s2.str(); + std::reverse(r.begin(), r.end()); + return r; +} + +void data_store_conduit::check_query_flags() const { + if (m_explicitly_loading && m_preloading) { + LBANN_ERROR("is_explicitly_loading() && is_preloading() are both true, but should not be"); + } + if (m_loading_is_complete && m_explicitly_loading) { + LBANN_ERROR("is_fully_loaded() && is_explicitly_loading() are both true, but should not be"); + } + if (m_loading_is_complete && m_preloading) { + LBANN_ERROR("is_fully_loaded() && is_preloading() are both true, but should not be"); + } +} + } // namespace lbann diff --git a/src/models/model.cpp b/src/models/model.cpp index bf590be7445..75efe94ae96 100644 --- a/src/models/model.cpp +++ b/src/models/model.cpp @@ -926,9 +926,9 @@ void model::make_data_store_preloaded(execution_mode mode) { auto *input = dynamic_cast(&get_layer(i)); if (input != nullptr) { auto *data_store = input->get_data_reader(mode)->get_data_store_ptr(); - if(data_store != nullptr && !data_store->is_preloaded()) { - input->get_data_reader(mode)->get_data_store_ptr()->set_is_preloaded(); - input->get_data_reader(mode)->get_data_store_ptr()->set_explicit_loading(false); + if(data_store != nullptr && !data_store->is_fully_loaded()) { + input->get_data_reader(mode)->get_data_store_ptr()->set_loading_is_complete(); + input->get_data_reader(mode)->get_data_store_ptr()->set_is_explicitly_loading(false); } } } @@ -942,8 +942,8 @@ void model::mark_data_store_explicitly_loading(execution_mode mode) { auto *input = dynamic_cast(&get_layer(i)); if (input != nullptr) { auto *data_store = input->get_data_reader(mode)->get_data_store_ptr(); - if(data_store != nullptr && !data_store->is_preloaded()) { - input->get_data_reader(mode)->get_data_store_ptr()->set_explicit_loading(true); + if(data_store != nullptr && !data_store->is_fully_loaded()) { + input->get_data_reader(mode)->get_data_store_ptr()->set_is_explicitly_loading(true); } } } From 1b13fe63f39cdbf4a0e03dcac62a08a69c011caf Mon Sep 17 00:00:00 2001 From: Tim Moon Date: Wed, 13 Nov 2019 10:44:30 -0800 Subject: [PATCH 393/634] Specify a layer's parents in Python frontend with positional args (#1348) * Specify a layer's parents in Python frontend with variadic args * Update vision and NLP models with new Python layer API * Update Bamboo tests with new Python layer API --- applications/vision/alexnet.py | 14 ++-- applications/vision/lenet.py | 12 ++-- applications/vision/resnet.py | 14 ++-- .../test_integration_alexnet.py | 4 +- .../test_integration_lenet.py | 4 +- .../test_integration_resnet50.py | 4 +- .../test_unit_layer_channelwise_scale_bias.py | 2 +- bamboo/unit_tests/test_unit_layer_clamp.py | 10 +-- .../unit_tests/test_unit_layer_convolution.py | 8 +-- .../unit_tests/test_unit_layer_covariance.py | 18 +++-- .../test_unit_layer_cross_entropy.py | 14 ++-- bamboo/unit_tests/test_unit_layer_elu.py | 10 +-- ...nit_layer_entrywise_batch_normalization.py | 7 +- .../test_unit_layer_entrywise_scale_bias.py | 11 +-- .../test_unit_layer_fully_connected.py | 10 +-- bamboo/unit_tests/test_unit_layer_identity.py | 10 +-- bamboo/unit_tests/test_unit_layer_l1_norm.py | 10 +-- .../unit_tests/test_unit_layer_leaky_relu.py | 10 +-- .../unit_tests/test_unit_layer_log_sigmoid.py | 10 +-- .../unit_tests/test_unit_layer_log_softmax.py | 10 +-- .../test_unit_layer_mean_absolute_error.py | 14 ++-- .../test_unit_layer_mean_squared_error.py | 14 ++-- bamboo/unit_tests/test_unit_layer_one_hot.py | 2 +- bamboo/unit_tests/test_unit_layer_relu.py | 10 +-- bamboo/unit_tests/test_unit_layer_selu.py | 10 +-- bamboo/unit_tests/test_unit_layer_sigmoid.py | 10 +-- bamboo/unit_tests/test_unit_layer_slice.py | 26 +++---- bamboo/unit_tests/test_unit_layer_softmax.py | 10 +-- bamboo/unit_tests/test_unit_layer_softplus.py | 10 +-- bamboo/unit_tests/test_unit_layer_softsign.py | 10 +-- .../test_unit_layer_squared_difference.py | 14 ++-- .../unit_tests/test_unit_layer_tessellate.py | 7 +- bamboo/unit_tests/test_unit_layer_variance.py | 10 +-- python/lbann/layer.py | 51 +++++++------ python/lbann/models/resnet.py | 4 +- python/lbann/modules.py | 71 ++++++++++++------- 36 files changed, 253 insertions(+), 212 deletions(-) diff --git a/applications/vision/alexnet.py b/applications/vision/alexnet.py index 51c0ff7c4d5..2f6a97792fa 100644 --- a/applications/vision/alexnet.py +++ b/applications/vision/alexnet.py @@ -33,15 +33,15 @@ imagenet_labels = 1000 # Construct layer graph -input = lbann.Input() -images = lbann.Identity(input) -labels = lbann.Identity(input) +input_ = lbann.Input() +images = lbann.Identity(input_) +labels = lbann.Identity(input_) preds = lbann.models.AlexNet(imagenet_labels)(images) probs = lbann.Softmax(preds) -cross_entropy = lbann.CrossEntropy([probs, labels]) -top1 = lbann.CategoricalAccuracy([probs, labels]) -top5 = lbann.TopKCategoricalAccuracy([probs, labels], k=5) -layers = list(lbann.traverse_layer_graph(input)) +cross_entropy = lbann.CrossEntropy(probs, labels) +top1 = lbann.CategoricalAccuracy(probs, labels) +top5 = lbann.TopKCategoricalAccuracy(probs, labels, k=5) +layers = list(lbann.traverse_layer_graph(input_)) # Setup objective function weights = set() diff --git a/applications/vision/lenet.py b/applications/vision/lenet.py index d7d7ff9b7dd..735d93febf4 100644 --- a/applications/vision/lenet.py +++ b/applications/vision/lenet.py @@ -21,9 +21,9 @@ # ---------------------------------- # Input data -input = lbann.Input() -images = lbann.Identity(input) -labels = lbann.Identity(input) +input_ = lbann.Input() +images = lbann.Identity(input_) +labels = lbann.Identity(input_) # LeNet x = lbann.Convolution(images, @@ -62,8 +62,8 @@ probs = lbann.Softmax(x) # Loss function and accuracy -loss = lbann.CrossEntropy([probs, labels]) -acc = lbann.CategoricalAccuracy([probs, labels]) +loss = lbann.CrossEntropy(probs, labels) +acc = lbann.CategoricalAccuracy(probs, labels) # ---------------------------------- # Setup experiment @@ -74,7 +74,7 @@ num_epochs = 20 model = lbann.Model(mini_batch_size, num_epochs, - layers=lbann.traverse_layer_graph(input), + layers=lbann.traverse_layer_graph(input_), objective_function=loss, metrics=[lbann.Metric(acc, name='accuracy', unit='%')], callbacks=[lbann.CallbackPrintModelDescription(), diff --git a/applications/vision/resnet.py b/applications/vision/resnet.py index 34f203ccc10..ed6d7969664 100644 --- a/applications/vision/resnet.py +++ b/applications/vision/resnet.py @@ -104,15 +104,15 @@ width=args.width) # Construct layer graph -input = lbann.Input() -images = lbann.Identity(input) -labels = lbann.Identity(input) +input_ = lbann.Input() +images = lbann.Identity(input_) +labels = lbann.Identity(input_) preds = resnet(images) probs = lbann.Softmax(preds) -cross_entropy = lbann.CrossEntropy([probs, labels]) -top1 = lbann.CategoricalAccuracy([probs, labels]) -top5 = lbann.TopKCategoricalAccuracy([probs, labels], k=5) -layers = list(lbann.traverse_layer_graph(input)) +cross_entropy = lbann.CrossEntropy(probs, labels) +top1 = lbann.CategoricalAccuracy(probs, labels) +top5 = lbann.TopKCategoricalAccuracy(probs, labels, k=5) +layers = list(lbann.traverse_layer_graph(input_)) # Setup objective function l2_reg_weights = set() diff --git a/bamboo/integration_tests/test_integration_alexnet.py b/bamboo/integration_tests/test_integration_alexnet.py index 1440d90c39e..7b188d1d31c 100644 --- a/bamboo/integration_tests/test_integration_alexnet.py +++ b/bamboo/integration_tests/test_integration_alexnet.py @@ -70,8 +70,8 @@ def construct_model(lbann): labels = lbann.Identity(input_) x = lbann.models.AlexNet(1000)(images) probs = lbann.Softmax(x) - cross_entropy = lbann.CrossEntropy([probs, labels]) - top5 = lbann.TopKCategoricalAccuracy([probs, labels], k=5) + cross_entropy = lbann.CrossEntropy(probs, labels) + top5 = lbann.TopKCategoricalAccuracy(probs, labels, k=5) layers = list(lbann.traverse_layer_graph(x)) # Setup objective function diff --git a/bamboo/integration_tests/test_integration_lenet.py b/bamboo/integration_tests/test_integration_lenet.py index b3ca8b70d1b..1c81d016611 100644 --- a/bamboo/integration_tests/test_integration_lenet.py +++ b/bamboo/integration_tests/test_integration_lenet.py @@ -71,8 +71,8 @@ def construct_model(lbann): labels = lbann.Identity(input_) x = lbann.models.LeNet(10)(images) probs = lbann.Softmax(x) - loss = lbann.CrossEntropy([probs, labels]) - acc = lbann.CategoricalAccuracy([probs, labels]) + loss = lbann.CrossEntropy(probs, labels) + acc = lbann.CategoricalAccuracy(probs, labels) # Objects for LBANN model callbacks = [lbann.CallbackPrint(), lbann.CallbackTimer()] diff --git a/bamboo/integration_tests/test_integration_resnet50.py b/bamboo/integration_tests/test_integration_resnet50.py index 086afd351f3..a146ea281dd 100644 --- a/bamboo/integration_tests/test_integration_resnet50.py +++ b/bamboo/integration_tests/test_integration_resnet50.py @@ -70,8 +70,8 @@ def construct_model(lbann): labels = lbann.Identity(input_) x = lbann.models.ResNet50(1000, bn_statistics_group_size=-1)(images) probs = lbann.Softmax(x) - cross_entropy = lbann.CrossEntropy([probs, labels]) - top5 = lbann.TopKCategoricalAccuracy([probs, labels], k=5) + cross_entropy = lbann.CrossEntropy(probs, labels) + top5 = lbann.TopKCategoricalAccuracy(probs, labels, k=5) layers = list(lbann.traverse_layer_graph(x)) # Setup objective function diff --git a/bamboo/unit_tests/test_unit_layer_channelwise_scale_bias.py b/bamboo/unit_tests/test_unit_layer_channelwise_scale_bias.py index 748be5b7e0c..6f072c99ebc 100644 --- a/bamboo/unit_tests/test_unit_layer_channelwise_scale_bias.py +++ b/bamboo/unit_tests/test_unit_layer_channelwise_scale_bias.py @@ -68,7 +68,7 @@ def construct_model(lbann): x0 = lbann.WeightsLayer(weights=x_weights, dims=tools.str_list(_sample_dims)) x1 = lbann.Reshape(lbann.Input(), dims=tools.str_list(_sample_dims)) - x = lbann.Sum([x0, x1]) + x = lbann.Sum(x0, x1) # Apply channel-wise scale/bias scale_values = tools.str_list(np.nditer(_scale)) diff --git a/bamboo/unit_tests/test_unit_layer_clamp.py b/bamboo/unit_tests/test_unit_layer_clamp.py index cb439755f9e..3402b65e3bf 100644 --- a/bamboo/unit_tests/test_unit_layer_clamp.py +++ b/bamboo/unit_tests/test_unit_layer_clamp.py @@ -66,10 +66,12 @@ def construct_model(lbann): # Note: Sum with a weights layer so that gradient checking will # verify that error signals are correct. x_weights = lbann.Weights(optimizer=lbann.SGD(), - initializer=lbann.ConstantInitializer(value=0.0)) - x0 = lbann.WeightsLayer(weights=x_weights, dims=str(_sample_size)) - x1 = lbann.Identity(lbann.Input()) - x = lbann.Sum([x0, x1]) + initializer=lbann.ConstantInitializer(value=0.0), + name='input_weights') + x = lbann.Sum(lbann.Reshape(lbann.Input(), + dims=tools.str_list(_sample_size)), + lbann.WeightsLayer(weights=x_weights, + dims=tools.str_list(_sample_size))) x_lbann = x # Objects for LBANN model diff --git a/bamboo/unit_tests/test_unit_layer_convolution.py b/bamboo/unit_tests/test_unit_layer_convolution.py index dbf92edc585..635ba68e8de 100644 --- a/bamboo/unit_tests/test_unit_layer_convolution.py +++ b/bamboo/unit_tests/test_unit_layer_convolution.py @@ -137,10 +137,10 @@ def construct_model(lbann): x_weights = lbann.Weights(optimizer=lbann.SGD(), initializer=lbann.ConstantInitializer(value=0.0), name='input_weights') - x0 = lbann.WeightsLayer(weights=x_weights, - dims=tools.str_list(_sample_dims)) - x1 = lbann.Reshape(lbann.Input(), dims=tools.str_list(_sample_dims)) - x = lbann.Sum([x0, x1]) + x = lbann.Sum(lbann.Reshape(lbann.Input(), + dims=tools.str_list(_sample_dims)), + lbann.WeightsLayer(weights=x_weights, + dims=tools.str_list(_sample_dims))) x_lbann = x # Objects for LBANN model diff --git a/bamboo/unit_tests/test_unit_layer_covariance.py b/bamboo/unit_tests/test_unit_layer_covariance.py index bf5c78eddf4..bfc69c67587 100644 --- a/bamboo/unit_tests/test_unit_layer_covariance.py +++ b/bamboo/unit_tests/test_unit_layer_covariance.py @@ -66,12 +66,10 @@ def construct_model(lbann): name='input1_weights') x_slice = lbann.Slice(lbann.Input(), slice_points=tools.str_list([0, slice_size, 2*slice_size])) - x0 = lbann.Sum([x_slice, - lbann.WeightsLayer(weights=x0_weights, - dims=str(slice_size))]) - x1 = lbann.Sum([x_slice, - lbann.WeightsLayer(weights=x1_weights, - dims=str(slice_size))]) + x0 = lbann.Sum(x_slice, + lbann.WeightsLayer(weights=x0_weights, dims=str(slice_size))) + x1 = lbann.Sum(x_slice, + lbann.WeightsLayer(weights=x1_weights, dims=str(slice_size))) x0_lbann = x0 x1_lbann = x1 @@ -87,7 +85,7 @@ def construct_model(lbann): # LBANN implementation x0 = x0_lbann x1 = x1_lbann - y = lbann.Covariance([x0, x1], data_layout='data_parallel') + y = lbann.Covariance(x0, x1, data_layout='data_parallel') z = lbann.L2Norm2(y) obj.append(z) metrics.append(lbann.Metric(z, name='data-parallel layout, unbiased')) @@ -117,7 +115,7 @@ def construct_model(lbann): # LBANN implementation x0 = x0_lbann x1 = x1_lbann - y = lbann.Covariance([x0, x1], data_layout='model_parallel') + y = lbann.Covariance(x0, x1, data_layout='model_parallel') z = lbann.L2Norm2(y) obj.append(z) metrics.append(lbann.Metric(z, name='model-parallel layout, unbiased')) @@ -147,7 +145,7 @@ def construct_model(lbann): # LBANN implementation x0 = x0_lbann x1 = x1_lbann - y = lbann.Covariance([x0, x1], biased=True, data_layout='data_parallel') + y = lbann.Covariance(x0, x1, biased=True, data_layout='data_parallel') z = lbann.L2Norm2(y) obj.append(z) metrics.append(lbann.Metric(z, name='data-parallel layout, biased')) @@ -177,7 +175,7 @@ def construct_model(lbann): # LBANN implementation x0 = x0_lbann x1 = x1_lbann - y = lbann.Covariance([x0, x1], biased=True, data_layout='model_parallel') + y = lbann.Covariance(x0, x1, biased=True, data_layout='model_parallel') z = lbann.L2Norm2(y) obj.append(z) metrics.append(lbann.Metric(z, name='model-parallel layout, biased')) diff --git a/bamboo/unit_tests/test_unit_layer_cross_entropy.py b/bamboo/unit_tests/test_unit_layer_cross_entropy.py index 03272f733d8..636b6ed8ed1 100644 --- a/bamboo/unit_tests/test_unit_layer_cross_entropy.py +++ b/bamboo/unit_tests/test_unit_layer_cross_entropy.py @@ -95,12 +95,10 @@ def construct_model(lbann): name='input1_weights') x_slice = lbann.Slice(lbann.Input(), slice_points=tools.str_list([0, slice_size, 2*slice_size])) - x0 = lbann.Sum([x_slice, - lbann.WeightsLayer(weights=x0_weights, - dims=str(slice_size))]) - x1 = lbann.Sum([x_slice, - lbann.WeightsLayer(weights=x1_weights, - dims=str(slice_size))]) + x0 = lbann.Sum(x_slice, + lbann.WeightsLayer(weights=x0_weights, dims=str(slice_size))) + x1 = lbann.Sum(x_slice, + lbann.WeightsLayer(weights=x1_weights, dims=str(slice_size))) x0_lbann = x0 x1_lbann = x1 @@ -116,7 +114,7 @@ def construct_model(lbann): # LBANN implementation x0 = x0_lbann x1 = x1_lbann - y = lbann.CrossEntropy([x0, x1], data_layout='data_parallel') + y = lbann.CrossEntropy(x0, x1, data_layout='data_parallel') z = lbann.L2Norm2(y) obj.append(z) metrics.append(lbann.Metric(z, name='data-parallel layout')) @@ -146,7 +144,7 @@ def construct_model(lbann): # LBANN implementation x0 = x0_lbann x1 = x1_lbann - y = lbann.CrossEntropy([x0, x1], data_layout='model_parallel') + y = lbann.CrossEntropy(x0, x1, data_layout='model_parallel') z = lbann.L2Norm2(y) obj.append(z) metrics.append(lbann.Metric(z, name='model-parallel layout')) diff --git a/bamboo/unit_tests/test_unit_layer_elu.py b/bamboo/unit_tests/test_unit_layer_elu.py index 2f2e94d8ee3..7c34d2ce8b8 100644 --- a/bamboo/unit_tests/test_unit_layer_elu.py +++ b/bamboo/unit_tests/test_unit_layer_elu.py @@ -64,10 +64,12 @@ def construct_model(lbann): # Note: Sum with a weights layer so that gradient checking will # verify that error signals are correct. x_weights = lbann.Weights(optimizer=lbann.SGD(), - initializer=lbann.ConstantInitializer(value=0.0)) - x0 = lbann.WeightsLayer(weights=x_weights, dims=str(_sample_size)) - x1 = lbann.Identity(lbann.Input()) - x = lbann.Sum([x0, x1]) + initializer=lbann.ConstantInitializer(value=0.0), + name='input_weights') + x = lbann.Sum(lbann.Reshape(lbann.Input(), + dims=tools.str_list(_sample_size)), + lbann.WeightsLayer(weights=x_weights, + dims=tools.str_list(_sample_size))) x_lbann = x # Objects for LBANN model diff --git a/bamboo/unit_tests/test_unit_layer_entrywise_batch_normalization.py b/bamboo/unit_tests/test_unit_layer_entrywise_batch_normalization.py index 77fd72ae22b..e1fd6ca0490 100644 --- a/bamboo/unit_tests/test_unit_layer_entrywise_batch_normalization.py +++ b/bamboo/unit_tests/test_unit_layer_entrywise_batch_normalization.py @@ -65,12 +65,13 @@ def construct_model(lbann): # the zero-valued tensor by the mini-batch index. x = lbann.Reshape(lbann.Input(), dims=tools.str_list(_sample_dims)) x_weights = lbann.Weights(optimizer=lbann.SGD(), - initializer=lbann.ConstantInitializer(value=0.0)) + initializer=lbann.ConstantInitializer(value=0.0), + name='input_weights') x0 = lbann.WeightsLayer(weights=x_weights, dims=tools.str_list(_sample_dims)) - x1 = lbann.Divide([lbann.MiniBatchIndex(), lbann.MiniBatchSize()]) + x1 = lbann.Divide(lbann.MiniBatchIndex(), lbann.MiniBatchSize()) x1 = lbann.Tessellate(lbann.Reshape(x1, dims='1 1 1'), dims=tools.str_list(_sample_dims)) - x = lbann.Sum([x, lbann.Multiply([x0, x1])]) + x = lbann.Sum(x, lbann.Multiply(x0, x1)) x_lbann = x # Objects for LBANN model diff --git a/bamboo/unit_tests/test_unit_layer_entrywise_scale_bias.py b/bamboo/unit_tests/test_unit_layer_entrywise_scale_bias.py index 75e2cdd5bde..38a2489a2ff 100644 --- a/bamboo/unit_tests/test_unit_layer_entrywise_scale_bias.py +++ b/bamboo/unit_tests/test_unit_layer_entrywise_scale_bias.py @@ -63,11 +63,12 @@ def construct_model(lbann): # Note: Sum with a weights layer so that gradient checking will # verify that error signals are correct. x_weights = lbann.Weights(optimizer=lbann.SGD(), - initializer=lbann.ConstantInitializer(value=0.0)) - x0 = lbann.WeightsLayer(weights=x_weights, - dims=tools.str_list(_sample_dims)) - x1 = lbann.Reshape(lbann.Input(), dims=tools.str_list(_sample_dims)) - x = lbann.Sum([x0, x1]) + initializer=lbann.ConstantInitializer(value=0.0), + name='input_weights') + x = lbann.Sum(lbann.Reshape(lbann.Input(), + dims=tools.str_list(_sample_size)), + lbann.WeightsLayer(weights=x_weights, + dims=tools.str_list(_sample_size))) x_lbann = x # Objects for LBANN model diff --git a/bamboo/unit_tests/test_unit_layer_fully_connected.py b/bamboo/unit_tests/test_unit_layer_fully_connected.py index 47469c1d499..175aed36cc3 100644 --- a/bamboo/unit_tests/test_unit_layer_fully_connected.py +++ b/bamboo/unit_tests/test_unit_layer_fully_connected.py @@ -61,10 +61,12 @@ def construct_model(lbann): # Note: Sum with a weights layer so that gradient checking will # verify that error signals are correct. x_weights = lbann.Weights(optimizer=lbann.SGD(), - initializer=lbann.ConstantInitializer(value=0.0)) - x0 = lbann.WeightsLayer(weights=x_weights, dims=str(_input_size)) - x1 = lbann.Identity(lbann.Input()) - x = lbann.Sum([x0, x1]) + initializer=lbann.ConstantInitializer(value=0.0), + name='input_weights') + x = lbann.Sum(lbann.Reshape(lbann.Input(), + dims=tools.str_list(_input_size)), + lbann.WeightsLayer(weights=x_weights, + dims=tools.str_list(_input_size))) x_lbann = x # Objects for LBANN model diff --git a/bamboo/unit_tests/test_unit_layer_identity.py b/bamboo/unit_tests/test_unit_layer_identity.py index 0bc14a7dcb1..af52185ebc8 100644 --- a/bamboo/unit_tests/test_unit_layer_identity.py +++ b/bamboo/unit_tests/test_unit_layer_identity.py @@ -60,10 +60,12 @@ def construct_model(lbann): # Note: Sum with a weights layer so that gradient checking will # verify that error signals are correct. x_weights = lbann.Weights(optimizer=lbann.SGD(), - initializer=lbann.ConstantInitializer(value=0.0)) - x0 = lbann.WeightsLayer(weights=x_weights, dims=str(_sample_size)) - x1 = lbann.Identity(lbann.Input()) - x = lbann.Sum([x0, x1]) + initializer=lbann.ConstantInitializer(value=0.0), + name='input_weights') + x = lbann.Sum(lbann.Reshape(lbann.Input(), + dims=tools.str_list(_sample_size)), + lbann.WeightsLayer(weights=x_weights, + dims=tools.str_list(_sample_size))) x_lbann = x # Objects for LBANN model diff --git a/bamboo/unit_tests/test_unit_layer_l1_norm.py b/bamboo/unit_tests/test_unit_layer_l1_norm.py index 06c7a517371..8d30e2d44fd 100644 --- a/bamboo/unit_tests/test_unit_layer_l1_norm.py +++ b/bamboo/unit_tests/test_unit_layer_l1_norm.py @@ -64,10 +64,12 @@ def construct_model(lbann): # Note: Sum with a weights layer so that gradient checking will # verify that error signals are correct. x_weights = lbann.Weights(optimizer=lbann.SGD(), - initializer=lbann.ConstantInitializer(value=0.0)) - x0 = lbann.WeightsLayer(weights=x_weights, dims=str(_sample_size)) - x1 = lbann.Identity(lbann.Input()) - x = lbann.Sum([x0, x1]) + initializer=lbann.ConstantInitializer(value=0.0), + name='input_weights') + x = lbann.Sum(lbann.Reshape(lbann.Input(), + dims=tools.str_list(_sample_size)), + lbann.WeightsLayer(weights=x_weights, + dims=tools.str_list(_sample_size))) x_lbann = x # Objects for LBANN model diff --git a/bamboo/unit_tests/test_unit_layer_leaky_relu.py b/bamboo/unit_tests/test_unit_layer_leaky_relu.py index b848266d19e..dee415e5f7b 100644 --- a/bamboo/unit_tests/test_unit_layer_leaky_relu.py +++ b/bamboo/unit_tests/test_unit_layer_leaky_relu.py @@ -64,10 +64,12 @@ def construct_model(lbann): # Note: Sum with a weights layer so that gradient checking will # verify that error signals are correct. x_weights = lbann.Weights(optimizer=lbann.SGD(), - initializer=lbann.ConstantInitializer(value=0.0)) - x0 = lbann.WeightsLayer(weights=x_weights, dims=str(_sample_size)) - x1 = lbann.Identity(lbann.Input()) - x = lbann.Sum([x0, x1]) + initializer=lbann.ConstantInitializer(value=0.0), + name='input_weights') + x = lbann.Sum(lbann.Reshape(lbann.Input(), + dims=tools.str_list(_sample_size)), + lbann.WeightsLayer(weights=x_weights, + dims=tools.str_list(_sample_size))) x_lbann = x # Objects for LBANN model diff --git a/bamboo/unit_tests/test_unit_layer_log_sigmoid.py b/bamboo/unit_tests/test_unit_layer_log_sigmoid.py index 6e3c6717732..04f207b2f26 100644 --- a/bamboo/unit_tests/test_unit_layer_log_sigmoid.py +++ b/bamboo/unit_tests/test_unit_layer_log_sigmoid.py @@ -62,10 +62,12 @@ def construct_model(lbann): # Note: Sum with a weights layer so that gradient checking will # verify that error signals are correct. x_weights = lbann.Weights(optimizer=lbann.SGD(), - initializer=lbann.ConstantInitializer(value=0.0)) - x0 = lbann.WeightsLayer(weights=x_weights, dims=str(_sample_size)) - x1 = lbann.Identity(lbann.Input()) - x = lbann.Sum([x0, x1]) + initializer=lbann.ConstantInitializer(value=0.0), + name='input_weights') + x = lbann.Sum(lbann.Reshape(lbann.Input(), + dims=tools.str_list(_sample_size)), + lbann.WeightsLayer(weights=x_weights, + dims=tools.str_list(_sample_size))) x_lbann = x # Objects for LBANN model diff --git a/bamboo/unit_tests/test_unit_layer_log_softmax.py b/bamboo/unit_tests/test_unit_layer_log_softmax.py index c135c316dc9..17fb0523d14 100644 --- a/bamboo/unit_tests/test_unit_layer_log_softmax.py +++ b/bamboo/unit_tests/test_unit_layer_log_softmax.py @@ -75,10 +75,12 @@ def construct_model(lbann): # Note: Sum with a weights layer so that gradient checking will # verify that error signals are correct. x_weights = lbann.Weights(optimizer=lbann.SGD(), - initializer=lbann.ConstantInitializer(value=0.0)) - x0 = lbann.WeightsLayer(weights=x_weights, dims=str(_sample_size)) - x1 = lbann.Identity(lbann.Input()) - x = lbann.Sum([x0, x1]) + initializer=lbann.ConstantInitializer(value=0.0), + name='input_weights') + x = lbann.Sum(lbann.Reshape(lbann.Input(), + dims=tools.str_list(_sample_size)), + lbann.WeightsLayer(weights=x_weights, + dims=tools.str_list(_sample_size))) x_lbann = x # Objects for LBANN model diff --git a/bamboo/unit_tests/test_unit_layer_mean_absolute_error.py b/bamboo/unit_tests/test_unit_layer_mean_absolute_error.py index e196ce1289a..a3645248ced 100644 --- a/bamboo/unit_tests/test_unit_layer_mean_absolute_error.py +++ b/bamboo/unit_tests/test_unit_layer_mean_absolute_error.py @@ -69,12 +69,10 @@ def construct_model(lbann): name='input1_weights') x_slice = lbann.Slice(lbann.Input(), slice_points=tools.str_list([0, slice_size, 2*slice_size])) - x0 = lbann.Sum([x_slice, - lbann.WeightsLayer(weights=x0_weights, - dims=str(slice_size))]) - x1 = lbann.Sum([x_slice, - lbann.WeightsLayer(weights=x1_weights, - dims=str(slice_size))]) + x0 = lbann.Sum(x_slice, + lbann.WeightsLayer(weights=x0_weights, dims=str(slice_size))) + x1 = lbann.Sum(x_slice, + lbann.WeightsLayer(weights=x1_weights, dims=str(slice_size))) x0_lbann = x0 x1_lbann = x1 @@ -90,7 +88,7 @@ def construct_model(lbann): # LBANN implementation x0 = x0_lbann x1 = x1_lbann - y = lbann.MeanAbsoluteError([x0, x1], data_layout='data_parallel') + y = lbann.MeanAbsoluteError(x0, x1, data_layout='data_parallel') z = lbann.L2Norm2(y) obj.append(z) metrics.append(lbann.Metric(z, name='data-parallel layout')) @@ -120,7 +118,7 @@ def construct_model(lbann): # LBANN implementation x0 = x0_lbann x1 = x1_lbann - y = lbann.MeanAbsoluteError([x0, x1], data_layout='model_parallel') + y = lbann.MeanAbsoluteError(x0, x1, data_layout='model_parallel') z = lbann.L2Norm2(y) obj.append(z) metrics.append(lbann.Metric(z, name='model-parallel layout, unbiased')) diff --git a/bamboo/unit_tests/test_unit_layer_mean_squared_error.py b/bamboo/unit_tests/test_unit_layer_mean_squared_error.py index 343658538c6..d024f48fa2f 100644 --- a/bamboo/unit_tests/test_unit_layer_mean_squared_error.py +++ b/bamboo/unit_tests/test_unit_layer_mean_squared_error.py @@ -66,12 +66,10 @@ def construct_model(lbann): name='input1_weights') x_slice = lbann.Slice(lbann.Input(), slice_points=tools.str_list([0, slice_size, 2*slice_size])) - x0 = lbann.Sum([x_slice, - lbann.WeightsLayer(weights=x0_weights, - dims=str(slice_size))]) - x1 = lbann.Sum([x_slice, - lbann.WeightsLayer(weights=x1_weights, - dims=str(slice_size))]) + x0 = lbann.Sum(x_slice, + lbann.WeightsLayer(weights=x0_weights, dims=str(slice_size))) + x1 = lbann.Sum(x_slice, + lbann.WeightsLayer(weights=x1_weights, dims=str(slice_size))) x0_lbann = x0 x1_lbann = x1 @@ -87,7 +85,7 @@ def construct_model(lbann): # LBANN implementation x0 = x0_lbann x1 = x1_lbann - y = lbann.MeanSquaredError([x0, x1], data_layout='data_parallel') + y = lbann.MeanSquaredError(x0, x1, data_layout='data_parallel') z = lbann.L2Norm2(y) obj.append(z) metrics.append(lbann.Metric(z, name='data-parallel layout')) @@ -117,7 +115,7 @@ def construct_model(lbann): # LBANN implementation x0 = x0_lbann x1 = x1_lbann - y = lbann.MeanSquaredError([x0, x1], data_layout='model_parallel') + y = lbann.MeanSquaredError(x0, x1, data_layout='model_parallel') z = lbann.L2Norm2(y) obj.append(z) metrics.append(lbann.Metric(z, name='model-parallel layout, unbiased')) diff --git a/bamboo/unit_tests/test_unit_layer_one_hot.py b/bamboo/unit_tests/test_unit_layer_one_hot.py index 2fe1e459475..ccdc9d29aa8 100644 --- a/bamboo/unit_tests/test_unit_layer_one_hot.py +++ b/bamboo/unit_tests/test_unit_layer_one_hot.py @@ -60,7 +60,7 @@ def construct_model(lbann): y1 = lbann.OneHot(x, size=one_hot_size) y2 = lbann.Concatenation([lbann.Constant(value=i+1, num_neurons='1') for i in range(one_hot_size)]) - y = lbann.Multiply([y1, y2]) + y = lbann.Multiply(y1, y2) z = lbann.L2Norm2(y) # Objects for LBANN model diff --git a/bamboo/unit_tests/test_unit_layer_relu.py b/bamboo/unit_tests/test_unit_layer_relu.py index 897642d454f..7fa888f1db2 100644 --- a/bamboo/unit_tests/test_unit_layer_relu.py +++ b/bamboo/unit_tests/test_unit_layer_relu.py @@ -64,10 +64,12 @@ def construct_model(lbann): # Note: Sum with a weights layer so that gradient checking will # verify that error signals are correct. x_weights = lbann.Weights(optimizer=lbann.SGD(), - initializer=lbann.ConstantInitializer(value=0.0)) - x0 = lbann.WeightsLayer(weights=x_weights, dims=str(_sample_size)) - x1 = lbann.Identity(lbann.Input()) - x = lbann.Sum([x0, x1]) + initializer=lbann.ConstantInitializer(value=0.0), + name='input_weights') + x = lbann.Sum(lbann.Reshape(lbann.Input(), + dims=tools.str_list(_sample_size)), + lbann.WeightsLayer(weights=x_weights, + dims=tools.str_list(_sample_size))) x_lbann = x # Objects for LBANN model diff --git a/bamboo/unit_tests/test_unit_layer_selu.py b/bamboo/unit_tests/test_unit_layer_selu.py index d46ce681223..261c95538bf 100644 --- a/bamboo/unit_tests/test_unit_layer_selu.py +++ b/bamboo/unit_tests/test_unit_layer_selu.py @@ -80,10 +80,12 @@ def construct_model(lbann): # Note: Sum with a weights layer so that gradient checking will # verify that error signals are correct. x_weights = lbann.Weights(optimizer=lbann.SGD(), - initializer=lbann.ConstantInitializer(value=0.0)) - x0 = lbann.WeightsLayer(weights=x_weights, dims=str(_sample_size)) - x1 = lbann.Identity(lbann.Input()) - x = lbann.Sum([x0, x1]) + initializer=lbann.ConstantInitializer(value=0.0), + name='input_weights') + x = lbann.Sum(lbann.Reshape(lbann.Input(), + dims=tools.str_list(_sample_size)), + lbann.WeightsLayer(weights=x_weights, + dims=tools.str_list(_sample_size))) x_lbann = x # Objects for LBANN model diff --git a/bamboo/unit_tests/test_unit_layer_sigmoid.py b/bamboo/unit_tests/test_unit_layer_sigmoid.py index bb4a946a2f1..e49de15ef8a 100644 --- a/bamboo/unit_tests/test_unit_layer_sigmoid.py +++ b/bamboo/unit_tests/test_unit_layer_sigmoid.py @@ -62,10 +62,12 @@ def construct_model(lbann): # Note: Sum with a weights layer so that gradient checking will # verify that error signals are correct. x_weights = lbann.Weights(optimizer=lbann.SGD(), - initializer=lbann.ConstantInitializer(value=0.0)) - x0 = lbann.WeightsLayer(weights=x_weights, dims=str(_sample_size)) - x1 = lbann.Identity(lbann.Input()) - x = lbann.Sum([x0, x1]) + initializer=lbann.ConstantInitializer(value=0.0), + name='input_weights') + x = lbann.Sum(lbann.Reshape(lbann.Input(), + dims=tools.str_list(_sample_size)), + lbann.WeightsLayer(weights=x_weights, + dims=tools.str_list(_sample_size))) x_lbann = x # Objects for LBANN model diff --git a/bamboo/unit_tests/test_unit_layer_slice.py b/bamboo/unit_tests/test_unit_layer_slice.py index b21cbbdbd58..3e9fce5a63d 100644 --- a/bamboo/unit_tests/test_unit_layer_slice.py +++ b/bamboo/unit_tests/test_unit_layer_slice.py @@ -58,23 +58,23 @@ def construct_model(lbann): """ - # LBANN objects + # Input data + # Note: Sum with a weights layer so that gradient checking will + # verify that error signals are correct. + x_weights = lbann.Weights(optimizer=lbann.SGD(), + initializer=lbann.ConstantInitializer(value=0.0), + name='input_weights') + x = lbann.Sum(lbann.Reshape(lbann.Input(), + dims=tools.str_list(_sample_dims)), + lbann.WeightsLayer(weights=x_weights, + dims=tools.str_list(_sample_dims))) + x_lbann = x + + # Objects for LBANN model obj = [] metrics = [] callbacks = [] - # -------------------------- - # LBANN input data - # -------------------------- - # Note: Sum with a weights layer so that gradient checking will - # verify that error signals are correct. - w = lbann.Weights(optimizer=lbann.SGD(), - initializer=lbann.ConstantInitializer(value=0.0)) - x0 = lbann.WeightsLayer(weights=w, - dims=tools.str_list(_sample_dims)) - x1 = lbann.Reshape(lbann.Input(), dims=tools.str_list(_sample_dims)) - x_lbann = lbann.Sum([x0, x1]) - # -------------------------- # Slice along axis 0 # -------------------------- diff --git a/bamboo/unit_tests/test_unit_layer_softmax.py b/bamboo/unit_tests/test_unit_layer_softmax.py index 2654cbba768..8d0ece4fd2b 100644 --- a/bamboo/unit_tests/test_unit_layer_softmax.py +++ b/bamboo/unit_tests/test_unit_layer_softmax.py @@ -76,10 +76,12 @@ def construct_model(lbann): # Note: Sum with a weights layer so that gradient checking will # verify that error signals are correct. x_weights = lbann.Weights(optimizer=lbann.SGD(), - initializer=lbann.ConstantInitializer(value=0.0)) - x0 = lbann.WeightsLayer(weights=x_weights, dims=str(_sample_size)) - x1 = lbann.Identity(lbann.Input()) - x = lbann.Sum([x0, x1]) + initializer=lbann.ConstantInitializer(value=0.0), + name='input_weights') + x = lbann.Sum(lbann.Reshape(lbann.Input(), + dims=tools.str_list(_sample_size)), + lbann.WeightsLayer(weights=x_weights, + dims=tools.str_list(_sample_size))) x_lbann = x # Objects for LBANN model diff --git a/bamboo/unit_tests/test_unit_layer_softplus.py b/bamboo/unit_tests/test_unit_layer_softplus.py index f0e52881db1..324865b5c1c 100644 --- a/bamboo/unit_tests/test_unit_layer_softplus.py +++ b/bamboo/unit_tests/test_unit_layer_softplus.py @@ -62,10 +62,12 @@ def construct_model(lbann): # Note: Sum with a weights layer so that gradient checking will # verify that error signals are correct. x_weights = lbann.Weights(optimizer=lbann.SGD(), - initializer=lbann.ConstantInitializer(value=0.0)) - x0 = lbann.WeightsLayer(weights=x_weights, dims=str(_sample_size)) - x1 = lbann.Identity(lbann.Input()) - x = lbann.Sum([x0, x1]) + initializer=lbann.ConstantInitializer(value=0.0), + name='input_weights') + x = lbann.Sum(lbann.Reshape(lbann.Input(), + dims=tools.str_list(_sample_size)), + lbann.WeightsLayer(weights=x_weights, + dims=tools.str_list(_sample_size))) x_lbann = x # Objects for LBANN model diff --git a/bamboo/unit_tests/test_unit_layer_softsign.py b/bamboo/unit_tests/test_unit_layer_softsign.py index 2f20c51364d..276668d4fe7 100644 --- a/bamboo/unit_tests/test_unit_layer_softsign.py +++ b/bamboo/unit_tests/test_unit_layer_softsign.py @@ -60,10 +60,12 @@ def construct_model(lbann): # Note: Sum with a weights layer so that gradient checking will # verify that error signals are correct. x_weights = lbann.Weights(optimizer=lbann.SGD(), - initializer=lbann.ConstantInitializer(value=0.0)) - x0 = lbann.WeightsLayer(weights=x_weights, dims=str(_sample_size)) - x1 = lbann.Identity(lbann.Input()) - x = lbann.Sum([x0, x1]) + initializer=lbann.ConstantInitializer(value=0.0), + name='input_weights') + x = lbann.Sum(lbann.Reshape(lbann.Input(), + dims=tools.str_list(_sample_size)), + lbann.WeightsLayer(weights=x_weights, + dims=tools.str_list(_sample_size))) x_lbann = x # Objects for LBANN model diff --git a/bamboo/unit_tests/test_unit_layer_squared_difference.py b/bamboo/unit_tests/test_unit_layer_squared_difference.py index 885cc9a8c8b..f758fb59ef3 100644 --- a/bamboo/unit_tests/test_unit_layer_squared_difference.py +++ b/bamboo/unit_tests/test_unit_layer_squared_difference.py @@ -66,12 +66,10 @@ def construct_model(lbann): name='input1_weights') x_slice = lbann.Slice(lbann.Input(), slice_points=tools.str_list([0, slice_size, 2*slice_size])) - x0 = lbann.Sum([x_slice, - lbann.WeightsLayer(weights=x0_weights, - dims=str(slice_size))]) - x1 = lbann.Sum([x_slice, - lbann.WeightsLayer(weights=x1_weights, - dims=str(slice_size))]) + x0 = lbann.Sum(x_slice, + lbann.WeightsLayer(weights=x0_weights, dims=str(slice_size))) + x1 = lbann.Sum(x_slice, + lbann.WeightsLayer(weights=x1_weights, dims=str(slice_size))) x0_lbann = x0 x1_lbann = x1 @@ -87,7 +85,7 @@ def construct_model(lbann): # LBANN implementation x0 = x0_lbann x1 = x1_lbann - y = lbann.SquaredDifference([x0, x1], data_layout='data_parallel') + y = lbann.SquaredDifference(x0, x1, data_layout='data_parallel') z = lbann.L2Norm2(y) obj.append(z) metrics.append(lbann.Metric(z, name='data-parallel layout')) @@ -117,7 +115,7 @@ def construct_model(lbann): # LBANN implementation x0 = x0_lbann x1 = x1_lbann - y = lbann.SquaredDifference([x0, x1], data_layout='model_parallel') + y = lbann.SquaredDifference(x0, x1, data_layout='model_parallel') z = lbann.L2Norm2(y) obj.append(z) metrics.append(lbann.Metric(z, name='model-parallel layout, unbiased')) diff --git a/bamboo/unit_tests/test_unit_layer_tessellate.py b/bamboo/unit_tests/test_unit_layer_tessellate.py index 86e7b81cc5c..96c3fd65aee 100644 --- a/bamboo/unit_tests/test_unit_layer_tessellate.py +++ b/bamboo/unit_tests/test_unit_layer_tessellate.py @@ -64,9 +64,10 @@ def construct_model(lbann): x_weights = lbann.Weights(optimizer=lbann.SGD(), initializer=lbann.ConstantInitializer(value=0.0), name='input_weights') - x0 = lbann.WeightsLayer(weights=x_weights, dims=tools.str_list(_sample_dims)) - x1 = lbann.Reshape(lbann.Input(), dims=tools.str_list(_sample_dims)) - x = lbann.Sum([x0, x1]) + x = lbann.Sum(lbann.Reshape(lbann.Input(), + dims=tools.str_list(_sample_dims)), + lbann.WeightsLayer(weights=x_weights, + dims=tools.str_list(_sample_dims))) x_lbann = x # Objects for LBANN model diff --git a/bamboo/unit_tests/test_unit_layer_variance.py b/bamboo/unit_tests/test_unit_layer_variance.py index aac27260a37..e8fad2980e7 100644 --- a/bamboo/unit_tests/test_unit_layer_variance.py +++ b/bamboo/unit_tests/test_unit_layer_variance.py @@ -60,10 +60,12 @@ def construct_model(lbann): # Note: Sum with a weights layer so that gradient checking will # verify that error signals are correct. x_weights = lbann.Weights(optimizer=lbann.SGD(), - initializer=lbann.ConstantInitializer(value=0.0)) - x0 = lbann.WeightsLayer(weights=x_weights, dims=str(_sample_size)) - x1 = lbann.Identity(lbann.Input()) - x = lbann.Sum([x0, x1]) + initializer=lbann.ConstantInitializer(value=0.0), + name='input_weights') + x = lbann.Sum(lbann.Reshape(lbann.Input(), + dims=tools.str_list(_sample_size)), + lbann.WeightsLayer(weights=x_weights, + dims=tools.str_list(_sample_size))) x_lbann = x # Objects for LBANN model diff --git a/python/lbann/layer.py b/python/lbann/layer.py index 3248873ca1a..0bb94bcfdfb 100644 --- a/python/lbann/layer.py +++ b/python/lbann/layer.py @@ -1,15 +1,31 @@ """Neural network tensor operations.""" import abc -from lbann import layers_pb2 +import lbann from lbann.util import make_iterable import lbann.util.class_generator class Layer(abc.ABC): - """Neural network tensor operation.""" + """Neural network tensor operation. + + Args: + *args (Layer): Parent layers, i.e. sources of input tensors. + parents (Iterable of Layer, optional): Sources of input + tensors. + children (Iterable of Layer, optional): Destinations of output + tensors. + weights (Iterable of Weights, optional): Trainable parameters. + name (str, optional): Unique identifier (default is + 'layer'). + device (str, optional): Device to use, e.g. CPU or GPU. + data_layout (str, optional): Data distribution scheme. + hint_layer (Layer, optional): Hint for output dimensions. + + """ global_count = 0 # Static counter, used for default names def __init__(self, + *args, parents=[], children=[], weights=[], @@ -17,22 +33,6 @@ def __init__(self, device=None, data_layout=None, hint_layer=None): - """Constructor. - - Args: - parents (Iterable of Layer, optional): Sources of input - tensors. - children (Iterable of Layer, optional): Destinations of - output tensors. - weights (Iterable of Weights, optional): Trainable - parameters. - name (str, optional): Unique identifier (default is - 'layer'). - device (str, optional): Device to use, e.g. CPU or GPU. - data_layout (str, optional): Data distribution scheme. - hint_layer (Layer, optional): Hint for output dimensions. - - """ Layer.global_count += 1 self.parents = [] self.children = [] @@ -43,16 +43,15 @@ def __init__(self, self.hint_layer = hint_layer # Initialize parents, children, and weights - for l in make_iterable(parents): - self.add_parent(l) - for l in make_iterable(children): - self.add_child(l) - for w in make_iterable(weights): - self.add_weights(w) + for arg in args: + self.add_parent(arg) + self.add_parent(parents) + self.add_child(children) + self.add_weights(weights) def export_proto(self): """Construct and return a protobuf message.""" - proto = layers_pb2.Layer() + proto = lbann.layers_pb2.Layer() proto.parents = ' '.join([l.name for l in self.parents]) proto.children = ' '.join([l.name for l in self.children]) proto.weights = ' '.join([w.name for w in self.weights]) @@ -93,7 +92,7 @@ def __call__(self, parent): # Note: The list of skip fields must be updated if any new fields are # added to the Layer message in lbann.proto classes = lbann.util.class_generator.generate_classes_from_protobuf_message( - layers_pb2.Layer, + lbann.layers_pb2.Layer, skip_fields = set([ 'name', 'parents', 'children', 'data_layout', 'device_allocation', 'weights', 'num_neurons_from_data_reader', 'freeze', 'hint_layer', diff --git a/python/lbann/models/resnet.py b/python/lbann/models/resnet.py index 0caeb150473..6277bbcde1e 100644 --- a/python/lbann/models/resnet.py +++ b/python/lbann/models/resnet.py @@ -130,7 +130,7 @@ def forward(self, x): self.instance += 1 y1 = self.branch1(x) if self.branch1 else x y2 = self.branch2b(self.branch2a(x)) - z = lbann.Add([y1, y2], + z = lbann.Add(y1, y2, name='{0}_sum_instance{1}'.format(self.name,self.instance)) return lbann.Relu(z, name='{0}_relu_instance{1}'.format(self.name,self.instance)) @@ -198,7 +198,7 @@ def forward(self, x): self.instance += 1 y1 = self.branch1(x) if self.branch1 else x y2 = self.branch2c(self.branch2b(self.branch2a(x))) - z = lbann.Add([y1, y2], + z = lbann.Add(y1, y2, name='{0}_sum_instance{1}'.format(self.name,self.instance)) return lbann.Relu(z, name='{0}_relu_instance{1}'.format(self.name,self.instance)) diff --git a/python/lbann/modules.py b/python/lbann/modules.py index b04cfeb05da..72404b1d745 100644 --- a/python/lbann/modules.py +++ b/python/lbann/modules.py @@ -318,7 +318,7 @@ def forward(self, x, prev_state): prev_output, prev_cell = prev_state # Apply linearity - input_concat = lbann.Concatenation([x, prev_output], + input_concat = lbann.Concatenation(x, prev_output, name=name + '_input', data_layout=self.data_layout) fc = self.fc(input_concat) @@ -346,19 +346,19 @@ def forward(self, x, prev_state): data_layout=self.data_layout) # Cell state - cell_forget = lbann.Multiply([f, prev_cell], + cell_forget = lbann.Multiply(f, prev_cell, name=name + '_cell_forget', data_layout=self.data_layout) - cell_input = lbann.Multiply([i, cell_update], + cell_input = lbann.Multiply(i, cell_update, name=name + '_cell_input', data_layout=self.data_layout) - cell = lbann.Add([cell_forget, cell_input], name=name + '_cell', + cell = lbann.Add(cell_forget, cell_input, name=name + '_cell', data_layout=self.data_layout) # Output cell_act = lbann.Tanh(cell, name=name + '_cell_activation', data_layout=self.data_layout) - output = lbann.Multiply([o, cell_act], name=name, + output = lbann.Multiply(o, cell_act, name=name, data_layout=self.data_layout) # Return output and state @@ -445,7 +445,7 @@ def forward(self, x, prev_state): prev_state: State from previous GRU step. Returns: - (Layer, Layer): The output (out) and state (hn). + (Layer, Layer): The output (out) and state (hn). The state can be passed directly into the next GRU step. @@ -480,26 +480,45 @@ def forward(self, x, prev_state): data_layout=self.data_layout) Whn_prev = lbann.Identity(fc2_slice, name=name + '_Wnh', data_layout=self.data_layout) - - rt = lbann.Sigmoid(lbann.Add([Wir_x,Whr_prev], data_layout=self.data_layout), name=name + '_reset_gate', - data_layout=self.data_layout) - zt = lbann.Sigmoid(lbann.Add([Wiz_x,Whz_prev], data_layout=self.data_layout), name=name + '_update_gate', - data_layout=self.data_layout) - - nt = lbann.Tanh(lbann.Add([Win_x, - lbann.Multiply([rt,Whn_prev], data_layout=self.data_layout)], data_layout=self.data_layout), - name=name + '_new_gate', data_layout=self.data_layout) - - ht = lbann.Add([ - lbann.Multiply([ - lbann.WeightedSum([ - lbann.Constant(value=1.0, hint_layer=zt, data_layout=self.data_layout), - zt], - scaling_factors='1 -1', data_layout=self.data_layout), - nt], data_layout=self.data_layout), - lbann.Multiply([zt,prev_state], data_layout=self.data_layout)], name=name+ '_output', - data_layout=self.data_layout) - + rt = \ + lbann.Sigmoid( + lbann.Add(Wir_x, Whr_prev, data_layout=self.data_layout), + name=name + '_reset_gate', + data_layout=self.data_layout + ) + + zt = \ + lbann.Sigmoid( + lbann.Add(Wiz_x, Whz_prev, data_layout=self.data_layout), + name=name + '_update_gate', + data_layout=self.data_layout, + ) + + nt = \ + lbann.Tanh( + lbann.Add( + Win_x, + lbann.Multiply(rt, Whn_prev, data_layout=self.data_layout), + data_layout=self.data_layout, + ), + name=name + '_new_gate', data_layout=self.data_layout, + ) + + ht = \ + lbann.Add( + lbann.Multiply( + lbann.WeightedSum( + lbann.Constant(value=1.0, hint_layer=zt, data_layout=self.data_layout), + zt, + scaling_factors='1 -1', data_layout=self.data_layout + ), + nt, + data_layout=self.data_layout + ), + lbann.Multiply(zt, prev_state, data_layout=self.data_layout), + name=name+ '_output', data_layout=self.data_layout, + ) + # Return output return ht, ht From 3c73c81e6726a11249c73606659dff1f0cc06cf9 Mon Sep 17 00:00:00 2001 From: Tim Moon Date: Wed, 13 Nov 2019 11:18:55 -0800 Subject: [PATCH 394/634] Implement matrix multiply layer (#1346) * Implement CPU matrix multiplication layer * Implement GPU matrix multiplication layer * Tweak matmul layer documentation * Update matmul Bamboo test with Python API change from PR #1348 --- bamboo/unit_tests/test_unit_layer_matmul.py | 178 ++++++++++++++++++ include/lbann/layers/math/CMakeLists.txt | 1 + include/lbann/layers/math/matmul.hpp | 170 +++++++++++++++++ include/lbann/utils/cublas.hpp | 13 +- src/layers/math/CMakeLists.txt | 1 + src/layers/math/matmul.cpp | 194 ++++++++++++++++++++ src/proto/factories/layer_factory.cpp | 9 + src/proto/layers.proto | 9 + src/utils/cublas.cpp | 24 ++- 9 files changed, 596 insertions(+), 3 deletions(-) create mode 100644 bamboo/unit_tests/test_unit_layer_matmul.py create mode 100644 include/lbann/layers/math/matmul.hpp create mode 100644 src/layers/math/matmul.cpp diff --git a/bamboo/unit_tests/test_unit_layer_matmul.py b/bamboo/unit_tests/test_unit_layer_matmul.py new file mode 100644 index 00000000000..4e0344853b6 --- /dev/null +++ b/bamboo/unit_tests/test_unit_layer_matmul.py @@ -0,0 +1,178 @@ +import functools +import operator +import os +import os.path +import sys +import numpy as np + +# Bamboo utilities +current_file = os.path.realpath(__file__) +current_dir = os.path.dirname(current_file) +sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python')) +import tools + +# ============================================== +# Objects for Python data reader +# ============================================== +# Note: The Python data reader imports this file as a module and calls +# the functions below to ingest data. + +# Data +np.random.seed(20191111) +_m = 11 +_n = 3 +_k = 5 +_samples = np.random.normal(size=(27,_m*_k+_k*_n)).astype(np.float32) + +# Sample access functions +def get_sample(index): + return _samples[index].reshape(-1) +def num_samples(): + return _samples.shape[0] +def sample_dims(): + return (_samples.shape[-1],) + +# ============================================== +# Setup LBANN experiment +# ============================================== + +def setup_experiment(lbann): + """Construct LBANN experiment. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + trainer = lbann.Trainer() + model = construct_model(lbann) + data_reader = construct_data_reader(lbann) + optimizer = lbann.NoOptimizer() + return trainer, model, data_reader, optimizer + +def construct_model(lbann): + """Construct LBANN model. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Input data + # Note: Sum with weights layers so that gradient checking will + # verify that error signals are correct. + x0_weights = lbann.Weights(optimizer=lbann.SGD(), + initializer=lbann.ConstantInitializer(value=0.0), + name='input0_weights') + x1_weights = lbann.Weights(optimizer=lbann.SGD(), + initializer=lbann.ConstantInitializer(value=0.0), + name='input1_weights') + x_slice = lbann.Slice(lbann.Input(), + slice_points=tools.str_list([0, _m*_k, _m*_k+_k*_n])) + x0 = lbann.Sum(lbann.Reshape(x_slice, + dims=tools.str_list([_m, _k])), + lbann.WeightsLayer(weights=x0_weights, + dims=tools.str_list([_m, _k]))) + x1 = lbann.Sum(lbann.Reshape(x_slice, + dims=tools.str_list([_k, _n])), + lbann.WeightsLayer(weights=x1_weights, + dims=tools.str_list([_k, _n]))) + x0_lbann = x0 + x1_lbann = x1 + + # Objects for LBANN model + obj = [] + metrics = [] + callbacks = [] + + # ------------------------------------------ + # NN GEMM + # ------------------------------------------ + + # LBANN implementation + x0 = x0_lbann + x1 = x1_lbann + y = lbann.MatMul(x0, x1, data_layout='data_parallel') + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='NN GEMM')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i).astype(np.float64) + x0 = x[:_m*_k].reshape([_m,_k]) + x1 = x[_m*_k:].reshape([_k,_n]) + y = np.matmul(x0, x1) + z = tools.numpy_l2norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # Gradient checking + # ------------------------------------------ + + callbacks.append(lbann.CallbackCheckGradients(error_on_failure=True)) + + # ------------------------------------------ + # Construct model + # ------------------------------------------ + + mini_batch_size = num_samples() // 2 + num_epochs = 0 + return lbann.Model(mini_batch_size, + num_epochs, + layers=lbann.traverse_layer_graph(x0_lbann), + objective_function=obj, + metrics=metrics, + callbacks=callbacks) + +def construct_data_reader(lbann): + """Construct Protobuf message for Python data reader. + + The Python data reader will import the current Python file to + access the sample access functions. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Note: The training data reader should be removed when + # https://github.com/LLNL/lbann/issues/1098 is resolved. + message = lbann.reader_pb2.DataReader() + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'train' + ) + ]) + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'test' + ) + ]) + return message + +# ============================================== +# Setup PyTest +# ============================================== + +# Create test functions that can interact with PyTest +for test in tools.create_tests(setup_experiment, __file__): + globals()[test.__name__] = test diff --git a/include/lbann/layers/math/CMakeLists.txt b/include/lbann/layers/math/CMakeLists.txt index a6d19112716..ca9b3d9461b 100644 --- a/include/lbann/layers/math/CMakeLists.txt +++ b/include/lbann/layers/math/CMakeLists.txt @@ -3,6 +3,7 @@ set_full_path(THIS_DIR_HEADERS unary.hpp binary.hpp clamp.hpp + matmul.hpp ) # Propagate the files up the tree diff --git a/include/lbann/layers/math/matmul.hpp b/include/lbann/layers/math/matmul.hpp new file mode 100644 index 00000000000..77b3e711c1b --- /dev/null +++ b/include/lbann/layers/math/matmul.hpp @@ -0,0 +1,170 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_LAYER_MATH_MATMUL_HPP_INCLUDED +#define LBANN_LAYER_MATH_MATMUL_HPP_INCLUDED + +#include "lbann/layers/layer.hpp" + +namespace lbann { + +/** @brief Matrix multiplication. + * + * Takes two 2D input tensors and outputs their matrix product. + * Matrix products are computed independently for each mini-batch + * sample, in a similar manner as NumPy's matmul function. + * + * @todo Support >2 dimensions, transposes, matvecs, and dot products + * + */ +template +class matmul_layer : public Layer { + static_assert(Layout == data_layout::DATA_PARALLEL, + "matmul_layer only supports " + "data-parallel data layout"); + +public: + + matmul_layer(lbann_comm *comm); + matmul_layer(const matmul_layer& other) = default; + matmul_layer& operator=(const matmul_layer& other) = default; + matmul_layer* copy() const override; + std::string get_type() const override; + data_layout get_data_layout() const override; + El::Device get_device_allocation() const override; + +protected: + + void setup_dims() override; + void fp_compute() override; + void bp_compute() override; + +}; + +// ========================================================= +// Implementation +// ========================================================= + +template +matmul_layer::matmul_layer(lbann_comm *comm) + : Layer(comm) { + this->m_expected_num_parent_layers = 2; +} + +template +matmul_layer* matmul_layer::copy() const { + return new matmul_layer(*this); +} + +template +std::string matmul_layer::get_type() const { + return "matrix multiply"; +} + +template +data_layout matmul_layer::get_data_layout() const { + return Layout; +} + +template +El::Device matmul_layer::get_device_allocation() const { + return Device; +} + +template +void matmul_layer::setup_dims() { + Layer::setup_dims(); + + // Input dimensions + const auto& input0_dims = this->get_input_dims(0); + const auto& input1_dims = this->get_input_dims(1); + + // Lambdas to help print error messages + auto print_name = [this] () -> std::string { + return this->get_type() + " layer \"" + this->get_name() + "\""; + }; + auto print_inputs = [this, &input0_dims, &input1_dims] () -> std::string { + auto print_dims = [] (const decltype(input0_dims)& dims) -> std::string { + std::ostringstream ss; + for (size_t i = 0; i < dims.size(); ++i) { + ss << (i > 0 ? "x" : "") << dims[i]; + } + return ss.str(); + }; + const auto& parents = this->get_parent_layers(); + return lbann::build_string( + parents[0]->get_type()," layer \"",parents[0]->get_name(),"\" ", + "outputs ",print_dims(input0_dims),", ", + parents[1]->get_type()," layer \"",parents[1]->get_name(),"\" ", + "outputs ",print_dims(input1_dims)); + }; + + // Check input dimensions + if (input0_dims.size() != input1_dims.size()) { + LBANN_ERROR("input tensors in ",print_name()," " + "have different numbers of dimensions ", + "(",print_inputs(),")"); + } + if (input0_dims.size() != 2) { + LBANN_ERROR("input tensors in ",print_name()," are not 2D ", + "(",print_inputs(),")"); + } + + // Get dimensions for matrix multiply + const auto m = *(input0_dims.rbegin()+1); + const auto n = *(input1_dims.rbegin()); + const auto k = *(input0_dims.rbegin()); + if (*(input1_dims.rbegin()+1) != k || m < 1 || n < 1 || k < 1) { + LBANN_ERROR("input tensors in ",print_name()," ", + "are not compatible with matrix multiplication ", + "(",print_inputs(),")"); + } + + // Set output dimensions + std::vector output_dims(input0_dims); + *(output_dims.rbegin()+1) = m; + *(output_dims.rbegin()) = n; + this->set_output_dims(output_dims); + +} + +// ========================================================= +// Explicit template instantiation +// ========================================================= + +#ifndef LBANN_MATMUL_LAYER_INSTANTIATE +extern template class matmul_layer< + data_layout::DATA_PARALLEL, El::Device::CPU>; +#ifdef LBANN_HAS_GPU +extern template class matmul_layer< + data_layout::DATA_PARALLEL, El::Device::GPU>; +#endif // LBANN_HAS_GPU +#endif // LBANN_MATMUL_LAYER_INSTANTIATE + +} // namespace lbann + +#endif // LBANN_LAYER_MATH_MATMUL_HPP_INCLUDED diff --git a/include/lbann/utils/cublas.hpp b/include/lbann/utils/cublas.hpp index 0122b8f697f..2439db8fda6 100644 --- a/include/lbann/utils/cublas.hpp +++ b/include/lbann/utils/cublas.hpp @@ -155,7 +155,18 @@ void geam(cublasHandle_t const& handle, DataType beta, DataType const * B, int ldb, DataType * C, int ldc); - +void gemm_strided_batched(cublasHandle_t const& handle, + cublasOperation_t transa, cublasOperation_t transb, + int m, int n, int k, + DataType alpha, + DataType const * A, int lda, + long long int strideA, + DataType const * B, int ldb, + long long int strideB, + DataType beta, + DataType * C, int ldc, + long long int strideC, + int batchCount); } // namespace cublas } // namespace lbann diff --git a/src/layers/math/CMakeLists.txt b/src/layers/math/CMakeLists.txt index 22125a51af2..6318913c367 100644 --- a/src/layers/math/CMakeLists.txt +++ b/src/layers/math/CMakeLists.txt @@ -3,6 +3,7 @@ set_full_path(THIS_DIR_SOURCES unary.cpp binary.cpp clamp.cpp + matmul.cpp ) if (LBANN_HAS_CUDA) diff --git a/src/layers/math/matmul.cpp b/src/layers/math/matmul.cpp new file mode 100644 index 00000000000..8745eced9cd --- /dev/null +++ b/src/layers/math/matmul.cpp @@ -0,0 +1,194 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#define LBANN_MATMUL_LAYER_INSTANTIATE +#include "lbann/layers/math/matmul.hpp" +#ifdef LBANN_HAS_GPU +#include "lbann/utils/cublas.hpp" +#endif // LBANN_HAS_GPU + +namespace lbann { + +template <> +void matmul_layer::fp_compute() { + + // Local data + const auto& local_input0 = dynamic_cast(get_local_prev_activations(0)); + const auto& local_input1 = dynamic_cast(get_local_prev_activations(1)); + auto& local_output = dynamic_cast(get_local_activations()); + const auto& local_mini_batch_size = local_input0.Width(); + + // Matrix dimensions + const auto output_dims = this->get_output_dims(); + const auto input0_dims = this->get_input_dims(0); + const El::Int m = *(output_dims.rbegin()+1); + const El::Int n = *(output_dims.rbegin()); + const El::Int k = *(input0_dims.rbegin()); + + // Compute matrix multiplication for each mini-batch sample + // Note: Elemental matrices are in Fortran layout while LBANN + // tensors are in C layout. + LBANN_OMP_PARALLEL_FOR + for (El::Int i = 0; i < local_mini_batch_size; ++i) { + CPUMat input0_v, input1_v, output_v; + input0_v.LockedAttach(k, m, local_input0.LockedBuffer(0,i), k); + input1_v.LockedAttach(n, k, local_input1.LockedBuffer(0,i), n); + output_v.Attach(n, m, local_output.Buffer(0,i), n); + El::Gemm(El::NORMAL, El::NORMAL, + DataType{1}, input1_v, input0_v, + DataType{0}, output_v); + } + +} + +template <> +void matmul_layer::bp_compute() { + + // Local data + const auto& local_input0 = dynamic_cast(get_local_prev_activations(0)); + const auto& local_input1 = dynamic_cast(get_local_prev_activations(1)); + const auto& local_output_grad = dynamic_cast(get_local_prev_error_signals()); + auto& local_input0_grad = dynamic_cast(get_local_error_signals(0)); + auto& local_input1_grad = dynamic_cast(get_local_error_signals(1)); + const auto& local_mini_batch_size = local_input0.Width(); + + // Matrix dimensions + const auto output_dims = this->get_output_dims(); + const auto input0_dims = this->get_input_dims(0); + const El::Int m = *(output_dims.rbegin()+1); + const El::Int n = *(output_dims.rbegin()); + const El::Int k = *(input0_dims.rbegin()); + + // Compute gradients for each mini-batch sample + // Note: Elemental matrices are in Fortran layout while LBANN + // tensors are in C layout. + LBANN_OMP_PARALLEL_FOR + for (El::Int i = 0; i < local_mini_batch_size; ++i) { + CPUMat input0_v, input1_v, output_grad_v, input0_grad_v, input1_grad_v; + input0_v.LockedAttach(k, m, local_input0.LockedBuffer(0,i), k); + input1_v.LockedAttach(n, k, local_input1.LockedBuffer(0,i), n); + output_grad_v.LockedAttach(n, m, local_output_grad.LockedBuffer(0,i), n); + input0_grad_v.Attach(k, m, local_input0_grad.Buffer(0,i), k); + input1_grad_v.Attach(n, k, local_input1_grad.Buffer(0,i), n); + El::Gemm(El::TRANSPOSE, El::NORMAL, + DataType{1}, input1_v, output_grad_v, + DataType{0}, input0_grad_v); + El::Gemm(El::NORMAL, El::TRANSPOSE, + DataType{1}, output_grad_v, input0_v, + DataType{0}, input1_grad_v); + } + +} + +#ifdef LBANN_HAS_GPU +template <> +void matmul_layer::fp_compute() { + + // Local data + const auto& local_input0 = dynamic_cast(get_local_prev_activations(0)); + const auto& local_input1 = dynamic_cast(get_local_prev_activations(1)); + auto& local_output = dynamic_cast(get_local_activations()); + const auto& local_mini_batch_size = local_input0.Width(); + + // Return immediately if nothing needs to be done + if (local_mini_batch_size < 1) { return; } + + // Matrix dimensions + const auto output_dims = this->get_output_dims(); + const auto input0_dims = this->get_input_dims(0); + const El::Int m = *(output_dims.rbegin()+1); + const El::Int n = *(output_dims.rbegin()); + const El::Int k = *(input0_dims.rbegin()); + + // Compute matrix multiplication for each mini-batch sample + // Note: cuBLAS expects matrices in Fortran layout while LBANN + // tensors are in C layout. + auto&& handle = El::GPUManager::cuBLASHandle(); + cublas::gemm_strided_batched( + handle, CUBLAS_OP_N, CUBLAS_OP_N, n, m, k, + DataType{1}, + local_input1.LockedBuffer(), n, local_input1.LDim(), + local_input0.LockedBuffer(), k, local_input0.LDim(), + DataType{0}, + local_output.Buffer(), n, local_output.LDim(), + local_mini_batch_size); + +} +#endif // LBANN_HAS_GPU + +#ifdef LBANN_HAS_GPU +template <> +void matmul_layer::bp_compute() { + + // Local data + const auto& local_input0 = dynamic_cast(get_local_prev_activations(0)); + const auto& local_input1 = dynamic_cast(get_local_prev_activations(1)); + const auto& local_output_grad = dynamic_cast(get_local_prev_error_signals()); + auto& local_input0_grad = dynamic_cast(get_local_error_signals(0)); + auto& local_input1_grad = dynamic_cast(get_local_error_signals(1)); + const auto& local_mini_batch_size = local_input0.Width(); + + // Return immediately if nothing needs to be done + if (local_mini_batch_size < 1) { return; } + + // Matrix dimensions + const auto output_dims = this->get_output_dims(); + const auto input0_dims = this->get_input_dims(0); + const El::Int m = *(output_dims.rbegin()+1); + const El::Int n = *(output_dims.rbegin()); + const El::Int k = *(input0_dims.rbegin()); + + // Compute gradients for each mini-batch sample + // Note: cuBLAS expects matrices in Fortran layout while LBANN + // tensors are in C layout. + auto&& handle = El::GPUManager::cuBLASHandle(); + cublas::gemm_strided_batched( + handle, CUBLAS_OP_T, CUBLAS_OP_N, k, m, n, + DataType{1}, + local_input1.LockedBuffer(), n, local_input1.LDim(), + local_output_grad.LockedBuffer(), n, local_output_grad.LDim(), + DataType{0}, + local_input0_grad.Buffer(), k, local_input0_grad.LDim(), + local_mini_batch_size); + cublas::gemm_strided_batched( + handle, CUBLAS_OP_N, CUBLAS_OP_T, n, k, m, + DataType{1}, + local_output_grad.LockedBuffer(), n, local_output_grad.LDim(), + local_input0.LockedBuffer(), k, local_input0.LDim(), + DataType{0}, + local_input1_grad.Buffer(), n, local_input1_grad.LDim(), + local_mini_batch_size); + +} +#endif // LBANN_HAS_GPU + +// Explicit instantiation +template class matmul_layer; +#ifdef LBANN_HAS_GPU +template class matmul_layer; +#endif // LBANN_HAS_GPU + +} // namespace lbann diff --git a/src/proto/factories/layer_factory.cpp b/src/proto/factories/layer_factory.cpp index 7c06f8b96d1..39eb0d7799d 100644 --- a/src/proto/factories/layer_factory.cpp +++ b/src/proto/factories/layer_factory.cpp @@ -55,6 +55,7 @@ #include "lbann/layers/loss/top_k_categorical_accuracy.hpp" #include "lbann/layers/math/binary.hpp" #include "lbann/layers/math/clamp.hpp" +#include "lbann/layers/math/matmul.hpp" #include "lbann/layers/math/unary.hpp" #include "lbann/layers/misc/channelwise_mean.hpp" #include "lbann/layers/misc/covariance.hpp" @@ -604,6 +605,14 @@ std::unique_ptr construct_layer( const auto& params = proto_layer.clamp(); return lbann::make_unique>(comm, params.min(), params.max()); } + if (proto_layer.has_matmul()) { + if (Layout == data_layout::DATA_PARALLEL) { + return lbann::make_unique>(comm); + } else { + LBANN_ERROR("matrix multiply layer is only supported with " + "a data-parallel layout"); + } + } // Activation layers if (proto_layer.has_elu()) { diff --git a/src/proto/layers.proto b/src/proto/layers.proto index 2695903df89..9342401d11a 100644 --- a/src/proto/layers.proto +++ b/src/proto/layers.proto @@ -147,6 +147,7 @@ message Layer { LogicalOr logical_or = 467; LogicalXor logical_xor = 468; Clamp clamp = 469; + MatMul matmul = 470; // Regularization layers BatchNormalization batch_normalization = 19; @@ -238,6 +239,14 @@ message Layer { double max = 2; } + /** @brief Matrix multiplication. + * + * Takes two 2D input tensors and outputs their matrix product. + * Matrix products are computed independently for each mini-batch + * sample, in a similar manner as NumPy's matmul function. + */ + message MatMul {} + /////////////////////// // Activation layers // /////////////////////// diff --git a/src/utils/cublas.cpp b/src/utils/cublas.cpp index 765b02d9ae3..d2ccc69d9dc 100644 --- a/src/utils/cublas.cpp +++ b/src/utils/cublas.cpp @@ -43,23 +43,25 @@ struct cuBLAS_Caller; template <> struct cuBLAS_Caller { WRAP_CUBLAS(cublasSaxpy, axpy) - WRAP_CUBLAS(cublasSdot , dot ) + WRAP_CUBLAS(cublasSdot, dot) WRAP_CUBLAS(cublasSnrm2, nrm2) WRAP_CUBLAS(cublasSscal, scal) WRAP_CUBLAS(cublasSgemv, gemv) WRAP_CUBLAS(cublasSgemm, gemm) WRAP_CUBLAS(cublasSgeam, geam) + WRAP_CUBLAS(cublasSgemmStridedBatched, gemm_strided_batched) }; template <> struct cuBLAS_Caller { WRAP_CUBLAS(cublasDaxpy, axpy) - WRAP_CUBLAS(cublasDdot , dot ) + WRAP_CUBLAS(cublasDdot, dot) WRAP_CUBLAS(cublasDnrm2, nrm2) WRAP_CUBLAS(cublasDscal, scal) WRAP_CUBLAS(cublasDgemv, gemv) WRAP_CUBLAS(cublasDgemm, gemm) WRAP_CUBLAS(cublasDgeam, geam) + WRAP_CUBLAS(cublasDgemmStridedBatched, gemm_strided_batched) }; } // namespace @@ -178,6 +180,24 @@ void geam(cublasHandle_t const& handle, &alpha, A, lda, &beta, B, ldb, C, ldc); } +void gemm_strided_batched(cublasHandle_t const& handle, + cublasOperation_t transa, cublasOperation_t transb, + int m, int n, int k, + DataType alpha, + DataType const * A, int lda, + long long int strideA, + DataType const * B, int ldb, + long long int strideB, + DataType beta, + DataType * C, int ldc, + long long int strideC, + int batchCount) { + cuBLAS_Caller{}.gemm_strided_batched( + handle, transa, transb, m, n, k, + &alpha, A, lda, strideA, B, ldb, strideB, + &beta, C, ldc, strideC, batchCount); +} + } // namespace cublas } // namespace lbann From a3e6df1ffb5deaaf7694b83e43f2b16cca47e859 Mon Sep 17 00:00:00 2001 From: Tom Benson <30674819+benson31@users.noreply.github.com> Date: Wed, 13 Nov 2019 12:25:03 -0800 Subject: [PATCH 395/634] update the versions for Aluminum and Hydrogen (#1349) --- CMakeLists.txt | 6 ++++-- superbuild/aluminum/CMakeLists.txt | 2 +- superbuild/hydrogen/CMakeLists.txt | 2 +- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index ba6e4762b56..4ceed5370be 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -225,13 +225,15 @@ endif () if (LBANN_WITH_ALUMINUM) # Aluminum may have already been found by Hydrogen if (NOT Aluminum_FOUND) - find_package(Aluminum 0.2.0 NO_MODULE QUIET + message(WARNING + "Using Aluminum without Hydrogen support may not be well-supported.") + find_package(Aluminum 0.3.0 NO_MODULE QUIET HINTS ${Aluminum_DIR} ${ALUMINUM_DIR} ${AL_DIR} $ENV{Aluminum_DIR} $ENV{ALUMINUM_DIR} $ENV{AL_DIR} PATH_SUFFIXES lib64/cmake/aluminum lib/cmake/aluminum NO_DEFAULT_PATH) if (NOT Aluminum_FOUND) - find_package(Aluminum 0.2.0 NO_MODULE QUIET) + find_package(Aluminum 0.3.0 NO_MODULE QUIET) endif () endif () set(LBANN_HAS_ALUMINUM ${Aluminum_FOUND}) diff --git a/superbuild/aluminum/CMakeLists.txt b/superbuild/aluminum/CMakeLists.txt index a7b37c02672..7a637eeabf9 100644 --- a/superbuild/aluminum/CMakeLists.txt +++ b/superbuild/aluminum/CMakeLists.txt @@ -11,7 +11,7 @@ else () CACHE STRING "The URL from which to clone Aluminum") endif () -set(ALUMINUM_TAG "v0.2.1-1" +set(ALUMINUM_TAG "v0.3.2" CACHE STRING "The git tag to checkout for Aluminum") set(ALUMINUM_CMAKE_BUILD_TYPE "${CMAKE_BUILD_TYPE}" diff --git a/superbuild/hydrogen/CMakeLists.txt b/superbuild/hydrogen/CMakeLists.txt index 8ebd4d0dbad..30a714f0d39 100644 --- a/superbuild/hydrogen/CMakeLists.txt +++ b/superbuild/hydrogen/CMakeLists.txt @@ -109,7 +109,7 @@ else () endif () # ... then the tag. -set(HYDROGEN_TAG "v1.1.0-1" +set(HYDROGEN_TAG "v1.2.0" CACHE STRING "The git tag or hash to checkout for Hydrogen") if (HYDROGEN_CUSTOM_SOURCE_DIR) From e9e90db21bd29f4da38a9e388fc9fbc02e75bfde Mon Sep 17 00:00:00 2001 From: Tom Benson <30674819+benson31@users.noreply.github.com> Date: Wed, 13 Nov 2019 12:25:38 -0800 Subject: [PATCH 396/634] Add support for MPI Catch2 testing (#1343) * initial progress toward MPI catch testing adds a main() and required infrastructure for outputting to rank-specific files * some doxygen updates * documentation tweaks * Update unit_test/utilities/ReplaceEscapes.hpp Co-Authored-By: Tim Moon * add a simple test showing use of the global communicator * add missing license statements --- docs/SourceTreeDoxyfile | 3 +- include/lbann/utils/memory.hpp | 7 + include/lbann/utils/system_info.hpp | 88 ++++++++ src/proto/unit_test/CMakeLists.txt | 8 +- src/transforms/unit_test/CMakeLists.txt | 8 +- .../vision/unit_test/CMakeLists.txt | 8 +- src/utils/CMakeLists.txt | 1 + src/utils/system_info.cpp | 166 ++++++++++++++++ src/utils/unit_test/CMakeLists.txt | 7 +- unit_test/CMakeLists.txt | 20 +- unit_test/MPICatchMain.cpp | 72 +++++++ unit_test/example/CMakeLists.txt | 7 + unit_test/example/mpi_bcast_example_test.cpp | 56 ++++++ unit_test/utilities/CMakeLists.txt | 21 ++ unit_test/utilities/MPITestHelpers.cpp | 53 +++++ unit_test/utilities/MPITestHelpers.hpp | 80 ++++++++ unit_test/utilities/ReplaceEscapes.cpp | 115 +++++++++++ unit_test/utilities/ReplaceEscapes.hpp | 102 ++++++++++ unit_test/utilities/unit_test/CMakeLists.txt | 8 + .../unit_test/test_replace_escapes.cpp | 188 ++++++++++++++++++ 20 files changed, 1002 insertions(+), 16 deletions(-) create mode 100644 include/lbann/utils/system_info.hpp create mode 100644 src/utils/system_info.cpp create mode 100644 unit_test/MPICatchMain.cpp create mode 100644 unit_test/example/CMakeLists.txt create mode 100644 unit_test/example/mpi_bcast_example_test.cpp create mode 100644 unit_test/utilities/CMakeLists.txt create mode 100644 unit_test/utilities/MPITestHelpers.cpp create mode 100644 unit_test/utilities/MPITestHelpers.hpp create mode 100644 unit_test/utilities/ReplaceEscapes.cpp create mode 100644 unit_test/utilities/ReplaceEscapes.hpp create mode 100644 unit_test/utilities/unit_test/CMakeLists.txt create mode 100644 unit_test/utilities/unit_test/test_replace_escapes.cpp diff --git a/docs/SourceTreeDoxyfile b/docs/SourceTreeDoxyfile index 44929c99dfe..8fcbae615bd 100644 --- a/docs/SourceTreeDoxyfile +++ b/docs/SourceTreeDoxyfile @@ -763,7 +763,8 @@ WARN_LOGFILE = INPUT = ../README.md \ ../docs \ ../src \ - ../include + ../include \ + ../unit_test # This tag can be used to specify the character encoding of the source files # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses diff --git a/include/lbann/utils/memory.hpp b/include/lbann/utils/memory.hpp index fc293fa819e..44982ecda4b 100644 --- a/include/lbann/utils/memory.hpp +++ b/include/lbann/utils/memory.hpp @@ -23,6 +23,13 @@ std::unique_ptr make_unique(Ts&&... params) #endif +/** @brief Convert the raw pointer to a unique_ptr. */ +template +std::unique_ptr to_unique(T* ptr) +{ + return std::unique_ptr(ptr); +} + }// namespace lbann #endif /* LBANN_MEMORY_HPP_ */ diff --git a/include/lbann/utils/system_info.hpp b/include/lbann/utils/system_info.hpp new file mode 100644 index 00000000000..9d0e960becb --- /dev/null +++ b/include/lbann/utils/system_info.hpp @@ -0,0 +1,88 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_UTILS_SYSTEM_INFO_HPP_INCLUDED +#define LBANN_UTILS_SYSTEM_INFO_HPP_INCLUDED + +#include + +namespace lbann { +namespace utils { + +/** @class SystemInfo + * @brief Query basic system information + * + * The class structure here is, strictly speaking, unnecessary. It is + * used to provide a "hook" for stubbing this information during + * testing. + */ +class SystemInfo +{ +public: + /** @brief Virtual destructor */ + virtual ~SystemInfo() noexcept = default; + + /** @brief Get the current process ID. + * + * This returns the value as a string to avoid system differences + * in `pid_t`. However, it's probably safe to return either int64_t + * or uint64_t here. + */ + virtual std::string pid() const; + + /** @brief Get the host name for this process. */ + virtual std::string host_name() const; + + /** @brief Get the MPI rank of this process. + * + * If this is not an MPI job, or cannot be determined to be an MPI + * job, this will return 0. + * + * The return type is chosen for consistency with MPI 3.0. + */ + virtual int mpi_rank() const; + + /** @brief Get the size of the MPI universe in which this process is + * participating. + * + * If this is not an MPI job, or cannot be determined to be an MPI + * job, this will return 1. + * + * The return type is chosen for consistency with MPI 3.0. + */ + virtual int mpi_size() const; + + /** @brief Get the value of the given variable from the environment. + * + * If the variable doesn't exist, the empty string is returned. + */ + virtual std::string env_variable_value(std::string const& var_name) const; + +}; + +}// namespace utils +}// namespace lbann +#endif // LBANN_UTILS_SYSTEM_INFO_HPP_INCLUDED diff --git a/src/proto/unit_test/CMakeLists.txt b/src/proto/unit_test/CMakeLists.txt index 90324d96bf0..2a4e5e66bc6 100644 --- a/src/proto/unit_test/CMakeLists.txt +++ b/src/proto/unit_test/CMakeLists.txt @@ -1,8 +1,10 @@ -set_full_path(_DIR_LBANN_CATCH2_TEST_FILES +set_full_path(THIS_DIR_SEQ_CATCH2_TEST_FILES parse_list_test.cpp parse_set_test.cpp trim_test.cpp ) -set(LBANN_CATCH2_TEST_FILES - "${LBANN_CATCH2_TEST_FILES}" "${_DIR_LBANN_CATCH2_TEST_FILES}" PARENT_SCOPE) +set(LBANN_SEQ_CATCH2_TEST_FILES + "${LBANN_SEQCATCH2_TEST_FILES}" + "${THIS_DIR_SEQ_CATCH2_TEST_FILES}" + PARENT_SCOPE) diff --git a/src/transforms/unit_test/CMakeLists.txt b/src/transforms/unit_test/CMakeLists.txt index c93aeb4207c..bb7b8e9b201 100644 --- a/src/transforms/unit_test/CMakeLists.txt +++ b/src/transforms/unit_test/CMakeLists.txt @@ -1,9 +1,11 @@ -set_full_path(_DIR_LBANN_CATCH2_TEST_FILES +set_full_path(THIS_DIR_SEQ_CATCH2_TEST_FILES normalize_test.cpp sample_normalize_test.cpp scale_test.cpp transform_pipeline_test.cpp ) -set(LBANN_CATCH2_TEST_FILES - "${LBANN_CATCH2_TEST_FILES}" "${_DIR_LBANN_CATCH2_TEST_FILES}" PARENT_SCOPE) +set(LBANN_SEQ_CATCH2_TEST_FILES + "${LBANN_SEQ_CATCH2_TEST_FILES}" + "${THIS_DIR_SEQ_CATCH2_TEST_FILES}" + PARENT_SCOPE) diff --git a/src/transforms/vision/unit_test/CMakeLists.txt b/src/transforms/vision/unit_test/CMakeLists.txt index b2e4c84cd4a..b28fcacf507 100644 --- a/src/transforms/vision/unit_test/CMakeLists.txt +++ b/src/transforms/vision/unit_test/CMakeLists.txt @@ -1,4 +1,4 @@ -set_full_path(_DIR_LBANN_CATCH2_TEST_FILES +set_full_path(THIS_DIR_SEQ_CATCH2_TEST_FILES center_crop_test.cpp colorize_test.cpp grayscale_test.cpp @@ -14,5 +14,7 @@ set_full_path(_DIR_LBANN_CATCH2_TEST_FILES vertical_flip_test.cpp ) -set(LBANN_CATCH2_TEST_FILES - "${LBANN_CATCH2_TEST_FILES}" "${_DIR_LBANN_CATCH2_TEST_FILES}" PARENT_SCOPE) +set(LBANN_SEQ_CATCH2_TEST_FILES + "${LBANN_SEQ_CATCH2_TEST_FILES}" + "${THIS_DIR_SEQ_CATCH2_TEST_FILES}" + PARENT_SCOPE) diff --git a/src/utils/CMakeLists.txt b/src/utils/CMakeLists.txt index 38e7a9f9b61..c4d863c3ede 100644 --- a/src/utils/CMakeLists.txt +++ b/src/utils/CMakeLists.txt @@ -22,6 +22,7 @@ set_full_path(THIS_DIR_SOURCES stack_trace.cpp statistics.cpp summary.cpp + system_info.cpp lbann_library.cpp jag_common.cpp ) diff --git a/src/utils/system_info.cpp b/src/utils/system_info.cpp new file mode 100644 index 00000000000..79fc0f4860c --- /dev/null +++ b/src/utils/system_info.cpp @@ -0,0 +1,166 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#include "lbann/utils/system_info.hpp" + +#include "lbann/utils/environment_variable.hpp" + +#include +#include + +#include +#include +#include + +namespace lbann { +namespace utils { +namespace { + +int try_mpi_comm_rank() noexcept +{ + int rank = -1; + int mpi_has_been_initialized = -1, mpi_has_been_finalized = -1; + MPI_Initialized(&mpi_has_been_initialized); + MPI_Finalized(&mpi_has_been_finalized); + + if (mpi_has_been_initialized && !mpi_has_been_finalized) + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + + return rank; +} + +// I know about SLURM, Open-MPI, and MVAPICH2 +int try_env_variable_rank() +{ + ENV slurm_rank("SLURM_PROCID"); + if (slurm_rank.exists()) + return slurm_rank.template value(); + ENV openmpi_rank("OMPI_COMM_WORLD_RANK"); + if (openmpi_rank.exists()) + return openmpi_rank.template value(); + ENV mvapich2_rank("MV2_COMM_WORLD_RANK"); + if (mvapich2_rank.exists()) + return mvapich2_rank.template value(); + + return -1; +} + +int try_mpi_comm_size() noexcept +{ + int size = -1; + int mpi_has_been_initialized = -1, mpi_has_been_finalized = -1; + MPI_Initialized(&mpi_has_been_initialized); + MPI_Finalized(&mpi_has_been_finalized); + + if (mpi_has_been_initialized && !mpi_has_been_finalized) + MPI_Comm_size(MPI_COMM_WORLD, &size); + + return size; +} + +// I know about SLURM, Open-MPI, and MVAPICH2 +int try_env_variable_size() +{ + ENV slurm_size("SLURM_NTASKS"); + if (slurm_size.exists()) + return slurm_size.template value(); + ENV openmpi_size("OMPI_COMM_WORLD_SIZE"); + if (openmpi_size.exists()) + return openmpi_size.template value(); + ENV mvapich2_size("MV2_COMM_WORLD_SIZE"); + if (mvapich2_size.exists()) + return mvapich2_size.template value(); + + return -1; +} +}// namespace + +std::string SystemInfo::pid() const +{ + return std::to_string(getpid()); +} + +std::string SystemInfo::host_name() const +{ + char hostname[4096]; + int status = gethostname(hostname, 4096); + if (status != 0) + throw std::runtime_error("gethostname failed"); + return hostname; +} + +int SystemInfo::mpi_rank() const +{ + static int rank = -1; + + // Short-circuit if rank has already been found. + if (rank != -1) + return rank; + + // First try MPI directly + rank = try_mpi_comm_rank(); + + // Now try some environment variables + if (rank == -1) + rank = try_env_variable_rank(); + + // At this point, I assume I'm not in an MPI job. + if (rank == -1) + rank = 0; + + return rank; +} + +int SystemInfo::mpi_size() const +{ + static int size = -1; + + // Short-circuit if size has already been found. + if (size != -1) + return size; + + // First try MPI directly + size = try_mpi_comm_size(); + + // Now try some environment variables + if (size == -1) + size = try_env_variable_size(); + + // At this point, I assume I'm not in an MPI job. + if (size == -1) + size = 0; + + return size; +} + +std::string +SystemInfo::env_variable_value(std::string const& var_name) const +{ + return ENV(var_name).raw_value(); +} + +}// namespace utils +}// namespace lbann diff --git a/src/utils/unit_test/CMakeLists.txt b/src/utils/unit_test/CMakeLists.txt index 813b1cbdfd8..67968c6642d 100644 --- a/src/utils/unit_test/CMakeLists.txt +++ b/src/utils/unit_test/CMakeLists.txt @@ -1,4 +1,4 @@ -set_full_path(_DIR_LBANN_CATCH2_TEST_FILES +set_full_path(THIS_DIR_SEQ_CATCH2_TEST_FILES any_test.cpp argument_parser_test.cpp beta_distribution_test.cpp @@ -15,5 +15,6 @@ set_full_path(_DIR_LBANN_CATCH2_TEST_FILES stubs/preset_env_accessor.cpp ) -set(LBANN_CATCH2_TEST_FILES - "${LBANN_CATCH2_TEST_FILES}" "${_DIR_LBANN_CATCH2_TEST_FILES}" PARENT_SCOPE) +set(LBANN_SEQ_CATCH2_TEST_FILES + "${LBANN_SEQ_CATCH2_TEST_FILES}" + "${THIS_DIR_SEQ_CATCH2_TEST_FILES}" PARENT_SCOPE) diff --git a/unit_test/CMakeLists.txt b/unit_test/CMakeLists.txt index 1368eafc110..f13c07376d6 100644 --- a/unit_test/CMakeLists.txt +++ b/unit_test/CMakeLists.txt @@ -1,8 +1,22 @@ +# Add the unit testing utilities library +add_subdirectory(utilities) + # Add the sequential test main() function add_executable(seq-catch-tests - SequentialCatchMain.cpp "${LBANN_CATCH2_TEST_FILES}") -target_link_libraries(seq-catch-tests PRIVATE lbann Catch2::Catch2) + SequentialCatchMain.cpp "${LBANN_SEQ_CATCH2_TEST_FILES}") +target_link_libraries(seq-catch-tests + PRIVATE unit_test_utilities lbann Catch2::Catch2) catch_discover_tests(seq-catch-tests) -# Add the parallel test main() function -- TODO +# There's an example MPI test +add_subdirectory(example) + +# Add the parallel test main() function +add_executable(mpi-catch-tests + MPICatchMain.cpp "${LBANN_MPI_CATCH2_TEST_FILES}") +target_link_libraries(mpi-catch-tests + PRIVATE unit_test_utilities lbann Catch2::Catch2) + +# TODO: Some "magical" way to automatically run tests if a parallel +# environment is detected at CTest time diff --git a/unit_test/MPICatchMain.cpp b/unit_test/MPICatchMain.cpp new file mode 100644 index 00000000000..66996776a63 --- /dev/null +++ b/unit_test/MPICatchMain.cpp @@ -0,0 +1,72 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#define CATCH_CONFIG_RUNNER +#include + +// Utilities +#include "MPITestHelpers.hpp" +#include "ReplaceEscapes.hpp" + +#include +#include + +// Just stand up MPI before running all tests; teardown after. +using namespace unit_test::utilities; +int main(int argc, char* argv[]) +{ + // Set up the communication domain + auto world_comm = lbann::initialize(argc, argv, /*seed=*/13); + expert::register_world_comm(*world_comm); + + // Initialize Catch2 + Catch::Session session; + + // Parse the command line + int return_code = session.applyCommandLine(argc, argv); + if (return_code != 0) // Indicates a command line error + return return_code; + + // Manipulate output file if needed. + auto& config_data = session.configData(); + auto& output_file = config_data.outputFilename; + if (output_file.size() > 0) + { + lbann::utils::SystemInfo sys_info; + output_file = replace_escapes(output_file, sys_info); + } + + // Run the catch tests, outputting to the given file. + int num_failed = session.run(); + + // Clean up the catch environment + expert::reset_world_comm(); + + // Shut down the communication domain + world_comm.reset(); // Force MPI_Finalize, et al, before return. + + return num_failed; +} diff --git a/unit_test/example/CMakeLists.txt b/unit_test/example/CMakeLists.txt new file mode 100644 index 00000000000..a651e948fd4 --- /dev/null +++ b/unit_test/example/CMakeLists.txt @@ -0,0 +1,7 @@ +set_full_path(THIS_DIR_MPI_CATCH2_TEST_FILES + mpi_bcast_example_test.cpp + ) + +set(LBANN_MPI_CATCH2_TEST_FILES + "${LBANN_MPI_CATCH2_TEST_FILES}" + "${THIS_DIR_MPI_CATCH2_TEST_FILES}" PARENT_SCOPE) diff --git a/unit_test/example/mpi_bcast_example_test.cpp b/unit_test/example/mpi_bcast_example_test.cpp new file mode 100644 index 00000000000..79bebb05ed6 --- /dev/null +++ b/unit_test/example/mpi_bcast_example_test.cpp @@ -0,0 +1,56 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#include + +#include "MPITestHelpers.hpp" + +TEST_CASE("Example: test of Broadcast", "[mpi][example]") +{ + auto& world_comm = unit_test::utilities::current_world_comm(); + int rank_in_world = world_comm.get_rank_in_world(); + + SECTION("Scalar broadcast") + { + int value = (rank_in_world == 0 ? 13 : -1); + world_comm.world_broadcast(0, value); + + REQUIRE(value == 13); + } + + SECTION("Vector broadcast") + { + std::vector true_values = {1.f, 2.f, 3.f, 4.f}; + std::vector values = + (rank_in_world == 0 + ? true_values + : std::vector(4, -1.f)); + + world_comm.world_broadcast(0, values.data(), values.size()); + + REQUIRE(values == true_values); + } +} diff --git a/unit_test/utilities/CMakeLists.txt b/unit_test/utilities/CMakeLists.txt new file mode 100644 index 00000000000..9944ad1c926 --- /dev/null +++ b/unit_test/utilities/CMakeLists.txt @@ -0,0 +1,21 @@ +# Add the library +add_library(unit_test_utilities + # Headers + MPITestHelpers.hpp + ReplaceEscapes.hpp + + # C++ + MPITestHelpers.cpp + ReplaceEscapes.cpp + ) # add_library unit_test_utilities + +target_include_directories(unit_test_utilities + PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) + +target_link_libraries(unit_test_utilities PUBLIC lbann) + +# Add the unit tests for the library +add_subdirectory(unit_test) +set(LBANN_SEQ_CATCH2_TEST_FILES + "${LBANN_SEQ_CATCH2_TEST_FILES}" + PARENT_SCOPE) diff --git a/unit_test/utilities/MPITestHelpers.cpp b/unit_test/utilities/MPITestHelpers.cpp new file mode 100644 index 00000000000..b82142ef151 --- /dev/null +++ b/unit_test/utilities/MPITestHelpers.cpp @@ -0,0 +1,53 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#include "MPITestHelpers.hpp" + +namespace unit_test { +namespace utilities { +namespace { +lbann::lbann_comm* global_comm_; +} + +lbann::lbann_comm& current_world_comm() +{ + LBANN_ASSERT_POINTER(global_comm_); + return *global_comm_; +} + +namespace expert { +void register_world_comm(lbann::lbann_comm& comm) noexcept +{ + global_comm_ = &comm; +} + +void reset_world_comm() noexcept +{ + global_comm_ = nullptr; +} +} // namespace expert +} // namespace utilities +} // namespace unit_test diff --git a/unit_test/utilities/MPITestHelpers.hpp b/unit_test/utilities/MPITestHelpers.hpp new file mode 100644 index 00000000000..4be99f8882f --- /dev/null +++ b/unit_test/utilities/MPITestHelpers.hpp @@ -0,0 +1,80 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_UNIT_TEST_MPI_TEST_HELPERS_HPP_ +#define LBANN_UNIT_TEST_MPI_TEST_HELPERS_HPP_ + +#include +#include + +#include +#include + +#define LBANN_ASSERT_POINTER(ptr) \ + do { \ + if (!ptr) \ + throw ::unit_test::utilities::BadPointer( \ + __FILE__, __LINE__, #ptr); \ + } while (false) + +namespace unit_test { +namespace utilities { + +struct BadPointer : std::runtime_error +{ + BadPointer(std::string const& file, unsigned int line, + std::string const& var_name) + : std::runtime_error{ + lbann::build_string( + file, ":", line, ": \"", var_name, "\" is null.")} + {} +}; + +/** @brief Get the world communicator for this MPI session. */ +lbann::lbann_comm& current_world_comm(); + +// Sizes are not signed. +template +size_t as_size(T const& size) noexcept { return static_cast(size); } + +/** @brief Expert-only methods */ +namespace expert { + +/** @brief Set the world communicator for this session. + * + * @warning This may only be called in main(). + */ +void register_world_comm(lbann::lbann_comm& comm) noexcept; + +/** @brief Clear the world communicator for this session. + * + * @warning This may only be called in main(). + */ +void reset_world_comm() noexcept; +} // namespace expert +} // namespace utilities +} // namespace unit_test +#endif // LBANN_UNIT_TEST_MPI_TEST_HELPERS_HPP_ diff --git a/unit_test/utilities/ReplaceEscapes.cpp b/unit_test/utilities/ReplaceEscapes.cpp new file mode 100644 index 00000000000..21dafe01e3e --- /dev/null +++ b/unit_test/utilities/ReplaceEscapes.cpp @@ -0,0 +1,115 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#include "ReplaceEscapes.hpp" + +#include +#include + +#include +#include +#include + +namespace unit_test +{ +namespace utilities +{ + +namespace +{ + +std::string GetBasicReplacement( + std::string const& str, lbann::utils::SystemInfo const& system_info) +{ + if (str.size() != 2 || str[0] != '%') + throw std::logic_error("string is not a valid pattern."); + + switch (str[1]) + { + case 'h': + return system_info.host_name(); + case 'p': + return system_info.pid(); + case 'r': + return std::to_string(system_info.mpi_rank()); + case 's': + return std::to_string(system_info.mpi_size()); + default: + throw BadSubstitutionPattern(str); + } + return ""; // in case a compiler complains about no return. +} + +}// namespace + +BadSubstitutionPattern::BadSubstitutionPattern(std::string const& str) + : std::runtime_error("Bad escape sequence: " + str) +{} + +std::string replace_escapes( + std::string const& str, lbann::utils::SystemInfo const& system_info) +{ + std::regex re("%env\\{([a-zA-Z0-9_]+)\\}|%[a-zA-Z]", std::regex::extended); + std::smatch match; + std::string outstr; + outstr.reserve(str.size()); + size_t start=0; + + do + { + // Get the string up to the first %% + auto const end = str.find("%%", start); + auto tmp = str.substr(start, end-start); + + // Do all replacements + while (regex_search(tmp, match, re)) + { + if (match.size() != 2UL) + throw std::logic_error("Unexpected match size"); + + if (match[1].length() == 0) + tmp.replace(match.position(), match.length(), + GetBasicReplacement(match[0], system_info)); + else + tmp.replace(match.position(), match.length(), + system_info.env_variable_value(match[1])); + } + outstr += tmp + "%"; + + // Update the starting position in the original string. + start = (end == std::string::npos) ? std::string::npos : end+2; + + } + while (start != std::string::npos); + + // Added an extra "%"; remove it. + outstr.pop_back(); + + return outstr; +} + +}// namespace utilities +}// namespace unit_test diff --git a/unit_test/utilities/ReplaceEscapes.hpp b/unit_test/utilities/ReplaceEscapes.hpp new file mode 100644 index 00000000000..40c4682e6cb --- /dev/null +++ b/unit_test/utilities/ReplaceEscapes.hpp @@ -0,0 +1,102 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_UNIT_TEST_UTILITIES_REPLACE_ESCAPES_HPP_INCLUDED +#define LBANN_UNIT_TEST_UTILITIES_REPLACE_ESCAPES_HPP_INCLUDED + +#include + +#include +#include + +namespace unit_test +{ +namespace utilities +{ + +// +// NOTE TO C++ READERS: The following documentation will appear WRONG +// to you, but it is not! DO NOT CHANGE THE PATTERN/REPLACEMENT TABLE! +// There are many extra percent signs, but these are necessary for the +// markdown to render the HTML correctly! For your benefit, the valid +// sequences are: +// +// %% -- A literal percent sign +// %h -- The hostname of the current process +// %p -- The PID of the current process +// %r -- the MPI rank of the current process, if available, or 0 +// %s -- the MPI size of the current job, if available, or 1 +// %env{NAME} -- The value of ${NAME} in the current environment +// + +/** @brief Substitute basic escape sequences in a string. + * + * The following patterns are supported: + * + * Pattern | Replacement + * --------------- | ----------- + * %% | A literal percent sign ("%") + * %%h | The hostname of the current process + * %%p | The PID of the current process + * %%r | The MPI rank of the current process, if available, or 0 + * %%s | The MPI size of the current job, if available, or 1 + * %%env{\} | The value of ${NAME} in the current environment + * + * The MPI runtime is queried if available for MPI information. After + * that, environment variables are checked for common libraries + * (SLURM, Open-MPI, MVAPICH2). If neither of these methods gives the + * required information, default information consistent with a + * sequential job is returned: the rank will be 0 and the size will + * be 1. + * + * If the "%env{}" substitution fails to find `NAME` in the + * current environment, the replacement will be the empty string. + * + * The double-percent sequence is extracted first, so "%%r" will + * return "%r" and "%%%r" will return "%". + * + * @param str The string to which substitutions should be applied. + * @param sys_info The source of system information. This is + * primarily exposed for stubbing the functionality + * to test this function. + * + * @throws BadSubstitutionPattern An escape sequence is found in + * the string that has no valid substitution. + * + * @returns A copy of the input string with all substitutions applied. + */ +std::string replace_escapes( + std::string const& str, lbann::utils::SystemInfo const& sys_info); + +/** @brief Indicates that an invalid pattern is detected. */ +struct BadSubstitutionPattern : std::runtime_error +{ + BadSubstitutionPattern(std::string const& str); +};// struct BadSubstitutionPattern + +}// namespace utilities +}// namespace unit_test +#endif // LBANN_UNIT_TEST_UTILITIES_REPLACE_ESCAPES_HPP_INCLUDED diff --git a/unit_test/utilities/unit_test/CMakeLists.txt b/unit_test/utilities/unit_test/CMakeLists.txt new file mode 100644 index 00000000000..a8fa529bc0e --- /dev/null +++ b/unit_test/utilities/unit_test/CMakeLists.txt @@ -0,0 +1,8 @@ +set_full_path(THIS_DIR_SEQ_CATCH2_TEST_FILES + test_replace_escapes.cpp + ) + +set(LBANN_SEQ_CATCH2_TEST_FILES + "${LBANN_SEQ_CATCH2_TEST_FILES}" + "${THIS_DIR_SEQ_CATCH2_TEST_FILES}" + PARENT_SCOPE) diff --git a/unit_test/utilities/unit_test/test_replace_escapes.cpp b/unit_test/utilities/unit_test/test_replace_escapes.cpp new file mode 100644 index 00000000000..10d5c15bb7a --- /dev/null +++ b/unit_test/utilities/unit_test/test_replace_escapes.cpp @@ -0,0 +1,188 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#include + +#include "ReplaceEscapes.hpp" + +#include + +#include + +// Stub the system info +class TestSystemInfo : public lbann::utils::SystemInfo +{ +public: + std::string pid() const override { return "321"; } + std::string host_name() const override { return "test.host.name"; } + int mpi_rank() const override { return 123; } + int mpi_size() const override { return 432; } + std::string env_variable_value(std::string const& var_name) const override + { + if (var_name == "PUMPKIN") + return "pie"; + if (var_name == "CRANBERRY") + return "sauce"; + return ""; + } +}; // class TestSystemInfo + +// Bring the function under test into scope. +using unit_test::utilities::replace_escapes; +using unit_test::utilities::BadSubstitutionPattern; + +TEST_CASE("Subtitution of patterns in strings", "[seq][utils][testing]") +{ + TestSystemInfo sys_info; + + SECTION("No patterns leaves the string unchanged.") + { + std::string test_string = "I am a string"; + CHECK(replace_escapes(test_string, sys_info) == test_string); + } + + SECTION("Subtitute %p for process ID") + { + auto pid = sys_info.pid(); + CHECK(replace_escapes("%p", sys_info) == pid); + CHECK(replace_escapes("%p_apple", sys_info) == pid+"_apple"); + CHECK(replace_escapes("%p%p", sys_info) == pid + pid); + CHECK(replace_escapes("%pap%pple_%p", sys_info) + == pid+"ap"+pid+"ple_"+pid); + } + + SECTION("Substitute %h for hostname") + { + auto host = sys_info.host_name(); + CHECK(replace_escapes("%h", sys_info) == host); + CHECK(replace_escapes("Tahitian %h farm", sys_info) == "Tahitian "+host+" farm"); + CHECK(replace_escapes("%h%h", sys_info) == host + host); + CHECK(replace_escapes("G%hs%hsss", sys_info) == "G"+host+"s"+host+"sss"); + } + + SECTION("Substitute %r for MPI rank") + { + auto rank = std::to_string(sys_info.mpi_rank()); + CHECK(replace_escapes("%r", sys_info) == rank); + CHECK(replace_escapes("I have %r cats", sys_info) + == "I have "+rank+" cats"); + CHECK(replace_escapes("%r%r", sys_info) == rank + rank); + CHECK(replace_escapes("G%rs%rhss", sys_info) + == "G"+rank+"s"+rank+"hss"); + } + + SECTION("Substitute %s for MPI size") + { + auto size = std::to_string(sys_info.mpi_size()); + CHECK(replace_escapes("%s", sys_info) == size); + CHECK(replace_escapes("I have %s puppies", sys_info) + == "I have "+size+" puppies"); + CHECK(replace_escapes("%s%s", sys_info) == size + size); + CHECK(replace_escapes("G%ss%shss", sys_info) + == "G"+size+"s"+size+"hss"); + } + + SECTION("Substitute %% for a literal %") + { + CHECK(replace_escapes("%%", sys_info) == "%"); + CHECK(replace_escapes("110%% is a lie", sys_info) + == "110% is a lie"); + CHECK(replace_escapes("%%%%", sys_info) == "%%"); + CHECK(replace_escapes("100%%", sys_info) == "100%"); + CHECK(replace_escapes("%%hope", sys_info) == "%hope"); + CHECK(replace_escapes("%%query", sys_info) == "%query"); + } + + SECTION("Substitute %env{} for $ in the current environment") + { + auto pumpkin = sys_info.env_variable_value("PUMPKIN"); + auto cranberry = sys_info.env_variable_value("CRANBERRY"); + auto pid = sys_info.pid(); + auto host = sys_info.host_name(); + CHECK(replace_escapes("%env{PUMPKIN}", sys_info) == pumpkin); + CHECK(replace_escapes("%env{PUMPKIN}%env{PUMPKIN}", sys_info) + == pumpkin+pumpkin); + CHECK(replace_escapes("%env{PUMPKIN}%env{CRANBERRY}", sys_info) + == pumpkin+cranberry); + CHECK(replace_escapes("%%%env{THIS_IS_UNDEFINED}", sys_info) == "%"); + CHECK(replace_escapes("eat_%env{PUMPKIN}_%h_%p.txt", sys_info) + == "eat_"+pumpkin+"_"+host+"_"+pid + ".txt"); + CHECK(replace_escapes("%env{THIS_IS_UNDEFINED}", sys_info) == ""); + } + + SECTION("Bad patterns are rejected") + { + CHECK_THROWS_AS(replace_escapes("%env", sys_info), BadSubstitutionPattern); + CHECK_THROWS_AS(replace_escapes("%a", sys_info), BadSubstitutionPattern); + CHECK_THROWS_AS(replace_escapes("%b", sys_info), BadSubstitutionPattern); + CHECK_THROWS_AS(replace_escapes("%c", sys_info), BadSubstitutionPattern); + CHECK_THROWS_AS(replace_escapes("%d", sys_info), BadSubstitutionPattern); + CHECK_THROWS_AS(replace_escapes("%e", sys_info), BadSubstitutionPattern); + CHECK_THROWS_AS(replace_escapes("%f", sys_info), BadSubstitutionPattern); + CHECK_THROWS_AS(replace_escapes("%g", sys_info), BadSubstitutionPattern); + CHECK_THROWS_AS(replace_escapes("%i", sys_info), BadSubstitutionPattern); + CHECK_THROWS_AS(replace_escapes("%j", sys_info), BadSubstitutionPattern); + CHECK_THROWS_AS(replace_escapes("%k", sys_info), BadSubstitutionPattern); + CHECK_THROWS_AS(replace_escapes("%l", sys_info), BadSubstitutionPattern); + CHECK_THROWS_AS(replace_escapes("%m", sys_info), BadSubstitutionPattern); + CHECK_THROWS_AS(replace_escapes("%n", sys_info), BadSubstitutionPattern); + CHECK_THROWS_AS(replace_escapes("%o", sys_info), BadSubstitutionPattern); + CHECK_THROWS_AS(replace_escapes("%q", sys_info), BadSubstitutionPattern); + CHECK_THROWS_AS(replace_escapes("%t", sys_info), BadSubstitutionPattern); + CHECK_THROWS_AS(replace_escapes("%u", sys_info), BadSubstitutionPattern); + CHECK_THROWS_AS(replace_escapes("%v", sys_info), BadSubstitutionPattern); + CHECK_THROWS_AS(replace_escapes("%w", sys_info), BadSubstitutionPattern); + CHECK_THROWS_AS(replace_escapes("%x", sys_info), BadSubstitutionPattern); + CHECK_THROWS_AS(replace_escapes("%y", sys_info), BadSubstitutionPattern); + CHECK_THROWS_AS(replace_escapes("%z", sys_info), BadSubstitutionPattern); + CHECK_THROWS_AS(replace_escapes("%A", sys_info), BadSubstitutionPattern); + CHECK_THROWS_AS(replace_escapes("%B", sys_info), BadSubstitutionPattern); + CHECK_THROWS_AS(replace_escapes("%C", sys_info), BadSubstitutionPattern); + CHECK_THROWS_AS(replace_escapes("%D", sys_info), BadSubstitutionPattern); + CHECK_THROWS_AS(replace_escapes("%E", sys_info), BadSubstitutionPattern); + CHECK_THROWS_AS(replace_escapes("%F", sys_info), BadSubstitutionPattern); + CHECK_THROWS_AS(replace_escapes("%G", sys_info), BadSubstitutionPattern); + CHECK_THROWS_AS(replace_escapes("%H", sys_info), BadSubstitutionPattern); + CHECK_THROWS_AS(replace_escapes("%I", sys_info), BadSubstitutionPattern); + CHECK_THROWS_AS(replace_escapes("%J", sys_info), BadSubstitutionPattern); + CHECK_THROWS_AS(replace_escapes("%K", sys_info), BadSubstitutionPattern); + CHECK_THROWS_AS(replace_escapes("%L", sys_info), BadSubstitutionPattern); + CHECK_THROWS_AS(replace_escapes("%M", sys_info), BadSubstitutionPattern); + CHECK_THROWS_AS(replace_escapes("%N", sys_info), BadSubstitutionPattern); + CHECK_THROWS_AS(replace_escapes("%O", sys_info), BadSubstitutionPattern); + CHECK_THROWS_AS(replace_escapes("%P", sys_info), BadSubstitutionPattern); + CHECK_THROWS_AS(replace_escapes("%Q", sys_info), BadSubstitutionPattern); + CHECK_THROWS_AS(replace_escapes("%R", sys_info), BadSubstitutionPattern); + CHECK_THROWS_AS(replace_escapes("%S", sys_info), BadSubstitutionPattern); + CHECK_THROWS_AS(replace_escapes("%T", sys_info), BadSubstitutionPattern); + CHECK_THROWS_AS(replace_escapes("%U", sys_info), BadSubstitutionPattern); + CHECK_THROWS_AS(replace_escapes("%V", sys_info), BadSubstitutionPattern); + CHECK_THROWS_AS(replace_escapes("%W", sys_info), BadSubstitutionPattern); + CHECK_THROWS_AS(replace_escapes("%X", sys_info), BadSubstitutionPattern); + CHECK_THROWS_AS(replace_escapes("%Y", sys_info), BadSubstitutionPattern); + CHECK_THROWS_AS(replace_escapes("%Z", sys_info), BadSubstitutionPattern); + } +} From f283c7b30384e98a92b16828b7578785ad118543 Mon Sep 17 00:00:00 2001 From: Tim Moon Date: Wed, 13 Nov 2019 13:50:17 -0800 Subject: [PATCH 397/634] Move gradient scaling from learning layers to evaluation layer (#1341) * Move gradient scaling from learning layers to eval layer * Remove unneeded include --- .../layers/learning/base_convolution.hpp | 9 ------ include/lbann/layers/transform/weights.hpp | 14 +++------ .../learning/channelwise_scale_bias.cpp | 7 +---- src/layers/learning/channelwise_scale_bias.cu | 7 +---- src/layers/learning/embedding.cpp | 8 +---- src/layers/learning/entrywise_scale_bias.cpp | 16 +++------- src/layers/learning/entrywise_scale_bias.cu | 16 +++------- src/layers/learning/fully_connected.cpp | 29 ++----------------- .../regularizers/batch_normalization.cpp | 12 ++------ .../regularizers/batch_normalization.cu | 20 ++++--------- .../entrywise_batch_normalization.cpp | 17 +++++------ .../entrywise_batch_normalization.cu | 17 +++++------ src/layers/transform/evaluation.cpp | 6 +++- 13 files changed, 45 insertions(+), 133 deletions(-) diff --git a/include/lbann/layers/learning/base_convolution.hpp b/include/lbann/layers/learning/base_convolution.hpp index 824b469f18f..c27c80aa731 100644 --- a/include/lbann/layers/learning/base_convolution.hpp +++ b/include/lbann/layers/learning/base_convolution.hpp @@ -27,7 +27,6 @@ #ifndef LBANN_LAYERS_LEARNING_BASE_CONVOLUTION_HPP_INCLUDED #define LBANN_LAYERS_LEARNING_BASE_CONVOLUTION_HPP_INCLUDED -#include "lbann/execution_contexts/sgd_execution_context.hpp" #include "lbann/layers/layer.hpp" #include "lbann/models/model.hpp" #include "lbann/weights/initializer.hpp" @@ -646,8 +645,6 @@ class base_convolution_layer : public Layer { const auto& local_input = get_local_prev_activations(); const auto& local_gradient_wrt_output = get_local_prev_error_signals(); - const auto& c = static_cast(this->m_model->get_execution_context()); - const auto effective_mini_batch_size = c.get_effective_mini_batch_size(); const bool has_local_data = (local_input.Height() > 0 && local_input.Width() > 0 && local_gradient_wrt_output.Height() > 0 @@ -660,7 +657,6 @@ class base_convolution_layer : public Layer { DataType dst_scale = DataType(0), gradient_scale = DataType(0); auto& bias_gradient = bias_optimizer->get_gradient_buffer( dst_scale, gradient_scale, true); - gradient_scale /= effective_mini_batch_size; if (has_local_data) { CHECK_CUDNN(cudnnConvolutionBackwardBias( cudnn::get_handle(), @@ -681,7 +677,6 @@ class base_convolution_layer : public Layer { DataType dst_scale = DataType(0), gradient_scale = DataType(0); auto& kernel_gradient = kernel_optimizer->get_gradient_buffer( dst_scale, gradient_scale, true); - gradient_scale /= effective_mini_batch_size; if (has_local_data) { // Initialize GPU workspace GPUMat workspace; @@ -920,8 +915,6 @@ class base_convolution_layer : public Layer { const int num_input_channels = input_dims[0]; const int num_output_channels = output_dims[0]; const int num_per_output_channel = get_output_size() / num_output_channels; - const auto& c = static_cast(this->m_model->get_execution_context()); - const auto effective_mini_batch_size = c.get_effective_mini_batch_size(); const auto& kernel_dims = get_kernel_dims(); const auto& kernel_size = std::accumulate(kernel_dims.begin(), kernel_dims.end(), @@ -935,7 +928,6 @@ class base_convolution_layer : public Layer { DataType dst_scale = DataType(0), gradient_scale = DataType(0); auto& bias_gradient = bias_optimizer->get_gradient_buffer( dst_scale, gradient_scale, true); - gradient_scale /= effective_mini_batch_size; if (has_local_data) { auto& local_bias_gradient = bias_gradient.Matrix(); LBANN_OMP_PARALLEL_FOR @@ -979,7 +971,6 @@ class base_convolution_layer : public Layer { auto& kernel_gradient = kernel_optimizer->get_gradient_buffer( dst_scale, gradient_scale, true); El::Scale(dst_scale, kernel_gradient); - gradient_scale /= effective_mini_batch_size; DMat im2col_matrix(m, k); DMat kernel_gradient_matrix(m, n, kernel_gradient.Buffer(), m); diff --git a/include/lbann/layers/transform/weights.hpp b/include/lbann/layers/transform/weights.hpp index 8c34cf0aaa6..940ccb28c20 100644 --- a/include/lbann/layers/transform/weights.hpp +++ b/include/lbann/layers/transform/weights.hpp @@ -29,7 +29,6 @@ #include "lbann/layers/transform/transform.hpp" #include "lbann/models/model.hpp" -#include "lbann/execution_contexts/sgd_execution_context.hpp" namespace lbann { @@ -170,8 +169,6 @@ class weights_layer : public transform_layer { } void bp_compute() override { - constexpr DataType zero = 0; - constexpr DataType one = 1; // Get optimizer // Note: Nothing needs to be done if there is no optimizer @@ -181,15 +178,12 @@ class weights_layer : public transform_layer { // Matrices const auto& local_gradient_wrt_output = get_local_prev_error_signals(); m_workspace->Resize(local_gradient_wrt_output.Width(), 1); - El::Fill(*m_workspace, one); + El::Fill(*m_workspace, DataType{1}); - const auto& c = static_cast(this->m_model->get_execution_context()); - // Compute gradient contribution and accumulate - const auto& scale = one / c.get_effective_mini_batch_size(); El::Gemv(El::NORMAL, - scale, local_gradient_wrt_output, *m_workspace, - zero, m_gradient->Matrix()); - opt->add_to_gradient(*m_gradient, one, true); + DataType{1}, local_gradient_wrt_output, *m_workspace, + DataType{0}, m_gradient->Matrix()); + opt->add_to_gradient(*m_gradient, DataType{1}, true); // Clean up m_workspace->Empty(); diff --git a/src/layers/learning/channelwise_scale_bias.cpp b/src/layers/learning/channelwise_scale_bias.cpp index dec23a2a379..959210f5d8e 100644 --- a/src/layers/learning/channelwise_scale_bias.cpp +++ b/src/layers/learning/channelwise_scale_bias.cpp @@ -26,7 +26,6 @@ #define LBANN_CHANNELWISE_SCALE_BIAS_LAYER_INSTANTIATE #include "lbann/layers/learning/channelwise_scale_bias.hpp" -#include "lbann/execution_contexts/sgd_execution_context.hpp" namespace lbann { @@ -127,11 +126,7 @@ void channelwise_scale_bias_layer // Update optimizer with gradient auto* opt = m_weights[0]->get_optimizer(); if (opt != nullptr) { - const auto& c = static_cast(this->m_model->get_execution_context()); - const auto mini_batch_size = c.get_effective_mini_batch_size(); - opt->add_to_gradient(*m_weights_gradient, - DataType{1} / mini_batch_size, - true); + opt->add_to_gradient(*m_weights_gradient, DataType{1}, true); } } diff --git a/src/layers/learning/channelwise_scale_bias.cu b/src/layers/learning/channelwise_scale_bias.cu index 5d321bd7d73..c7bade904cc 100644 --- a/src/layers/learning/channelwise_scale_bias.cu +++ b/src/layers/learning/channelwise_scale_bias.cu @@ -29,7 +29,6 @@ #ifdef HYDROGEN_HAVE_CUB #include "cub/block/block_reduce.cuh" #endif // HYDROGEN_HAVE_CUB -#include "lbann/execution_contexts/sgd_execution_context.hpp" namespace lbann { @@ -266,11 +265,7 @@ void channelwise_scale_bias_layer // Update optimizer with gradient auto* opt = m_weights[0]->get_optimizer(); if (opt != nullptr) { - const auto& c = static_cast(this->m_model->get_execution_context()); - const auto mini_batch_size = c.get_effective_mini_batch_size(); - opt->add_to_gradient(*m_weights_gradient, - DataType{1} / mini_batch_size, - true); + opt->add_to_gradient(*m_weights_gradient, DataType{1}, true); } diff --git a/src/layers/learning/embedding.cpp b/src/layers/learning/embedding.cpp index 835fa29cf9d..103fdec0172 100644 --- a/src/layers/learning/embedding.cpp +++ b/src/layers/learning/embedding.cpp @@ -26,8 +26,6 @@ #define LBANN_EMBEDDING_LAYER_INSTANTIATE #include "lbann/layers/learning/embedding.hpp" -#include "lbann/models/model.hpp" -#include "lbann/execution_contexts/sgd_execution_context.hpp" namespace lbann { @@ -76,8 +74,6 @@ void embedding_layer::bp_compute() { auto& local_dict_grad = dynamic_cast(m_dictionary_gradient.Matrix()); const auto& local_output_grad = dynamic_cast(get_local_prev_error_signals()); const auto& local_width = local_input.Width(); - const auto& c = static_cast(this->m_model->get_execution_context()); - const auto& mini_batch_size = c.get_effective_mini_batch_size(); // Update appropriate columns of gradient w.r.t. dictionary // Note: Don't update gradient for padding index @@ -93,9 +89,7 @@ void embedding_layer::bp_compute() { El::Axpy(DataType{1}, output_grad_v, dict_grad_v); } } - opt.add_to_gradient(m_dictionary_gradient, - DataType{1} / mini_batch_size, - true); + opt.add_to_gradient(m_dictionary_gradient, DataType{1}, true); } diff --git a/src/layers/learning/entrywise_scale_bias.cpp b/src/layers/learning/entrywise_scale_bias.cpp index 01b607eea1b..d71228f312c 100644 --- a/src/layers/learning/entrywise_scale_bias.cpp +++ b/src/layers/learning/entrywise_scale_bias.cpp @@ -26,7 +26,6 @@ #define LBANN_ENTRYWISE_SCALE_BIAS_LAYER_INSTANTIATE #include "lbann/layers/learning/entrywise_scale_bias.hpp" -#include "lbann/execution_contexts/sgd_execution_context.hpp" namespace lbann { @@ -64,8 +63,7 @@ void bp_impl(const CPUMat& local_input, const CPUMat& local_gradient_wrt_output, CPUMat& local_gradient_wrt_input, weights& scale_bias, - AbsDistMat& gradient_wrt_scale_bias, - El::Int mini_batch_size) { + AbsDistMat& gradient_wrt_scale_bias) { // Local matrices const auto& local_scale_bias @@ -114,9 +112,7 @@ void bp_impl(const CPUMat& local_input, // Update optimizer with gradient auto* opt = scale_bias.get_optimizer(); if (opt != nullptr) { - opt->add_to_gradient(gradient_wrt_scale_bias, - DataType{1} / mini_batch_size, - true); + opt->add_to_gradient(gradient_wrt_scale_bias, DataType{1}, true); } } @@ -141,24 +137,20 @@ void entrywise_scale_bias_layer template <> void entrywise_scale_bias_layer ::bp_compute() { - const auto& c = static_cast(this->m_model->get_execution_context()); bp_impl(dynamic_cast(get_local_prev_activations()), dynamic_cast(get_local_prev_error_signals()), dynamic_cast(get_local_error_signals()), *this->m_weights[0], - *m_weights_gradient, - c.get_effective_mini_batch_size()); + *m_weights_gradient); } template <> void entrywise_scale_bias_layer ::bp_compute() { - const auto& c = static_cast(this->m_model->get_execution_context()); bp_impl(dynamic_cast(get_local_prev_activations()), dynamic_cast(get_local_prev_error_signals()), dynamic_cast(get_local_error_signals()), *this->m_weights[0], - *m_weights_gradient, - c.get_effective_mini_batch_size()); + *m_weights_gradient); } template class entrywise_scale_bias_layer< diff --git a/src/layers/learning/entrywise_scale_bias.cu b/src/layers/learning/entrywise_scale_bias.cu index 286dfa8a993..4830245501d 100644 --- a/src/layers/learning/entrywise_scale_bias.cu +++ b/src/layers/learning/entrywise_scale_bias.cu @@ -26,7 +26,6 @@ #define LBANN_ENTRYWISE_SCALE_BIAS_LAYER_INSTANTIATE #include "lbann/layers/learning/entrywise_scale_bias.hpp" -#include "lbann/execution_contexts/sgd_execution_context.hpp" namespace lbann { @@ -132,8 +131,7 @@ void bp_impl(const GPUMat& local_input, const GPUMat& local_gradient_wrt_output, GPUMat& local_gradient_wrt_input, weights& scale_bias, - AbsDistMat& gradient_wrt_scale_bias, - El::Int mini_batch_size) { + AbsDistMat& gradient_wrt_scale_bias) { // Local matrices const auto& local_scale_bias @@ -170,9 +168,7 @@ void bp_impl(const GPUMat& local_input, // Update optimizer with gradient auto* opt = scale_bias.get_optimizer(); if (opt != nullptr) { - opt->add_to_gradient(gradient_wrt_scale_bias, - DataType{1} / mini_batch_size, - true); + opt->add_to_gradient(gradient_wrt_scale_bias, DataType{1}, true); } } @@ -197,24 +193,20 @@ void entrywise_scale_bias_layer template <> void entrywise_scale_bias_layer ::bp_compute() { - const auto& c = static_cast(this->m_model->get_execution_context()); bp_impl(dynamic_cast(get_local_prev_activations()), dynamic_cast(get_local_prev_error_signals()), dynamic_cast(get_local_error_signals()), *this->m_weights[0], - *m_weights_gradient, - c.get_effective_mini_batch_size()); + *m_weights_gradient); } template <> void entrywise_scale_bias_layer ::bp_compute() { - const auto& c = static_cast(this->m_model->get_execution_context()); bp_impl(dynamic_cast(get_local_prev_activations()), dynamic_cast(get_local_prev_error_signals()), dynamic_cast(get_local_error_signals()), *this->m_weights[0], - *m_weights_gradient, - c.get_effective_mini_batch_size()); + *m_weights_gradient); } template class entrywise_scale_bias_layer< diff --git a/src/layers/learning/fully_connected.cpp b/src/layers/learning/fully_connected.cpp index 01dad8b8a04..059759078ba 100644 --- a/src/layers/learning/fully_connected.cpp +++ b/src/layers/learning/fully_connected.cpp @@ -26,7 +26,6 @@ #define LBANN_FULLY_CONNECTED_LAYER_INSTANTIATE #include "lbann/layers/learning/fully_connected.hpp" -#include "lbann/execution_contexts/sgd_execution_context.hpp" namespace lbann { @@ -102,10 +101,6 @@ void fully_connected_layer::fp_com /** CPU implementation of backward prop computation. */ template <> void fully_connected_layer::bp_compute() { - auto& c = static_cast(this->m_model->get_execution_context()); - - // Effective mini-batch size - const auto mini_batch_size = c.get_effective_mini_batch_size(); // Matrices const auto& linearity = m_weights[0]->get_values(); @@ -125,7 +120,7 @@ void fully_connected_layer::bp_com m_bias_gradient->Matrix()); bias_optimizer->add_to_gradient( *m_bias_gradient, - m_bias_scaling_factor / mini_batch_size, + m_bias_scaling_factor, true); } } @@ -138,7 +133,6 @@ void fully_connected_layer::bp_com if (linearity.DistSize() == 1) { auto& linearity_gradient = linearity_optimizer->get_gradient_buffer( dst_scale, gradient_scale, true); - gradient_scale /= mini_batch_size; if (m_transpose) { El::Gemm(El::NORMAL, El::TRANSPOSE, gradient_scale, local_input, local_gradient_wrt_output, @@ -151,7 +145,6 @@ void fully_connected_layer::bp_com } else { auto& linearity_gradient = linearity_optimizer->get_gradient_buffer( dst_scale, gradient_scale); - gradient_scale /= mini_batch_size; if (m_transpose) { El::Gemm(El::NORMAL, El::TRANSPOSE, gradient_scale, input, gradient_wrt_output, @@ -211,10 +204,6 @@ void fully_connected_layer::fp_comp /** CPU implementation of backward prop computation. */ template <> void fully_connected_layer::bp_compute() { - auto& c = static_cast(this->m_model->get_execution_context()); - - // Effective mini-batch size - const auto mini_batch_size = c.get_effective_mini_batch_size(); // Matrices const auto& local_linearity = m_weights[0]->get_values().LockedMatrix(); @@ -230,7 +219,7 @@ void fully_connected_layer::bp_comp m_bias_gradient->Matrix()); bias_optimizer->add_to_gradient( *m_bias_gradient, - m_bias_scaling_factor / mini_batch_size, + m_bias_scaling_factor, true); } } @@ -241,7 +230,6 @@ void fully_connected_layer::bp_comp DataType dst_scale = DataType(0), gradient_scale = DataType(0); auto& linearity_gradient = linearity_optimizer->get_gradient_buffer( dst_scale, gradient_scale, true); - gradient_scale /= mini_batch_size; if (m_transpose) { El::Gemm(El::NORMAL, El::TRANSPOSE, gradient_scale, local_input, local_gradient_wrt_output, @@ -296,10 +284,6 @@ void fully_connected_layer::fp_comp /** GPU implementation of backward prop computation. */ template <> void fully_connected_layer::bp_compute() { - auto& c = static_cast(this->m_model->get_execution_context()); - - // Effective mini-batch size - const auto mini_batch_size = c.get_effective_mini_batch_size(); // Matrices const auto& local_linearity = m_weights[0]->get_values().LockedMatrix(); @@ -314,7 +298,6 @@ void fully_connected_layer::bp_comp DataType dst_scale = DataType(0), gradient_scale = DataType(0); auto& bias_gradient = bias_optimizer->get_gradient_buffer( dst_scale, gradient_scale, true); - gradient_scale /= mini_batch_size; if (local_gradient_wrt_output.Height() < 1 || local_gradient_wrt_output.Width() < 1) { El::Scale(dst_scale, bias_gradient); @@ -338,7 +321,6 @@ void fully_connected_layer::bp_comp DataType dst_scale = DataType(0), gradient_scale = DataType(0); auto& linearity_gradient = linearity_optimizer->get_gradient_buffer( dst_scale, gradient_scale, true); - gradient_scale /= mini_batch_size; if (m_transpose) { El::Gemm(El::NORMAL, El::TRANSPOSE, gradient_scale, local_input, local_gradient_wrt_output, @@ -399,10 +381,6 @@ void fully_connected_layer::fp_com template <> void fully_connected_layer::bp_compute() { - auto& c = static_cast(this->m_model->get_execution_context()); - - // Effective mini-batch size - const auto mini_batch_size = c.get_effective_mini_batch_size(); // Matrices const auto& linearity = m_weights[0]->get_values(); @@ -422,7 +400,6 @@ void fully_connected_layer::bp_com DataType dst_scale = DataType(0), gradient_scale = DataType(0); auto& bias_gradient = bias_optimizer->get_gradient_buffer( dst_scale, gradient_scale, true); - gradient_scale /= mini_batch_size; if (local_gradient_wrt_output.Height() < 1 || local_gradient_wrt_output.Width() < 1) { El::Scale(dst_scale, bias_gradient); @@ -448,7 +425,6 @@ void fully_connected_layer::bp_com if (linearity.DistSize() == 1) { auto& linearity_gradient = linearity_optimizer->get_gradient_buffer( dst_scale, gradient_scale, true); - gradient_scale /= mini_batch_size; if (m_transpose) { El::Gemm(El::NORMAL, El::TRANSPOSE, gradient_scale, local_input, local_gradient_wrt_output, @@ -461,7 +437,6 @@ void fully_connected_layer::bp_com } else { auto& linearity_gradient = linearity_optimizer->get_gradient_buffer( dst_scale, gradient_scale); - gradient_scale /= mini_batch_size; if (m_transpose) { El::Gemm(El::NORMAL, El::TRANSPOSE, gradient_scale, input, gradient_wrt_output, diff --git a/src/layers/regularizers/batch_normalization.cpp b/src/layers/regularizers/batch_normalization.cpp index e5bba2677b0..4f9ed87b37a 100644 --- a/src/layers/regularizers/batch_normalization.cpp +++ b/src/layers/regularizers/batch_normalization.cpp @@ -26,7 +26,6 @@ #define LBANN_BATCH_NORMALIZATION_LAYER_INSTANTIATE #include "lbann/layers/regularizers/batch_normalization.hpp" -#include "lbann/execution_contexts/sgd_execution_context.hpp" namespace lbann { @@ -158,7 +157,6 @@ void batch_normalization_layer::fp_ template <> void batch_normalization_layer::bp_compute() { - constexpr DataType one = 1; const bool is_training = this->m_model->get_execution_context().get_execution_mode() == execution_mode::training; // Matrices @@ -179,8 +177,6 @@ void batch_normalization_layer::bp_ auto& local_bias_gradient = m_bias_gradient->Matrix(); // Matrix parameters - const auto& c = static_cast(this->m_model->get_execution_context()); - const auto effective_mini_batch_size = c.get_effective_mini_batch_size(); const auto& width = input.Width(); const auto& local_width = local_input.Width(); const auto& output_dims = get_output_dims(); @@ -243,15 +239,11 @@ void batch_normalization_layer::bp_ } optimizer* scale_optimizer = m_weights[0]->get_optimizer(); if (scale_optimizer != nullptr) { - scale_optimizer->add_to_gradient(*m_scale_gradient, - one / effective_mini_batch_size, - true); + scale_optimizer->add_to_gradient(*m_scale_gradient, DataType{1}, true); } optimizer* bias_optimizer = m_weights[1]->get_optimizer(); if (bias_optimizer != nullptr) { - bias_optimizer->add_to_gradient(*m_bias_gradient, - one / effective_mini_batch_size, - true); + bias_optimizer->add_to_gradient(*m_bias_gradient, DataType{1}, true); } // Compute error signal diff --git a/src/layers/regularizers/batch_normalization.cu b/src/layers/regularizers/batch_normalization.cu index 8aed7440912..c89f6b83841 100644 --- a/src/layers/regularizers/batch_normalization.cu +++ b/src/layers/regularizers/batch_normalization.cu @@ -27,7 +27,6 @@ #define LBANN_BATCH_NORMALIZATION_LAYER_INSTANTIATE #include "lbann/layers/regularizers/batch_normalization.hpp" #include "lbann/utils/cuda.hpp" -#include "lbann/execution_contexts/sgd_execution_context.hpp" namespace lbann { @@ -98,7 +97,6 @@ __global__ void compute_statistics_kernel( DataType * __restrict__ global_var, DataType * __restrict__ global_running_mean, DataType * __restrict__ global_running_var) { - constexpr DataType one = 1; const El::Int gid = threadIdx.x + blockIdx.x * blockDim.x; const El::Int num_threads = blockDim.x * gridDim.x; for (El::Int i = gid; i < num_sums; i += num_threads) { @@ -114,8 +112,8 @@ __global__ void compute_statistics_kernel( // Compute running statistics auto& running_mean = global_running_mean[gid]; auto& running_var = global_running_var[gid]; - running_mean = decay * running_mean + (one - decay) * mean; - running_var = decay * running_var + (one - decay) * var; + running_mean = decay * running_mean + (DataType{1} - decay) * mean; + running_var = decay * running_var + (DataType{1} - decay) * var; } @@ -297,7 +295,6 @@ __global__ void backprop2_kernel( template <> void batch_normalization_layer::fp_compute() { - constexpr DataType one = 1; const bool is_training = this->m_model->get_execution_context().get_execution_mode() == execution_mode::training; // CUDA objects @@ -366,7 +363,7 @@ void batch_normalization_layer::fp_ // Compute minibatch statistics if (num_per_sum <= 1) { - El::Fill(local_var, one); + El::Fill(local_var, DataType{1}); } else if (num_channels > 0) { const El::Int block_dim = 256; const El::Int grid_dim = (num_channels + block_dim - 1) / block_dim; @@ -407,7 +404,6 @@ void batch_normalization_layer::fp_ template <> void batch_normalization_layer::bp_compute() { - constexpr DataType one = 1; const bool is_training = this->m_model->get_execution_context().get_execution_mode() == execution_mode::training; // CUDA objects @@ -432,8 +428,6 @@ void batch_normalization_layer::bp_ auto& local_bias_gradient = m_bias_gradient->Matrix(); // Matrix parameters - const auto& c = static_cast(this->m_model->get_execution_context()); - const auto effective_mini_batch_size = c.get_effective_mini_batch_size(); const auto& width = input.Width(); const auto& local_width = local_input.Width(); const auto& output_dims = get_output_dims(); @@ -482,15 +476,11 @@ void batch_normalization_layer::bp_ } optimizer* scale_optimizer = m_weights[0]->get_optimizer(); if (scale_optimizer != nullptr) { - scale_optimizer->add_to_gradient(*m_scale_gradient, - one / effective_mini_batch_size, - true); + scale_optimizer->add_to_gradient(*m_scale_gradient, DataType{1}, true); } optimizer* bias_optimizer = m_weights[1]->get_optimizer(); if (bias_optimizer != nullptr) { - bias_optimizer->add_to_gradient(*m_bias_gradient, - one / effective_mini_batch_size, - true); + bias_optimizer->add_to_gradient(*m_bias_gradient, DataType{1}, true); } // Compute error signal diff --git a/src/layers/regularizers/entrywise_batch_normalization.cpp b/src/layers/regularizers/entrywise_batch_normalization.cpp index 4e6f1bebef3..8a85b3db37d 100644 --- a/src/layers/regularizers/entrywise_batch_normalization.cpp +++ b/src/layers/regularizers/entrywise_batch_normalization.cpp @@ -26,7 +26,6 @@ #define LBANN_ENTRYWISE_BATCH_NORMALIZATION_LAYER_INSTANTIATE #include "lbann/layers/regularizers/entrywise_batch_normalization.hpp" -#include "lbann/execution_contexts/sgd_execution_context.hpp" namespace lbann { @@ -377,11 +376,11 @@ void bp_impl(lbann_comm& comm, // Template instantiation template <> void entrywise_batch_normalization_layer::fp_compute() { - const auto& c = static_cast(this->m_model->get_execution_context()); + const auto mode = this->m_model->get_execution_context().get_execution_mode(); fp_impl(*get_comm(), m_decay, m_epsilon, - c.get_execution_mode() == execution_mode::training, + mode == execution_mode::training, get_prev_activations(), get_activations(), *m_batch_statistics, @@ -390,11 +389,11 @@ void entrywise_batch_normalization_layer void entrywise_batch_normalization_layer::fp_compute() { - const auto& c = static_cast(this->m_model->get_execution_context()); + const auto mode = this->m_model->get_execution_context().get_execution_mode(); fp_impl(*get_comm(), m_decay, m_epsilon, - c.get_execution_mode() == execution_mode::training, + mode == execution_mode::training, get_prev_activations(), get_activations(), *m_batch_statistics, @@ -403,10 +402,10 @@ void entrywise_batch_normalization_layer void entrywise_batch_normalization_layer::bp_compute() { - const auto& c = static_cast(this->m_model->get_execution_context()); + const auto mode = this->m_model->get_execution_context().get_execution_mode(); bp_impl(*get_comm(), m_epsilon, - c.get_execution_mode() == execution_mode::training, + mode == execution_mode::training, get_prev_activations(), get_prev_error_signals(), get_error_signals(), @@ -416,10 +415,10 @@ void entrywise_batch_normalization_layer void entrywise_batch_normalization_layer::bp_compute() { - const auto& c = static_cast(this->m_model->get_execution_context()); + const auto mode = this->m_model->get_execution_context().get_execution_mode(); bp_impl(*get_comm(), m_epsilon, - c.get_execution_mode() == execution_mode::training, + mode == execution_mode::training, get_prev_activations(), get_prev_error_signals(), get_error_signals(), diff --git a/src/layers/regularizers/entrywise_batch_normalization.cu b/src/layers/regularizers/entrywise_batch_normalization.cu index a6dbaf90e1e..9a5aaa0ce0f 100644 --- a/src/layers/regularizers/entrywise_batch_normalization.cu +++ b/src/layers/regularizers/entrywise_batch_normalization.cu @@ -27,7 +27,6 @@ #define LBANN_ENTRYWISE_BATCH_NORMALIZATION_LAYER_INSTANTIATE #include "lbann/layers/regularizers/entrywise_batch_normalization.hpp" #include "lbann/utils/cuda.hpp" -#include "lbann/execution_contexts/sgd_execution_context.hpp" namespace lbann { @@ -565,11 +564,11 @@ void bp_impl(lbann_comm& comm, // Template instantiation template <> void entrywise_batch_normalization_layer::fp_compute() { - const auto& c = static_cast(this->m_model->get_execution_context()); + const auto mode = this->m_model->get_execution_context().get_execution_mode(); fp_impl(*get_comm(), m_decay, m_epsilon, - c.get_execution_mode() == execution_mode::training, + mode == execution_mode::training, get_prev_activations(), get_activations(), *m_batch_statistics, @@ -578,11 +577,11 @@ void entrywise_batch_normalization_layer void entrywise_batch_normalization_layer::fp_compute() { - const auto& c = static_cast(this->m_model->get_execution_context()); + const auto mode = this->m_model->get_execution_context().get_execution_mode(); fp_impl(*get_comm(), m_decay, m_epsilon, - c.get_execution_mode() == execution_mode::training, + mode == execution_mode::training, get_prev_activations(), get_activations(), *m_batch_statistics, @@ -591,10 +590,10 @@ void entrywise_batch_normalization_layer void entrywise_batch_normalization_layer::bp_compute() { - const auto& c = static_cast(this->m_model->get_execution_context()); + const auto mode = this->m_model->get_execution_context().get_execution_mode(); bp_impl(*get_comm(), m_epsilon, - c.get_execution_mode() == execution_mode::training, + mode == execution_mode::training, get_prev_activations(), get_prev_error_signals(), get_error_signals(), @@ -604,10 +603,10 @@ void entrywise_batch_normalization_layer void entrywise_batch_normalization_layer::bp_compute() { - const auto& c = static_cast(this->m_model->get_execution_context()); + const auto mode = this->m_model->get_execution_context().get_execution_mode(); bp_impl(*get_comm(), m_epsilon, - c.get_execution_mode() == execution_mode::training, + mode == execution_mode::training, get_prev_activations(), get_prev_error_signals(), get_error_signals(), diff --git a/src/layers/transform/evaluation.cpp b/src/layers/transform/evaluation.cpp index 3bd2d69d2f5..ee886b6b3de 100644 --- a/src/layers/transform/evaluation.cpp +++ b/src/layers/transform/evaluation.cpp @@ -26,6 +26,8 @@ #define LBANN_EVALUATION_LAYER_INSTANTIATE #include "lbann/layers/transform/evaluation.hpp" +#include "lbann/models/model.hpp" +#include "lbann/execution_contexts/sgd_execution_context.hpp" #include "lbann/utils/exception.hpp" #ifdef LBANN_HAS_GPU #include "lbann/utils/cublas.hpp" @@ -194,7 +196,9 @@ void abstract_evaluation_layer::fp_compute() { } void abstract_evaluation_layer::bp_compute() { - El::Fill(get_error_signals(), DataType(m_scale)); + const auto& context = static_cast(this->m_model->get_execution_context()); + const auto mini_batch_size = context.get_effective_mini_batch_size(); + El::Fill(get_error_signals(), DataType(m_scale / mini_batch_size)); } abstract_evaluation_layer* From bc17d09f5d2c7fbd29d9c0a39acdd1c8aa16cf38 Mon Sep 17 00:00:00 2001 From: Sam Ade Jacobs Date: Thu, 14 Nov 2019 10:34:18 -0600 Subject: [PATCH 398/634] Draft implementation of RAS state classifier model (#1350) * Draft impelementation of RAS state classifier model * Update train_ras_classifier.py Clean up, add accuracy metric * Update train_ras_classifier.py Remove print statement * Update util.py Remove print statements * Update util.py Transpose data to NCHW --- .../CANDLE/pilot2/train_ras_classifier.py | 134 ++++++++++++++++++ applications/CANDLE/pilot2/util.py | 95 +++++++++++++ 2 files changed, 229 insertions(+) create mode 100644 applications/CANDLE/pilot2/train_ras_classifier.py create mode 100644 applications/CANDLE/pilot2/util.py diff --git a/applications/CANDLE/pilot2/train_ras_classifier.py b/applications/CANDLE/pilot2/train_ras_classifier.py new file mode 100644 index 00000000000..d13272937ed --- /dev/null +++ b/applications/CANDLE/pilot2/train_ras_classifier.py @@ -0,0 +1,134 @@ +import numpy as np +import lbann +import lbann.modules +from util import preprocess_data + +# Data paths, directory where patches are located +data_dir = 'data' +samples = preprocess_data(data_dir) + +dims = len(samples[0]) + + +num_classes = 3 +num_channels = 14 + +# Sample access functions +def get_sample(index): + sample = samples[index] + return sample + +def num_samples(): + return samples.shape[0] + +def sample_dims(): + return [dims] + +def str_list(l): + return ' '.join([str(i) for i in l]) +# ============================================== +# Setup and launch experiment +# ============================================== + +def construct_model(): + """Model description + + """ + import lbann + import lbann.modules + + + fc = lbann.modules.FullyConnectedModule + conv = lbann.modules.Convolution2dModule + + conv1 = conv(20, 3, stride=1, padding=1,name='conv1') + conv2 = conv(20, 3, stride=1, padding=1,name='conv2') + fc1 = fc(100, name='fc1') + fc2 = fc(20, name='fc2') + fc3 = fc(num_classes, name='fc3') + # Layer graph + input = lbann.Input(name='inp_tensor') + inp_slice = lbann.Slice(input, axis=0, slice_points=str_list([0, dims-1, dims]),name='inp_slice') + xdata = lbann.Identity(inp_slice) + ylabel = lbann.Identity(inp_slice, name='gt_y') + #NHWC to NCHW + x = lbann.Reshape(xdata, dims='14 13 13') + x = conv2(conv1(x)) + x = lbann.Reshape(x, dims='3380') + x = lbann.Dropout(lbann.Relu(fc1(x)),keep_prob=0.5) + x = lbann.Dropout(fc2(x),keep_prob=0.5) + pred = lbann.Softmax(fc3(x)) + gt_label = lbann.OneHot(ylabel, size=num_classes) + loss = lbann.CrossEntropy([pred,gt_label],name='loss') + acc = lbann.CategoricalAccuracy([pred, gt_label]) + + + layers = list(lbann.traverse_layer_graph(input)) + # Setup objective function + weights = set() + for l in layers: + weights.update(l.weights) + obj = lbann.ObjectiveFunction(loss) + + + callbacks = [lbann.CallbackPrint(), + lbann.CallbackTimer()] + + # Construct model + mini_batch_size = 64 + num_epochs = 10 + return lbann.Model(mini_batch_size, + num_epochs, + weights=weights, + layers=layers, + metrics=[lbann.Metric(acc, name='accuracy', unit='%')], + objective_function=obj, + callbacks=callbacks) + +def construct_data_reader(): + """Construct Protobuf message for Python data reader. + + The Python data reader will import this Python file to access the + sample access functions. + + """ + import os.path + import lbann + module_file = os.path.abspath(__file__) + module_name = os.path.splitext(os.path.basename(module_file))[0] + module_dir = os.path.dirname(module_file) + + # Base data reader message + message = lbann.reader_pb2.DataReader() + + # Training set data reader + data_reader = message.reader.add() + data_reader.name = 'python' + data_reader.role = 'train' + data_reader.shuffle = True + data_reader.percent_of_data_to_use = 1.0 + data_reader.python.module = module_name + data_reader.python.module_dir = module_dir + data_reader.python.sample_function = 'get_sample' + data_reader.python.num_samples_function = 'num_samples' + data_reader.python.sample_dims_function = 'sample_dims' + + return message + +if __name__ == '__main__': + import lbann + import lbann.contrib.lc.launcher + trainer = lbann.Trainer() + model = construct_model() + opt = lbann.Adam(learn_rate=0.001,beta1=0.9,beta2=0.99,eps=1e-8) + data_reader = construct_data_reader() + status = lbann.contrib.lc.launcher.run( + trainer, model, data_reader, opt, + account='hpcdl', + scheduler='slurm', + time_limit=720, + nodes=1, + procs_per_node=1, + setup_only=False, + job_name='candle_p2_ras_classifier') + print(status) diff --git a/applications/CANDLE/pilot2/util.py b/applications/CANDLE/pilot2/util.py new file mode 100644 index 00000000000..9584a3c0941 --- /dev/null +++ b/applications/CANDLE/pilot2/util.py @@ -0,0 +1,95 @@ +import os +import sys +import random +import numpy as np + + +p0_thresh = 0.55 +p1_thresh = 0.85 +p2_thresh = 0.85 + +def preprocess_data(dirspath,channels=None): +# define a tuple of specific channels if user listed them + channels_tuple = tuple(range(14)) + if channels is not None: + channels_tuple = tuple(channels) + + files_train = [] + states = [] + cons = [] + + #for d in dirspath: + for _ in range(1): + # get list of all files in datapath and shuffle them + # sort by filename before shuffle so we could generate + # a consistent list if using the same random seed + filenames = os.listdir(dirspath) + filenames.sort() + random.shuffle(filenames) + + filenames_divide = int(1.0 * len(filenames)) + filenames_train = filenames[:filenames_divide] + + files_train.append([dirspath + "/" + f for f in filenames_train]) + + frame_start = 0 + + for f in filenames_train: + # read in the data file + d = np.load(dirspath + '/' + f) + + # extract fields + p = d['probs'][d['frames'] >= frame_start] + s = d['states'][d['frames'] >= frame_start] + #n = d['density_sig1p5'][d['frames'] >= frame_start] + n = d['density_sig1'][d['frames'] >= frame_start] + #print p.shape, s.shape + + s = s[(p[:,0] > p0_thresh) | (p[:,1] > p1_thresh) | (p[:,2] > p2_thresh)] + n = n[(p[:,0] > p0_thresh) | (p[:,1] > p1_thresh) | (p[:,2] > p2_thresh)] + + states.append(s) + + + # append concentrations, filter out by channel id(s) if given + # can we do channel first here, transpose?, move axis? + n = np.array(n) + n = n.astype(np.float32) + if channels: + cons.append(n[:,:,:,channels_tuple]) + else: + cons.append(n) + + + states = np.concatenate(states,axis=0) + cons = np.concatenate(cons,axis=0) + + # print list of unique state labels and number of each + (values, cnt) = np.unique(states, return_counts=True) + + min_cnt = np.min(cnt) + idx_0 = np.where(states == 0) + idx_0 = idx_0[0][:min_cnt] + idx_1 = np.where(states == 1) + idx_1 = idx_1[0][:min_cnt] + idx_2 = np.where(states == 2) + idx_2 = idx_2[0][:min_cnt] + ids = np.concatenate([idx_0, idx_1, idx_2], axis=0) + states = states[ids] + cons = cons[ids] + + + # normalize each concentration channel independently + mins = cons.min(axis=(0,1,2), keepdims=True) + maxs = cons.max(axis=(0,1,2), keepdims=True) + + cons /= maxs + labels = states + + #transpose to NCHW + cons = cons.transpose(0,3,1,2) + + X = cons.reshape(cons.shape[0],-1) + y = labels.reshape(-1,1) + Xy_data = np.hstack((X,y)) + return Xy_data From b3055b9eba030de2ffc81ee25b2b3017d57e0be1 Mon Sep 17 00:00:00 2001 From: Tim Moon Date: Thu, 14 Nov 2019 18:34:21 -0800 Subject: [PATCH 399/634] Implement GPU embedding layer (#1345) --- .../unit_tests/test_unit_layer_embedding.py | 6 +- include/lbann/layers/learning/embedding.hpp | 78 +++++--- src/layers/learning/CMakeLists.txt | 1 + src/layers/learning/embedding.cpp | 28 +-- src/layers/learning/embedding.cu | 178 ++++++++++++++++++ src/proto/factories/layer_factory.cpp | 7 +- 6 files changed, 254 insertions(+), 44 deletions(-) create mode 100644 src/layers/learning/embedding.cu diff --git a/bamboo/unit_tests/test_unit_layer_embedding.py b/bamboo/unit_tests/test_unit_layer_embedding.py index 30c7ac991ae..52ac8409f35 100644 --- a/bamboo/unit_tests/test_unit_layer_embedding.py +++ b/bamboo/unit_tests/test_unit_layer_embedding.py @@ -85,8 +85,7 @@ def construct_model(lbann): y = lbann.Embedding(x, weights=embedding_weights, num_embeddings=_num_embeddings, - embedding_dim=embedding_dim, - device='cpu') + embedding_dim=embedding_dim) z = lbann.L2Norm2(y) obj.append(z) metrics.append(lbann.Metric(z, name='no padding index')) @@ -129,8 +128,7 @@ def construct_model(lbann): weights=embedding_weights, num_embeddings=_num_embeddings, embedding_dim=embedding_dim, - padding_idx=padding_idx, - device='cpu') + padding_idx=padding_idx) z = lbann.L2Norm2(y) obj.append(z) metrics.append(lbann.Metric(z, name='padding index = 0')) diff --git a/include/lbann/layers/learning/embedding.hpp b/include/lbann/layers/learning/embedding.hpp index e5650a72e8d..b73a4b979ce 100644 --- a/include/lbann/layers/learning/embedding.hpp +++ b/include/lbann/layers/learning/embedding.hpp @@ -49,8 +49,6 @@ template class embedding_layer : public Layer { static_assert(Layout == data_layout::DATA_PARALLEL, "embedding layer only supports data parallel layout"); - static_assert(Device == El::Device::CPU, - "embedding layer only supports CPU"); public: /** @@ -65,14 +63,10 @@ class embedding_layer : public Layer { embedding_layer(lbann_comm* comm, size_t num_embeddings, size_t embedding_dim, - El::Int padding_idx=-1) - : Layer(comm), - m_num_embeddings{num_embeddings}, - m_embedding_dim{embedding_dim}, - m_padding_idx{padding_idx} {} - - embedding_layer(const embedding_layer& other) = default; - embedding_layer& operator=(const embedding_layer& other) = default; + El::Int padding_idx=-1); + + embedding_layer(const embedding_layer& other); + embedding_layer& operator=(const embedding_layer& other); ~embedding_layer() = default; embedding_layer* copy() const override { @@ -107,7 +101,7 @@ class embedding_layer : public Layer { El::Int m_padding_idx; /** Gradient w.r.t. embedding weights. */ - StarMat m_dictionary_gradient; + std::unique_ptr m_gradient_wrt_embeddings; }; @@ -115,6 +109,41 @@ class embedding_layer : public Layer { // Implementation // ========================================================= +template +embedding_layer::embedding_layer( + lbann_comm* comm, + size_t num_embeddings, + size_t embedding_dim, + El::Int padding_idx) + : Layer(comm), + m_num_embeddings{num_embeddings}, + m_embedding_dim{embedding_dim}, + m_padding_idx{padding_idx} {} + +template +embedding_layer::embedding_layer( + const embedding_layer& other) + : Layer(other), + m_num_embeddings{other.m_num_embeddings}, + m_embedding_dim{other.m_embedding_dim}, + m_padding_idx{other.m_padding_idx}, + m_gradient_wrt_embeddings(other.m_gradient_wrt_embeddings + ? other.m_gradient_wrt_embeddings->Copy() + : nullptr) {} + +template +embedding_layer& embedding_layer::operator=( + const embedding_layer& other) { + Layer::operator=(other); + m_num_embeddings = other.m_num_embeddings; + m_embedding_dim = other.m_embedding_dim; + m_padding_idx = other.m_padding_idx; + m_gradient_wrt_embeddings.reset(other.m_gradient_wrt_embeddings + ? other.m_gradient_wrt_embeddings->Copy() + : nullptr); + return *this; +} + template description embedding_layer::get_description() const { auto desc = Layer::get_description(); @@ -170,33 +199,38 @@ void embedding_layer::setup_data() { } // Initialize dictionary - auto& dict = *m_weights[0]; + auto& embeddings = *m_weights[0]; auto matrix_dist = get_prev_activations().DistData(); matrix_dist.colDist = El::STAR; matrix_dist.rowDist = El::STAR; - dict.set_dims({static_cast(m_embedding_dim)}, - {static_cast(m_num_embeddings)}); - dict.set_matrix_distribution(matrix_dist); - dict.setup(); + embeddings.set_dims({static_cast(m_embedding_dim)}, + {static_cast(m_num_embeddings)}); + embeddings.set_matrix_distribution(matrix_dist); + embeddings.setup(); // Zero out embedding vector for padding index if (0 <= m_padding_idx && m_padding_idx < static_cast(m_embedding_dim)) { - auto& dict_values = dict.get_values(); - std::unique_ptr pad_embedding(dict_values.Construct(dict_values.Grid(), - dict_values.Root())); - El::View(*pad_embedding, dict_values, El::ALL, El::IR(m_padding_idx)); + auto& embedding_values = embeddings.get_values(); + std::unique_ptr pad_embedding( + embedding_values.Construct(embedding_values.Grid(), + embedding_values.Root())); + El::View(*pad_embedding, embedding_values, El::ALL, El::IR(m_padding_idx)); El::Zero(*pad_embedding); } - // Initialize gradient w.r.t. dictionary - m_dictionary_gradient.Resize(m_embedding_dim, m_num_embeddings); + // Initialize gradient w.r.t. embeddings + m_gradient_wrt_embeddings->Resize(m_embedding_dim, m_num_embeddings); } #ifndef LBANN_EMBEDDING_LAYER_INSTANTIATE extern template class embedding_layer< data_layout::DATA_PARALLEL, El::Device::CPU>; +#ifdef LBANN_HAS_GPU +extern template class embedding_layer< + data_layout::DATA_PARALLEL, El::Device::GPU>; +#endif // LBANN_HAS_GPU #endif // LBANN_EMBEDDING_LAYER_INSTANTIATE } // namespace lbann diff --git a/src/layers/learning/CMakeLists.txt b/src/layers/learning/CMakeLists.txt index e5b1cb0f4a8..69ba7d2e3fe 100644 --- a/src/layers/learning/CMakeLists.txt +++ b/src/layers/learning/CMakeLists.txt @@ -12,6 +12,7 @@ if (LBANN_HAS_CUDA) # Add the CUDA source files for this directory set_full_path(THIS_DIR_CU_SOURCES channelwise_scale_bias.cu + embedding.cu entrywise_scale_bias.cu ) endif () diff --git a/src/layers/learning/embedding.cpp b/src/layers/learning/embedding.cpp index 103fdec0172..1c84b5c3a10 100644 --- a/src/layers/learning/embedding.cpp +++ b/src/layers/learning/embedding.cpp @@ -32,26 +32,26 @@ namespace lbann { template <> void embedding_layer::setup_matrices(const El::Grid& grid) { Layer::setup_matrices(grid); - m_dictionary_gradient = StarMat(grid); + m_gradient_wrt_embeddings.reset(new StarMat(grid)); } template <> void embedding_layer::fp_compute() { // Local data - const auto& local_dict = dynamic_cast(m_weights[0]->get_values().LockedMatrix()); + const auto& local_embeddings = dynamic_cast(m_weights[0]->get_values().LockedMatrix()); const auto& local_input = dynamic_cast(get_local_prev_activations()); auto& local_output = dynamic_cast(get_local_activations()); const auto& local_width = local_input.Width(); - // Populate output matrix with appropriate columns of dictionary - CPUMat dict_v, output_v; + // Populate output matrix with columns of embedding matrix + CPUMat embedding_v, output_v; for (El::Int col = 0; col < local_width; ++ col) { El::View(output_v, local_output, El::ALL, El::IR(col)); const El::Int ind = static_cast(std::floor(local_input(0, col))); if (0 <= ind && ind < static_cast(m_num_embeddings)) { - El::LockedView(dict_v, local_dict, El::ALL, El::IR(ind)); - El::Copy(dict_v, output_v); + El::LockedView(embedding_v, local_embeddings, El::ALL, El::IR(ind)); + El::Copy(embedding_v, output_v); } else { El::Zero(output_v); } @@ -65,31 +65,31 @@ void embedding_layer::bp_compute() { // Embedding layer is not differentiable w.r.t. inputs El::Zero(get_error_signals()); - // Nothing to be done if dictionary is not being optimized + // Nothing to be done if embeddings are not being optimized if (m_weights[0]->get_optimizer() == nullptr) { return; } auto& opt = *m_weights[0]->get_optimizer(); // Local data const auto& local_input = dynamic_cast(get_local_prev_activations()); - auto& local_dict_grad = dynamic_cast(m_dictionary_gradient.Matrix()); + auto& local_embedding_grad = dynamic_cast(m_gradient_wrt_embeddings->Matrix()); const auto& local_output_grad = dynamic_cast(get_local_prev_error_signals()); const auto& local_width = local_input.Width(); - // Update appropriate columns of gradient w.r.t. dictionary + // Update appropriate columns of gradient w.r.t. embeddings // Note: Don't update gradient for padding index - El::Zero(local_dict_grad); - CPUMat dict_grad_v, output_grad_v; + El::Zero(local_embedding_grad); + CPUMat embedding_grad_v, output_grad_v; for (El::Int col = 0; col < local_width; ++ col) { const El::Int ind = static_cast(std::floor(local_input(0, col))); if (0 <= ind && ind < static_cast(m_num_embeddings) && ind != m_padding_idx) { - El::View(dict_grad_v, local_dict_grad, El::ALL, El::IR(ind)); + El::View(embedding_grad_v, local_embedding_grad, El::ALL, El::IR(ind)); El::LockedView(output_grad_v, local_output_grad, El::ALL, El::IR(col)); - El::Axpy(DataType{1}, output_grad_v, dict_grad_v); + El::Axpy(DataType{1}, output_grad_v, embedding_grad_v); } } - opt.add_to_gradient(m_dictionary_gradient, DataType{1}, true); + opt.add_to_gradient(*m_gradient_wrt_embeddings, DataType{1}, true); } diff --git a/src/layers/learning/embedding.cu b/src/layers/learning/embedding.cu new file mode 100644 index 00000000000..4b3e5a3976e --- /dev/null +++ b/src/layers/learning/embedding.cu @@ -0,0 +1,178 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#define LBANN_EMBEDDING_LAYER_INSTANTIATE +#include "lbann/layers/learning/embedding.hpp" + +namespace lbann { + +namespace { + +/** @brief Kernel for forward prop + * + * Block dimensions: bsize x 1 x 1 + * + * Grid dimensions: (embedding_dim / bsize) x mini_batch_size x 1 + */ +__global__ void fp_kernel(El::Int num_embeddings, + El::Int embedding_dim, + El::Int mini_batch_size, + const DataType* __restrict__ indices, + El::Int indices_stride, + const DataType* __restrict__ embeddings, + El::Int embeddings_ldim, + DataType* __restrict__ output, + El::Int output_ldim) { + const El::Int gidx = threadIdx.x + blockIdx.x * blockDim.x; + const El::Int gidy = threadIdx.y + blockIdx.y * blockDim.y; + const El::Int nthreadsx = blockDim.x * gridDim.x; + const El::Int nthreadsy = blockDim.y * gridDim.y; + for (El::Int j = gidy; j < mini_batch_size; j += nthreadsy) { + const El::Int ind = static_cast(indices[j*indices_stride]); + for (El::Int i = gidx; i < embedding_dim; i += nthreadsx) { + auto& y = output[i+j*output_ldim]; + if (0 <= ind && ind < num_embeddings) { + y = embeddings[i+ind*embeddings_ldim]; + } + else { + y = DataType{0}; + } + } + } +} + +/** @brief Kernel for backprop + * + * Block dimensions: bsize x 1 x 1 + * + * Grid dimensions: (embedding_dim / bsize) x mini_batch_size x 1 + */ +__global__ void bp_kernel(El::Int num_embeddings, + El::Int embedding_dim, + El::Int mini_batch_size, + El::Int padding_idx, + const DataType* __restrict__ indices, + El::Int indices_stride, + const DataType* __restrict__ gradient_wrt_output, + El::Int gradient_wrt_output_ldim, + DataType* __restrict__ gradient_wrt_embeddings, + El::Int gradient_wrt_embeddings_ldim) { + const El::Int gidx = threadIdx.x + blockIdx.x * blockDim.x; + const El::Int gidy = threadIdx.y + blockIdx.y * blockDim.y; + const El::Int nthreadsx = blockDim.x * gridDim.x; + const El::Int nthreadsy = blockDim.y * gridDim.y; + for (El::Int j = gidy; j < mini_batch_size; j += nthreadsy) { + const El::Int ind = static_cast(indices[j*indices_stride]); + if (0 <= ind && ind < num_embeddings && ind != padding_idx) { + for (El::Int i = gidx; i < embedding_dim; i += nthreadsx) { + const auto& dy = gradient_wrt_output[i+j*gradient_wrt_output_ldim]; + auto& dw = gradient_wrt_embeddings[i+ind*gradient_wrt_embeddings_ldim]; + cuda::atomic_add(&dw, dy); + } + } + } +} + +} // namespace + +template <> +void embedding_layer::setup_matrices(const El::Grid& grid) { + Layer::setup_matrices(grid); + m_gradient_wrt_embeddings.reset(new StarMat(grid)); +} + +template <> +void embedding_layer::fp_compute() { + + // Local data + const auto& local_embeddings = dynamic_cast(m_weights[0]->get_values().LockedMatrix()); + const auto& local_input = dynamic_cast(get_local_prev_activations()); + auto& local_output = dynamic_cast(get_local_activations()); + + // Launch CUDA kernel + if (!local_input.IsEmpty()) { + constexpr size_t block_size = 256; + dim3 block_dims, grid_dims; + block_dims.x = block_size; + grid_dims.x = (local_output.Height() + block_size - 1) / block_size; + grid_dims.y = local_output.Width(); + fp_kernel<<>>( + m_num_embeddings, + m_embedding_dim, + local_input.Width(), + local_input.LockedBuffer(), + local_input.LDim(), + local_embeddings.LockedBuffer(), + local_embeddings.LDim(), + local_output.Buffer(), + local_output.LDim()); + } + +} + +template <> +void embedding_layer::bp_compute() { + + // Embedding layer is not differentiable w.r.t. inputs + El::Zero(get_error_signals()); + + // Nothing to be done if embeddings are not being optimized + if (m_weights[0]->get_optimizer() == nullptr) { return; } + auto& opt = *m_weights[0]->get_optimizer(); + + // Local data + const auto& local_input = dynamic_cast(get_local_prev_activations()); + auto& local_embedding_grad = dynamic_cast(m_gradient_wrt_embeddings->Matrix()); + const auto& local_output_grad = dynamic_cast(get_local_prev_error_signals()); + + // Launch CUDA kernel + El::Zero(local_embedding_grad); + if (!local_input.IsEmpty()) { + constexpr size_t block_size = 256; + dim3 block_dims, grid_dims; + block_dims.x = block_size; + grid_dims.x = (local_output_grad.Height() + block_size - 1) / block_size; + grid_dims.y = local_output_grad.Width(); + bp_kernel<<>>( + m_num_embeddings, + m_embedding_dim, + local_input.Width(), + m_padding_idx, + local_input.LockedBuffer(), + local_input.LDim(), + local_output_grad.LockedBuffer(), + local_output_grad.LDim(), + local_embedding_grad.Buffer(), + local_embedding_grad.LDim()); + } + opt.add_to_gradient(*m_gradient_wrt_embeddings, DataType{1}, true); + +} + +// Explicit instantiation +template class embedding_layer; + +} // namespace lbann diff --git a/src/proto/factories/layer_factory.cpp b/src/proto/factories/layer_factory.cpp index 39eb0d7799d..1c7549b98f9 100644 --- a/src/proto/factories/layer_factory.cpp +++ b/src/proto/factories/layer_factory.cpp @@ -248,13 +248,12 @@ std::unique_ptr construct_layer( const size_t embedding_dim = params.embedding_dim(); const El::Int padding_idx = (params.has_padding_idx() ? params.padding_idx().value() : -1); - if (Layout == data_layout::DATA_PARALLEL - && Device == El::Device::CPU) { - return lbann::make_unique>( + if (Layout == data_layout::DATA_PARALLEL) { + return lbann::make_unique>( comm, num_embeddings, embedding_dim, padding_idx); } else { LBANN_ERROR("embedding layer is only supported with " - "data-parallel data layout and on CPU"); + "data-parallel data layout"); } } if (proto_layer.has_channelwise_scale_bias()) { From 0ba2acd45be3b67c69970f8b144bc517b7c5b284 Mon Sep 17 00:00:00 2001 From: Katie Graham <50850420+graham63@users.noreply.github.com> Date: Fri, 15 Nov 2019 18:28:58 -0600 Subject: [PATCH 400/634] updated queue names for corona (#1351) --- bamboo/allocate_and_run.sh | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/bamboo/allocate_and_run.sh b/bamboo/allocate_and_run.sh index f46baa69687..81c433c1908 100755 --- a/bamboo/allocate_and_run.sh +++ b/bamboo/allocate_and_run.sh @@ -46,7 +46,22 @@ elif [ "${CLUSTER}" = 'ray' ]; then else timeout -k 5 24h bsub -Is -q pbatch -nnodes 2 -W ${ALLOCATION_TIME_LIMIT} ./run.sh fi -elif [ "${CLUSTER}" = 'catalyst' ] || [ "${CLUSTER}" = 'corona' ] || [ "${CLUSTER}" = 'pascal' ]; then +elif [ "${CLUSTER}" = 'corona' ]; then + ALLOCATION_TIME_LIMIT=960 + if [ ${WEEKLY} -ne 0 ]; then + timeout -k 5 24h salloc -N4 --partition=mi60 -t ${ALLOCATION_TIME_LIMIT} ./run.sh --weekly + else + ALLOCATION_TIME_LIMIT=90 # Start with 1.5 hrs; may adjust for CPU clusters + if [[ $(mjstat -c | awk 'match($1, "mi60") && NF < 7 { print $5 }') -ne "0" ]]; + then + timeout -k 5 24h salloc -N2 --partition=mi60 -t ${ALLOCATION_TIME_LIMIT} ./run.sh + else + echo "Partition \"mi60\" on cluster \"${CLUSTER}\" appears to be down." + echo "Trying \"mi25\"." + timeout -k 5 24h salloc -N2 --partition=mi25 -t ${ALLOCATION_TIME_LIMIT} ./run.sh + fi + fi +elif [ "${CLUSTER}" = 'catalyst' ] || [ "${CLUSTER}" = 'pascal' ]; then ALLOCATION_TIME_LIMIT=960 if [ ${WEEKLY} -ne 0 ]; then timeout -k 5 24h salloc -N4 --partition=pbatch -t ${ALLOCATION_TIME_LIMIT} ./run.sh --weekly @@ -57,11 +72,6 @@ elif [ "${CLUSTER}" = 'catalyst' ] || [ "${CLUSTER}" = 'corona' ] || [ "${CLUSTE timeout -k 5 24h salloc -N2 --partition=pbatch -t ${ALLOCATION_TIME_LIMIT} ./run.sh else echo "Partition \"pbatch\" on cluster \"${CLUSTER}\" appears to be down." - if [[ "${CLUSTER}" =~ ^corona$ ]]; - then - echo "Trying \"pgpu\"." - timeout -k 5 24h salloc -N2 --partition=pgpu -t ${ALLOCATION_TIME_LIMIT} ./run.sh - fi fi fi else From f180a7147e4a77f9a39d6746aa22055333273e06 Mon Sep 17 00:00:00 2001 From: Nikoli Dryden Date: Sat, 16 Nov 2019 09:06:21 +0100 Subject: [PATCH 401/634] Bump Aluminum version --- superbuild/aluminum/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/superbuild/aluminum/CMakeLists.txt b/superbuild/aluminum/CMakeLists.txt index 7a637eeabf9..75ac7faf73a 100644 --- a/superbuild/aluminum/CMakeLists.txt +++ b/superbuild/aluminum/CMakeLists.txt @@ -11,7 +11,7 @@ else () CACHE STRING "The URL from which to clone Aluminum") endif () -set(ALUMINUM_TAG "v0.3.2" +set(ALUMINUM_TAG "v0.3.3" CACHE STRING "The git tag to checkout for Aluminum") set(ALUMINUM_CMAKE_BUILD_TYPE "${CMAKE_BUILD_TYPE}" From 7cb436a1d97d76f55646636353fed5fb2f720b4f Mon Sep 17 00:00:00 2001 From: Tim Moon Date: Mon, 18 Nov 2019 17:11:30 -0800 Subject: [PATCH 402/634] Layer norm (#1352) * Implement layer norm layer on CPU * Implement layer norm layer on GPU * Rename "entry-wise layer normalization" to "layer norm" * Document layer norm layer --- .../unit_tests/test_unit_layer_layer_norm.py | 203 ++++++++ .../lbann/layers/regularizers/CMakeLists.txt | 1 + .../lbann/layers/regularizers/layer_norm.hpp | 213 ++++++++ src/layers/regularizers/CMakeLists.txt | 2 + src/layers/regularizers/layer_norm.cpp | 226 ++++++++ src/layers/regularizers/layer_norm.cu | 481 ++++++++++++++++++ src/proto/factories/layer_factory.cpp | 8 + src/proto/layers.proto | 20 + 8 files changed, 1154 insertions(+) create mode 100644 bamboo/unit_tests/test_unit_layer_layer_norm.py create mode 100644 include/lbann/layers/regularizers/layer_norm.hpp create mode 100644 src/layers/regularizers/layer_norm.cpp create mode 100644 src/layers/regularizers/layer_norm.cu diff --git a/bamboo/unit_tests/test_unit_layer_layer_norm.py b/bamboo/unit_tests/test_unit_layer_layer_norm.py new file mode 100644 index 00000000000..4ab7617c16a --- /dev/null +++ b/bamboo/unit_tests/test_unit_layer_layer_norm.py @@ -0,0 +1,203 @@ +import functools +import operator +import os +import os.path +import sys +import numpy as np + +# Bamboo utilities +current_file = os.path.realpath(__file__) +current_dir = os.path.dirname(current_file) +sys.path.insert(0, os.path.join(os.path.dirname(current_dir), 'common_python')) +import tools + +# ============================================== +# Objects for Python data reader +# ============================================== +# Note: The Python data reader imports this file as a module and calls +# the functions below to ingest data. + +# Data +np.random.seed(20191114) +_num_samples = 31 +_sample_size = 31 +_samples = np.random.normal(size=(_num_samples,_sample_size)).astype(np.float32) + +# Sample access functions +def get_sample(index): + return _samples[index,:] +def num_samples(): + return _num_samples +def sample_dims(): + return (_sample_size,) + +# ============================================== +# NumPy softmax +# ============================================== + +def numpy_layer_norm(x, epsilon=1e-5): + if x.dtype is not np.float64: + x = x.astype(np.float64) + mean = np.mean(x) + var = np.var(x, ddof=1) + return (x - mean) / np.sqrt(var + epsilon) + +# ============================================== +# Setup LBANN experiment +# ============================================== + +def setup_experiment(lbann): + """Construct LBANN experiment. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + trainer = lbann.Trainer() + model = construct_model(lbann) + data_reader = construct_data_reader(lbann) + optimizer = lbann.NoOptimizer() + return trainer, model, data_reader, optimizer + +def construct_model(lbann): + """Construct LBANN model. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Input data + # Note: Sum with a weights layer so that gradient checking will + # verify that error signals are correct. + x_weights = lbann.Weights(optimizer=lbann.SGD(), + initializer=lbann.ConstantInitializer(value=0.0), + name='input_weights') + x = lbann.Sum(lbann.Reshape(lbann.Input(), + dims=tools.str_list(_sample_size)), + lbann.WeightsLayer(weights=x_weights, + dims=tools.str_list(_sample_size))) + x_lbann = x + + # Objects for LBANN model + obj = [] + metrics = [] + callbacks = [] + + # ------------------------------------------ + # Data-parallel layout + # ------------------------------------------ + + # LBANN implementation + x = x_lbann + y = lbann.LayerNorm(x, data_layout='data_parallel') + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='data-parallel layout')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i).astype(np.float64) + y = numpy_layer_norm(x) + z = tools.numpy_l2norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # Model-parallel layout + # ------------------------------------------ + + # LBANN implementation + epsilon = 0.0123 + x = x_lbann + y = lbann.LayerNorm(x, data_layout='model_parallel', epsilon=epsilon) + z = lbann.L2Norm2(y) + obj.append(z) + metrics.append(lbann.Metric(z, name='model-parallel layout')) + + # NumPy implementation + vals = [] + for i in range(num_samples()): + x = get_sample(i).astype(np.float64) + y = numpy_layer_norm(x, epsilon) + z = tools.numpy_l2norm2(y) + vals.append(z) + val = np.mean(vals) + tol = 8 * val * np.finfo(np.float32).eps + callbacks.append(lbann.CallbackCheckMetric( + metric=metrics[-1].name, + lower_bound=val-tol, + upper_bound=val+tol, + error_on_failure=True, + execution_modes='test')) + + # ------------------------------------------ + # Gradient checking + # ------------------------------------------ + + callbacks.append(lbann.CallbackCheckGradients(error_on_failure=True)) + + # ------------------------------------------ + # Construct model + # ------------------------------------------ + + mini_batch_size = num_samples() // 2 + num_epochs = 0 + return lbann.Model(mini_batch_size, + num_epochs, + layers=lbann.traverse_layer_graph(x_lbann), + objective_function=obj, + metrics=metrics, + callbacks=callbacks) + +def construct_data_reader(lbann): + """Construct Protobuf message for Python data reader. + + The Python data reader will import the current Python file to + access the sample access functions. + + Args: + lbann (module): Module for LBANN Python frontend + + """ + + # Note: The training data reader should be removed when + # https://github.com/LLNL/lbann/issues/1098 is resolved. + message = lbann.reader_pb2.DataReader() + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'train' + ) + ]) + message.reader.extend([ + tools.create_python_data_reader( + lbann, + current_file, + 'get_sample', + 'num_samples', + 'sample_dims', + 'test' + ) + ]) + return message + +# ============================================== +# Setup PyTest +# ============================================== + +# Create test functions that can interact with PyTest +for test in tools.create_tests(setup_experiment, __file__): + globals()[test.__name__] = test diff --git a/include/lbann/layers/regularizers/CMakeLists.txt b/include/lbann/layers/regularizers/CMakeLists.txt index 726676d7426..15384770bc0 100644 --- a/include/lbann/layers/regularizers/CMakeLists.txt +++ b/include/lbann/layers/regularizers/CMakeLists.txt @@ -3,6 +3,7 @@ set_full_path(THIS_DIR_HEADERS batch_normalization.hpp dropout.hpp entrywise_batch_normalization.hpp + layer_norm.hpp local_response_normalization.hpp regularizer.hpp selu_dropout.hpp diff --git a/include/lbann/layers/regularizers/layer_norm.hpp b/include/lbann/layers/regularizers/layer_norm.hpp new file mode 100644 index 00000000000..b2c85b873be --- /dev/null +++ b/include/lbann/layers/regularizers/layer_norm.hpp @@ -0,0 +1,213 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_LAYERS_REGULARIZERS_LAYER_NORM_HPP_INCLUDED +#define LBANN_LAYERS_REGULARIZERS_LAYER_NORM_HPP_INCLUDED + +#include "lbann/layers/layer.hpp" + +#include + +namespace lbann { + +/** @brief + * + * Each data sample is normalized to have zero mean and unit standard + * deviation. See: + * + * Jimmy Lei Ba, Jamie Ryan Kiros, and Geoffrey E. Hinton. "Layer + * normalization." arXiv preprint arXiv:1607.06450 (2016). + * + * Note that this layer does not apply an entry-wise scale and bias + * like in the paper. Use the entry-wise scale/bias layer to + * reproduce that functionality. + * + */ +template +class layer_norm_layer : public Layer { +public: + + /** + * @param comm LBANN communicator + * @param epsilon Small number to avoid division by zero + */ + layer_norm_layer(lbann_comm* comm, DataType epsilon=1e-5); + + layer_norm_layer(const layer_norm_layer& other); + layer_norm_layer& operator=(const layer_norm_layer& other); + layer_norm_layer* copy() const override; + + std::string get_type() const override; + data_layout get_data_layout() const override; + El::Device get_device_allocation() const override; + description get_description() const override; + +protected: + + void setup_dims() override; + void setup_matrices(const El::Grid& grid) override; + void fp_setup_outputs(El::Int mini_batch_size) override; + void bp_setup_gradient_wrt_inputs(El::Int mini_batch_size) override; + + void fp_compute() override; + void bp_compute() override; + +private: + + /** Small number to avoid division by zero. */ + DataType m_epsilon; + + /** @brief Per-sample statistics. + * + * The means and variances are fused for performance. + */ + std::unique_ptr m_statistics; + /** @brief Gradients w.r.t. per-sample statistics. + * + * The means and variances are fused for performance. + */ + std::unique_ptr m_statistics_gradient; + +}; + +// ========================================================= +// Implementation +// ========================================================= + +template +layer_norm_layer::layer_norm_layer( + lbann_comm* comm, + DataType epsilon) + : Layer(comm), m_epsilon(epsilon) +{} + +template +layer_norm_layer::layer_norm_layer( + const layer_norm_layer& other) + : Layer(other), + m_epsilon(other.m_epsilon), + m_statistics(other.m_statistics + ? other.m_statistics->Copy() + : nullptr), + m_statistics_gradient(other.m_statistics_gradient + ? other.m_statistics_gradient->Copy() + : nullptr) +{} + +template +layer_norm_layer& layer_norm_layer::operator=( + const layer_norm_layer& other) { + Layer::operator=(other); + m_epsilon = other.m_epsilon; + m_statistics.reset(other.m_statistics + ? other.m_statistics->Copy() + : nullptr); + m_statistics_gradient.reset(other.m_statistics_gradient + ? other.m_statistics_gradient->Copy() + : nullptr); + return *this; +} + +template +layer_norm_layer* layer_norm_layer::copy() const { + return new layer_norm_layer(*this); +} + +template +std::string layer_norm_layer::get_type() const { + return "layer norm"; +} + +template +data_layout layer_norm_layer::get_data_layout() const { + return Layout; +} + +template +El::Device layer_norm_layer::get_device_allocation() const { + return Device; +} + +template +description layer_norm_layer::get_description() const { + auto desc = Layer::get_description(); + desc.add("Epsilon", m_epsilon); + return desc; +} + +template +void layer_norm_layer::setup_dims() { + Layer::setup_dims(); + this->set_output_dims(this->get_input_dims()); +} + +template +void layer_norm_layer::setup_matrices(const El::Grid& grid) { + Layer::setup_matrices(grid); + auto dist = get_prev_activations().DistData(); + dist.colDist = El::STAR; + m_statistics.reset(AbsDistMat::Instantiate(dist)); + m_statistics_gradient.reset(AbsDistMat::Instantiate(dist)); +} + +template +void layer_norm_layer::fp_setup_outputs(El::Int mini_batch_size) { + Layer::fp_setup_outputs(mini_batch_size); + const auto& input = get_prev_activations(); + m_statistics->Empty(false); + m_statistics->AlignWith(input); + m_statistics->Resize(2, input.Width()); +} + +template +void layer_norm_layer::bp_setup_gradient_wrt_inputs(El::Int mini_batch_size) { + Layer::bp_setup_gradient_wrt_inputs(mini_batch_size); + const auto& input = get_prev_activations(); + m_statistics_gradient->Empty(false); + m_statistics_gradient->AlignWith(input); + m_statistics_gradient->Resize(2, input.Width()); +} + +// ========================================================= +// Explicit template instantiation +// ========================================================= + +#ifndef LBANN_LAYER_NORM_LAYER_INSTANTIATE +extern template class layer_norm_layer< + data_layout::DATA_PARALLEL, El::Device::CPU>; +extern template class layer_norm_layer< + data_layout::MODEL_PARALLEL, El::Device::CPU>; +#ifdef LBANN_HAS_GPU +extern template class layer_norm_layer< + data_layout::DATA_PARALLEL, El::Device::GPU>; +extern template class layer_norm_layer< + data_layout::MODEL_PARALLEL, El::Device::GPU>; +#endif // LBANN_HAS_GPU +#endif // LBANN_LAYER_NORM_LAYER_INSTANTIATE + +} // namespace lbann + +#endif // LBANN_LAYERS_REGULARIZERS_LAYER_NORM_HPP_INCLUDED diff --git a/src/layers/regularizers/CMakeLists.txt b/src/layers/regularizers/CMakeLists.txt index 198de11ce86..7cd102426a7 100644 --- a/src/layers/regularizers/CMakeLists.txt +++ b/src/layers/regularizers/CMakeLists.txt @@ -3,6 +3,7 @@ set_full_path(THIS_DIR_SOURCES batch_normalization.cpp dropout.cpp entrywise_batch_normalization.cpp + layer_norm.cpp local_response_normalization.cpp selu_dropout.cpp ) @@ -12,6 +13,7 @@ if (LBANN_HAS_CUDA) set_full_path(THIS_DIR_CU_SOURCES batch_normalization.cu entrywise_batch_normalization.cu + layer_norm.cu ) endif () diff --git a/src/layers/regularizers/layer_norm.cpp b/src/layers/regularizers/layer_norm.cpp new file mode 100644 index 00000000000..8b31d5441fd --- /dev/null +++ b/src/layers/regularizers/layer_norm.cpp @@ -0,0 +1,226 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#define LBANN_LAYER_NORM_LAYER_INSTANTIATE +#include "lbann/layers/regularizers/layer_norm.hpp" + +namespace lbann { + +namespace { + +/** @brief Forward prop */ +void fp_impl(lbann_comm& comm, + DataType epsilon, + const AbsDistMat& input, + AbsDistMat& output, + AbsDistMat& statistics) { + + // Local matrices + const auto& local_input = dynamic_cast(input.LockedMatrix()); + auto& local_output = dynamic_cast(output.Matrix()); + auto& local_statistics = dynamic_cast(statistics.Matrix()); + auto local_means = El::LockedView(local_statistics, El::IR(0), El::ALL); + auto local_vars = El::LockedView(local_statistics, El::IR(1), El::ALL); + + // Dimensions + const El::Int sample_size = input.Height(); + const El::Int local_num_samples = local_input.Width(); + const El::Int local_sample_size = local_input.Height(); + + // Compute sums + El::Zero(statistics); + LBANN_OMP_PARALLEL_FOR + for (El::Int i = 0; i < local_num_samples; ++i) { + auto& sum = local_means(0,i); + auto& sqsum = local_vars(0,i); + for (El::Int j = 0; j < local_sample_size; ++j) { + const auto& x = local_input(j,i); + sum += x; + sqsum += x * x; + } + } + comm.allreduce(statistics, statistics.RedundantComm(), El::mpi::SUM); + + // Compute statistics from sums + // mean = sum(x_i) / n + // var = ( sum(x_i^2)/n - mean^2 ) * n/(n-1) + if (sample_size <= 1) { + // local_means already has correct values + El::Fill(local_vars, DataType{1}); + } + else { + LBANN_OMP_PARALLEL_FOR + for (El::Int i = 0; i < local_num_samples; ++i) { + const auto sum = local_means(0,i); + const auto sqsum = local_vars(0,i); + const auto& mean = sum / sample_size; + const auto& sqmean = sqsum / sample_size; + const auto& var = (sqmean - mean*mean) * sample_size / (sample_size-1); + local_means(0,i) = mean; + local_vars(0,i) = std::max(var, DataType{0}); + } + } + + // Apply layer norm + // y_i = (x_i - mean) / sqrt(var + epsilon) + for (El::Int i = 0; i < local_num_samples; ++i) { + const auto& mean = local_means(0,i); + const auto& var = local_vars(0,i); + const DataType inv_stdev = 1 / std::sqrt(var + epsilon); + for (El::Int j = 0; j < local_sample_size; ++j) { + const auto& x = local_input(j,i); + auto& y = local_output(j,i); + y = (x - mean) * inv_stdev; + } + } + +} + +/** @brief Backprop */ +void bp_impl(lbann_comm& comm, + DataType epsilon, + const AbsDistMat& input, + const AbsDistMat& output_grad, + AbsDistMat& input_grad, + const AbsDistMat& statistics, + AbsDistMat& statistics_grad) { + + // Local matrices + const auto& local_input = dynamic_cast(input.LockedMatrix()); + const auto& local_output_grad = dynamic_cast(output_grad.LockedMatrix()); + auto& local_input_grad = dynamic_cast(input_grad.Matrix()); + const auto& local_statistics = dynamic_cast(statistics.LockedMatrix()); + const auto local_means = El::LockedView(local_statistics, El::IR(0), El::ALL); + const auto local_vars = El::LockedView(local_statistics, El::IR(1), El::ALL); + auto& local_statistics_grad = dynamic_cast(statistics_grad.Matrix()); + auto local_means_grad = El::View(local_statistics_grad, El::IR(0), El::ALL); + auto local_vars_grad = El::View(local_statistics_grad, El::IR(1), El::ALL); + + // Dimensions + const El::Int sample_size = input.Height(); + const El::Int local_num_samples = local_input.Width(); + const El::Int local_sample_size = local_input.Height(); + + // Trivial case if sample size <= 1 + // Note: Output is constant, so error signal is zero. + if (sample_size <= 1) { + El::Zero(input_grad); + return; + } + + // Compute gradient w.r.t. statistics + // dL/dmean = - sum(dL/dy_i) / sqrt(var+epsilon) + // dL/dvar = - sum(dL/dy_i * (x_i-mean)) * (var+epsilon)^(-3/2) / 2 + El::Zero(statistics_grad); + LBANN_OMP_PARALLEL_FOR + for (El::Int i = 0; i < local_num_samples; ++i) { + const auto& mean = local_means(0,i); + const auto& var = local_vars(0,i); + const DataType inv_stdev = 1 / std::sqrt(var + epsilon); + auto& dmean = local_means_grad(0,i); + auto& dvar = local_vars_grad(0,i); + for (El::Int j = 0; j < local_sample_size; ++j) { + const auto& x = local_input(j,i); + const auto& dy = local_output_grad(j,i); + dmean += dy; + dvar += dy * (x - mean); + } + dmean *= -inv_stdev; + dvar *= -inv_stdev*inv_stdev*inv_stdev / 2; + } + comm.allreduce(statistics_grad, + statistics_grad.RedundantComm(), + El::mpi::SUM); + + // Compute gradient w.r.t. input + // dL/dx_i = ( dL/dy_i / sqrt(var+epsilon) + // + dL/dmean / n + // + dL/dvar * (x_i - mean) * 2/(n-1) ) + LBANN_OMP_PARALLEL_FOR + for (El::Int i = 0; i < local_num_samples; ++i) { + const auto& mean = local_means(0,i); + const auto& var = local_vars(0,i); + const DataType inv_stdev = 1 / std::sqrt(var + epsilon); + const auto& dmean = local_means_grad(0,i); + const auto& dvar = local_vars_grad(0,i); + for (El::Int j = 0; j < local_sample_size; ++j) { + const auto& x = local_input(j,i); + const auto& dy = local_output_grad(j,i); + auto& dx = local_input_grad(j,i); + dx = (dy * inv_stdev + + dmean / sample_size + + dvar * (x - mean) * 2 / (sample_size - 1)); + } + } + +} + +} // namespace + +// Template instantiation +template <> +void layer_norm_layer::fp_compute() { + fp_impl(*get_comm(), + m_epsilon, + get_prev_activations(), + get_activations(), + *m_statistics); +} +template <> +void layer_norm_layer::fp_compute() { + fp_impl(*get_comm(), + m_epsilon, + get_prev_activations(), + get_activations(), + *m_statistics); +} +template <> +void layer_norm_layer::bp_compute() { + bp_impl(*get_comm(), + m_epsilon, + get_prev_activations(), + get_prev_error_signals(), + get_error_signals(), + *m_statistics, + *m_statistics_gradient); +} +template <> +void layer_norm_layer::bp_compute() { + bp_impl(*get_comm(), + m_epsilon, + get_prev_activations(), + get_prev_error_signals(), + get_error_signals(), + *m_statistics, + *m_statistics_gradient); +} + +template class layer_norm_layer< + data_layout::DATA_PARALLEL, El::Device::CPU>; +template class layer_norm_layer< + data_layout::MODEL_PARALLEL, El::Device::CPU>; + +} // namespace lbann diff --git a/src/layers/regularizers/layer_norm.cu b/src/layers/regularizers/layer_norm.cu new file mode 100644 index 00000000000..21dd0093063 --- /dev/null +++ b/src/layers/regularizers/layer_norm.cu @@ -0,0 +1,481 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#define LBANN_LAYER_NORM_LAYER_INSTANTIATE +#include "lbann/layers/regularizers/layer_norm.hpp" +#include "lbann/utils/cuda.hpp" + +#include + +namespace lbann { + +namespace { + +/** Functor for adding @c thrust::pair objects. */ +template +struct pair_sum { + __device__ __forceinline__ + Pair operator()(const Pair& x, const Pair& y) { + return Pair(x.first+y.first, x.second+y.second); + } +}; + +/** Accumulate sums and sums of squares for each data sample. + * + * On input, sums and sqsums are filled with zeros. + * + * Block dimensions: bsize x 1 x 1 + * + * Grid dimensions: (local_sample_size / bsize) x local_num_samples x 1 + */ +template +__global__ void fp_sums_kernel( + size_t local_num_samples, + size_t local_sample_size, + const DataType* __restrict__ vals, + size_t vals_ldim, + DataType* sums, + size_t sums_stride, + DataType* sqsums, + size_t sqsums_stride) { + + // Indices and dimensions + constexpr size_t bdimy = 1; + constexpr size_t bdimz = 1; + const size_t tid = threadIdx.x + blockDim.x * threadIdx.y; + const size_t gidx = threadIdx.x + blockIdx.x * blockDim.x; + const size_t gidy = threadIdx.y + blockIdx.y * blockDim.y; + const size_t nthreadsx = blockDim.x * gridDim.x; + const size_t nthreadsy = blockDim.y * gridDim.y; + + for (size_t i = gidy; i < local_num_samples; i += nthreadsy) { + + // Accumulate sums and perform block-wide reduction + using pair_t = thrust::pair; + using pair_sum_t = pair_sum; + pair_t sum_sqsum(0,0); + for (size_t j = gidx; j < local_sample_size; j += nthreadsx) { + const auto& x = vals[i*vals_ldim + j]; + sum_sqsum.first += x; + sum_sqsum.second += x * x; + } + sum_sqsum = cuda::block_reduce(sum_sqsum); + + // Output result to global memory + if (tid == 0) { + cuda::atomic_add(&sums[i*sums_stride], sum_sqsum.first); + cuda::atomic_add(&sqsums[i*sqsums_stride], sum_sqsum.second); + } + + } + +} + +/** Compute per-sample statistics. + * + * mean = sum(x_i) / n + * + * var = ( sum(x_i^2)/n - mean^2 ) * n/(n-1) + * + * On input, means contains per-sample sums and vars contains + * per-sample sums of squares. + * + * Block dimensions: bsize x 1 x 1 + * + * Grid dimensions: (local_num_samples / bsize) x 1 x 1 + */ +__global__ void fp_statistics_kernel( + size_t sample_size, + size_t local_num_samples, + DataType* means, + size_t means_stride, + DataType* vars, + size_t vars_stride) { + + const size_t gid = threadIdx.x + blockIdx.x * blockDim.x; + const size_t nthreads = blockDim.x * gridDim.x; + for (size_t i = gid; i < local_num_samples; i += nthreads) { + const auto sum = means[i*means_stride]; + const auto sqsum = vars[i*means_stride]; + const auto& mean = sum / sample_size; + const auto& sqmean = sqsum / sample_size; + const auto& var = (sqmean - mean*mean) * sample_size / (sample_size-1); + means[i*means_stride] = mean; + vars[i*vars_stride] = cuda::max(var, DataType{0}); + } + +} + +/** Compute outputs. + * + * y_i = (x_i - mean) / sqrt(var + epsilon) + * + * Block dimensions: bdimx x bdimy x 1 + * + * Grid dimensions: (local_sample_size / bdimx) x (local_num_samples / bdimy) x 1 + */ +__global__ void fp_output_kernel( + size_t local_num_samples, + size_t local_sample_size, + DataType epsilon, + const DataType* __restrict__ input, + size_t input_ldim, + DataType* __restrict__ output, + size_t output_ldim, + const DataType* means, + size_t means_stride, + const DataType* vars, + size_t vars_stride) { + + const size_t gidx = threadIdx.x + blockIdx.x * blockDim.x; + const size_t gidy = threadIdx.y + blockIdx.y * blockDim.y; + const size_t nthreadsx = blockDim.x * gridDim.x; + const size_t nthreadsy = blockDim.y * gridDim.y; + for (size_t i = gidy; i < local_num_samples; i += nthreadsy) { + const auto& mean = means[i*means_stride]; + const auto& var = vars[i*vars_stride]; + const auto& inv_stdev = cuda::rsqrt(var + epsilon); + for (size_t j = gidx; j < local_sample_size; j += nthreadsx) { + const auto& x = input[i*input_ldim + j]; + auto& y = output[i*output_ldim + j]; + y = (x - mean) * inv_stdev; + } + } + +} + +/** @brief Forward prop */ +void fp_impl(lbann_comm& comm, + DataType epsilon, + const AbsDistMat& input, + AbsDistMat& output, + AbsDistMat& statistics) { + + // Local matrices + const auto& local_input = dynamic_cast(input.LockedMatrix()); + auto& local_output = dynamic_cast(output.Matrix()); + auto& local_statistics = dynamic_cast(statistics.Matrix()); + auto local_means = El::LockedView(local_statistics, El::IR(0), El::ALL); + auto local_vars = El::LockedView(local_statistics, El::IR(1), El::ALL); + + // Dimensions + const size_t sample_size = input.Height(); + const size_t local_num_samples = local_input.Width(); + const size_t local_sample_size = local_input.Height(); + + // Trivial cases + if (local_num_samples < 1) { return; } + + // Compute sums + El::Zero(statistics); + if (!local_input.IsEmpty()) { + constexpr size_t block_size = 256; + dim3 block_dims, grid_dims; + block_dims.x = block_size; + grid_dims.x = (local_sample_size + block_size - 1) / block_size; + grid_dims.y = local_num_samples; + fp_sums_kernel<<>>( + local_num_samples, local_sample_size, + local_input.LockedBuffer(), local_input.LDim(), + local_means.Buffer(), local_means.LDim(), + local_vars.Buffer(), local_vars.LDim()); + } + comm.allreduce(statistics, statistics.RedundantComm(), El::mpi::SUM); + + // Compute statistics from sums + if (sample_size <= 1) { + // local_means already has correct values + El::Fill(local_vars, DataType{1}); + } + else if (!local_statistics.IsEmpty()) { + constexpr size_t block_size = 256; + dim3 block_dims, grid_dims; + block_dims.x = block_size; + grid_dims.x = (local_num_samples + block_size - 1) / block_size; + fp_statistics_kernel<<>>( + sample_size, local_num_samples, + local_means.Buffer(), local_means.LDim(), + local_vars.Buffer(), local_vars.LDim()); + } + + // Apply layer norm + if (!local_output.IsEmpty()) { + constexpr size_t block_size = 256; + dim3 block_dims, grid_dims; + block_dims.x = block_size; + grid_dims.x = (local_sample_size + block_size - 1) / block_size; + grid_dims.y = local_num_samples; + fp_output_kernel<<>>( + local_num_samples, local_sample_size, epsilon, + local_input.LockedBuffer(), local_input.LDim(), + local_output.Buffer(), local_output.LDim(), + local_means.LockedBuffer(), local_means.LDim(), + local_vars.LockedBuffer(), local_vars.LDim()); + } + +} + +/** Compute gradients w.r.t. per-sample statistics. + * + * dL/dmean = - sum(dL/dy_i) / sqrt(var+epsilon) + * + * dL/dvar = - sum(dL/dy_i * (x_i-mean)) * (var+epsilon)^(-3/2) / 2 + * + * On input, means_grad and vars_grad are filled with zeros. + * + * Block dimensions: bsize x 1 x 1 + * + * Grid dimensions: (local_sample_size / bsize) x local_num_samples x 1 + */ +template +__global__ void bp_statistics_grad_kernel( + size_t local_num_samples, + size_t local_sample_size, + DataType epsilon, + const DataType* __restrict__ input, + size_t input_ldim, + const DataType* __restrict__ output_grad, + size_t output_grad_ldim, + const DataType* means, + size_t means_stride, + const DataType* vars, + size_t vars_stride, + DataType* means_grad, + size_t means_grad_stride, + DataType* vars_grad, + size_t vars_grad_stride) { + + // Indices and dimensions + constexpr size_t bdimy = 1; + constexpr size_t bdimz = 1; + const size_t tid = threadIdx.x + blockDim.x * threadIdx.y; + const size_t gidx = threadIdx.x + blockIdx.x * blockDim.x; + const size_t gidy = threadIdx.y + blockIdx.y * blockDim.y; + const size_t nthreadsx = blockDim.x * gridDim.x; + const size_t nthreadsy = blockDim.y * gridDim.y; + + for (size_t i = gidy; i < local_num_samples; i += nthreadsy) { + + // Accumulate sums and perform block-wide reduction + using pair_t = thrust::pair; + using pair_sum_t = pair_sum; + pair_t sums(0,0); + const auto& mean = means[i*means_stride]; + for (size_t j = gidx; j < local_sample_size; j += nthreadsx) { + const auto& x = input[i*input_ldim + j]; + const auto& dy = output_grad[i*output_grad_ldim + j]; + sums.first += dy; + sums.second += dy * (x - mean); + } + sums = cuda::block_reduce(sums); + + // Output result to global memory + if (tid == 0) { + const auto& var = vars[i*vars_stride]; + const auto& inv_stdev = cuda::rsqrt(var + epsilon); + const DataType dmean = -sums.first * inv_stdev; + const DataType dvar = -sums.second * inv_stdev*inv_stdev*inv_stdev / 2; + cuda::atomic_add(&means_grad[i*means_grad_stride], dmean); + cuda::atomic_add(&vars_grad[i*vars_grad_stride], dvar); + } + + } + +} + +/** Compute gradients w.r.t. input. + * + * dL/dx_i = ( dL/dy_i / sqrt(var+epsilon) + * + dL/dmean / n + * + dL/dvar * (x_i - mean) * 2/(n-1) ) + * + * Block dimensions: bdimx x bdimy x 1 + * + * Grid dimensions: (local_sample_size / bdimx) x (local_num_samples / bdimy) x 1 + */ +__global__ void bp_input_grad_kernel( + size_t sample_size, + size_t local_num_samples, + size_t local_sample_size, + DataType epsilon, + const DataType* __restrict__ input, + size_t input_ldim, + const DataType* __restrict__ output_grad, + size_t output_grad_ldim, + DataType* __restrict__ input_grad, + size_t input_grad_ldim, + const DataType* __restrict__ means, + size_t means_stride, + const DataType* __restrict__ vars, + size_t vars_stride, + const DataType* means_grad, + size_t means_grad_stride, + const DataType* vars_grad, + size_t vars_grad_stride) { + + const size_t gidx = threadIdx.x + blockIdx.x * blockDim.x; + const size_t gidy = threadIdx.y + blockIdx.y * blockDim.y; + const size_t nthreadsx = blockDim.x * gridDim.x; + const size_t nthreadsy = blockDim.y * gridDim.y; + for (size_t i = gidy; i < local_num_samples; i += nthreadsy) { + const auto& mean = means[i*means_stride]; + const auto& var = vars[i*vars_stride]; + const auto& inv_stdev = cuda::rsqrt(var + epsilon); + const auto& dmean = means_grad[i*means_grad_stride]; + const auto& dvar = vars_grad[i*vars_grad_stride]; + for (size_t j = gidx; j < local_sample_size; j += nthreadsx) { + const auto& x = input[i*input_ldim + j]; + const auto& dy = output_grad[i*output_grad_ldim + j]; + auto& dx = input_grad[i*input_grad_ldim + j]; + dx = (dy * inv_stdev + + dmean / sample_size + + dvar * (x - mean) * 2 / (sample_size - 1)); + } + } + +} + +/** @brief Backprop */ +void bp_impl(lbann_comm& comm, + DataType epsilon, + const AbsDistMat& input, + const AbsDistMat& output_grad, + AbsDistMat& input_grad, + const AbsDistMat& statistics, + AbsDistMat& statistics_grad) { + + // Local matrices + const auto& local_input = dynamic_cast(input.LockedMatrix()); + const auto& local_output_grad = dynamic_cast(output_grad.LockedMatrix()); + auto& local_input_grad = dynamic_cast(input_grad.Matrix()); + const auto& local_statistics = dynamic_cast(statistics.LockedMatrix()); + const auto local_means = El::LockedView(local_statistics, El::IR(0), El::ALL); + const auto local_vars = El::LockedView(local_statistics, El::IR(1), El::ALL); + auto& local_statistics_grad = dynamic_cast(statistics_grad.Matrix()); + auto local_means_grad = El::View(local_statistics_grad, El::IR(0), El::ALL); + auto local_vars_grad = El::View(local_statistics_grad, El::IR(1), El::ALL); + + // Dimensions + const size_t sample_size = input.Height(); + const size_t local_num_samples = local_input.Width(); + const size_t local_sample_size = local_input.Height(); + + // Trivial case if sample size <= 1 + // Note: Output is constant, so error signal is zero. + if (sample_size <= 1) { + El::Zero(input_grad); + return; + } + + // Compute gradient w.r.t. statistics + El::Zero(statistics_grad); + if (!local_output_grad.IsEmpty()) { + constexpr size_t block_size = 256; + dim3 block_dims, grid_dims; + block_dims.x = block_size; + grid_dims.x = (local_sample_size + block_size - 1) / block_size; + grid_dims.y = local_num_samples; + bp_statistics_grad_kernel + <<>>( + local_num_samples, local_sample_size, epsilon, + local_input.LockedBuffer(), local_input.LDim(), + local_output_grad.LockedBuffer(), local_output_grad.LDim(), + local_means.LockedBuffer(), local_means.LDim(), + local_vars.LockedBuffer(), local_vars.LDim(), + local_means_grad.Buffer(), local_means_grad.LDim(), + local_vars_grad.Buffer(), local_vars_grad.LDim()); + } + comm.allreduce(statistics_grad, + statistics_grad.RedundantComm(), + El::mpi::SUM); + + // Compute gradient w.r.t. input + if (!local_input_grad.IsEmpty()) { + constexpr size_t block_size = 256; + dim3 block_dims, grid_dims; + block_dims.x = block_size; + grid_dims.x = (local_sample_size + block_size - 1) / block_size; + grid_dims.y = local_num_samples; + bp_input_grad_kernel + <<>>( + sample_size, local_num_samples, local_sample_size, epsilon, + local_input.LockedBuffer(), local_input.LDim(), + local_output_grad.LockedBuffer(), local_output_grad.LDim(), + local_input_grad.Buffer(), local_input_grad.LDim(), + local_means.LockedBuffer(), local_means.LDim(), + local_vars.LockedBuffer(), local_vars.LDim(), + local_means_grad.LockedBuffer(), local_means_grad.LDim(), + local_vars_grad.LockedBuffer(), local_vars_grad.LDim()); + } + +} + +} // namespace + +// Template instantiation +template <> +void layer_norm_layer::fp_compute() { + fp_impl(*get_comm(), + m_epsilon, + get_prev_activations(), + get_activations(), + *m_statistics); +} +template <> +void layer_norm_layer::fp_compute() { + fp_impl(*get_comm(), + m_epsilon, + get_prev_activations(), + get_activations(), + *m_statistics); +} +template <> +void layer_norm_layer::bp_compute() { + bp_impl(*get_comm(), + m_epsilon, + get_prev_activations(), + get_prev_error_signals(), + get_error_signals(), + *m_statistics, + *m_statistics_gradient); +} +template <> +void layer_norm_layer::bp_compute() { + bp_impl(*get_comm(), + m_epsilon, + get_prev_activations(), + get_prev_error_signals(), + get_error_signals(), + *m_statistics, + *m_statistics_gradient); +} + +template class layer_norm_layer< + data_layout::DATA_PARALLEL, El::Device::GPU>; +template class layer_norm_layer< + data_layout::MODEL_PARALLEL, El::Device::GPU>; + +} // namespace lbann diff --git a/src/proto/factories/layer_factory.cpp b/src/proto/factories/layer_factory.cpp index 1c7549b98f9..f351b7e1b24 100644 --- a/src/proto/factories/layer_factory.cpp +++ b/src/proto/factories/layer_factory.cpp @@ -71,6 +71,7 @@ #include "lbann/layers/regularizers/regularizer.hpp" #include "lbann/layers/regularizers/selu_dropout.hpp" #include "lbann/layers/regularizers/entrywise_batch_normalization.hpp" +#include "lbann/layers/regularizers/layer_norm.hpp" #include "lbann/layers/transform/bernoulli.hpp" #include "lbann/layers/transform/categorical_random.hpp" #include "lbann/layers/transform/concatenation.hpp" @@ -551,6 +552,13 @@ std::unique_ptr construct_layer( const auto& params = proto_layer.entrywise_batch_normalization(); return lbann::make_unique>(comm, params.decay(), params.epsilon()); } + if (proto_layer.has_layer_norm()) { + const auto& params = proto_layer.layer_norm(); + const double epsilon = (params.has_epsilon() + ? params.epsilon().value() + : 1e-5); + return lbann::make_unique>(comm, epsilon); + } // Math layers CONSTRUCT_LAYER(logical_not); diff --git a/src/proto/layers.proto b/src/proto/layers.proto index 9342401d11a..bb7de80503a 100644 --- a/src/proto/layers.proto +++ b/src/proto/layers.proto @@ -155,6 +155,7 @@ message Layer { Dropout dropout = 21; SeluDropout selu_dropout = 229; EntrywiseBatchNormalization entrywise_batch_normalization = 230; + LayerNorm layer_norm = 231; // Activation layers Elu elu = 200; @@ -330,6 +331,25 @@ message Layer { double keep_prob = 2; //default: 0.5 } + /** @brief + * + * Each data sample is normalized to have zero mean and unit + * standard deviation. See: + * + * Jimmy Lei Ba, Jamie Ryan Kiros, and Geoffrey E. Hinton. "Layer + * normalization." arXiv preprint arXiv:1607.06450 (2016). + * + * Note that this layer does not apply an entry-wise scale and bias + * like in the paper. Use the entry-wise scale/bias layer to + * reproduce that functionality. + */ + message LayerNorm { + /** @brief Small number to avoid division by zero. + * @details Default is 1e-5. + */ + google.protobuf.DoubleValue epsilon = 1; + } + ////////////////// // Input layers // ////////////////// From ef3d008b23bf6c62571c2cc004fe0b36bd4e922b Mon Sep 17 00:00:00 2001 From: Tim Moon Date: Wed, 20 Nov 2019 15:24:02 -0800 Subject: [PATCH 403/634] Refactor core components in Python frontend (#1355) * Consolidate LBANN components in Python frontend in 'core' submodule * Move Python frontend class generator to core submodule --- python/lbann/__init__.py | 16 ++++++++-------- python/lbann/core/__init__.py | 6 ++++++ python/lbann/{ => core}/callback.py | 4 ++-- python/lbann/{ => core}/layer.py | 10 +++++----- python/lbann/{ => core}/metric.py | 1 - python/lbann/{ => core}/model.py | 9 ++++----- python/lbann/{ => core}/objective_function.py | 4 ++-- python/lbann/{ => core}/optimizer.py | 8 +++++--- python/lbann/{ => core}/trainer.py | 3 +-- .../{util/class_generator.py => core/util.py} | 7 ++++++- python/lbann/{ => core}/weights.py | 6 +++--- python/lbann/util/__init__.py | 6 +++--- 12 files changed, 45 insertions(+), 35 deletions(-) create mode 100644 python/lbann/core/__init__.py rename python/lbann/{ => core}/callback.py (84%) rename python/lbann/{ => core}/layer.py (96%) rename python/lbann/{ => core}/metric.py (98%) rename python/lbann/{ => core}/model.py (90%) rename python/lbann/{ => core}/objective_function.py (97%) rename python/lbann/{ => core}/optimizer.py (70%) rename python/lbann/{ => core}/trainer.py (94%) rename python/lbann/{util/class_generator.py => core/util.py} (98%) rename python/lbann/{ => core}/weights.py (89%) diff --git a/python/lbann/__init__.py b/python/lbann/__init__.py index 36a19e07b15..48c19698a40 100644 --- a/python/lbann/__init__.py +++ b/python/lbann/__init__.py @@ -25,12 +25,12 @@ def lbann_exe(): return _lbann_exe if _lbann_exe else 'lbann' # Import core functionality into lbann namespace -from lbann.callback import * -from lbann.layer import * -from lbann.metric import * -from lbann.trainer import * -from lbann.model import * -from lbann.objective_function import * -from lbann.optimizer import * -from lbann.weights import * +from lbann.core.callback import * +from lbann.core.layer import * +from lbann.core.metric import * +from lbann.core.model import * +from lbann.core.objective_function import * +from lbann.core.optimizer import * +from lbann.core.trainer import * +from lbann.core.weights import * from lbann.launcher import run diff --git a/python/lbann/core/__init__.py b/python/lbann/core/__init__.py new file mode 100644 index 00000000000..b7d8cebf6b7 --- /dev/null +++ b/python/lbann/core/__init__.py @@ -0,0 +1,6 @@ +"""Core components in LBANN. + +Most objects in this namespace correspond to C++ classes in LBANN. +Most are autogenerated from the Protobuf frontend. + +""" diff --git a/python/lbann/callback.py b/python/lbann/core/callback.py similarity index 84% rename from python/lbann/callback.py rename to python/lbann/core/callback.py index 8319a0fe4b4..18cf4dc790a 100644 --- a/python/lbann/callback.py +++ b/python/lbann/core/callback.py @@ -1,7 +1,7 @@ """Callbacks for neural network training.""" import abc from lbann import callbacks_pb2 -import lbann.util.class_generator +import lbann.core.util class Callback(abc.ABC): """Callback for neural network training.""" @@ -16,7 +16,7 @@ def export_proto(self): # Generate Callback sub-classes from lbann.proto # Note: The list of skip fields must be updated if any new fields are # added to the Callback message in lbann.proto -classes = lbann.util.class_generator.generate_classes_from_protobuf_message( +classes = lbann.core.util.generate_classes_from_protobuf_message( callbacks_pb2.Callback, base_class = Callback, base_has_export_proto = True) diff --git a/python/lbann/layer.py b/python/lbann/core/layer.py similarity index 96% rename from python/lbann/layer.py rename to python/lbann/core/layer.py index 0bb94bcfdfb..c5acbd70391 100644 --- a/python/lbann/layer.py +++ b/python/lbann/core/layer.py @@ -1,8 +1,8 @@ """Neural network tensor operations.""" import abc -import lbann +from lbann import layers_pb2 from lbann.util import make_iterable -import lbann.util.class_generator +import lbann.core.util class Layer(abc.ABC): """Neural network tensor operation. @@ -51,7 +51,7 @@ def __init__(self, def export_proto(self): """Construct and return a protobuf message.""" - proto = lbann.layers_pb2.Layer() + proto = layers_pb2.Layer() proto.parents = ' '.join([l.name for l in self.parents]) proto.children = ' '.join([l.name for l in self.children]) proto.weights = ' '.join([w.name for w in self.weights]) @@ -91,8 +91,8 @@ def __call__(self, parent): # Generate Layer sub-classes from lbann.proto # Note: The list of skip fields must be updated if any new fields are # added to the Layer message in lbann.proto -classes = lbann.util.class_generator.generate_classes_from_protobuf_message( - lbann.layers_pb2.Layer, +classes = lbann.core.util.generate_classes_from_protobuf_message( + layers_pb2.Layer, skip_fields = set([ 'name', 'parents', 'children', 'data_layout', 'device_allocation', 'weights', 'num_neurons_from_data_reader', 'freeze', 'hint_layer', diff --git a/python/lbann/metric.py b/python/lbann/core/metric.py similarity index 98% rename from python/lbann/metric.py rename to python/lbann/core/metric.py index e9528659def..69a50afe179 100644 --- a/python/lbann/metric.py +++ b/python/lbann/core/metric.py @@ -1,5 +1,4 @@ """Neural network tensor operations.""" -import abc from lbann import metrics_pb2 class Metric: diff --git a/python/lbann/model.py b/python/lbann/core/model.py similarity index 90% rename from python/lbann/model.py rename to python/lbann/core/model.py index a6e4572e87b..ba287e773d5 100644 --- a/python/lbann/model.py +++ b/python/lbann/core/model.py @@ -1,9 +1,8 @@ """Neural network model.""" -import abc from lbann import model_pb2 from lbann.util import make_iterable -import lbann.layer -import lbann.objective_function +import lbann.core.layer +import lbann.core.objective_function class Model: """Neural network model.""" @@ -19,7 +18,7 @@ def __init__(self, mini_batch_size, epochs, self.random_seed = random_seed self.summary_dir = summary_dir # Get connected layers - self.layers = list(lbann.layer.traverse_layer_graph(layers)) + self.layers = list(lbann.core.layer.traverse_layer_graph(layers)) # Get weights associated with layers self.weights = set(make_iterable(weights)) @@ -27,7 +26,7 @@ def __init__(self, mini_batch_size, epochs, self.weights.update(l.weights) # Construct objective function if needed - obj_type = lbann.objective_function.ObjectiveFunction + obj_type = lbann.core.objective_function.ObjectiveFunction if isinstance(objective_function, obj_type): self.objective_function = objective_function elif objective_function is None: diff --git a/python/lbann/objective_function.py b/python/lbann/core/objective_function.py similarity index 97% rename from python/lbann/objective_function.py rename to python/lbann/core/objective_function.py index 473a25632a3..df58cdf54a6 100644 --- a/python/lbann/objective_function.py +++ b/python/lbann/core/objective_function.py @@ -1,7 +1,7 @@ import abc from lbann import objective_functions_pb2 from lbann.util import make_iterable -import lbann.layer +import lbann.core.layer # Note: Currently, only layer terms and L2 weight regularization terms # are supported in LBANN. If more terms are added, it may be @@ -59,7 +59,7 @@ def add_term(self, term): constructed and added to the objective function. """ - if isinstance(term, lbann.layer.Layer): + if isinstance(term, lbann.core.layer.Layer): term = LayerTerm(term) self.terms.append(term) diff --git a/python/lbann/optimizer.py b/python/lbann/core/optimizer.py similarity index 70% rename from python/lbann/optimizer.py rename to python/lbann/core/optimizer.py index fd5a9ab402e..e844dfce188 100644 --- a/python/lbann/optimizer.py +++ b/python/lbann/core/optimizer.py @@ -1,7 +1,9 @@ +import abc from lbann import optimizers_pb2 -import lbann.util.class_generator +import lbann.core.util -class Optimizer: +class Optimizer(abc.ABC): + """Optimization algorithm for a neural network's parameters.""" def export_proto(self): """Construct and return a protobuf message.""" return optimizers_pb2.Optimizer() @@ -9,7 +11,7 @@ def export_proto(self): # Generate Optimizer sub-classes from lbann.proto # Note: The list of skip fields must be updated if any new fields are # added to the Optimizer message in lbann.proto -classes = lbann.util.class_generator.generate_classes_from_protobuf_message( +classes = lbann.core.util.generate_classes_from_protobuf_message( optimizers_pb2.Optimizer, base_class = Optimizer, base_has_export_proto = True) diff --git a/python/lbann/trainer.py b/python/lbann/core/trainer.py similarity index 94% rename from python/lbann/trainer.py rename to python/lbann/core/trainer.py index 6e917add9ab..459e11500e2 100644 --- a/python/lbann/trainer.py +++ b/python/lbann/core/trainer.py @@ -1,10 +1,9 @@ """LBANN Trainer.""" -import abc from lbann import trainer_pb2 from lbann.util import make_iterable class Trainer: - """LBANN Trainer.""" + """Manages the training of a neural network model.""" def __init__(self, name=None, diff --git a/python/lbann/util/class_generator.py b/python/lbann/core/util.py similarity index 98% rename from python/lbann/util/class_generator.py rename to python/lbann/core/util.py index 4e77d20f842..525a72c2e9e 100644 --- a/python/lbann/util/class_generator.py +++ b/python/lbann/core/util.py @@ -1,4 +1,9 @@ -"""Utility functions to generate classes from Protobuf messages.""" +"""Utility functions for core LBANN functionality. + +This submodule mostly contains helper functions to generate classes +from Protobuf messages. + +""" import google.protobuf.descriptor import google.protobuf.wrappers_pb2 from lbann import lbann_pb2, callbacks_pb2, layers_pb2, metrics_pb2, model_pb2, objective_functions_pb2, optimizers_pb2, weights_pb2 diff --git a/python/lbann/weights.py b/python/lbann/core/weights.py similarity index 89% rename from python/lbann/weights.py rename to python/lbann/core/weights.py index 3fe24af164a..da7cfc1f91e 100644 --- a/python/lbann/weights.py +++ b/python/lbann/core/weights.py @@ -1,7 +1,7 @@ """Trainable model parameters.""" import abc from lbann import weights_pb2 -import lbann.util.class_generator +import lbann.core.util class Initializer(abc.ABC): """Initialization scheme for `Weights`.""" @@ -10,7 +10,7 @@ def export_proto(self): return weights_pb2.Initializer() # Generate Initializer sub-classes from weights.proto. -classes = lbann.util.class_generator.generate_classes_from_protobuf_message( +classes = lbann.core.util.generate_classes_from_protobuf_message( weights_pb2.Initializer, base_class = Initializer, base_has_export_proto = True) @@ -18,7 +18,7 @@ def export_proto(self): globals()[c.__name__] = c class Weights: - """Trainable model parameters.""" + """Trainable parameters for neural network.""" global_count = 0 # Static counter, used for default names diff --git a/python/lbann/util/__init__.py b/python/lbann/util/__init__.py index 04e36dcffbb..71dda82829f 100644 --- a/python/lbann/util/__init__.py +++ b/python/lbann/util/__init__.py @@ -1,13 +1,13 @@ -from collections.abc import Iterable +import collections.abc def make_iterable(obj): """Convert to an iterable object. Simply returns `obj` if it is alredy iterable. Otherwise returns a - 1-tuple containing `obj`. + 1-tuple containing `obj`. `str`s are treated as _not_ iterable. """ - if isinstance(obj, Iterable) and not isinstance(obj, str): + if isinstance(obj, collections.abc.Iterable) and not isinstance(obj, str): return obj else: return (obj,) From 8bbd4429be430c6592b815c47ea3070db7b9f09e Mon Sep 17 00:00:00 2001 From: Tim Moon Date: Wed, 20 Nov 2019 15:24:21 -0800 Subject: [PATCH 404/634] Reorganize layer modules in Python frontend into directory (#1354) --- python/lbann/modules.py | 524 ------------------------------- python/lbann/modules/__init__.py | 10 + python/lbann/modules/base.py | 233 ++++++++++++++ python/lbann/modules/rnn.py | 300 ++++++++++++++++++ python/lbann/util/__init__.py | 4 + 5 files changed, 547 insertions(+), 524 deletions(-) delete mode 100644 python/lbann/modules.py create mode 100644 python/lbann/modules/__init__.py create mode 100644 python/lbann/modules/base.py create mode 100644 python/lbann/modules/rnn.py diff --git a/python/lbann/modules.py b/python/lbann/modules.py deleted file mode 100644 index 72404b1d745..00000000000 --- a/python/lbann/modules.py +++ /dev/null @@ -1,524 +0,0 @@ -"""Neural network modules. - -These are a convenience for common layer patterns that are often the -basic building blocks for larger models. - -""" - -from collections.abc import Iterable -import warnings -from math import sqrt -import lbann -from lbann.util import make_iterable - -def _str_list(l): - """Convert an iterable object to a space-separated string.""" - return ' '.join(str(i) for i in make_iterable(l)) - -class Module: - """Base class for neural network modules. - - A module is a pattern of layers that can be added to a layer - graph, possibly multiple times. The pattern typically takes a set - of input layers and obtains a set of output layers. - - """ - - def __init__(self): - pass - - def forward(self, *args, **kwargs): - """Apply module pattern. - - A module pattern typically takes a set of `Layer`s as input - and returns a set of `Layer`s. - - """ - # Should be overridden in all sub-classes - raise NotImplementedError - - def __call__(self, *args, **kwargs): - """Apply module mattern to `input`. - - Syntatic sugar around `forward` function. - - """ - return self.forward(*args, **kwargs) - -class FullyConnectedModule(Module): - """Basic block for fully-connected neural networks. - - Applies a dense linearity and a nonlinear activation function. - - """ - - global_count = 0 # Static counter, used for default names - - def __init__(self, size, bias=True, weights=[], activation=None, - name=None, data_layout='data_parallel'): - """Initialize fully-connected module. - - Args: - size (int): Size of output tensor. - activation (type): Layer class for activation function. - bias (bool): Whether to apply bias after linearity. - weights (`Weights` or iterator of `Weights`): Weights in - fully-connected layer. There are at most two: the - matrix and the bias. If weights are not provided, the - matrix will be initialized with He normal - initialization and the bias with zeros. - name (str): Default name is in the form 'fcmodule'. - data_layout (str): Data layout. - - """ - super().__init__() - FullyConnectedModule.global_count += 1 - self.instance = 0 - self.size = size - self.bias = bias - self.name = (name - if name - else 'fcmodule{0}'.format(FullyConnectedModule.global_count)) - self.data_layout = data_layout - - # Initialize weights - # Note: If weights are not provided, matrix weights are - # initialized with He normal scheme and bias weights are - # initialized with zeros. - self.weights = list(make_iterable(weights)) - if len(self.weights) > 2: - raise ValueError('`FullyConnectedModule` has ' - 'at most two weights, ' - 'but got {0}'.format(len(self.weights))) - if len(self.weights) == 0: - self.weights.append( - lbann.Weights(initializer=lbann.HeNormalInitializer(), - name=self.name+'_matrix')) - if len(self.weights) == 1: - self.weights.append( - lbann.Weights(initializer=lbann.ConstantInitializer(value=0.0), - name=self.name+'_bias')) - - # Initialize activation layer - self.activation = None - if activation: - if isinstance(activation, type): - self.activation = activation - else: - self.activation = type(activation) - if not issubclass(self.activation, lbann.Layer): - raise ValueError('activation must be a layer') - - def forward(self, x): - self.instance += 1 - name = '{0}_instance{1}'.format(self.name, self.instance) - y = lbann.FullyConnected(x, - weights=self.weights, - name=(name+'_fc' if self.activation else name), - data_layout=self.data_layout, - num_neurons=self.size, - has_bias=self.bias) - if self.activation: - return self.activation(y, - name=name+'_activation', - data_layout=self.data_layout) - else: - return y - -class ConvolutionModule(Module): - """Basic block for convolutional neural networks. - - Applies a convolution and a nonlinear activation function. - - """ - - global_count = 0 # Static counter, used for default names - - def __init__(self, num_dims, - out_channels, kernel_size, - stride=1, padding=0, dilation=1, groups=1, bias=True, - weights=[], activation=None, name=None): - """Initialize convolution module. - - Args: - num_dims (int): Number of dimensions. - out_channels (int): Number of output channels, i.e. number - of filters. - kernel_size (int): Size of convolution kernel. - stride (int): Convolution stride. - padding (int): Convolution padding. - dilation (int): Convolution dilation. - groups (int): Number of convolution groups. - bias (bool): Whether to apply channel-wise bias after - convolution. - weights (`Weights` or iterator of `Weights`): Weights in - convolution layer. There are at most two: the kernel - and the bias. If weights are not provided, the kernel - will be initialized with He normal initialization and - the bias with zeros. - name (str): Default name is in the form 'convmodule'. - - """ - super().__init__() - ConvolutionModule.global_count += 1 - self.instance = 0 - self.num_dims = num_dims - self.out_channels = out_channels - self.kernel_size = kernel_size - self.stride = stride - self.padding = padding - self.dilation = dilation - self.groups = groups - self.bias = bias - self.weights = list(make_iterable(weights)) - self.name = (name - if name - else 'convmodule{0}'.format(ConvolutionModule.global_count)) - - # Initialize weights - # Note: If weights are not provided, kernel weights are - # initialized with He normal scheme and bias weights are - # initialized with zeros. - self.weights = list(make_iterable(weights)) - if len(self.weights) > 2: - raise ValueError('`ConvolutionModule` has ' - 'at most two weights, ' - 'but got {0}'.format(len(self.weights))) - if len(self.weights) == 0: - self.weights.append( - lbann.Weights(initializer=lbann.HeNormalInitializer(), - name=self.name+'_kernel')) - if len(self.weights) == 1: - self.weights.append( - lbann.Weights(initializer=lbann.ConstantInitializer(value=0.0), - name=self.name+'_bias')) - - # Initialize activation layer - self.activation = None - if activation: - if isinstance(activation, type): - self.activation = activation - else: - self.activation = type(activation) - if not issubclass(self.activation, lbann.Layer): - raise ValueError('activation must be a layer') - - def forward(self, x): - self.instance += 1 - name = '{0}_instance{1}'.format(self.name, self.instance) - y = lbann.Convolution(x, - weights=self.weights, - name=(name+'_conv' if self.activation else name), - num_dims=self.num_dims, - num_output_channels=self.out_channels, - has_vectors=False, - conv_dims_i=self.kernel_size, - conv_pads_i=self.padding, - conv_strides_i=self.stride, - conv_dilations_i=self.dilation, - num_groups=self.groups, - has_bias=self.bias) - if self.activation: - return self.activation(y, name=name+'_activation') - else: - return y - -class Convolution2dModule(ConvolutionModule): - """Basic block for 2D convolutional neural networks. - - Applies a convolution and a nonlinear activation function. - This is a wrapper class for ConvolutionModule. - """ - - def __init__(self, *args, **kwargs): - super().__init__(2, *args, **kwargs) - -class Convolution3dModule(ConvolutionModule): - """Basic block for 3D convolutional neural networks. - - Applies a convolution and a nonlinear activation function. - This is a wrapper class for ConvolutionModule. - """ - - def __init__(self, *args, **kwargs): - super().__init__(3, *args, **kwargs) - -class LSTMCell(Module): - """Long short-term memory cell.""" - - global_count = 0 # Static counter, used for default names - - def __init__(self, size, bias = True, - weights=[], name=None, data_layout='data_parallel'): - """Initialize LSTM cell. - - Args: - size (int): Size of output tensor. - bias (bool): Whether to apply biases after linearity. - weights (`Weights` or iterator of `Weights`): Weights in - fully-connected layer. There are at most two - a - matrix ((4*size) x (input_size+size) dimensions) and a - bias (4*size entries). If weights are not provided, - the matrix and bias will be initialized in a similar - manner as PyTorch (uniform random values from - [-1/sqrt(size), 1/sqrt(size)]). - name (str): Default name is in the form 'lstmcell'. - data_layout (str): Data layout. - - """ - super().__init__() - LSTMCell.global_count += 1 - self.step = 0 - self.size = size - self.name = (name - if name - else 'lstmcell{0}'.format(LSTMCell.global_count)) - self.data_layout = data_layout - - # Weights - self.weights = list(make_iterable(weights)) - if len(self.weights) > 2: - raise ValueError('`LSTMCell` has at most two weights, ' - 'but got {0}'.format(len(self.weights))) - if len(self.weights) == 0: - self.weights.append( - lbann.Weights(initializer=lbann.UniformInitializer(min=-1/sqrt(self.size), - max=1/sqrt(self.size)), - name=self.name+'_matrix')) - if len(self.weights) == 1: - self.weights.append( - lbann.Weights(initializer=lbann.UniformInitializer(min=-1/sqrt(self.size), - max=1/sqrt(self.size)), - name=self.name+'_bias')) - - # Linearity - self.fc = FullyConnectedModule(4*size, bias=bias, - weights=self.weights, - name=self.name + '_fc', - data_layout=self.data_layout) - - def forward(self, x, prev_state): - """Apply LSTM step. - - Args: - x (Layer): Input. - prev_state (tuple with two `Layer`s): State from previous - LSTM step. Comprised of LSTM output and cell state. - - Returns: - (Layer, (Layer, Layer)): The output and state (the output - and cell state). The state can be passed directly into - the next LSTM step. - - """ - self.step += 1 - name = '{0}_step{1}'.format(self.name, self.step) - - # Get output and cell state from previous step - prev_output, prev_cell = prev_state - - # Apply linearity - input_concat = lbann.Concatenation(x, prev_output, - name=name + '_input', - data_layout=self.data_layout) - fc = self.fc(input_concat) - - # Get gates and cell update - slice = lbann.Slice(fc, - slice_points=_str_list([0, self.size, 4*self.size]), - name=name + '_fc_slice', - data_layout=self.data_layout) - cell_update = lbann.Tanh(slice, - name=name + '_cell_update', - data_layout=self.data_layout) - sigmoid = lbann.Sigmoid(slice, - name=name + '_sigmoid', - data_layout=self.data_layout) - slice = lbann.Slice(sigmoid, - slice_points=_str_list([0, self.size, 2*self.size, 3*self.size]), - name=name + '_sigmoid_slice', - data_layout=self.data_layout) - f = lbann.Identity(slice, name=name + '_forget_gate', - data_layout=self.data_layout) - i = lbann.Identity(slice, name=name + '_input_gate', - data_layout=self.data_layout) - o = lbann.Identity(slice, name=name + '_output_gate', - data_layout=self.data_layout) - - # Cell state - cell_forget = lbann.Multiply(f, prev_cell, - name=name + '_cell_forget', - data_layout=self.data_layout) - cell_input = lbann.Multiply(i, cell_update, - name=name + '_cell_input', - data_layout=self.data_layout) - cell = lbann.Add(cell_forget, cell_input, name=name + '_cell', - data_layout=self.data_layout) - - # Output - cell_act = lbann.Tanh(cell, name=name + '_cell_activation', - data_layout=self.data_layout) - output = lbann.Multiply(o, cell_act, name=name, - data_layout=self.data_layout) - - # Return output and state - return output, (output, cell) - -class GRU(Module): - """Gated-recurrent unit. - Implementation mostly taken from: - https://pytorch.org/docs/stable/nn.html#gru""" - - global_count = 0 # Static counter, used for default names - - def __init__(self, size, bias = True, - weights=[], name=None, data_layout='data_parallel'): - """Initialize GRU cell. - - Args: - size (int): Size of output tensor. - bias (bool): Whether to apply biases after linearity. - weights (`Weights` or iterator of `Weights`): Weights in - fully-connected layer. There are at most four - two - matrices ((3*size) x (input_size) and (3*size) x (size) dimensions) each and two - biases (3*size entries) each. If weights are not provided, - the matrix and bias will be initialized in a similar - manner as PyTorch (uniform random values from - [-1/sqrt(size), 1/sqrt(size)]). - name (str): Default name is in the form 'gru'. - data_layout (str): Data layout. - - """ - super().__init__() - GRU.global_count += 1 - self.step = 0 - self.size = size - self.name = (name - if name - else 'gru{0}'.format(GRU.global_count)) - self.data_layout = data_layout - - # Weights - self.weights = list(make_iterable(weights)) - if len(self.weights) > 4: - raise ValueError('`GRU` has at most 4 weights, ' - 'but got {0}'.format(len(self.weights))) - ##@todo: use loop - if len(self.weights) == 0: - self.weights.append( - lbann.Weights(initializer=lbann.UniformInitializer(min=-1/sqrt(self.size), - max=1/sqrt(self.size)), - name=self.name+'_ih_matrix')) - if len(self.weights) == 1: - self.weights.append( - lbann.Weights(initializer=lbann.UniformInitializer(min=-1/sqrt(self.size), - max=1/sqrt(self.size)), - name=self.name+'_ih_bias')) - if len(self.weights) == 2: - self.weights.append( - lbann.Weights(initializer=lbann.UniformInitializer(min=-1/sqrt(self.size), - max=1/sqrt(self.size)), - name=self.name+'_hh_matrix')) - if len(self.weights) == 3: - self.weights.append( - lbann.Weights(initializer=lbann.UniformInitializer(min=-1/sqrt(self.size), - max=1/sqrt(self.size)), - name=self.name+'_hh_bias')) - - # Linearity - ####Learnable input-hidden weights - self.ih_fc = lbann.modules.FullyConnectedModule(3*size, bias=bias, - weights=self.weights[:2], - name=self.name + '_ih_fc', - data_layout=self.data_layout) - ###Learnable hidden-hidden weights - self.hh_fc = lbann.modules.FullyConnectedModule(3*size, bias=bias, - weights=self.weights[2:], - name=self.name + '_hh_fc', - data_layout=self.data_layout) - - def forward(self, x, prev_state): - """Apply GRU step. - - Args: - x (Layer): Input. - prev_state: State from previous GRU step. - - Returns: - (Layer, Layer): The output (out) and state (hn). - The state can be passed directly into - the next GRU step. - - """ - self.step += 1 - name = '{0}_step{1}'.format(self.name, self.step) - - - fc1 = self.ih_fc(x) #input_fc - fc2 = self.hh_fc(prev_state) #hidden_fc - - - # Get gates and cell update - fc1_slice = lbann.Slice(fc1, - slice_points=_str_list([0, self.size, 2*self.size, 3*self.size]), - name=name + '_fc1_slice', - data_layout=self.data_layout) - Wir_x = lbann.Identity(fc1_slice, name=name + '_Wrx', - data_layout=self.data_layout) - Wiz_x = lbann.Identity(fc1_slice, name=name + '_Wzx', - data_layout=self.data_layout) - Win_x = lbann.Identity(fc1_slice, name=name + '_Wnx', - data_layout=self.data_layout) - - fc2_slice = lbann.Slice(fc2, - slice_points=_str_list([0, self.size, 2*self.size, 3*self.size]), - name=name + '_fc2_slice', - data_layout=self.data_layout) - Whr_prev = lbann.Identity(fc2_slice, name=name + '_Wrh', - data_layout=self.data_layout) - Whz_prev = lbann.Identity(fc2_slice, name=name + '_Wzh', - data_layout=self.data_layout) - Whn_prev = lbann.Identity(fc2_slice, name=name + '_Wnh', - data_layout=self.data_layout) - - rt = \ - lbann.Sigmoid( - lbann.Add(Wir_x, Whr_prev, data_layout=self.data_layout), - name=name + '_reset_gate', - data_layout=self.data_layout - ) - - zt = \ - lbann.Sigmoid( - lbann.Add(Wiz_x, Whz_prev, data_layout=self.data_layout), - name=name + '_update_gate', - data_layout=self.data_layout, - ) - - nt = \ - lbann.Tanh( - lbann.Add( - Win_x, - lbann.Multiply(rt, Whn_prev, data_layout=self.data_layout), - data_layout=self.data_layout, - ), - name=name + '_new_gate', data_layout=self.data_layout, - ) - - ht = \ - lbann.Add( - lbann.Multiply( - lbann.WeightedSum( - lbann.Constant(value=1.0, hint_layer=zt, data_layout=self.data_layout), - zt, - scaling_factors='1 -1', data_layout=self.data_layout - ), - nt, - data_layout=self.data_layout - ), - lbann.Multiply(zt, prev_state, data_layout=self.data_layout), - name=name+ '_output', data_layout=self.data_layout, - ) - - # Return output - return ht, ht diff --git a/python/lbann/modules/__init__.py b/python/lbann/modules/__init__.py new file mode 100644 index 00000000000..183f965c6a7 --- /dev/null +++ b/python/lbann/modules/__init__.py @@ -0,0 +1,10 @@ +"""Neural network modules. + +These are a convenience for common layer patterns that are often the +basic building blocks for larger models. + +""" + +# Import from submodules +from lbann.modules.base import Module, FullyConnectedModule, ConvolutionModule, Convolution2dModule, Convolution3dModule +from lbann.modules.rnn import LSTMCell, GRU diff --git a/python/lbann/modules/base.py b/python/lbann/modules/base.py new file mode 100644 index 00000000000..76ef2989af7 --- /dev/null +++ b/python/lbann/modules/base.py @@ -0,0 +1,233 @@ +"""Base class for neural network modules. + +This also contains modules for fully-connected and convolution layers. + +""" +import abc +import lbann +from lbann.util import make_iterable + +class Module(abc.ABC): + """Base class for neural network modules. + + A module is a pattern of layers that can be added to a layer + graph, possibly multiple times. The pattern typically takes a set + of input layers and obtains a set of output layers. + + """ + + def forward(self, *args, **kwargs): + """Apply module pattern. + + A module pattern typically takes a set of `Layer`s as input + and returns a set of `Layer`s. + + """ + # Should be overridden in all sub-classes + raise NotImplementedError + + def __call__(self, *args, **kwargs): + """Apply module mattern to `input`. + + Syntatic sugar around `forward` function. + + """ + return self.forward(*args, **kwargs) + +class FullyConnectedModule(Module): + """Basic block for fully-connected neural networks. + + Applies a dense linearity and a nonlinear activation function. + + """ + + global_count = 0 # Static counter, used for default names + + def __init__(self, size, bias=True, weights=[], activation=None, + name=None, data_layout='data_parallel'): + """Initialize fully-connected module. + + Args: + size (int): Size of output tensor. + activation (type): Layer class for activation function. + bias (bool): Whether to apply bias after linearity. + weights (`Weights` or iterator of `Weights`): Weights in + fully-connected layer. There are at most two: the + matrix and the bias. If weights are not provided, the + matrix will be initialized with He normal + initialization and the bias with zeros. + name (str): Default name is in the form 'fcmodule'. + data_layout (str): Data layout. + + """ + super().__init__() + FullyConnectedModule.global_count += 1 + self.instance = 0 + self.size = size + self.bias = bias + self.name = (name + if name + else 'fcmodule{0}'.format(FullyConnectedModule.global_count)) + self.data_layout = data_layout + + # Initialize weights + # Note: If weights are not provided, matrix weights are + # initialized with He normal scheme and bias weights are + # initialized with zeros. + self.weights = list(make_iterable(weights)) + if len(self.weights) > 2: + raise ValueError('`FullyConnectedModule` has ' + 'at most two weights, ' + 'but got {0}'.format(len(self.weights))) + if len(self.weights) == 0: + self.weights.append( + lbann.Weights(initializer=lbann.HeNormalInitializer(), + name=self.name+'_matrix')) + if len(self.weights) == 1: + self.weights.append( + lbann.Weights(initializer=lbann.ConstantInitializer(value=0.0), + name=self.name+'_bias')) + + # Initialize activation layer + self.activation = None + if activation: + if isinstance(activation, type): + self.activation = activation + else: + self.activation = type(activation) + if not issubclass(self.activation, lbann.Layer): + raise ValueError('activation must be a layer') + + def forward(self, x): + self.instance += 1 + name = '{0}_instance{1}'.format(self.name, self.instance) + y = lbann.FullyConnected(x, + weights=self.weights, + name=(name+'_fc' if self.activation else name), + data_layout=self.data_layout, + num_neurons=self.size, + has_bias=self.bias) + if self.activation: + return self.activation(y, + name=name+'_activation', + data_layout=self.data_layout) + else: + return y + +class ConvolutionModule(Module): + """Basic block for convolutional neural networks. + + Applies a convolution and a nonlinear activation function. + + """ + + global_count = 0 # Static counter, used for default names + + def __init__(self, num_dims, + out_channels, kernel_size, + stride=1, padding=0, dilation=1, groups=1, bias=True, + weights=[], activation=None, name=None): + """Initialize convolution module. + + Args: + num_dims (int): Number of dimensions. + out_channels (int): Number of output channels, i.e. number + of filters. + kernel_size (int): Size of convolution kernel. + stride (int): Convolution stride. + padding (int): Convolution padding. + dilation (int): Convolution dilation. + groups (int): Number of convolution groups. + bias (bool): Whether to apply channel-wise bias after + convolution. + weights (`Weights` or iterator of `Weights`): Weights in + convolution layer. There are at most two: the kernel + and the bias. If weights are not provided, the kernel + will be initialized with He normal initialization and + the bias with zeros. + name (str): Default name is in the form 'convmodule'. + + """ + super().__init__() + ConvolutionModule.global_count += 1 + self.instance = 0 + self.num_dims = num_dims + self.out_channels = out_channels + self.kernel_size = kernel_size + self.stride = stride + self.padding = padding + self.dilation = dilation + self.groups = groups + self.bias = bias + self.weights = list(make_iterable(weights)) + self.name = (name + if name + else 'convmodule{0}'.format(ConvolutionModule.global_count)) + + # Initialize weights + # Note: If weights are not provided, kernel weights are + # initialized with He normal scheme and bias weights are + # initialized with zeros. + self.weights = list(make_iterable(weights)) + if len(self.weights) > 2: + raise ValueError('`ConvolutionModule` has ' + 'at most two weights, ' + 'but got {0}'.format(len(self.weights))) + if len(self.weights) == 0: + self.weights.append( + lbann.Weights(initializer=lbann.HeNormalInitializer(), + name=self.name+'_kernel')) + if len(self.weights) == 1: + self.weights.append( + lbann.Weights(initializer=lbann.ConstantInitializer(value=0.0), + name=self.name+'_bias')) + + # Initialize activation layer + self.activation = None + if activation: + if isinstance(activation, type): + self.activation = activation + else: + self.activation = type(activation) + if not issubclass(self.activation, lbann.Layer): + raise ValueError('activation must be a layer') + + def forward(self, x): + self.instance += 1 + name = '{0}_instance{1}'.format(self.name, self.instance) + y = lbann.Convolution(x, + weights=self.weights, + name=(name+'_conv' if self.activation else name), + num_dims=self.num_dims, + num_output_channels=self.out_channels, + has_vectors=False, + conv_dims_i=self.kernel_size, + conv_pads_i=self.padding, + conv_strides_i=self.stride, + conv_dilations_i=self.dilation, + num_groups=self.groups, + has_bias=self.bias) + if self.activation: + return self.activation(y, name=name+'_activation') + else: + return y + +class Convolution2dModule(ConvolutionModule): + """Basic block for 2D convolutional neural networks. + + Applies a convolution and a nonlinear activation function. + This is a wrapper class for ConvolutionModule. + """ + + def __init__(self, *args, **kwargs): + super().__init__(2, *args, **kwargs) + +class Convolution3dModule(ConvolutionModule): + """Basic block for 3D convolutional neural networks. + + Applies a convolution and a nonlinear activation function. + This is a wrapper class for ConvolutionModule. + """ + + def __init__(self, *args, **kwargs): + super().__init__(3, *args, **kwargs) diff --git a/python/lbann/modules/rnn.py b/python/lbann/modules/rnn.py new file mode 100644 index 00000000000..4d3ee1a5982 --- /dev/null +++ b/python/lbann/modules/rnn.py @@ -0,0 +1,300 @@ +"""Neural network modules for recurrent models.""" + +import math +import lbann +from .base import Module, FullyConnectedModule +from lbann.util import make_iterable, str_list + +class LSTMCell(Module): + """Long short-term memory cell.""" + + global_count = 0 # Static counter, used for default names + + def __init__(self, size, bias = True, + weights=[], name=None, data_layout='data_parallel'): + """Initialize LSTM cell. + + Args: + size (int): Size of output tensor. + bias (bool): Whether to apply biases after linearity. + weights (`Weights` or iterator of `Weights`): Weights in + fully-connected layer. There are at most two - a + matrix ((4*size) x (input_size+size) dimensions) and a + bias (4*size entries). If weights are not provided, + the matrix and bias will be initialized in a similar + manner as PyTorch (uniform random values from + [-1/sqrt(size), 1/sqrt(size)]). + name (str): Default name is in the form 'lstmcell'. + data_layout (str): Data layout. + + """ + super().__init__() + LSTMCell.global_count += 1 + self.step = 0 + self.size = size + self.name = (name + if name + else 'lstmcell{0}'.format(LSTMCell.global_count)) + self.data_layout = data_layout + + # Weights + self.weights = list(make_iterable(weights)) + if len(self.weights) > 2: + raise ValueError('`LSTMCell` has at most two weights, ' + 'but got {0}'.format(len(self.weights))) + scale = 1 / math.sqrt(self.size) + if len(self.weights) == 0: + self.weights.append( + lbann.Weights(initializer=lbann.UniformInitializer(min=-scale, + max=scale), + name=self.name+'_matrix') + ) + if len(self.weights) == 1: + self.weights.append( + lbann.Weights(initializer=lbann.UniformInitializer(min=-scale, + max=scale), + name=self.name+'_bias') + ) + + # Linearity + self.fc = FullyConnectedModule( + 4*size, bias=bias, + weights=self.weights, + name=self.name + '_fc', + data_layout=self.data_layout + ) + + def forward(self, x, prev_state): + """Apply LSTM step. + + Args: + x (Layer): Input. + prev_state (tuple with two `Layer`s): State from previous + LSTM step. Comprised of LSTM output and cell state. + + Returns: + (Layer, (Layer, Layer)): The output and state (the output + and cell state). The state can be passed directly into + the next LSTM step. + + """ + self.step += 1 + name = '{0}_step{1}'.format(self.name, self.step) + + # Get output and cell state from previous step + prev_output, prev_cell = prev_state + + # Apply linearity + input_concat = lbann.Concatenation(x, prev_output, + name=name + '_input', + data_layout=self.data_layout) + fc = self.fc(input_concat) + + # Get gates and cell update + slice = lbann.Slice(fc, + slice_points=str_list([0, self.size, 4*self.size]), + name=name + '_fc_slice', + data_layout=self.data_layout) + cell_update = lbann.Tanh(slice, + name=name + '_cell_update', + data_layout=self.data_layout) + sigmoid = lbann.Sigmoid(slice, + name=name + '_sigmoid', + data_layout=self.data_layout) + slice = lbann.Slice(sigmoid, + slice_points=str_list([0, self.size, 2*self.size, 3*self.size]), + name=name + '_sigmoid_slice', + data_layout=self.data_layout) + f = lbann.Identity(slice, name=name + '_forget_gate', + data_layout=self.data_layout) + i = lbann.Identity(slice, name=name + '_input_gate', + data_layout=self.data_layout) + o = lbann.Identity(slice, name=name + '_output_gate', + data_layout=self.data_layout) + + # Cell state + cell_forget = lbann.Multiply(f, prev_cell, + name=name + '_cell_forget', + data_layout=self.data_layout) + cell_input = lbann.Multiply(i, cell_update, + name=name + '_cell_input', + data_layout=self.data_layout) + cell = lbann.Add(cell_forget, cell_input, name=name + '_cell', + data_layout=self.data_layout) + + # Output + cell_act = lbann.Tanh(cell, name=name + '_cell_activation', + data_layout=self.data_layout) + output = lbann.Multiply(o, cell_act, name=name, + data_layout=self.data_layout) + + # Return output and state + return output, (output, cell) + +class GRU(Module): + """Gated-recurrent unit. + Implementation mostly taken from: + https://pytorch.org/docs/stable/nn.html#gru""" + + global_count = 0 # Static counter, used for default names + + def __init__(self, size, bias = True, + weights=[], name=None, data_layout='data_parallel'): + """Initialize GRU cell. + + Args: + size (int): Size of output tensor. + bias (bool): Whether to apply biases after linearity. + weights (`Weights` or iterator of `Weights`): Weights in + fully-connected layer. There are at most four - two + matrices ((3*size) x (input_size) and (3*size) x (size) dimensions) each and two + biases (3*size entries) each. If weights are not provided, + the matrix and bias will be initialized in a similar + manner as PyTorch (uniform random values from + [-1/sqrt(size), 1/sqrt(size)]). + name (str): Default name is in the form 'gru'. + data_layout (str): Data layout. + + """ + super().__init__() + GRU.global_count += 1 + self.step = 0 + self.size = size + self.name = (name + if name + else 'gru{0}'.format(GRU.global_count)) + self.data_layout = data_layout + + # Weights + self.weights = list(make_iterable(weights)) + if len(self.weights) > 4: + raise ValueError('`GRU` has at most 4 weights, ' + 'but got {0}'.format(len(self.weights))) + ##@todo: use loop + scale = 1 / math.sqrt(self.size) + if len(self.weights) == 0: + self.weights.append( + lbann.Weights(initializer=lbann.UniformInitializer(min=-scale, + max=scale), + name=self.name+'_ih_matrix') + ) + if len(self.weights) == 1: + self.weights.append( + lbann.Weights(initializer=lbann.UniformInitializer(min=-scale, + max=scale), + name=self.name+'_ih_bias') + ) + if len(self.weights) == 2: + self.weights.append( + lbann.Weights(initializer=lbann.UniformInitializer(min=-scale, + max=scale), + name=self.name+'_hh_matrix') + ) + if len(self.weights) == 3: + self.weights.append( + lbann.Weights(initializer=lbann.UniformInitializer(min=-scale, + max=scale), + name=self.name+'_hh_bias') + ) + + # Linearity + ####Learnable input-hidden weights + self.ih_fc = FullyConnectedModule( + 3*size, bias=bias, + weights=self.weights[:2], + name=self.name + '_ih_fc', + data_layout=self.data_layout + ) + ###Learnable hidden-hidden weights + self.hh_fc = FullyConnectedModule( + 3*size, bias=bias, + weights=self.weights[2:], + name=self.name + '_hh_fc', + data_layout=self.data_layout + ) + + def forward(self, x, prev_state): + """Apply GRU step. + + Args: + x (Layer): Input. + prev_state: State from previous GRU step. + + Returns: + (Layer, Layer): The output (out) and state (hn). + The state can be passed directly into + the next GRU step. + + """ + self.step += 1 + name = '{0}_step{1}'.format(self.name, self.step) + + + fc1 = self.ih_fc(x) #input_fc + fc2 = self.hh_fc(prev_state) #hidden_fc + + + # Get gates and cell update + fc1_slice = lbann.Slice(fc1, + slice_points=str_list([0, self.size, 2*self.size, 3*self.size]), + name=name + '_fc1_slice', + data_layout=self.data_layout) + Wir_x = lbann.Identity(fc1_slice, name=name + '_Wrx', + data_layout=self.data_layout) + Wiz_x = lbann.Identity(fc1_slice, name=name + '_Wzx', + data_layout=self.data_layout) + Win_x = lbann.Identity(fc1_slice, name=name + '_Wnx', + data_layout=self.data_layout) + + fc2_slice = lbann.Slice(fc2, + slice_points=str_list([0, self.size, 2*self.size, 3*self.size]), + name=name + '_fc2_slice', + data_layout=self.data_layout) + Whr_prev = lbann.Identity(fc2_slice, name=name + '_Wrh', + data_layout=self.data_layout) + Whz_prev = lbann.Identity(fc2_slice, name=name + '_Wzh', + data_layout=self.data_layout) + Whn_prev = lbann.Identity(fc2_slice, name=name + '_Wnh', + data_layout=self.data_layout) + + rt = \ + lbann.Sigmoid( + lbann.Add(Wir_x, Whr_prev, data_layout=self.data_layout), + name=name + '_reset_gate', + data_layout=self.data_layout + ) + + zt = \ + lbann.Sigmoid( + lbann.Add(Wiz_x, Whz_prev, data_layout=self.data_layout), + name=name + '_update_gate', + data_layout=self.data_layout, + ) + + nt = \ + lbann.Tanh( + lbann.Add( + Win_x, + lbann.Multiply(rt, Whn_prev, data_layout=self.data_layout), + data_layout=self.data_layout, + ), + name=name + '_new_gate', data_layout=self.data_layout, + ) + + ht = \ + lbann.Add( + lbann.Multiply( + lbann.WeightedSum( + lbann.Constant(value=1.0, hint_layer=zt, data_layout=self.data_layout), + zt, + scaling_factors='1 -1', data_layout=self.data_layout + ), + nt, + data_layout=self.data_layout + ), + lbann.Multiply(zt, prev_state, data_layout=self.data_layout), + name=name+ '_output', data_layout=self.data_layout, + ) + + # Return output + return ht, ht diff --git a/python/lbann/util/__init__.py b/python/lbann/util/__init__.py index 71dda82829f..1f0af211f0c 100644 --- a/python/lbann/util/__init__.py +++ b/python/lbann/util/__init__.py @@ -11,3 +11,7 @@ def make_iterable(obj): return obj else: return (obj,) + +def str_list(it, sep=' '): + """Convert an iterable object to a string.""" + return sep.join(str(i) for i in make_iterable(it)) From 9f118b826cb137b47095e39f0c4aaecf43c42c1b Mon Sep 17 00:00:00 2001 From: Brian Van Essen Date: Wed, 20 Nov 2019 16:28:04 -0700 Subject: [PATCH 405/634] Feature improve spack build env (#1284) * Updated and simplified the spack developers installation. * Updated the spack environment to reflect the new spack architecture names that are more architecture specific. * Adding new requirements: catch2 and clara --- docs/building_lbann.rst | 4 +- ...yaml => developer_release_cuda_spack.yaml} | 14 ++-- .../developer_release_ppc64le_cuda_spack.yaml | 53 -------------- .../externals_power9le_llnl_lc_cz.yaml | 69 +++++++++++++++++++ .../externals_ppc64le_llnl_lc_cz.yaml | 68 ------------------ ...xternals_x86_64_broadwell_llnl_lc_cz.yaml} | 20 +++--- .../std_versions_and_variants_llnl_lc_cz.yaml | 16 +++-- 7 files changed, 103 insertions(+), 141 deletions(-) rename spack_environments/{developer_release_x86_64_cuda_spack.yaml => developer_release_cuda_spack.yaml} (75%) delete mode 100644 spack_environments/developer_release_ppc64le_cuda_spack.yaml create mode 100644 spack_environments/externals_power9le_llnl_lc_cz.yaml delete mode 100644 spack_environments/externals_ppc64le_llnl_lc_cz.yaml rename spack_environments/{externals_x86_64_llnl_lc_cz.yaml => externals_x86_64_broadwell_llnl_lc_cz.yaml} (52%) diff --git a/docs/building_lbann.rst b/docs/building_lbann.rst index 9866d31b94e..ba84db1b7fa 100644 --- a/docs/building_lbann.rst +++ b/docs/building_lbann.rst @@ -145,9 +145,9 @@ Hydrogen, and LBANN separately, by whatever means they choose. export LBANN_BUILD_DIR=/path/to/a/build/directory export LBANN_INSTALL_DIR=/path/to/an/install/directory cd ${LBANN_BUILD_DIR} - spack env create -d . ${LBANN_HOME}/spack_environments/developer_release__cuda_spack.yaml # where = x86_64 | ppc64le + spack env create -d . ${LBANN_HOME}/spack_environments/developer_release_cuda_spack.yaml cp ${LBANN_HOME}/spack_environments/std_versions_and_variants_llnl_lc_cz.yaml . - cp ${LBANN_HOME}/spack_environments/externals__llnl_lc_cz.yaml . # where = x86_64 | ppc64le + cp ${LBANN_HOME}/spack_environments/externals__llnl_lc_cz.yaml externals_llnl_lc_cz.yaml # where = x86_64_broadwell | power9le spack install spack env loads # Spack creates a file named loads that has all of the correct modules source ${SPACK_ROOT}/share/spack/setup-env.sh # Rerun setup since spack doesn't modify MODULEPATH unless there are module files defined diff --git a/spack_environments/developer_release_x86_64_cuda_spack.yaml b/spack_environments/developer_release_cuda_spack.yaml similarity index 75% rename from spack_environments/developer_release_x86_64_cuda_spack.yaml rename to spack_environments/developer_release_cuda_spack.yaml index 801ff296d06..b604dad425c 100644 --- a/spack_environments/developer_release_x86_64_cuda_spack.yaml +++ b/spack_environments/developer_release_cuda_spack.yaml @@ -14,16 +14,19 @@ ################################################################################ spack: + concretization: together # add package specs to the `specs` list specs: - - conduit@master~doc~doxygen+hdf5~hdf5_compat+mpi+python+shared~silo - - cnpy@master build_type=RelWithDebInfo - - opencv build_type=RelWithDebInfo ~calib3d+core~cuda~dnn~eigen+fast-math~features2d~flann~gtk+highgui+imgproc~ipp~ipp_iw~jasper~java+jpeg~lapack~ml~opencl~opencl_svm~openclamdblas~openclamdfft~openmp+png~powerpc~pthreads_pf~python~qt+shared~stitching~superres+tiff~ts~video~videoio~videostab~vsx~vtk+zlib + - catch2 + - clara + - conduit + - cnpy + - opencv - cereal - ninja - zlib - cmake - - cudnn@7.5.1-10.0-x86_64 + - cudnn - cub - nccl - hwloc @@ -36,6 +39,7 @@ spack: - py-pandas - protobuf - py-protobuf+cpp + - py-pytest - py-setuptools - py-texttable mirrors: {} @@ -50,4 +54,4 @@ spack: ################################################################################ include: - std_versions_and_variants_llnl_lc_cz.yaml - - externals_x86_64_llnl_lc_cz.yaml + - externals_llnl_lc_cz.yaml diff --git a/spack_environments/developer_release_ppc64le_cuda_spack.yaml b/spack_environments/developer_release_ppc64le_cuda_spack.yaml deleted file mode 100644 index 3462e2eb90c..00000000000 --- a/spack_environments/developer_release_ppc64le_cuda_spack.yaml +++ /dev/null @@ -1,53 +0,0 @@ -# This is a Spack Environment file. -# -# It describes a set of packages to be installed, along with -# configuration settings. - -################################################################################ -# Notes on building a environment file: -# 1) Packages that should be explicitly installed to satisfy dependencies -# for LBANN, Hydrogen, or Aluminum are in the specs list. -# 2) Packages and their variants that are secondary dependencies of -# the primary dependencies should be in the packages list. This -# ensures that all primary dependenies build with a consistent set -# of secondary (and tertiary, ...) dependencies. -################################################################################ - -spack: - # add package specs to the `specs` list - specs: - - conduit@master~doc~doxygen+hdf5~hdf5_compat+mpi+python+shared~silo - - cnpy@master build_type=RelWithDebInfo - - opencv build_type=RelWithDebInfo ~calib3d+core~cuda~dnn~eigen+fast-math~features2d~flann~gtk+highgui+imgproc~ipp~ipp_iw~jasper~java+jpeg~lapack~ml~opencl~opencl_svm~openclamdblas~openclamdfft~openmp+png+powerpc~pthreads_pf~python~qt+shared~stitching~superres+tiff~ts~video~videoio~videostab+vsx~vtk+zlib - - cereal - - ninja - - zlib - - cmake - - cudnn@7.5.1-10.1-ppc64le - - cub - - nccl - - hwloc - - py-argparse - - py-configparser - - py-cython - - py-graphviz - - py-matplotlib - - py-onnx - - py-pandas - - protobuf - - py-protobuf+cpp - - py-setuptools - - py-texttable - mirrors: {} - modules: - enable: [] - repos: [] - config: {} -################################################################################ -# Include paths to standard compilers and packages on LLNL LC systems -# Remove and/or replace these with your site specific packages and paths -# Note that the include files are expected to be local to this yaml file -################################################################################ - include: - - std_versions_and_variants_llnl_lc_cz.yaml - - externals_ppc64le_llnl_lc_cz.yaml diff --git a/spack_environments/externals_power9le_llnl_lc_cz.yaml b/spack_environments/externals_power9le_llnl_lc_cz.yaml new file mode 100644 index 00000000000..ba05f34c61c --- /dev/null +++ b/spack_environments/externals_power9le_llnl_lc_cz.yaml @@ -0,0 +1,69 @@ + packages: + all: + providers: + mpi: [spectrum-mpi@rolling-release arch=linux-rhel7-power9le] + lapack: [openblas threads=openmp] + blas: [openblas threasd=openmp] + buildable: true + version: [] + paths: {} + modules: {} + compiler: [gcc@7.3.1 arch=linux-rhel7-power9le] + + cmake:: + variants: ~openssl ~ncurses + version: [3.12.1] + paths: + cmake@3.12.1 arch=linux-rhel7-power9le: /usr/tce/packages/cmake/cmake-3.12.1 + + cuda:: + buildable: False + version: [9.2.88, 10.1.105, 10.1.168] + paths: + cuda@9.2.88 arch=linux-rhel7-power9le: /usr/tce/packages/cuda/cuda-9.2.88/ + cuda@10.1.105 arch=linux-rhel7-power9le: /usr/tce/packages/cuda/cuda-10.1.105 + cuda@10.1.168 arch=linux-rhel7-power9le: /usr/tce/packages/cuda/cuda-10.1.168 + + cudnn:: + buildable: true + version: [7.4.2. 7.5.1, 7.5.1-10.1-power9le, 7.6.3-10.1-power9le] + paths: + cudnn@7.5.1 arch=linux-rhel7-power9le: /usr/workspace/wsb/brain/cudnn/cudnn-7.5.1/cuda-10.1_ppc64le/ + cudnn@7.4.2 arch=linux-rhel7-power9le: /usr/workspace/wsb/brain/cudnn/cudnn-7.4.2/cuda-9.2_ppc64le/ + + hwloc:: + buildable: False + version: [2.0.2] + paths: + hwloc@2.0.2 arch=linux-rhel7-power9le: /usr/lib64/libhwloc.so + + openblas:: + buildable: True + variants: threads=openmp ~avx2 ~avx512 + version: [0.3.6] + + opencv:: + buildable: true + variants: build_type=RelWithDebInfo ~calib3d+core~cuda~dnn~eigen+fast-math~features2d~flann~gtk+highgui+imgproc~ipp~ipp_iw~jasper~java+jpeg~lapack~ml~opencl~opencl_svm~openclamdblas~openclamdfft~openmp+png+powerpc~pthreads_pf~python~qt+shared~stitching~superres+tiff~ts~video~videoio~videostab+vsx~vtk+zlib + version: [4.1.0] + + spectrum-mpi:: + buildable: False + version: [rolling-release] + paths: + spectrum-mpi@rolling-release %gcc@7.3.1 arch=linux-rhel7-power9le: /usr/tce/packages/spectrum-mpi/spectrum-mpi-rolling-release-gcc-7.3.1 + + compilers: + - compiler: + environment: {} + extra_rpaths: [] + flags: {} + modules: [] + operating_system: rhel7 + paths: + cc: /usr/tce/packages/gcc/gcc-7.3.1/bin/gcc + cxx: /usr/tce/packages/gcc/gcc-7.3.1/bin/g++ + f77: /usr/tce/packages/gcc/gcc-7.3.1/bin/gfortran + fc: /usr/tce/packages/gcc/gcc-7.3.1/bin/gfortran + spec: gcc@7.3.1 + target: power9le diff --git a/spack_environments/externals_ppc64le_llnl_lc_cz.yaml b/spack_environments/externals_ppc64le_llnl_lc_cz.yaml deleted file mode 100644 index c3b297e0062..00000000000 --- a/spack_environments/externals_ppc64le_llnl_lc_cz.yaml +++ /dev/null @@ -1,68 +0,0 @@ - packages: - all: - providers: - mpi: [spectrum-mpi@rolling-release arch=linux-rhel7-ppc64le] - lapack: [openblas threads=openmp] - blas: [openblas threasd=openmp] - buildable: true - version: [] - paths: {} - modules: {} - compiler: [gcc@7.3.1 arch=linux-rhel7-ppc64le] - - cmake:: - variants: ~openssl ~ncurses - version: [3.12.1] - paths: - cmake@3.12.1 arch=linux-rhel7-ppc64le: /usr/tce/packages/cmake/cmake-3.12.1 - - cuda:: - buildable: False - version: [9.2.88, 10.1.105] - paths: - cuda@9.2.88 arch=linux-rhel7-ppc64le: /usr/tce/packages/cuda/cuda-9.2.88/ - cuda@10.1.105 arch=linux-rhel7-ppc64le: /usr/tce/packages/cuda/cuda-10.1.105 - - cudnn:: - buildable: true - version: [7.4.2. 7.5.1, 7.5.1-10.1-ppc64le] - paths: - cudnn@7.5.1 arch=linux-rhel7-ppc64le: /usr/workspace/wsb/brain/cudnn/cudnn-7.5.1/cuda-10.1_ppc64le/ - cudnn@7.4.2 arch=linux-rhel7-ppc64le: /usr/workspace/wsb/brain/cudnn/cudnn-7.4.2/cuda-9.2_ppc64le - - hwloc:: - buildable: False - version: [2.0.2] - paths: - hwloc@2.0.2 arch=linux-rhel7-ppc64le: /usr/lib64/libhwloc.so - - openblas:: - buildable: True - variants: threads=openmp ~avx2 ~avx512 - version: [0.3.6] - - opencv:: - buildable: true - variants: +powerpc +vsx - version: [4.1.0] - - spectrum-mpi:: - buildable: False - version: [rolling-release] - paths: - spectrum-mpi@rolling-release %gcc@7.3.1 arch=linux-rhel7-ppc64le: /usr/tce/packages/spectrum-mpi/spectrum-mpi-rolling-release-gcc-7.3.1 - - compilers: - - compiler: - environment: {} - extra_rpaths: [] - flags: {} - modules: [] - operating_system: rhel7 - paths: - cc: /usr/tce/packages/gcc/gcc-7.3.1/bin/gcc - cxx: /usr/tce/packages/gcc/gcc-7.3.1/bin/g++ - f77: /usr/tce/packages/gcc/gcc-7.3.1/bin/gfortran - fc: /usr/tce/packages/gcc/gcc-7.3.1/bin/gfortran - spec: gcc@7.3.1 - target: ppc64le diff --git a/spack_environments/externals_x86_64_llnl_lc_cz.yaml b/spack_environments/externals_x86_64_broadwell_llnl_lc_cz.yaml similarity index 52% rename from spack_environments/externals_x86_64_llnl_lc_cz.yaml rename to spack_environments/externals_x86_64_broadwell_llnl_lc_cz.yaml index 00cb62e1bf1..4bb2cc923e8 100644 --- a/spack_environments/externals_x86_64_llnl_lc_cz.yaml +++ b/spack_environments/externals_x86_64_broadwell_llnl_lc_cz.yaml @@ -1,42 +1,43 @@ packages: all: providers: - mpi: [mvapich2@2.3 arch=linux-rhel7-x86_64] + mpi: [mvapich2@2.3 arch=linux-rhel7-broadwell] lapack: [openblas threads=openmp] blas: [openblas threasd=openmp] buildable: true version: [] paths: {} modules: {} - compiler: [gcc@7.3.0 arch=linux-rhel7-x86_64] + compiler: [gcc@7.3.0 arch=linux-rhel7-broadwell] cmake:: variants: ~openssl ~ncurses version: [3.12.1] paths: - cmake@3.12.1 arch=linux-rhel7-x86_64: /usr/tce/packages/cmake/cmake-3.12.1 + cmake@3.12.1 arch=linux-rhel7-broadwell: /usr/tce/packages/cmake/cmake-3.12.1 cuda:: buildable: False - version: [10.0.130] + version: [10.0.130, 10.1.168] paths: - cuda@10.0.130 arch=linux-rhel7-x86_64: /usr/tce/packages/cuda/cuda-10.0.130 + cuda@10.0.130 arch=linux-rhel7-broadwell: /usr/tce/packages/cuda/cuda-10.0.130 + cuda@10.1.168 arch=linux-rhel7-broadwell: /usr/tce/packages/cuda/cuda-10.1.168 cudnn:: buildable: true - version: [7.5.1-10.0-x86_64] + version: [7.6.3-10.1-broadwell] hwloc:: buildable: False version: [2.0.2] paths: - hwloc@2.0.2 arch=linux-rhel7-x86_64: /usr/lib64/libhwloc.so + hwloc@2.0.2 arch=linux-rhel7-broadwell: /usr/lib64/libhwloc.so mvapich2:: buildable: True version: [2.3] paths: - mvapich2@2.3%gcc@7.3.0 arch=linux-rhel7-x86_64: /usr/tce/packages/mvapich2/mvapich2-2.3-gcc-7.3.0/ + mvapich2@2.3%gcc@7.3.0 arch=linux-rhel7-broadwell: /usr/tce/packages/mvapich2/mvapich2-2.3-gcc-7.3.0/ openblas:: buildable: True @@ -45,6 +46,7 @@ opencv:: buildable: true + variants: build_type=RelWithDebInfo ~calib3d+core~cuda~dnn~eigen+fast-math~features2d~flann~gtk+highgui+imgproc~ipp~ipp_iw~jasper~java+jpeg~lapack~ml~opencl~opencl_svm~openclamdblas~openclamdfft~openmp+png~powerpc~pthreads_pf~python~qt+shared~stitching~superres+tiff~ts~video~videoio~videostab~vsx~vtk+zlib version: [4.1.0] compilers: @@ -60,4 +62,4 @@ f77: /usr/tce/packages/gcc/gcc-7.3.0/bin/gfortran fc: /usr/tce/packages/gcc/gcc-7.3.0/bin/gfortran spec: gcc@7.3.0 - target: x86_64 + target: broadwell diff --git a/spack_environments/std_versions_and_variants_llnl_lc_cz.yaml b/spack_environments/std_versions_and_variants_llnl_lc_cz.yaml index 362c5d5927f..c05929efcc0 100644 --- a/spack_environments/std_versions_and_variants_llnl_lc_cz.yaml +++ b/spack_environments/std_versions_and_variants_llnl_lc_cz.yaml @@ -11,13 +11,23 @@ buildable: true version: [1.2.2] + conduit:: + buildable: true + variants: ~doc~doxygen+hdf5~hdf5_compat+mpi+python+shared~silo + version: [0.4.0] + + cnpy:: + buildable: true + variants: build_type=RelWithDebInfo + version: [master] + cub:: buildable: true version: [1.7.1] nccl:: buildable: true - version: [2.4.6-1] + version: [2.4.8-1] protobuf:: buildable: True @@ -44,7 +54,7 @@ buildable: True version: [1.2.11] - modules:: + modules: enable:: - tcl - lmod @@ -65,7 +75,6 @@ suffixes: '^openblas': openblas '^netlib-lapack': netlib - '^python@3.7.2': python-3.7.2 filter: # Exclude changes to any of these variables environment_blacklist: ['CPATH', 'LIBRARY_PATH'] @@ -87,7 +96,6 @@ suffixes: '^openblas': openblas '^netlib-lapack': netlib - '^python@3.7.2': python-3.7.2 filter: # Exclude changes to any of these variables environment_blacklist: ['CPATH', 'LIBRARY_PATH'] From 261c9cd5a86e359ce6d8c1f97948ae24db7a8910 Mon Sep 17 00:00:00 2001 From: Tim Moon Date: Fri, 22 Nov 2019 17:22:21 -0800 Subject: [PATCH 406/634] Progress with node2vec implementation (#1356) * Add @KIwabuchi's evaluation scripts to node2vec app * Tweak parameters in node2vec app to match SNAP --- .gitmodules | 4 +++ applications/graph/.gitignore | 2 ++ applications/graph/dataset.py | 16 +++++---- applications/graph/evaluate.py | 49 ++++++++++++++++++++++++++ applications/graph/largescale_node2vec | 1 + applications/graph/main.py | 43 ++++++++++++---------- applications/graph/utils/snap.py | 5 +++ 7 files changed, 95 insertions(+), 25 deletions(-) create mode 100644 applications/graph/.gitignore create mode 100644 applications/graph/evaluate.py create mode 160000 applications/graph/largescale_node2vec diff --git a/.gitmodules b/.gitmodules index 090b654b8e9..58b006fb3d4 100644 --- a/.gitmodules +++ b/.gitmodules @@ -2,3 +2,7 @@ path = applications/graph/snap url = https://github.com/snap-stanford/snap ignore = dirty +[submodule "applications/graph/largescale_node2vec"] + path = applications/graph/largescale_node2vec + url = https://lc.llnl.gov/bitbucket/scm/havoq/largescale_node2vec.git + ignore = dirty diff --git a/applications/graph/.gitignore b/applications/graph/.gitignore new file mode 100644 index 00000000000..72e300c6440 --- /dev/null +++ b/applications/graph/.gitignore @@ -0,0 +1,2 @@ +experiments +run_scripts diff --git a/applications/graph/dataset.py b/applications/graph/dataset.py index 092413c56b2..ca8de12d001 100644 --- a/applications/graph/dataset.py +++ b/applications/graph/dataset.py @@ -5,24 +5,26 @@ """ import os.path - import numpy as np import utils.snap +root_dir = os.path.dirname(os.path.realpath(__file__)) # Options -graph_name = 'ego-Facebook' +graph_name = 'blog' +graph_file = os.path.join( + root_dir, 'largescale_node2vec', 'evaluation', 'dataset', + 'blog', 'edges_0based' +) walk_length = 80 walk_context_length = 10 -walks_per_node = 4 -return_param = 1.0 -inout_param = 1.0 +walks_per_node = 10 +return_param = 0.25 +inout_param = 0.25 directed = False weighted = False # Download graph and perform random walk, if needed -root_dir = os.path.dirname(os.path.realpath(__file__)) data_dir = os.path.join(root_dir, 'data', graph_name) -graph_file = os.path.join(data_dir, 'graph.txt') walk_file = os.path.join(data_dir, 'walk.txt') if not os.path.isfile(graph_file): utils.snap.download_graph(graph_name, graph_file) diff --git a/applications/graph/evaluate.py b/applications/graph/evaluate.py new file mode 100644 index 00000000000..8b67d2b9e38 --- /dev/null +++ b/applications/graph/evaluate.py @@ -0,0 +1,49 @@ +"""Helper script to evaluate quality of node embeddings. + +Converts the embedding weights computed by LBANN into a format that +can be read by Keita's evaluation script. + +""" +import argparse +import os.path +import sys + +import numpy as np + +# Command-line arguments +parser = argparse.ArgumentParser() +parser.add_argument( + 'embedding_file', type=str, + help='node embeddings computed by LBANN', metavar='EMBEDDING_FILE') +parser.add_argument( + 'label_file', type=str, + help='node labels', metavar='LABEL_FILE') +parser.add_argument( + '--snap-embedding-file', default='results.emb', type=str, + help='node embeddings in SNAP format', metavar='FILE') +args = parser.parse_args() + +# Construct embedding file in SNAP's format +embeddings = np.loadtxt(args.embedding_file) +embeddings = np.transpose(embeddings) +with open(args.snap_embedding_file, 'w') as f: + f.write(f'{embeddings.shape[0]} {embeddings.shape[1]}\n') + for index, embedding in enumerate(embeddings): + f.write(f'{index} {" ".join(str(x) for x in embedding)}\n') + +# Evaluate embeddings with Keita's evaluation script +root_dir = os.path.dirname(os.path.realpath(__file__)) +eval_script_dir = os.path.join( + root_dir, + 'largescale_node2vec', + 'evaluation', + 'multi_label_classification' +) +sys.path.append(eval_script_dir) +import multi_label_classification +multi_label_classification.main([ + '-x', args.snap_embedding_file, + '-y', args.label_file, + '-r', 0.9, + '-n', 10 +]) diff --git a/applications/graph/largescale_node2vec b/applications/graph/largescale_node2vec new file mode 160000 index 00000000000..1b0aa43fdf5 --- /dev/null +++ b/applications/graph/largescale_node2vec @@ -0,0 +1 @@ +Subproject commit 1b0aa43fdf5f8e956915926305f3e55c2c17972e diff --git a/applications/graph/main.py b/applications/graph/main.py index c6fdcc95518..420a2d2e092 100644 --- a/applications/graph/main.py +++ b/applications/graph/main.py @@ -1,3 +1,4 @@ +"""Learn embedding weights with LBANN.""" import argparse import os.path @@ -23,11 +24,14 @@ '--mini-batch-size', action='store', default=256, type=int, help='mini-batch size (default: 256)', metavar='NUM') parser.add_argument( - '--num-epochs', action='store', default=20, type=int, - help='number of epochs (default: 20)', metavar='NUM') + '--num-epochs', action='store', default=1, type=int, + help='number of epochs (default: 1)', metavar='NUM') parser.add_argument( '--latent-dim', action='store', default=128, type=int, help='latent space dimensions (default: 128)', metavar='NUM') +parser.add_argument( + '--experiment-dir', action='store', default=None, type=str, + help='directory for experiment artifacts', metavar='DIR') args = parser.parse_args() # ---------------------------------- @@ -47,33 +51,29 @@ walk_length = dataset.sample_dims()[0] # Input is a sequence of graph node IDs -input_ = lbann.Identity(lbann.Input(), device='cpu') +input_ = lbann.Identity(lbann.Input()) input_slice = lbann.Slice(input_, - slice_points=str_list(range(walk_length+1)), - device='cpu') + slice_points=str_list(range(walk_length+1))) walk = [] for _ in range(walk_length): - walk.append(lbann.Identity(input_slice, device='cpu')) + walk.append(lbann.Identity(input_slice)) # Skip-gram architecture latent = lbann.Embedding(walk[0], weights=embeddings, num_embeddings=num_graph_nodes, - embedding_dim=args.latent_dim, - device='cpu') + embedding_dim=args.latent_dim) pred = lbann.FullyConnected(latent, weights=embeddings, num_neurons=num_graph_nodes, has_bias=False, - transpose=True, - device='cpu') -pred = lbann.Softmax(pred, device='cpu') + transpose=True) +pred = lbann.Softmax(pred) # Objective function -ground_truth = lbann.Sum([lbann.OneHot(node, size=num_graph_nodes, device='cpu') - for node in walk[1:]], - device='cpu') -obj = lbann.CrossEntropy([pred, ground_truth], device='cpu') +ground_truth = lbann.Sum([lbann.OneHot(node, size=num_graph_nodes) + for node in walk[1:]]) +obj = lbann.CrossEntropy([pred, ground_truth]) # ---------------------------------- # Create data reader @@ -96,16 +96,23 @@ # Create LBANN objects trainer = lbann.Trainer() +callbacks = [ + lbann.CallbackPrint(), + lbann.CallbackTimer(), + lbann.CallbackDumpWeights(basename='embeddings', + epoch_interval=args.num_epochs), +] model = lbann.Model(args.mini_batch_size, args.num_epochs, layers=lbann.traverse_layer_graph(input_), objective_function=obj, - callbacks=[lbann.CallbackPrint(), - lbann.CallbackTimer()]) -opt = lbann.SGD(learn_rate=0.01, momentum=0.9) + callbacks=callbacks) +opt = lbann.SGD(learn_rate=0.025, momentum=0.9) # Run LBANN kwargs = lbann.contrib.args.get_scheduler_kwargs(args) lbann.contrib.lc.launcher.run(trainer, model, reader, opt, job_name=args.job_name, + experiment_dir=args.experiment_dir, + overwrite_script=True, **kwargs) diff --git a/applications/graph/utils/snap.py b/applications/graph/utils/snap.py index ca43ba767e9..2c65bf7231c 100644 --- a/applications/graph/utils/snap.py +++ b/applications/graph/utils/snap.py @@ -91,6 +91,11 @@ def node2vec_walk(graph_file, .format(node2vec_exe) ) + # Make sure output directory exists + output_dir = os.path.dirname(os.path.realpath(walk_file)) + if not os.path.isdir(output_dir): + os.makedirs(output_dir) + # Construct invocation command = [ node2vec_exe, From e88d2a774cbb7ce0b98c3c5092626328e7aa0fe7 Mon Sep 17 00:00:00 2001 From: Brian Van Essen Date: Sun, 24 Nov 2019 20:08:07 -0800 Subject: [PATCH 407/634] Bugfix osx build env (#1353) * Updating the OSX spack build environment to match the ones used on x86_64 and ppc64le systems. * Updated the local OSX environment to use veclibfort rather than openblas * Updating for skylake CPU type --- .../developer_release_osx_spack.yaml | 97 +++++++------------ .../externals_osx_highsierra_skylake.yaml | 48 +++++++++ .../std_versions_and_variants_llnl_lc_cz.yaml | 1 + 3 files changed, 86 insertions(+), 60 deletions(-) create mode 100644 spack_environments/externals_osx_highsierra_skylake.yaml diff --git a/spack_environments/developer_release_osx_spack.yaml b/spack_environments/developer_release_osx_spack.yaml index 8b1bee0296c..eeb29c7977e 100644 --- a/spack_environments/developer_release_osx_spack.yaml +++ b/spack_environments/developer_release_osx_spack.yaml @@ -2,28 +2,42 @@ # # It describes a set of packages to be installed, along with # configuration settings. + +################################################################################ +# Notes on building a environment file: +# 1) Packages that should be explicitly installed to satisfy dependencies +# for LBANN, Hydrogen, or Aluminum are in the specs list. +# 2) Packages and their variants that are secondary dependencies of +# the primary dependencies should be in the packages list. This +# ensures that all primary dependenies build with a consistent set +# of secondary (and tertiary, ...) dependencies. +################################################################################ + spack: + concretization: together # add package specs to the `specs` list specs: - - protobuf@3.6.1 build_type=Release +shared - - conduit@master~doc~doxygen+hdf5~hdf5_compat+mpi+python+shared~silo - - cnpy@master build_type=RelWithDebInfo - - opencv@3.4.3 build_type=RelWithDebInfo ~calib3d+core~cuda~dnn~eigen+fast-math~features2d~flann~gtk+highgui+imgproc~ipp~ipp_iw~jasper~java+jpeg~lapack~ml~opencl~opencl_svm~openclamdblas~openclamdfft~openmp+png+powerpc~pthreads_pf~python~qt+shared~stitching~superres+tiff~ts~video~videoio~videostab+vsx~vtk+zlib - - cereal@1.2.2 build_type=RelWithDebInfo patches=2dfa0bff9816d0ebd8a1bcc70ced4483b3cda83a982ea5027f1aaadceaa15aac,720265382f29b744488d67e8df5000f2ca1b4dceb2018835fb5dc7a3a1c23f75,91f968e9ac3964e1a689a9ad379ab16f7803ac3d34d24f87ebcaecaa3f9a2f16 - - ninja@1.8.2 - - zlib@1.2.11 - - openblas@0.3.4 cpu_target=auto ~ilp64+pic+shared threads=none ~virtual_machine - - hwloc@2.0.2 - - cmake@3.12.1 - - py-cython@0.29 - - py-breathe - - py-m2r - - py-sphinx - - py-certifi - - py-urllib3 - - py-idna - - py-chardet - - doxygen + - catch2 + - clara + - conduit + - cnpy + - opencv + - cereal + - ninja + - zlib + - cmake + - hwloc + - py-argparse + - py-configparser + - py-cython + - py-graphviz + - py-matplotlib + - py-onnx + - py-pandas + - protobuf + - py-protobuf+cpp + - py-setuptools + - py-texttable mirrors: {} modules: enable: [] @@ -32,45 +46,8 @@ spack: ################################################################################ # Include paths to standard compilers and packages on LLNL LC systems # Remove and/or replace these with your site specific packages and paths +# Note that the include files are expected to be local to this yaml file ################################################################################ -# include: -# - externals_llnl_lc_cz.yaml - packages: - all: - providers: - mpi: [openmpi@4.0 arch=darwin-highsierra-x86_64] - buildable: true - version: [] - paths: {} - modules: {} - compiler: [clang@7.0.1 arch=darwin-highsierra-x86_64] - - cmake: - variants: ~openssl ~ncurses - paths: - cmake@3.14.0 arch=darwin-highsierra--x86_64: /usr/local/ - python: - buildable: True - variants: +shared - version: [3.7.2] - - openmpi: - buildable: False - version: [4.0] - paths: - openmpi@4.0 arch=darwin-highsierra-x86_64: /usr/local/ - - compilers: - - compiler: - environment: {} - extra_rpaths: [] - flags: {} - modules: [] - operating_system: highsierra - paths: - cc: /usr/local/Cellar/llvm/7.0.1/bin/clang - cxx: /usr/local/Cellar/llvm/7.0.1/bin/clang++ - f77: /usr/local/bin/gfortran - fc: /usr/local/bin/gfortran - spec: clang@7.0.1 - target: x86_64 + include: + - std_versions_and_variants_llnl_lc_cz.yaml + - externals_osx_highsierra_skylake.yaml diff --git a/spack_environments/externals_osx_highsierra_skylake.yaml b/spack_environments/externals_osx_highsierra_skylake.yaml new file mode 100644 index 00000000000..2dbe7d619f3 --- /dev/null +++ b/spack_environments/externals_osx_highsierra_skylake.yaml @@ -0,0 +1,48 @@ + packages: + all: + providers: + mpi: [openmpi@4.0 arch=darwin-highsierra-skylake] + blas: [veclibfort] + lapack: [veclibfort] + buildable: true + version: [] + paths: {} + modules: {} + compiler: [clang@9.0.0 arch=darwin-highsierra-skylake] + + cmake:: + buildable: True + variants: ~openssl ~ncurses + version: [3.15.4] + paths: + cmake@3.15.4 arch=darwin-highsierra-skylake: /usr/local/ + + hwloc:: + buildable: True + version: [2.0.2] + + opencv:: + buildable: true + variants: build_type=RelWithDebInfo ~calib3d+core~cuda~dnn~eigen+fast-math~features2d~flann~gtk+highgui+imgproc~ipp~ipp_iw~jasper~java+jpeg~lapack~ml~opencl~opencl_svm~openclamdblas~openclamdfft~openmp+png~powerpc~pthreads_pf~python~qt+shared~stitching~superres+tiff~ts~video~videoio~videostab~vsx~vtk+zlib + version: [4.1.0] + + openmpi: + buildable: False + version: [4.0] + paths: + openmpi@4.0 arch=darwin-highsierra-skylake: /usr/local/ + + compilers: + - compiler: + environment: {} + extra_rpaths: [] + flags: {} + modules: [] + operating_system: highsierra + paths: + cc: /usr/local/Cellar/llvm/9.0.0/bin/clang + cxx: /usr/local/Cellar/llvm/9.0.0/bin/clang++ + f77: /usr/local/bin/gfortran + fc: /usr/local/bin/gfortran + spec: clang@9.0.0 + target: x86_64 diff --git a/spack_environments/std_versions_and_variants_llnl_lc_cz.yaml b/spack_environments/std_versions_and_variants_llnl_lc_cz.yaml index c05929efcc0..142d24faad9 100644 --- a/spack_environments/std_versions_and_variants_llnl_lc_cz.yaml +++ b/spack_environments/std_versions_and_variants_llnl_lc_cz.yaml @@ -36,6 +36,7 @@ python:: buildable: True + variants: +shared ~readline ~zlib ~bz2 ~lzma ~pyexpat version: [3.7.2] py-cython:: From 7311a06ad995c5ba7a7ee4915870c146cbd31ca6 Mon Sep 17 00:00:00 2001 From: Nikoli Dryden Date: Wed, 27 Nov 2019 06:58:37 -0800 Subject: [PATCH 408/634] Fix bugs in Python frontend Import was not updated for reorg Broken command line parameters --- applications/vision/resnet.py | 4 ---- python/lbann/contrib/args.py | 14 ++++++++------ 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/applications/vision/resnet.py b/applications/vision/resnet.py index ed6d7969664..b7d4667bfa8 100644 --- a/applications/vision/resnet.py +++ b/applications/vision/resnet.py @@ -51,9 +51,6 @@ '--random-seed', action='store', default=0, type=int, help='random seed for LBANN RNGs', metavar='NUM') lbann.contrib.args.add_optimizer_arguments(parser, default_learning_rate=0.1) -parser.add_argument( - '--setup_only', action='store_true', - help='setup LBANN experiment without running it') args = parser.parse_args() # Due to a data reader limitation, the actual model realization must be @@ -154,5 +151,4 @@ kwargs = lbann.contrib.args.get_scheduler_kwargs(args) lbann.contrib.lc.launcher.run(trainer, model, data_reader, opt, job_name=args.job_name, - setup_only=args.setup_only, **kwargs) diff --git a/python/lbann/contrib/args.py b/python/lbann/contrib/args.py index 86e08e0c9b5..eef2f73fa77 100644 --- a/python/lbann/contrib/args.py +++ b/python/lbann/contrib/args.py @@ -1,6 +1,6 @@ """Helper functions to add common command-line arguments.""" import argparse -import lbann.optimizer +import lbann.core.optimizer def add_scheduler_arguments(parser): """Add command-line arguments for common scheduler settings. @@ -118,14 +118,16 @@ def create_optimizer(args): # Create optimizer if opt == 'momentum': - return lbann.optimizer.SGD(learn_rate=lr, momentum=0.9) + return lbann.core.optimizer.SGD(learn_rate=lr, momentum=0.9) elif opt == 'sgd': - return lbann.optimizer.SGD(learn_rate=lr) + return lbann.core.optimizer.SGD(learn_rate=lr) elif opt == 'adam': - return lbann.optimizer.Adam(learn_rate=lr, beta1=0.9, beta2=0.99, eps=1e-8) + return lbann.core.optimizer.Adam(learn_rate=lr, beta1=0.9, beta2=0.99, + eps=1e-8) elif opt == 'adagrad': - return lbann.optimizer.AdaGrad(learn_rate=lr, eps=1e-8) + return lbann.core.optimizer.AdaGrad(learn_rate=lr, eps=1e-8) elif opt == 'rmsprop': - return lbann.optimizer.RMSprop(learn_rate=lr, decay_rate=0.99, eps=1e-8) + return lbann.core.optimizer.RMSprop(learn_rate=lr, decay_rate=0.99, + eps=1e-8) else: raise ValueError('invalid optimizer type ({})'.format(opt)) From 49dc3636172134c0aaf920c2b57d856475231751 Mon Sep 17 00:00:00 2001 From: "David A. Hysom" Date: Fri, 29 Nov 2019 14:02:09 -0800 Subject: [PATCH 409/634] Initial commit for pilot data normalization file. --- .../CANDLE/pilot2/data/pilot2_normalization.txt | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 applications/CANDLE/pilot2/data/pilot2_normalization.txt diff --git a/applications/CANDLE/pilot2/data/pilot2_normalization.txt b/applications/CANDLE/pilot2/data/pilot2_normalization.txt new file mode 100644 index 00000000000..635bb276d2b --- /dev/null +++ b/applications/CANDLE/pilot2/data/pilot2_normalization.txt @@ -0,0 +1,15 @@ +max min mean std_dev: +2.40001 0 0.36777 0.248641 +1.98564 0 0.186917 0.158741 +1.95085 0 0.138673 0.127767 +2.53137 3.24127e-08 0.447338 0.240423 +2.48228 0 0.273659 0.220102 +2.3157 0 0.408046 0.198704 +2.06274 0 0.0375413 0.0741625 +2.50623 3.25748e-05 0.663633 0.218187 +2.6136 6.98301e-06 0.712461 0.265916 +2.41215 0 0.2882 0.213314 +2.80934 0 0.0411697 0.0691358 +2.4133 0 0.151854 0.150553 +2.69434 0 0.724106 0.293084 +2.71227 0.00951633 0.893146 0.222718 From 1e20c7de59f1ae5c02d96e39bdbab8c7a6d1b6e8 Mon Sep 17 00:00:00 2001 From: Sam Ade Jacobs Date: Tue, 3 Dec 2019 13:47:41 -0800 Subject: [PATCH 410/634] Add support for Deconv (transpose conv) in python frontend (#1360) * Add support for Deconv (transpose conv) in python frontend * remove transpose flag from ResNet --- python/lbann/models/resnet.py | 1 + python/lbann/modules/base.py | 21 +++++++++++++++++++-- 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/python/lbann/models/resnet.py b/python/lbann/models/resnet.py index 6277bbcde1e..319c8ec8cb4 100644 --- a/python/lbann/models/resnet.py +++ b/python/lbann/models/resnet.py @@ -41,6 +41,7 @@ def __init__(self, out_channels, kernel_size, stride, padding, stride=stride, padding=padding, bias=False, name=self.name + '_conv') + # Initialize batch normalization bn_scale_init = 0.0 if bn_zero_init else 1.0 diff --git a/python/lbann/modules/base.py b/python/lbann/modules/base.py index 76ef2989af7..d4031c06df6 100644 --- a/python/lbann/modules/base.py +++ b/python/lbann/modules/base.py @@ -126,7 +126,7 @@ class ConvolutionModule(Module): def __init__(self, num_dims, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=True, - weights=[], activation=None, name=None): + weights=[], activation=None, name=None, transpose=False): """Initialize convolution module. Args: @@ -146,6 +146,8 @@ def __init__(self, num_dims, will be initialized with He normal initialization and the bias with zeros. name (str): Default name is in the form 'convmodule'. + transpose (bool): If true call deconvolution (or convolution + transpose) """ super().__init__() @@ -163,6 +165,7 @@ def __init__(self, num_dims, self.name = (name if name else 'convmodule{0}'.format(ConvolutionModule.global_count)) + self.transpose = transpose # Initialize weights # Note: If weights are not provided, kernel weights are @@ -195,7 +198,21 @@ def __init__(self, num_dims, def forward(self, x): self.instance += 1 name = '{0}_instance{1}'.format(self.name, self.instance) - y = lbann.Convolution(x, + if(self.transpose): + y = lbann.Deconvolution(x, + weights=self.weights, + name=(name+'_deconv' if self.activation else name), + num_dims=self.num_dims, + num_output_channels=self.out_channels, + has_vectors=False, + conv_dims_i=self.kernel_size, + conv_pads_i=self.padding, + conv_strides_i=self.stride, + conv_dilations_i=self.dilation, + num_groups=self.groups, + has_bias=self.bias) + else: + y = lbann.Convolution(x, weights=self.weights, name=(name+'_conv' if self.activation else name), num_dims=self.num_dims, From 689e746cc7f809bd71d0bc70a7a5ea6e97f95118 Mon Sep 17 00:00:00 2001 From: davidHysom Date: Tue, 3 Dec 2019 17:42:35 -0800 Subject: [PATCH 411/634] Develop new data reader: ras_lipid_conduit_data_reader (#1347) * Created data reader for Pilot 2 RAS data set in NPZ format. * Added methods to data store: clear_owner_map() and add_owner(); these are needed to support ras_lipid_conduit_data_reader * cast the "states" labels from double to integers. * moved commify() from data_store_conduit to utils/commify[hpp|cpp] * Added driver to compute min-max normalization for pilot2 data * Modified data_reader_npz_ras_lipid.cpp to perform optional z-score normalization. * tweaks: removed m_cur_epoch variable from data_store, as it was misleading and not needed. Made periodically printing the number of samples loaded in data_reader_npy_ras_lipid optional; the --verbose flag enables it. * bug fix for flag for exchanging owner maps; also, exception is thrown if exchange_owner_maps is called, and maps are empty. * reworked logic for exchanging owner maps; the code no longer keeps track of the epoch. --- .../data_reader_npz_ras_lipid.hpp | 159 ++++++ .../lbann/data_store/data_store_conduit.hpp | 17 +- include/lbann/lbann.hpp | 1 + include/lbann/utils/commify.hpp | 16 + model_zoo/jag_utils/CMakeLists.txt | 7 +- .../compute_pilot2_normalization.cpp | 221 ++++++++ src/data_readers/CMakeLists.txt | 1 + src/data_readers/data_reader.cpp | 2 - .../data_reader_npz_ras_lipid.cpp | 526 ++++++++++++++++++ src/data_store/data_store_conduit.cpp | 90 ++- src/proto/proto_common.cpp | 9 + src/utils/CMakeLists.txt | 1 + src/utils/commify.cpp | 56 ++ 13 files changed, 1051 insertions(+), 55 deletions(-) create mode 100644 include/lbann/data_readers/data_reader_npz_ras_lipid.hpp create mode 100644 include/lbann/utils/commify.hpp create mode 100644 model_zoo/jag_utils/compute_pilot2_normalization.cpp create mode 100644 src/data_readers/data_reader_npz_ras_lipid.cpp create mode 100644 src/utils/commify.cpp diff --git a/include/lbann/data_readers/data_reader_npz_ras_lipid.hpp b/include/lbann/data_readers/data_reader_npz_ras_lipid.hpp new file mode 100644 index 00000000000..84c23a69726 --- /dev/null +++ b/include/lbann/data_readers/data_reader_npz_ras_lipid.hpp @@ -0,0 +1,159 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +// +//////////////////////////////////////////////////////////////////////////////// + + +#ifndef LBANN_DATA_READER_NPZ_RAS_LIPID_HPP +#define LBANN_DATA_READER_NPZ_RAS_LIPID_HPP + +#include "conduit/conduit.hpp" +#include "lbann/utils/options.hpp" +//#include "lbann/data_readers/sample_list_file_ptr.hpp" +#include "lbann/data_readers/data_reader.hpp" +#include "conduit/conduit.hpp" +#include +#include + +namespace lbann { + /** + * Data reader for data stored in numpy (.npz) files that are encapsulated + * in conduit::Nodes + */ +class ras_lipid_conduit_data_reader : public generic_data_reader { + +public: + //using sample_name_t = std::string; + //using sample_list_t = sample_list_file_ptr; + + ras_lipid_conduit_data_reader(const bool shuffle); + ras_lipid_conduit_data_reader(const ras_lipid_conduit_data_reader&); + ras_lipid_conduit_data_reader& operator=(const ras_lipid_conduit_data_reader&); + ~ras_lipid_conduit_data_reader() override {} + + ras_lipid_conduit_data_reader* copy() const override { return new ras_lipid_conduit_data_reader(*this); } + + std::string get_type() const override { + return "ras_lipid_conduit_data_reader"; + } + + void load() override; + + void set_num_labels(int n) { m_num_labels = n; } + + int get_linearized_data_size() const override { return m_num_features; } + int get_linearized_label_size() const override { return m_num_labels; } + int get_linearized_response_size() const override { return m_num_response_features; } + const std::vector get_data_dims() const override { return m_data_dims; } + int get_num_labels() const override { return m_num_labels; } + +private: + int m_num_features = 0; + int m_num_labels = 0; + int m_num_response_features = 0; + std::vector m_data_dims; + +// sample_list_t m_sample_list; + + /** @brief List of input npz filenames */ + std::vector m_filenames; + + /** @brief The global number of samples */ + int m_num_samples = 0; + + /** @brief m_samples_per_file[j] contains the number of samples in the j-th file */ + std::vector m_samples_per_file; + + /** @brief Maps a data_id to the file index (in m_filenames) that + * contains the sample, and the offset in that file's npy array */ + std::unordered_map> m_data_id_map; + + /** @brief Maps a field name to the data's shape + * + * Example: "bbs" -> {184, 3} + */ + std::unordered_map> m_datum_shapes; + + /** @brief Maps a field name to the word size */ + std::unordered_map m_datum_word_sizes; + + /** @brief Maps a field name to the number of bytes in the datum + * + * Example: "bbs" -> 184*3*word_size + */ + std::unordered_map m_datum_num_bytes; + + /** @brief Maps a field name to the number of words in the datum */ + std::unordered_map m_datum_num_words; + + //===================================================================== + // private methods follow + //===================================================================== + + /** @brief Contains common code for operator= and copy ctor */ + void copy_members(const ras_lipid_conduit_data_reader& rhs); + + void do_preload_data_store() override; + + bool fetch_datum(CPUMat& X, int data_id, int mb_idx) override; + bool fetch_label(CPUMat& Y, int data_id, int mb_idx) override; + bool fetch_response(CPUMat& Y, int data_id, int mb_idx) override; + + /** @brief Populates in m_datum_shapes, m_datum_num_bytes, m_datum_word_sizes */ + void fill_in_metadata(); + + /** @brief Collect the sample_ids that belong to this rank and + * rebuild the data store's owner map + * + * my_samples maps a filename (index in m_filenames) to the pair: + * (data_id, local index of the sample wrt the samples in the file). + */ + void get_my_indices(std::unordered_map>> &my_samples); + + /** @brief Re-build the data store's owner map + * + * This one-off, wouldn't need to do this if we were using sample lists. + */ + void rebuild_data_store_owner_map(); + + /** @brief Fills in m_samples_per_file */ + void get_samples_per_file(); + + /** @brief Write file sizes to disk + * + * Each line of the output file contains: filename num_samples + */ + void write_file_sizes(); + + /** @brief Read file that contains: filename num_samples + * + * see: write_file_sizes() + */ + void read_file_sizes(); +}; + +} // namespace lbann + +#endif //LBANN_DATA_READER_NPZ_RAS_LIPID_HPP diff --git a/include/lbann/data_store/data_store_conduit.hpp b/include/lbann/data_store/data_store_conduit.hpp index 3d436475a0a..6f02a7be608 100644 --- a/include/lbann/data_store/data_store_conduit.hpp +++ b/include/lbann/data_store/data_store_conduit.hpp @@ -92,8 +92,11 @@ class data_store_conduit { /** @brief Returns the conduit Node associated with the data_id */ const conduit::Node & get_conduit_node(int data_id) const; - /// if 'already_have = true' then the passed 'node' was obtained by a call to - /// get_empty_node(). In some operating modes this saves us from copying the node + /** @brief Set a conduit node in the data store + * + * if 'already_have = true' then the passed 'node' was obtained by a call to + * get_empty_node(); note, we do this to prevent copying the node + */ void set_conduit_node(int data_id, conduit::Node &node, bool already_have = false); void set_preloaded_conduit_node(int data_id, const conduit::Node &node); @@ -176,6 +179,12 @@ class data_store_conduit { /// fills in m_owner, which maps index -> owning processor void build_preloaded_owner_map(const std::vector& per_rank_list_sizes); + /** @brief Special hanling for ras_lipid_conduit_data_reader; may go away in the future */ + void clear_owner_map(); + + /** @brief Special hanling for ras_lipid_conduit_data_reader; may go away in the future */ + void add_owner(int data_id, int owner) { m_owner[data_id] = owner; } + /// Recompact the nodes because they are not copied properly when instantiating /// using the copy constructor void compact_nodes(); @@ -247,6 +256,8 @@ class data_store_conduit { private : + bool m_owner_maps_were_exchanged = false; + bool m_run_checkpoint_test = false; /** @brief The number of samples that this processor owns */ @@ -341,8 +352,6 @@ private : // END: timers for profiling exchange_data //=========================================================== - int m_cur_epoch = 0; - bool m_is_setup = false; /// set to true if data_store is preloaded diff --git a/include/lbann/lbann.hpp b/include/lbann/lbann.hpp index d4cc34730a0..31fb82917c0 100644 --- a/include/lbann/lbann.hpp +++ b/include/lbann/lbann.hpp @@ -117,6 +117,7 @@ #include "lbann/layers/misc/one_hot.hpp" /// Data readers +#include "lbann/data_readers/data_reader_npz_ras_lipid.hpp" #include "lbann/data_readers/data_reader_imagenet.hpp" #include "lbann/data_readers/data_reader_cifar10.hpp" #include "lbann/data_readers/data_reader_mnist.hpp" diff --git a/include/lbann/utils/commify.hpp b/include/lbann/utils/commify.hpp new file mode 100644 index 00000000000..d5c43ab1956 --- /dev/null +++ b/include/lbann/utils/commify.hpp @@ -0,0 +1,16 @@ +#ifndef LBANN_UTILS_COMMIFY_INCLUDED +#define LBANN_UTILS_COMMIFY_INCLUDED + +#include + +namespace lbann +{ +namespace utils +{ + +/** @brief Inserts commas large integers for pretty-printing */ +std::string commify(size_t n); + +}// namespace utils +}// namespace lbann +#endif // LBANN_UTILS_ANY_HPP_INCLUDED diff --git a/model_zoo/jag_utils/CMakeLists.txt b/model_zoo/jag_utils/CMakeLists.txt index 98b95212085..8307c0421e5 100644 --- a/model_zoo/jag_utils/CMakeLists.txt +++ b/model_zoo/jag_utils/CMakeLists.txt @@ -1,6 +1,11 @@ # Add a target to control building all the utilities add_custom_target(jag-utils) +add_executable(compute_pilot2_normalization + EXCLUDE_FROM_ALL compute_pilot2_normalization.cpp) +target_link_libraries(compute_pilot2_normalization lbann) +add_dependencies(jag-utils compute_pilot2_normalization) + add_executable(build_index EXCLUDE_FROM_ALL build_index.cpp) target_link_libraries(build_index lbann) @@ -88,7 +93,7 @@ add_dependencies(jag-utils convert) # Install the binaries install( - TARGETS select_samples build_sample_id_mapping build_index + TARGETS select_samples build_sample_id_mapping build_index compute_pilot2_normalization OPTIONAL EXPORT LBANNTargets RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} diff --git a/model_zoo/jag_utils/compute_pilot2_normalization.cpp b/model_zoo/jag_utils/compute_pilot2_normalization.cpp new file mode 100644 index 00000000000..4bcd94d7038 --- /dev/null +++ b/model_zoo/jag_utils/compute_pilot2_normalization.cpp @@ -0,0 +1,221 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +// +//////////////////////////////////////////////////////////////////////////////// + +#include +#include "lbann/comm.hpp" +#include "lbann/utils/options.hpp" +#include "lbann/utils/exception.hpp" +#include "lbann/utils/jag_utils.hpp" +#include "lbann/utils/commify.hpp" +#include +#include + +using namespace lbann; + +int main(int argc, char *argv[]) { + int random_seed = 0; + world_comm_ptr comm = initialize(argc, argv, random_seed); + bool master = comm->am_world_master(); + + try { + // Initialize options db (this parses the command line) + options *opts = options::get(); + opts->init(argc, argv); + + if (argc == 1) { + if (master) { + std::cerr << "usage: " << argv[0] << " --filelist= --output_fn=" << std::endl; + } + return EXIT_FAILURE; + } + + if (! (opts->has_string("filelist") && opts->has_string("output_fn"))) { + throw lbann_exception(std::string{} + __FILE__ + " " + std::to_string(__LINE__) + " :: improper invocation; run with no cmd line args for proper invocation"); + } + + const std::string input_fn = opts->get_string("filelist"); + const std::string output_fn = opts->get_string("output_fn"); + + //sanity check that we can write to the output file + if (master) { + std::ofstream out(output_fn.c_str()); + if (!out) { + LBANN_ERROR("failed to open ", output_fn, " for writing"); + } + out.close(); + } + + int rank = comm->get_rank_in_world(); + int np = comm->get_procs_in_world(); + + // get list of input filenames + std::vector filenames; + read_filelist(comm.get(), input_fn, filenames); + + size_t total_elts_per_channel = 0; + std::vector v_max(14, 0.); + std::vector v_min(14, std::numeric_limits::max()); + std::vector v_mean(14, 0); + for (size_t j=rank; j a = cnpy::npz_load(filenames[j]); + size_t n_elts = a["density_sig1"].num_vals; + double *data = reinterpret_cast(a["density_sig1"].data_holder->data()); + + int s = 0; + for (size_t i=0; i v_max[s]) v_max[s] = vv; + if (vv < v_min[s]) v_min[s] = vv; + ++s; + if (s == 14) { + s = 0; + } + } + if (master) { + std::cerr << "approx " << utils::commify(total_elts_per_channel*np) << " samples processed" << std::endl; + } + } + // ==================== finished processing all files ======================== + + std::vector f_max(14, 0.); + std::vector f_min(14, 0.); + std::vector f_mean(14, 0.); + + comm->trainer_allreduce(v_max.data(), v_max.size(), f_max.data(), El::mpi::MAX); + comm->trainer_allreduce(v_min.data(), v_min.size(), f_min.data(), El::mpi::MIN); + comm->trainer_allreduce(v_mean.data(), v_mean.size(), f_mean.data(), El::mpi::SUM); + size_t n3 = comm->trainer_allreduce(total_elts_per_channel); + for (size_t j=0; j v_minus_mean_squared(14, 0.); + std::vector stdev(14, 0.); + for (size_t j=rank; j a = cnpy::npz_load(filenames[j]); + size_t n_elts = a["density_sig1"].num_vals; + double *data = reinterpret_cast(a["density_sig1"].data_holder->data()); + + int s = 0; + for (size_t i=0; i f_minus_mean_squared(14, 0.); + std::vector f_std_dev(14, 0.); + comm->trainer_allreduce(v_minus_mean_squared.data(), v_minus_mean_squared.size(), f_minus_mean_squared.data(), El::mpi::SUM); + if (master) std::cout << "n3: " << n3 << std::endl; + for (size_t j=0; jset_profile_msg(s.str()); - size_t n = m_data_store->get_num_global_indices(); if (n != m_shuffled_indices.size()) { LBANN_ERROR("num samples loaded: ", n, " != shuffled-indices.size(): ", m_shuffled_indices.size()); @@ -833,5 +832,4 @@ void generic_data_reader::preload_data_store() { } - } // namespace lbann diff --git a/src/data_readers/data_reader_npz_ras_lipid.cpp b/src/data_readers/data_reader_npz_ras_lipid.cpp new file mode 100644 index 00000000000..ff19b43aece --- /dev/null +++ b/src/data_readers/data_reader_npz_ras_lipid.cpp @@ -0,0 +1,526 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +// +//////////////////////////////////////////////////////////////////////////////// + +#include "lbann/data_readers/data_reader_npz_ras_lipid.hpp" +#include "lbann/data_store/data_store_conduit.hpp" +#include +#include "lbann/utils/file_utils.hpp" // pad() +#include "lbann/utils/jag_utils.hpp" // read_filelist(..) TODO should be move to file_utils +#include "lbann/utils/timer.hpp" +#include "lbann/models/model.hpp" +#include "lbann/utils/commify.hpp" +#include "lbann/utils/lbann_library.hpp" + +namespace lbann { + +ras_lipid_conduit_data_reader::ras_lipid_conduit_data_reader(const bool shuffle) + : generic_data_reader(shuffle) {} + +ras_lipid_conduit_data_reader::ras_lipid_conduit_data_reader(const ras_lipid_conduit_data_reader& rhs) : generic_data_reader(rhs) { + copy_members(rhs); +} + +ras_lipid_conduit_data_reader& ras_lipid_conduit_data_reader::operator=(const ras_lipid_conduit_data_reader& rhs) { + // check for self-assignment + if (this == &rhs) { + return (*this); + } + generic_data_reader::operator=(rhs); + copy_members(rhs); + return (*this); +} + + +void ras_lipid_conduit_data_reader::copy_members(const ras_lipid_conduit_data_reader &rhs) { + if (is_master()) { + std::cout << "Starting ras_lipid_conduit_data_reader::copy_members\n"; + } + if(rhs.m_data_store != nullptr) { + m_data_store = new data_store_conduit(rhs.get_data_store()); + } + m_data_store->set_data_reader_ptr(this); + m_filenames = rhs.m_filenames; + m_samples_per_file = rhs.m_samples_per_file; + m_data_id_map = rhs.m_data_id_map; + m_datum_shapes = rhs.m_datum_shapes; + m_datum_word_sizes = rhs.m_datum_word_sizes; + m_datum_num_bytes = rhs.m_datum_num_bytes; + m_datum_num_words = rhs.m_datum_num_words; + m_num_features = rhs.m_num_features; + m_num_labels = rhs.m_num_labels; + m_num_response_features = rhs.m_num_response_features; + m_data_dims = rhs.m_data_dims; +} + +void ras_lipid_conduit_data_reader::load() { + if(is_master()) { + std::cout << "starting load for role: " << get_role() << std::endl; + } + + options *opts = options::get(); + + if (! opts->get_bool("preload_data_store")) { + LBANN_ERROR("ras_lipid_conduit_data_reader requires data_store; please pass either --preload_data_store on the cmd line"); + } + + //dah - for now, I assume the input file contains, on each line, the complete + // pathname of an npz file. + std::string infile = get_data_filename(); + read_filelist(m_comm, infile, m_filenames); + + fill_in_metadata(); + + if (opts->has_string("pilot2_read_file_sizes")) { + read_file_sizes(); + } else { + double tm3 = get_time(); + get_samples_per_file(); + if (is_master()) std::cout << "time to compute samples_per_file: " << get_time() - tm3 << std::endl; + } + if (opts->has_string("pilot2_save_file_sizes")) { + write_file_sizes(); + } + + //Note: we really need the sample list here, but to get this working + //I'm doing something clunky ... + //XX + /* + int data_id = 0; + for (size_t j=0; j>> my_samples; + get_my_indices(my_samples); + + bool verbose = options::get()->get_bool("verbose"); + + std::ofstream out; + if (is_master() && options::get()->has_string("pilot2_profile")) { + out.open(options::get()->get_string("pilot2_profile").c_str()); + if (!out) { + LBANN_ERROR("failed to open ", options::get()->get_string("pilot2_profile"), " for writing"); + } + } + + std::vector min; + std::vector max_min; + std::vector mean; + std::vector std_dev; + bool min_max = false; + bool z_score = false; + if (options::get()->has_string("normalization")) { + min_max = true; + z_score = options::get()->get_bool("z_score"); + if (is_master()) { + if (z_score) { + std::cout << "Normalizing data using z-score" << std::endl; + } else { + std::cout << "Normalizing data using min-max" << std::endl; + } + } + + std::string fn = options::get()->get_string("normalization"); + std::ifstream in(fn.c_str()); + if (!in) { + LBANN_ERROR("failed to open ", fn, " for reading"); + } + std::string line; + getline(in, line); + max_min.reserve(14); + min.reserve(14); + mean.reserve(14); + std_dev.reserve(14); + double v_max, v_min, v_mean, v_std_dev; + while (in >> v_max >> v_min >> v_mean >> v_std_dev) { + min.push_back(v_min); + max_min.push_back(v_max - v_min); + mean.push_back(v_mean); + std_dev.push_back(v_std_dev); + } + in.close(); + if (min.size() != 14) { + LBANN_ERROR("normalization.size() = ", min.size(), "; should be 14"); + } + } else { + if (is_master()) { + std::cout << "NOT Normalizing data!" << std::endl; + } + } + + // construct a conduit::Node for each sample that this rank owns, + // and set it in the data_store + size_t nn = 0; + std::vector dist(3, 0); + for (const auto &t : my_samples) { + std::map a = cnpy::npz_load(m_filenames[t.first]); + for (const auto &t4 : t.second) { + int data_id = t4.first; + int sample_index = t4.second; + conduit::Node &node = m_data_store->get_empty_node(data_id); + + size_t offset; + for (const auto &t5 : m_datum_shapes) { + const std::string &name = t5.first; + // this could be done better ... read the choices of fields + // to use from file, as is done in data_reader_jag_conduit? + + if (name == "frames") { + conduit::int64 *data = reinterpret_cast(a[name].data_holder->data()); + offset = sample_index*m_datum_num_words["frames"]; + node[LBANN_DATA_ID_STR(data_id) + "/" + name].set(data + offset, m_datum_num_words[name]); + } + + else if (name == "bbs") { + conduit::float32 *data = reinterpret_cast(a[name].data_holder->data()); + offset = sample_index*m_datum_num_words["bbs"]; + node[LBANN_DATA_ID_STR(data_id) + "/" + name].set(data + offset, m_datum_num_words[name]); + } + + else { // rots, states, tilts, density_sig1, probs + offset = sample_index*m_datum_num_words[name]; + conduit::float64 *data = reinterpret_cast(a[name].data_holder->data()); + + if (name == "states") { + int label = (data + offset)[0]; + if (label < 0 || label > 2) { + LBANN_ERROR("bad label; should be 0, 1, or 2 but it's: ", label); + } + dist[label] += 1; + node[LBANN_DATA_ID_STR(data_id) + "/" + name].set(label); + + } else if (name == "density_sig1") { + int s = 0; + if (z_score) { + for (size_t j=offset; jset_preloaded_conduit_node(data_id, node); + + //user feedback + ++nn; + if (verbose && is_master() && nn % 1000 == 0) { + int np = m_comm->get_procs_per_trainer(); + std::cout << "estimated number of samples loaded: " << utils::commify(nn/1000*np) << "K" << std::endl; + } + } + } + + if (out) { + out.close(); + } + + //user feedback + if (is_master()) { + std::vector r(3); + m_comm->trainer_reduce(dist.data(), 3, r.data()); + std::cout << "\nLabel distribution:\n"; + for (size_t h=0; h<3; h++) { + std::cout << " " << h << " " << r[h] << std::endl; + } + std::cout << "\nData Shapes:\n"; + for (auto t : m_datum_shapes) { + std::cout << " " << t.first << " "; + for (auto t2 : t.second) { + std::cout << t2 << " "; + } + std::cout << std::endl; + } + std::cout << std::endl; + } else { + m_comm->trainer_reduce(dist.data(), 3, 0); + } +} + +bool ras_lipid_conduit_data_reader::fetch_datum(Mat& X, int data_id, int mb_idx) { + const conduit::Node& node = m_data_store->get_conduit_node(data_id); + double scaling_factor = 1.0; + const double *data = node[LBANN_DATA_ID_STR(data_id) + "/density_sig1"].value(); + size_t n = m_datum_num_words["density_sig1"]; + for (size_t j = 0; j < n; ++j) { + X(j, mb_idx) = data[j] * scaling_factor; + } + +#if 0 +Notes from Adam: +The keras model that I gave you only looks at the density_sig1 data as input data and it uses the states data as labels. We¿ll want to also extract bbs to merge that with density_sig1 in various ways as input data in future models that we¿re putting together. + + The probs field can be useful as an alternate label if building a regression model instead of a classification model. I¿ve also been using the probs field as a filter on the training data to only consider those input data whose state probability exceeds some threshold. + + So that works out to: + + bb, density_sig1 - datum + states - label + probs - used as a filter to include/exclude certain samples + +#endif + return true; +} + +std::map m2; + +bool ras_lipid_conduit_data_reader::fetch_label(Mat& Y, int data_id, int mb_idx) { + const conduit::Node node = m_data_store->get_conduit_node(data_id); + int label = node[LBANN_DATA_ID_STR(data_id) + "/states"].value(); + Y.Set(label, mb_idx, 1); + return true; +} + +bool ras_lipid_conduit_data_reader::fetch_response(Mat& Y, int data_id, int mb_idx) { + LBANN_ERROR("ras_lipid_conduit_data_reader: do not have responses"); + return true; +} + +void ras_lipid_conduit_data_reader::fill_in_metadata() { + std::map aa = cnpy::npz_load(m_filenames[0]); + for (const auto &t : aa) { + const std::string &name = t.first; + size_t word_size = t.second.word_size; + const std::vector &shape = t.second.shape; + size_t num_words = 1; + if (shape.size() == 1) { + m_datum_shapes[name].push_back(1); + } else { + for (size_t x=1; x>> &my_samples) { + std::unordered_set indices; + for (const auto &t : m_shuffled_indices) { + indices.insert(t); + } + int my_rank = m_comm->get_rank_in_trainer(); + int np = m_comm->get_procs_per_trainer(); + size_t data_id = 0; + for (size_t j=0; jclear_owner_map(); + int np = m_comm->get_procs_per_trainer(); + size_t data_id = 0; + for (size_t j=0; jadd_owner(data_id, file_owner); + ++data_id; + } + } +} + +void ras_lipid_conduit_data_reader::get_samples_per_file() { + int me = m_comm->get_rank_in_trainer(); + int np = m_comm->get_procs_per_trainer(); + std::vector work; + int x = 0; + for (size_t j=me; j a = cnpy::npz_load(m_filenames[j]); + size_t n = 0; + for (const auto &t2 : a) { + size_t n2 = t2.second.shape[0]; + if (n == 0) { + n = n2; + } else { + if (n2 != n) { + LBANN_ERROR("n2 != n; ", n2, n); + } + } + } + work.push_back(j); + work.push_back(n); + } + + std::vector num_files(np, 0); + for (size_t j=0; j work_2; + std::vector *work_ptr; + for (int j=0; jtrainer_broadcast(j, work_ptr->data(), work_ptr->size()); + for (size_t h=0; hsize(); h+= 2) { + m_samples_per_file[(*work_ptr)[h]] = (*work_ptr)[h+1]; + } + } +} + +void ras_lipid_conduit_data_reader::write_file_sizes() { + if (! is_master()) { + return; + } + std::string fn = options::get()->get_string("pilot2_save_file_sizes"); + std::ofstream out(fn.c_str()); + if (!out) { + LBANN_ERROR("failed to open ", fn, " for writing"); + } + for (size_t j=0; jget_string("pilot2_read_file_sizes"); + std::ifstream in(fn.c_str()); + if (!in) { + LBANN_ERROR("failed to open ", fn, " for reading"); + } + std::unordered_map mp; + std::string filename; + int num_samples; + while (in >> filename >> num_samples) { + mp[filename] = num_samples; + } + in.close(); + + m_samples_per_file.resize(m_filenames.size()); + for (size_t h=0; h #include #include @@ -52,8 +53,6 @@ namespace lbann { -std::string commify(size_t n); - data_store_conduit::data_store_conduit( generic_data_reader *reader) : m_reader(reader) { @@ -174,7 +173,6 @@ void data_store_conduit::copy_members(const data_store_conduit& rhs) { m_is_local_cache = rhs.m_is_local_cache; m_node_sizes_vary = rhs.m_node_sizes_vary; m_have_sample_sizes = rhs.m_have_sample_sizes; - //m_reader = rhs.m_reader; m_comm = rhs.m_comm; m_world_master = rhs.m_world_master; m_trainer_master = rhs.m_trainer_master; @@ -217,7 +215,7 @@ void data_store_conduit::copy_members(const data_store_conduit& rhs) { } void data_store_conduit::setup(int mini_batch_size) { - PROFILE("starting setup()"); + PROFILE("starting setup(); m_owner.size(): ", m_owner.size()); m_owner_map_mb_size = mini_batch_size; m_is_setup = true; } @@ -317,6 +315,10 @@ void data_store_conduit::error_check_compacted_node(const conduit::Node &nd, int //n.b. Do not put any PROFILE or DEBUG statements in this method, // since the threading from the data_reader will cause you grief void data_store_conduit::set_conduit_node(int data_id, conduit::Node &node, bool already_have) { + +DEBUG("starting set_conduit_node; m_data.size: ", m_data.size(), " data_id: ", data_id, " already_have: ", already_have); + + std::lock_guard lock(m_mutex); // TODO: test whether having multiple mutexes below is better (faster) than // locking this entire call with a single mutex. For now I'm @@ -330,6 +332,7 @@ void data_store_conduit::set_conduit_node(int data_id, conduit::Node &node, bool { //std::lock_guard lock(m_mutex); if (already_have == false && m_data.find(data_id) != m_data.end()) { + DEBUG("m_data.size: ", m_data.size(), " ERROR: duplicate data_id: ", data_id); LBANN_ERROR("duplicate data_id: ", data_id, " in data_store_conduit::set_conduit_node; role: ", m_reader->get_role()); } } @@ -638,6 +641,7 @@ void data_store_conduit::build_preloaded_owner_map(const std::vector& per_r } m_owner[(*m_shuffled_indices)[i]] = owning_rank; } + m_owner_maps_were_exchanged = true; } const conduit::Node & data_store_conduit::get_random_node() const { @@ -660,6 +664,7 @@ const conduit::Node & data_store_conduit::get_random_node(const std::string &fie } conduit::Node & data_store_conduit::get_empty_node(int data_id) { + std::lock_guard lock(m_mutex); if (m_data.find(data_id) != m_data.end()) { LBANN_ERROR("we already have a node with data_id= ", data_id); } @@ -1023,8 +1028,8 @@ void data_store_conduit::allocate_shared_segment(map_is_t &sizes, std::vector= avail_mem) { @@ -1097,7 +1102,6 @@ void data_store_conduit::preload_local_cache() { void data_store_conduit::exchange_local_caches() { PROFILE("Starting exchange_local_caches"); - PROFILE(" At new epoch; m_cur_epoch: ", m_cur_epoch); PROFILE(" is_explicitly_loading(): ", is_explicitly_loading()); PROFILE(" is_preloading(): ", is_preloading()); PROFILE(" is_local_cache(): ", is_local_cache()); @@ -1191,7 +1195,7 @@ void data_store_conduit::build_conduit_nodes(map_is_t &sizes) { } void data_store_conduit::fillin_shared_images(char* images, size_t size, size_t offset) { - PROFILE(" fillin_shared_images; size: ", commify(size), " offset: ", commify(offset)); + PROFILE(" fillin_shared_images; size: ", utils::commify(size), " offset: ", utils::commify(offset)); memcpy(reinterpret_cast(m_mem_seg+offset), reinterpret_cast(images), size); } @@ -1238,7 +1242,7 @@ void data_store_conduit::exchange_images(std::vector &work, map_is_t &imag for (auto idx : indices[p]) { bytes += image_sizes[idx]; } - //PROFILE(" \nP_", p, " has ", commify(bytes), " bytes to bcast"); + //PROFILE(" \nP_", p, " has ", utils::commify(bytes), " bytes to bcast"); // Set up the rounds; due to MPI yuckiness, can bcast at most INT_MAX bytes // in a single broadcast @@ -1264,7 +1268,7 @@ void data_store_conduit::exchange_images(std::vector &work, map_is_t &imag int work_vector_offset = 0; for (size_t i=0; itrainer_broadcast(p, work.data()+work_vector_offset, sz); if (node_rank == 0) { @@ -1331,10 +1335,15 @@ void data_store_conduit::exchange_owner_maps() { } PROFILE("leaving data_store_conduit::exchange_owner_maps\n", "my owner map size: ", m_owner.size()); + m_owner_maps_were_exchanged = true; + set_loading_is_complete(); } void data_store_conduit::profile_timing() { - if (m_cur_epoch > 0) { + if (m_exchange_time == 0) { + return; + } + if (m_exchange_time > 0.) { PROFILE( "\n", "Exchange Data Timing:\n", @@ -1391,10 +1400,13 @@ void data_store_conduit::exchange_mini_batch_data(size_t current_pos, size_t mb_ return; } + if (m_reader->at_new_epoch() && is_local_cache() && is_explicitly_loading()) { + exchange_local_caches(); + return; + } + if (m_reader->at_new_epoch()) { - ++m_cur_epoch; - PROFILE("Starting exchange_mini_batch_data"); - PROFILE(" At new epoch; m_cur_epoch: ", m_cur_epoch); + PROFILE("\nExchange_mini_batch_data"); PROFILE(" is_explicitly_loading(): ", is_explicitly_loading()); PROFILE(" is_local_cache(): ", is_local_cache()); PROFILE(" is_fully_loaded: ", is_fully_loaded()); @@ -1403,17 +1415,15 @@ void data_store_conduit::exchange_mini_batch_data(size_t current_pos, size_t mb_ } } - if (m_reader->at_new_epoch() && is_local_cache() && is_explicitly_loading()) { - exchange_local_caches(); - return; - } - double tm1 = get_time(); // when not running in preload mode, exchange owner maps after the 1st epoch - if (m_reader->at_new_epoch() && ! is_preloading() && !is_local_cache() && m_cur_epoch == 1) { + if (m_reader->at_new_epoch() && ! is_preloading() && !is_local_cache()) { PROFILE("calling exchange_owner_maps"); - exchange_owner_maps(); + if (!m_owner_maps_were_exchanged) { + exchange_owner_maps(); + } + m_owner_maps_were_exchanged = true; /* * TODO if (m_spill) { @@ -1445,7 +1455,8 @@ void data_store_conduit::flush_profile_file() const { } size_t data_store_conduit::get_num_global_indices() const { - return m_comm->trainer_allreduce(m_my_num_indices); + size_t n = m_comm->trainer_allreduce(m_my_num_indices); + return n; } void data_store_conduit::test_checkpoint(const std::string &checkpoint_dir) { @@ -1464,7 +1475,6 @@ void data_store_conduit::test_checkpoint(const std::string &checkpoint_dir) { m_owner.clear(); m_sample_sizes.clear(); m_data.clear(); - m_cur_epoch = -1; m_is_setup = false; m_preloading = false; @@ -1576,7 +1586,7 @@ void data_store_conduit::save_state() { { cereal::XMLOutputArchive archive(os); archive(CEREAL_NVP(m_my_num_indices), - CEREAL_NVP(m_cur_epoch), + CEREAL_NVP(m_owner_maps_were_exchanged), CEREAL_NVP(m_is_setup), CEREAL_NVP(m_preloading), CEREAL_NVP(m_loading_is_complete), @@ -1616,7 +1626,7 @@ void data_store_conduit::load_checkpoint(std::string dir_name, generic_data_read } cereal::XMLInputArchive iarchive(in); iarchive(CEREAL_NVP(m_my_num_indices), - m_cur_epoch, m_is_setup, + m_owner_maps_were_exchanged, m_is_setup, m_preloading, m_loading_is_complete, m_explicitly_loading, m_owner_map_mb_size, m_compacted_sample_size, m_is_local_cache, @@ -1669,8 +1679,7 @@ void data_store_conduit::print_variables() { if (!m_world_master) { return; } - std::cerr << "m_cur_epoch: " << m_cur_epoch << std::endl - << "m_is_setup: " << m_is_setup << std::endl + std::cerr << "m_is_setup: " << m_is_setup << std::endl << "m_preloading: " << m_preloading << std::endl << "m_explicitly_loading: " << m_explicitly_loading << std::endl << "m_owner_map_mb_size: " << m_owner_map_mb_size << std::endl @@ -1804,12 +1813,12 @@ void data_store_conduit::test_imagenet_node(int index, bool dereference) { LBANN_ERROR("failed to find data_id ", data_id, " in the image_sizes map"); } size_t szz = m_sample_sizes[data_id]; - PROFILE("test_imagenet_node() for data_id: ", commify(data_id), " at offset: ", commify(m_image_offsets[data_id]), " image size: ", commify(szz)); + PROFILE("test_imagenet_node() for data_id: ", utils::commify(data_id), " at offset: ", utils::commify(m_image_offsets[data_id]), " image size: ", utils::commify(szz)); if (m_image_offsets[data_id] >= INT_MAX) { PROFILE(" WARNING: offset is >= INT_MAX!"); } - std::cerr << "testing sample_id: "<< commify(data_id)<< " stored at offset: "<< commify(m_image_offsets[data_id]); + std::cerr << "testing sample_id: "<< utils::commify(data_id)<< " stored at offset: "<< utils::commify(m_image_offsets[data_id]); if (m_image_offsets[data_id] >= INT_MAX) { std::cerr << "; (>= INT_MAX)\n"; } else { @@ -1891,25 +1900,6 @@ bool data_store_conduit::test_local_cache_imagenet(int n) { return true; } -std::string commify(size_t n) { - std::string s = std::to_string(n); - std::stringstream s2; - int c = 0; - for (int j = (int)s.size()-1; j>=0; j--) { - s2 << s[j]; - ++c; - if (c == 3) { - if (j > 0) { - s2 << ","; - c = 0; - } - } - } - std::string r = s2.str(); - std::reverse(r.begin(), r.end()); - return r; -} - void data_store_conduit::check_query_flags() const { if (m_explicitly_loading && m_preloading) { LBANN_ERROR("is_explicitly_loading() && is_preloading() are both true, but should not be"); @@ -1922,4 +1912,8 @@ void data_store_conduit::check_query_flags() const { } } +void data_store_conduit::clear_owner_map() { + m_owner_maps_were_exchanged = false; + m_owner.clear(); + } } // namespace lbann diff --git a/src/proto/proto_common.cpp b/src/proto/proto_common.cpp index 0738485fe69..bd38f58597f 100644 --- a/src/proto/proto_common.cpp +++ b/src/proto/proto_common.cpp @@ -179,6 +179,10 @@ void init_data_readers( set_transform_pipeline = false; } else if (name == "nci") { reader = new data_reader_nci(shuffle); + } else if (name == "ras_lipid") { + auto *ras_lipid = new ras_lipid_conduit_data_reader(shuffle); + ras_lipid->set_num_labels(readme.num_labels()); + reader = ras_lipid; } else if (name == "csv") { auto* reader_csv = new csv_reader(shuffle); reader_csv->set_label_col(readme.label_col()); @@ -457,6 +461,11 @@ void init_data_readers( reader_jag_conduit->set_role(role); leading_reader_jag_conduit[role] = reader_jag_conduit; } + } else if (name == "ras_lipid") { + auto *ras_lipid = new ras_lipid_conduit_data_reader(shuffle); + ras_lipid->set_num_labels(readme.num_labels()); + reader_validation = ras_lipid; + (*(ras_lipid_conduit_data_reader *)reader_validation) = (*(ras_lipid_conduit_data_reader *)reader); } else if (name == "nci") { reader_validation = new data_reader_nci(shuffle); (*(data_reader_nci *)reader_validation) = (*(data_reader_nci *)reader); diff --git a/src/utils/CMakeLists.txt b/src/utils/CMakeLists.txt index c4d863c3ede..989282a290d 100644 --- a/src/utils/CMakeLists.txt +++ b/src/utils/CMakeLists.txt @@ -25,6 +25,7 @@ set_full_path(THIS_DIR_SOURCES system_info.cpp lbann_library.cpp jag_common.cpp + commify.cpp ) if (LBANN_HAS_CUDA) diff --git a/src/utils/commify.cpp b/src/utils/commify.cpp new file mode 100644 index 00000000000..40eab16dbf7 --- /dev/null +++ b/src/utils/commify.cpp @@ -0,0 +1,56 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#include "lbann/utils/commify.hpp" +#include +#include + +namespace lbann +{ +namespace utils +{ + +std::string commify(size_t n) { + std::string s = std::to_string(n); + std::stringstream s2; + int c = 0; + for (int j = (int)s.size()-1; j>=0; j--) { + s2 << s[j]; + ++c; + if (c == 3) { + if (j > 0) { + s2 << ","; + c = 0; + } + } + } + std::string r = s2.str(); + std::reverse(r.begin(), r.end()); + return r; +} + +}// namespace utils +}// namespace lbann From 21d63e356dc51ecb2a600042a4b732dbef783bde Mon Sep 17 00:00:00 2001 From: Brian Van Essen Date: Wed, 4 Dec 2019 09:02:05 -0800 Subject: [PATCH 412/634] Refactor add data type layer (#1261) * Adds a TensorDataType template parameter to the layer, weights, and optimizer classes as appropriate. This minimizes the dependency of the code on the global macro definition of type DataType. This is the first step in a sequence to enable LBANN to have per-layer data type support. * Refactoring the layer class hierarchy to be templated by a TensorDataType. Added a data_type_layer in-between the Layer class and all children. The data_type_layer holds the activations, error, and weight matrices. It is the first class in the hierarchy that is specialized by TensorDataType. * Moved all of the TensorDataType dependencies in the weights class into a data_type_weights subclass. This keeps weighs a non-templated class again and should make it possible for the model to maintain a list of weights without know which data type each one is. * Create a data_type_weights_initializer subclass * Moved all of the TensorDataType dependencies in the optimizers class into a data_type_optimizers subclass. * Minor cleanup and clang -g fix for all files on os x. * Builds with DataType equal to float or double. --- CMakeLists.txt | 6 +- include/lbann/callbacks/check_init.hpp | 10 +- include/lbann/callbacks/check_nan.hpp | 2 +- include/lbann/callbacks/check_small.hpp | 10 +- include/lbann/callbacks/confusion_matrix.hpp | 19 +- include/lbann/callbacks/debug.hpp | 2 - include/lbann/callbacks/debug_io.hpp | 2 +- include/lbann/callbacks/imcomm.hpp | 50 +- include/lbann/callbacks/learning_rate.hpp | 12 +- include/lbann/callbacks/perturb_adam.hpp | 16 +- include/lbann/callbacks/perturb_dropout.hpp | 4 +- include/lbann/callbacks/save_images.hpp | 4 +- .../io/data_buffers/partitioned_io_buffer.hpp | 32 +- include/lbann/layers/CMakeLists.txt | 1 + .../lbann/layers/activations/activations.hpp | 6 +- include/lbann/layers/activations/elu.hpp | 26 +- include/lbann/layers/activations/identity.hpp | 24 +- .../lbann/layers/activations/leaky_relu.hpp | 26 +- .../lbann/layers/activations/log_softmax.hpp | 52 +- include/lbann/layers/activations/softmax.hpp | 60 +- include/lbann/layers/data_type_layer.hpp | 304 +++++++++ .../lbann/layers/image/bilinear_resize.hpp | 24 +- .../layers/io/input/generic_input_layer.hpp | 89 ++- include/lbann/layers/io/input/input_layer.hpp | 54 +- include/lbann/layers/io/io_layer.hpp | 7 +- include/lbann/layers/layer.hpp | 109 +-- .../layers/learning/base_convolution.hpp | 336 ++++----- .../learning/channelwise_scale_bias.hpp | 68 +- include/lbann/layers/learning/convolution.hpp | 29 +- .../lbann/layers/learning/deconvolution.hpp | 28 +- include/lbann/layers/learning/embedding.hpp | 86 ++- .../layers/learning/entrywise_scale_bias.hpp | 91 ++- .../lbann/layers/learning/fully_connected.hpp | 130 ++-- include/lbann/layers/learning/learning.hpp | 8 +- .../layers/loss/categorical_accuracy.hpp | 30 +- include/lbann/layers/loss/cross_entropy.hpp | 81 ++- include/lbann/layers/loss/entrywise.hpp | 4 +- include/lbann/layers/loss/l1_norm.hpp | 67 +- include/lbann/layers/loss/l2_norm2.hpp | 67 +- .../lbann/layers/loss/mean_absolute_error.hpp | 87 ++- .../lbann/layers/loss/mean_squared_error.hpp | 87 ++- .../loss/top_k_categorical_accuracy.hpp | 32 +- include/lbann/layers/math/binary.hpp | 88 +-- include/lbann/layers/math/clamp.hpp | 28 +- include/lbann/layers/math/matmul.hpp | 39 +- include/lbann/layers/math/unary.hpp | 53 +- include/lbann/layers/misc/argmax.hpp | 20 +- include/lbann/layers/misc/argmin.hpp | 20 +- .../lbann/layers/misc/channelwise_mean.hpp | 19 +- include/lbann/layers/misc/covariance.hpp | 57 +- .../lbann/layers/misc/mini_batch_index.hpp | 28 +- include/lbann/layers/misc/mini_batch_size.hpp | 25 +- include/lbann/layers/misc/one_hot.hpp | 22 +- include/lbann/layers/misc/variance.hpp | 55 +- .../regularizers/batch_normalization.hpp | 139 ++-- include/lbann/layers/regularizers/dropout.hpp | 87 +-- .../entrywise_batch_normalization.hpp | 110 +-- .../lbann/layers/regularizers/layer_norm.hpp | 100 +-- .../local_response_normalization.hpp | 119 ++-- .../lbann/layers/regularizers/regularizer.hpp | 7 +- .../layers/regularizers/selu_dropout.hpp | 69 +- include/lbann/layers/transform/bernoulli.hpp | 25 +- .../layers/transform/categorical_random.hpp | 19 +- .../lbann/layers/transform/concatenation.hpp | 88 +-- include/lbann/layers/transform/constant.hpp | 27 +- include/lbann/layers/transform/crop.hpp | 84 ++- .../layers/transform/discrete_random.hpp | 21 +- include/lbann/layers/transform/dummy.hpp | 15 +- include/lbann/layers/transform/evaluation.hpp | 25 +- include/lbann/layers/transform/gaussian.hpp | 29 +- include/lbann/layers/transform/hadamard.hpp | 65 +- include/lbann/layers/transform/in_top_k.hpp | 21 +- include/lbann/layers/transform/pooling.hpp | 102 +-- include/lbann/layers/transform/reduction.hpp | 41 +- include/lbann/layers/transform/reshape.hpp | 32 +- include/lbann/layers/transform/slice.hpp | 82 ++- include/lbann/layers/transform/sort.hpp | 29 +- include/lbann/layers/transform/split.hpp | 37 +- .../lbann/layers/transform/stop_gradient.hpp | 20 +- include/lbann/layers/transform/sum.hpp | 53 +- include/lbann/layers/transform/tessellate.hpp | 87 ++- include/lbann/layers/transform/transform.hpp | 7 +- include/lbann/layers/transform/uniform.hpp | 29 +- include/lbann/layers/transform/unpooling.hpp | 67 +- .../lbann/layers/transform/weighted_sum.hpp | 59 +- include/lbann/layers/transform/weights.hpp | 126 ++-- include/lbann/metrics/layer_metric.hpp | 2 +- .../lbann/objective_functions/layer_term.hpp | 2 +- .../weight_regularization/l2.hpp | 20 +- include/lbann/optimizers/CMakeLists.txt | 1 + include/lbann/optimizers/adagrad.hpp | 36 +- include/lbann/optimizers/adam.hpp | 92 ++- .../lbann/optimizers/data_type_optimizer.hpp | 217 ++++++ .../lbann/optimizers/hypergradient_adam.hpp | 66 +- include/lbann/optimizers/optimizer.hpp | 144 +--- include/lbann/optimizers/rmsprop.hpp | 46 +- include/lbann/optimizers/sgd.hpp | 48 +- include/lbann/proto/factories.hpp | 2 +- include/lbann/utils/cuda.hpp | 32 +- include/lbann/utils/cudnn.hpp | 26 +- include/lbann/utils/entrywise_operator.hpp | 80 ++- include/lbann/utils/impl/cuda.hpp | 98 ++- include/lbann/utils/memory.hpp | 2 +- include/lbann/utils/numerical_traits.hpp | 39 ++ include/lbann/weights/CMakeLists.txt | 1 + include/lbann/weights/data_type_weights.hpp | 198 ++++++ include/lbann/weights/initializer.hpp | 107 ++- .../weights/variance_scaling_initializers.hpp | 37 +- include/lbann/weights/weights.hpp | 96 +-- src/callbacks/check_dataset.cpp | 6 +- src/callbacks/check_gradients.cpp | 28 +- src/callbacks/check_init.cpp | 51 +- src/callbacks/check_nan.cpp | 151 +++-- src/callbacks/check_small.cpp | 81 +-- src/callbacks/confusion_matrix.cpp | 66 +- src/callbacks/debug.cpp | 15 +- src/callbacks/debug_io.cpp | 8 +- src/callbacks/dump_error_signals.cpp | 4 +- src/callbacks/dump_gradients.cpp | 4 +- .../dump_minibatch_sample_indices.cpp | 2 +- src/callbacks/dump_outputs.cpp | 8 +- src/callbacks/dump_weights.cpp | 6 +- src/callbacks/imcomm.cpp | 72 +- src/callbacks/learning_rate.cpp | 52 +- src/callbacks/ltfb.cpp | 92 +-- src/callbacks/mixup.cpp | 11 +- src/callbacks/monitor_io.cpp | 4 +- src/callbacks/perturb_adam.cpp | 39 +- src/callbacks/perturb_dropout.cpp | 14 +- src/callbacks/print_statistics.cpp | 4 +- src/callbacks/replace_weights.cpp | 4 +- src/callbacks/save_images.cpp | 22 +- src/callbacks/summary.cpp | 16 +- src/callbacks/sync_layers.cpp | 2 +- src/callbacks/variable_minibatch.cpp | 14 +- src/data_readers/data_reader_jag_conduit.cpp | 10 +- src/io/data_buffers/partitioned_io_buffer.cpp | 99 ++- src/layers/CMakeLists.txt | 1 + src/layers/activations/activations.cpp | 105 ++- src/layers/activations/activations.cu | 101 ++- src/layers/activations/elu.cpp | 67 +- src/layers/activations/elu.cu | 79 +-- src/layers/activations/identity.cpp | 8 +- src/layers/activations/leaky_relu.cpp | 67 +- src/layers/activations/leaky_relu.cu | 79 +-- src/layers/activations/log_softmax.cpp | 67 +- src/layers/activations/log_softmax.cu | 142 ++-- src/layers/activations/softmax.cpp | 75 +- src/layers/activations/softmax.cu | 148 ++-- src/layers/data_type_layer.cpp | 638 ++++++++++++++++++ src/layers/image/bilinear_resize.cpp | 24 +- src/layers/image/bilinear_resize.cu | 22 +- src/layers/io/input/input_layer.cpp | 8 +- src/layers/layer.cpp | 572 +--------------- .../learning/channelwise_scale_bias.cpp | 38 +- src/layers/learning/channelwise_scale_bias.cu | 67 +- src/layers/learning/convolution.cpp | 8 +- src/layers/learning/deconvolution.cpp | 4 +- src/layers/learning/embedding.cpp | 51 +- src/layers/learning/embedding.cu | 67 +- src/layers/learning/entrywise_scale_bias.cpp | 78 +-- src/layers/learning/entrywise_scale_bias.cu | 120 ++-- src/layers/learning/fully_connected.cpp | 339 +++++----- src/layers/loss/categorical_accuracy.cpp | 50 +- src/layers/loss/categorical_accuracy.cu | 74 +- src/layers/loss/cross_entropy.cpp | 80 +-- src/layers/loss/cross_entropy.cu | 102 ++- src/layers/loss/entrywise.cpp | 164 +++-- src/layers/loss/entrywise.cu | 197 +++--- src/layers/loss/l1_norm.cpp | 58 +- src/layers/loss/l1_norm.cu | 72 +- src/layers/loss/l2_norm2.cpp | 56 +- src/layers/loss/l2_norm2.cu | 71 +- src/layers/loss/mean_absolute_error.cpp | 92 +-- src/layers/loss/mean_absolute_error.cu | 112 ++- src/layers/loss/mean_squared_error.cpp | 88 +-- src/layers/loss/mean_squared_error.cu | 108 ++- .../loss/top_k_categorical_accuracy.cpp | 64 +- src/layers/loss/top_k_categorical_accuracy.cu | 92 ++- src/layers/math/binary.cpp | 447 ++++++------ src/layers/math/binary.cu | 458 ++++++------- src/layers/math/clamp.cpp | 65 +- src/layers/math/clamp.cu | 85 +-- src/layers/math/matmul.cpp | 77 ++- src/layers/math/unary.cpp | 237 +++---- src/layers/math/unary.cu | 215 +++--- src/layers/misc/argmax.cpp | 15 +- src/layers/misc/argmin.cpp | 15 +- src/layers/misc/channelwise_mean.cpp | 26 +- src/layers/misc/channelwise_mean.cu | 45 +- src/layers/misc/covariance.cpp | 120 ++-- src/layers/misc/covariance.cu | 173 ++--- src/layers/misc/mini_batch_index.cpp | 8 +- src/layers/misc/mini_batch_size.cpp | 8 +- src/layers/misc/one_hot.cpp | 17 +- src/layers/misc/one_hot.cu | 26 +- src/layers/misc/variance.cpp | 99 ++- src/layers/misc/variance.cu | 132 ++-- .../regularizers/batch_normalization.cpp | 142 ++-- .../regularizers/batch_normalization.cu | 225 +++--- src/layers/regularizers/dropout.cpp | 8 +- .../entrywise_batch_normalization.cpp | 247 ++++--- .../entrywise_batch_normalization.cu | 315 +++++---- src/layers/regularizers/layer_norm.cpp | 95 ++- src/layers/regularizers/layer_norm.cu | 164 +++-- .../local_response_normalization.cpp | 4 +- src/layers/regularizers/selu_dropout.cpp | 8 +- src/layers/transform/bernoulli.cpp | 8 +- src/layers/transform/categorical_random.cpp | 2 +- src/layers/transform/concatenation.cpp | 8 +- src/layers/transform/constant.cpp | 8 +- src/layers/transform/crop.cpp | 14 +- src/layers/transform/crop.cu | 52 +- src/layers/transform/discrete_random.cpp | 2 +- src/layers/transform/dummy.cpp | 8 +- src/layers/transform/evaluation.cpp | 91 +-- src/layers/transform/gaussian.cpp | 8 +- src/layers/transform/hadamard.cpp | 8 +- src/layers/transform/in_top_k.cpp | 49 +- src/layers/transform/in_top_k.cu | 77 ++- src/layers/transform/pooling.cpp | 4 +- src/layers/transform/reduction.cpp | 4 +- src/layers/transform/reshape.cpp | 8 +- src/layers/transform/slice.cpp | 8 +- src/layers/transform/sort.cpp | 26 +- src/layers/transform/sort.cu | 38 +- src/layers/transform/split.cpp | 8 +- src/layers/transform/stop_gradient.cpp | 8 +- src/layers/transform/sum.cpp | 8 +- src/layers/transform/tessellate.cpp | 148 ++-- src/layers/transform/tessellate.cu | 110 ++- src/layers/transform/uniform.cpp | 8 +- src/layers/transform/unpooling.cpp | 2 +- src/layers/transform/weighted_sum.cpp | 8 +- src/layers/transform/weights.cpp | 8 +- src/metrics/layer_metric.cpp | 6 +- src/models/model.cpp | 47 +- src/objective_functions/layer_term.cpp | 14 +- .../weight_regularization/l2.cpp | 26 +- .../weight_regularization/l2.cu | 14 +- src/optimizers/CMakeLists.txt | 1 + src/optimizers/adagrad.cpp | 63 +- src/optimizers/adagrad.cu | 20 +- src/optimizers/adam.cpp | 101 +-- src/optimizers/adam.cu | 57 +- src/optimizers/data_type_optimizer.cpp | 354 ++++++++++ src/optimizers/hypergradient_adam.cpp | 92 +-- src/optimizers/optimizer.cpp | 281 +------- src/optimizers/rmsprop.cpp | 91 +-- src/optimizers/rmsprop.cu | 24 +- src/optimizers/sgd.cpp | 82 ++- src/optimizers/sgd.cu | 46 +- src/proto/factories/layer_factory.cpp | 139 ++-- src/proto/factories/layer_graph_factory.cpp | 30 +- src/proto/factories/model_factory.cpp | 3 +- src/proto/factories/weights_factory.cpp | 3 +- src/utils/cudnn.cpp | 161 +++-- src/weights/CMakeLists.txt | 1 + src/weights/data_type_weights.cpp | 479 +++++++++++++ src/weights/initializer.cpp | 49 +- src/weights/variance_scaling_initializers.cpp | 44 +- src/weights/weights.cpp | 333 --------- 262 files changed, 9513 insertions(+), 8444 deletions(-) create mode 100644 include/lbann/layers/data_type_layer.hpp create mode 100644 include/lbann/optimizers/data_type_optimizer.hpp create mode 100644 include/lbann/utils/numerical_traits.hpp create mode 100644 include/lbann/weights/data_type_weights.hpp create mode 100644 src/layers/data_type_layer.cpp create mode 100644 src/optimizers/data_type_optimizer.cpp create mode 100644 src/weights/data_type_weights.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 4ceed5370be..8cf9d6a6f23 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -552,10 +552,8 @@ if (APPLE) set_source_files_properties(${_LBANN_SRCS} PROPERTIES COMPILE_OPTIONS "-g") - # Cleanup bad files - list(APPEND BAD_FILES - "${CMAKE_SOURCE_DIR}/src/layers/loss/cross_entropy.cpp") - foreach (bad_file IN LISTS BAD_FILES) + # Cleanup source files + foreach (bad_file IN LISTS _LBANN_SRCS) get_source_file_property( _SRC_COMPILE_OPTS "${bad_file}" COMPILE_OPTIONS) string(REPLACE "-g" "" _SRC_COMPILE_OPTS "${COMPILE_OPTIONS}") diff --git a/include/lbann/callbacks/check_init.hpp b/include/lbann/callbacks/check_init.hpp index 662412b5f58..0f6ffa5c7a5 100644 --- a/include/lbann/callbacks/check_init.hpp +++ b/include/lbann/callbacks/check_init.hpp @@ -39,24 +39,20 @@ namespace callback { */ class check_init : public callback_base { public: - check_init() : callback_base() {} + check_init() = default; check_init(const check_init&) = default; - check_init& operator=( - const check_init&) = default; + check_init& operator=(const check_init&) = default; check_init* copy() const override { return new check_init(*this); } /** Check initializations. */ void on_train_begin(model *m) override; std::string name() const override { return "check init"; } - private: - /** Return true if x == y. */ - bool check_equal(const AbsMat& x, const AbsMat& y) const; }; // Builder function LBANN_ADD_DEFAULT_CALLBACK_BUILDER( - check_init, build_check_init_callback_from_pbuf) + check_init, build_check_init_callback_from_pbuf); } // namespace callback } // namespace lbann diff --git a/include/lbann/callbacks/check_nan.hpp b/include/lbann/callbacks/check_nan.hpp index bf6ebdffb8e..0894b25a12e 100644 --- a/include/lbann/callbacks/check_nan.hpp +++ b/include/lbann/callbacks/check_nan.hpp @@ -43,7 +43,7 @@ class check_nan : public callback_base { using callback_base::on_forward_prop_end; using callback_base::on_backward_prop_end; - check_nan() : callback_base() {} + check_nan() = default; check_nan(const check_nan&) = default; check_nan& operator=( const check_nan&) = default; diff --git a/include/lbann/callbacks/check_small.hpp b/include/lbann/callbacks/check_small.hpp index c0c49e8e9ed..c5419f58571 100644 --- a/include/lbann/callbacks/check_small.hpp +++ b/include/lbann/callbacks/check_small.hpp @@ -47,10 +47,9 @@ class check_small : public callback_base { using callback_base::on_forward_prop_end; using callback_base::on_backward_prop_end; - check_small() : callback_base() {} + check_small() = default; check_small(const check_small&) = default; - check_small& operator=( - const check_small&) = default; + check_small& operator=(const check_small&) = default; check_small* copy() const override { return new check_small(*this); } @@ -61,11 +60,6 @@ class check_small : public callback_base { /** Check that weights are good. */ void on_batch_end(model *m) override; std::string name() const override { return "check_small"; } - private: - /** Smallest allowable value. */ - static const DataType m_threshold; - /** Return true if there are no problems with m. */ - bool is_good(const AbsDistMat& m); }; // Builder function diff --git a/include/lbann/callbacks/confusion_matrix.hpp b/include/lbann/callbacks/confusion_matrix.hpp index 3bed8808a6f..187c9088487 100644 --- a/include/lbann/callbacks/confusion_matrix.hpp +++ b/include/lbann/callbacks/confusion_matrix.hpp @@ -39,11 +39,16 @@ namespace callback { * assumed to output one-hot vectors for each mini-batch sample. */ class confusion_matrix : public callback_base { +public: + using AbsDistMatType = El::AbstractDistMatrix; public: - confusion_matrix(std::string prediction_layer, - std::string label_layer, - std::string prefix); + confusion_matrix(std::string&& prediction_layer, + std::string&& label_layer, + std::string&& prefix); + confusion_matrix(std::string const& prediction_layer, + std::string const& label_layer, + std::string const& prefix); confusion_matrix(const confusion_matrix&); confusion_matrix& operator=(const confusion_matrix&); confusion_matrix* copy() const override { @@ -86,18 +91,18 @@ class confusion_matrix : public callback_base { * This is a CPU matrix. If the prediction layer keeps data on GPU, * then this will be a matrix copy rather than a matrix view. */ - std::unique_ptr m_predictions_v; + std::unique_ptr m_predictions_v; /** "View" into label matrix. * This is a CPU matrix. If the label layer keeps data on GPU or in * a different distribution than the prediction layer, then this * will be a matrix copy rather than a matrix view. */ - std::unique_ptr m_labels_v; + std::unique_ptr m_labels_v; /** Get prediction matrix. */ - const AbsDistMat& get_predictions(const model& m) const; + const AbsDistMatType& get_predictions(const model& m) const; /** Get label matrix. */ - const AbsDistMat& get_labels(const model& m) const; + const AbsDistMatType& get_labels(const model& m) const; /** Reset confusion matrix counts. */ void reset_counts(const model& m); diff --git a/include/lbann/callbacks/debug.hpp b/include/lbann/callbacks/debug.hpp index d67857eeff7..354696e7dd2 100644 --- a/include/lbann/callbacks/debug.hpp +++ b/include/lbann/callbacks/debug.hpp @@ -72,8 +72,6 @@ class debug : public callback_base { using callback_base::on_backward_prop_end; using callback_base::on_evaluate_forward_prop_begin; using callback_base::on_evaluate_forward_prop_end; - using callback_base::on_optimize_begin; - using callback_base::on_optimize_end; /** @brief Print that a layer's forward prop is beginning. */ void on_forward_prop_begin(model *m, Layer *l) override; diff --git a/include/lbann/callbacks/debug_io.hpp b/include/lbann/callbacks/debug_io.hpp index 4ce036c1929..834f91e40bb 100644 --- a/include/lbann/callbacks/debug_io.hpp +++ b/include/lbann/callbacks/debug_io.hpp @@ -75,7 +75,7 @@ class debug_io : public callback_base { void on_test_begin(model *m) override; /** Common format for printing I/O stats at the start of a mini-batch */ - void print_fp_start(model *m, generic_input_layer *input); + void print_fp_start(model *m, generic_input_layer *input); /** Common format for printing I/O stats at the start of a phase */ void print_phase_start(model *m, execution_mode mode); diff --git a/include/lbann/callbacks/imcomm.hpp b/include/lbann/callbacks/imcomm.hpp index 66b4e68dcc3..f7703ade0d9 100644 --- a/include/lbann/callbacks/imcomm.hpp +++ b/include/lbann/callbacks/imcomm.hpp @@ -35,11 +35,15 @@ #include "lbann/callbacks/callback.hpp" namespace lbann { + +template +class data_type_weights; + namespace callback { /** - * Support inter-model communication after each mini-batch to synchronize - * gradient updates. + * @brief Support inter-model communication after each mini-batch to + * synchronize gradient updates. */ class imcomm : public callback_base { public: @@ -51,7 +55,7 @@ class imcomm : public callback_base { }; /** - * Initialize with ct being used for all weights. + * @brief Initialize with ct being used for all weights. */ imcomm(comm_type ct = NORMAL, const std::shared_ptr& summarizer = nullptr); @@ -61,41 +65,51 @@ class imcomm : public callback_base { return new imcomm(*this); } /** - * Convenience initialization to do one update type for specific weights. - * Implies no inter-model updates for other weights. + * @brief Convenience initialization to do one update type for specific weights. + * + * @details Implies no inter-model updates for other weights. */ imcomm(comm_type ct, std::unordered_set weights_list, const std::shared_ptr& summarizer = nullptr); - /** Choose comm type ct for weights. */ + /** @brief Choose comm type ct for weights. */ void set_weights_comm(weights *w, comm_type ct); - /** Do initialization for this model. */ + /** @brief Do initialization for this model. */ void setup(model *m) override; - /** Make sure all models have the same weights. */ + + /** @brief Make sure all models have the same weights. */ void on_train_begin(model *m) override; - /** Do inter-model gradient updates. */ + + /** @brief Do inter-model gradient updates. */ void on_backward_prop_end(model *m) override; std::string name() const override { return "imcomm"; } private: + /** @brief Summarize relevant statistics. */ + template + void do_summary(model const& m, data_type_weights& w, EvalType im_time); - /** Default communication type. */ - comm_type m_default_ct; + private: + /** @brief Parameters for a given set of weights. */ + struct imcomm_params { + /** @brief Type of communication done. */ + comm_type ct = NONE; + }; - /** Per-weights parameters. */ - std::unordered_map m_weights_params; + /** @brief Default communication type. */ + comm_type m_default_ct; - /** Summarize relevant statistics. */ - void do_summary(model *m, weights *w, EvalType im_time); + /** @brief Per-weights parameters. */ + std::unordered_map m_weights_params; - /** @brief lbann_summary */ + /** @brief @brief lbann_summary */ std::shared_ptr m_summarizer = nullptr; }; -/** returns a string representation of the weight_initialization. */ -std::string get_comm_type_name(imcomm::comm_type m); +/** @brief returns a string representation of the weight_initialization */ +std::string get_comm_type_name(typename imcomm::comm_type m); // Builder function std::unique_ptr diff --git a/include/lbann/callbacks/learning_rate.hpp b/include/lbann/callbacks/learning_rate.hpp index c7a4d763959..8973fe34b4e 100644 --- a/include/lbann/callbacks/learning_rate.hpp +++ b/include/lbann/callbacks/learning_rate.hpp @@ -75,9 +75,7 @@ class learning_rate : public callback_base { * learning rate for optimizer opt. The current global learning rate is *not* * updated automatically based on this method. */ - virtual float optimizer_schedule(model *m, optimizer &opt) { - return opt.get_learning_rate(); - } + virtual float optimizer_schedule(model *m, optimizer &opt); const std::unordered_set& get_weights() const noexcept { return m_weights; @@ -104,7 +102,7 @@ class learning_rate : public callback_base { std::vector m_weights_names; /** Weights to update. */ - std::unordered_set m_weights; + std::unordered_set m_weights; }; /** @@ -184,8 +182,7 @@ build_adaptive_learning_rate_callback_from_pbuf( /** * Decrease learning rate by a fixed amount at fixed times. */ -class drop_fixed_learning_rate : - public learning_rate { +class drop_fixed_learning_rate : public learning_rate { public: /** * Decrease the learning rate by amt when each epoch in drop_epochs is @@ -228,8 +225,7 @@ build_drop_fixed_learning_rate_callback_from_pbuf( * learning rate. This also *forces* its schedule and will stomp over * other changes. */ -class linear_growth_learning_rate : - public learning_rate { +class linear_growth_learning_rate : public learning_rate { public: /** * Linearly increase the learning rate to reach target after num_epochs. diff --git a/include/lbann/callbacks/perturb_adam.hpp b/include/lbann/callbacks/perturb_adam.hpp index 7538cd526ad..3101018c6a7 100644 --- a/include/lbann/callbacks/perturb_adam.hpp +++ b/include/lbann/callbacks/perturb_adam.hpp @@ -69,13 +69,13 @@ class perturb_adam : public callback_base { * perturbed. */ perturb_adam(DataType learning_rate_factor, - DataType beta1_factor, - DataType beta2_factor, - DataType eps_factor = 0, - bool perturb_during_training = false, - El::Int batch_interval = 1, - std::set weights_names - = std::set()); + DataType beta1_factor, + DataType beta2_factor, + DataType eps_factor = 0, + bool perturb_during_training = false, + El::Int batch_interval = 1, + std::set weights_names + = std::set()); perturb_adam* copy() const override { return new perturb_adam(*this); } std::string name() const override { return "perturb Adam"; } @@ -120,7 +120,7 @@ class perturb_adam : public callback_base { /** Perturb Adam optimizers in model. */ void perturb(model& m) const; /** Perturb Adam optimizer hyperparameters. */ - void perturb(lbann_comm& comm, adam& m) const; + void perturb(lbann_comm& comm, adam& m) const; }; diff --git a/include/lbann/callbacks/perturb_dropout.hpp b/include/lbann/callbacks/perturb_dropout.hpp index 1fec9084ed8..c55722ef618 100644 --- a/include/lbann/callbacks/perturb_dropout.hpp +++ b/include/lbann/callbacks/perturb_dropout.hpp @@ -69,8 +69,8 @@ class perturb_dropout : public callback_base { */ std::set m_layer_names; - template - dropout* get_dropout_layer(Layer* l); + template + dropout* get_dropout_layer(Layer* l); /** Perturb dropout keep prob in model. */ void perturb(model& m); diff --git a/include/lbann/callbacks/save_images.hpp b/include/lbann/callbacks/save_images.hpp index 3ce9efc71f9..cf37f33e33d 100644 --- a/include/lbann/callbacks/save_images.hpp +++ b/include/lbann/callbacks/save_images.hpp @@ -47,8 +47,8 @@ class save_images : public callback_base { * @param image_prefix Prefix for image file names. */ save_images(std::vector layer_names, - std::string image_format = "jpg", - std::string image_prefix = ""); + std::string image_format = "jpg", + std::string image_prefix = ""); save_images(const save_images&) = default; save_images& operator=( const save_images&) = default; diff --git a/include/lbann/io/data_buffers/partitioned_io_buffer.hpp b/include/lbann/io/data_buffers/partitioned_io_buffer.hpp index 13a4a23f8b2..a9f49a2da55 100644 --- a/include/lbann/io/data_buffers/partitioned_io_buffer.hpp +++ b/include/lbann/io/data_buffers/partitioned_io_buffer.hpp @@ -31,12 +31,20 @@ namespace lbann { +template class data_buffer { +public: + /** @name Public Types */ + ///@{ + + /** @brief The tensor type expected in this object. */ + using AbsDistMatrixType = El::AbstractDistMatrix; + public: /** Number of samples in the current mini-batch */ int m_num_samples_fetched; /** Distributed matrix used to stage local data to layer output */ - std::vector> m_input_buffers; + std::vector> m_input_buffers; std::atomic m_fetch_data_in_background; std::future m_data_fetch_future; /// 1-D Matrix of which indices were fetched in this mini-batch @@ -78,9 +86,19 @@ class data_buffer { /** * Parallel I/O routines for managing partitioned minibatches */ +template class partitioned_io_buffer : public generic_io_buffer { +public: + /** @name Public Types */ + ///@{ + + /** @brief The tensor type expected in this object. */ + using AbsDistMatrixType = El::AbstractDistMatrix; + + ///@} + public: - typedef std::map data_buffer_map_t; + typedef std::map *> data_buffer_map_t; public: partitioned_io_buffer(lbann_comm *comm, int num_parallel_readers, std::map data_readers, int num_child_layers); partitioned_io_buffer(const partitioned_io_buffer& other); @@ -94,8 +112,8 @@ class partitioned_io_buffer : public generic_io_buffer { void setup_data(El::Int num_neurons, El::Int num_targets, El::Int max_mini_batch_size) override; int fetch_to_local_matrix(generic_data_reader *data_reader, execution_mode mode) override; - void distribute_from_local_matrix(generic_data_reader *data_reader, execution_mode mode, AbsDistMat& sample, AbsDistMat& response) override; - void distribute_from_local_matrix(generic_data_reader *data_reader, execution_mode mode, AbsDistMat& sample) override; + void distribute_from_local_matrix(generic_data_reader *data_reader, execution_mode mode, AbsDistMatrixType& sample, AbsDistMatrixType& response) override; + void distribute_from_local_matrix(generic_data_reader *data_reader, execution_mode mode, AbsDistMatrixType& sample) override; bool update_data_set(generic_data_reader *data_reader, execution_mode mode) override; void set_fetch_data_in_background(bool flag, execution_mode mode) override; bool is_data_fetched_in_background(execution_mode mode) override; @@ -109,9 +127,9 @@ class partitioned_io_buffer : public generic_io_buffer { int compute_max_num_parallel_readers(long data_set_size, int mini_batch_size, int requested_num_parallel_readers) const override; static int compute_max_num_parallel_readers(long data_set_size, int mini_batch_size, int requested_num_parallel_readers, const lbann_comm* comm); - data_buffer *get_data_buffer(const execution_mode mode) const { - data_buffer *data_buffer = nullptr; - data_buffer_map_t::const_iterator it = m_data_buffers.find(mode); + data_buffer *get_data_buffer(const execution_mode mode) const { + data_buffer *data_buffer = nullptr; + typename data_buffer_map_t::const_iterator it = m_data_buffers.find(mode); if (it != m_data_buffers.end()) data_buffer = it->second; switch(mode) { diff --git a/include/lbann/layers/CMakeLists.txt b/include/lbann/layers/CMakeLists.txt index 0cc71271bcb..766871b88f8 100644 --- a/include/lbann/layers/CMakeLists.txt +++ b/include/lbann/layers/CMakeLists.txt @@ -1,6 +1,7 @@ # Add the headers for this directory set_full_path(THIS_DIR_HEADERS layer.hpp + data_type_layer.hpp ) # Add the subdirectories diff --git a/include/lbann/layers/activations/activations.hpp b/include/lbann/layers/activations/activations.hpp index 35895f3dcc4..bd0e3d4edeb 100644 --- a/include/lbann/layers/activations/activations.hpp +++ b/include/lbann/layers/activations/activations.hpp @@ -34,9 +34,9 @@ namespace lbann { // Convenience macros for ETI decls for unary layers #ifndef LBANN_ACTIVATIONS_LAYER_INSTANTIATE -#define UNARY_ETI_DECL_MACRO_DEV(LAYER_NAME, DEVICE) \ - extern template class LAYER_NAME; \ - extern template class LAYER_NAME +#define UNARY_ETI_DECL_MACRO_DEV(LAYER_NAME, DEVICE) \ + extern template class LAYER_NAME; \ + extern template class LAYER_NAME #else #define UNARY_ETI_DECL_MACRO_DEV(...) #endif // LBANN_UNARY_LAYER_INSTANTIATE diff --git a/include/lbann/layers/activations/elu.hpp b/include/lbann/layers/activations/elu.hpp index 9d9ce7a9cb4..c978c752107 100644 --- a/include/lbann/layers/activations/elu.hpp +++ b/include/lbann/layers/activations/elu.hpp @@ -27,7 +27,7 @@ #ifndef LBANN_LAYERS_ACTIVATIONS_ELU_HPP_INCLUDED #define LBANN_LAYERS_ACTIVATIONS_ELU_HPP_INCLUDED -#include "lbann/layers/layer.hpp" +#include "lbann/layers/data_type_layer.hpp" namespace lbann { @@ -46,42 +46,42 @@ namespace lbann { * and accurate deep network learning by exponential linear units * (ELUs)." arXiv preprint arXiv:1511.07289 (2015). */ -template -class elu_layer : public Layer { +template +class elu_layer : public data_type_layer { public: - elu_layer(lbann_comm *comm, DataType alpha = 1) - : Layer(comm), m_alpha(alpha) {} + elu_layer(lbann_comm *comm, TensorDataType alpha = 1) + : data_type_layer(comm), m_alpha(alpha) {} elu_layer* copy() const override { return new elu_layer(*this); } std::string get_type() const override { return "ELU"; } data_layout get_data_layout() const override { return Layout; } El::Device get_device_allocation() const override { return Device; } description get_description() const override { - auto desc = Layer::get_description(); + auto desc = data_type_layer::get_description(); desc.add("alpha", m_alpha); return desc; } protected: void setup_dims() override { - Layer::setup_dims(); - set_output_dims(get_input_dims()); + data_type_layer::setup_dims(); + this->set_output_dims(this->get_input_dims()); } void fp_compute() override; void bp_compute() override; private: /** Scale parameter for negative region. */ - DataType m_alpha; + TensorDataType m_alpha; }; #ifndef LBANN_ELU_LAYER_INSTANTIATE -extern template class elu_layer; -extern template class elu_layer; +extern template class elu_layer; +extern template class elu_layer; #ifdef LBANN_HAS_GPU -extern template class elu_layer; -extern template class elu_layer; +extern template class elu_layer; +extern template class elu_layer; #endif // LBANN_HAS_GPU #endif // LBANN_ELU_LAYER_INSTANTIATE diff --git a/include/lbann/layers/activations/identity.hpp b/include/lbann/layers/activations/identity.hpp index b7eeba766b7..025df35bef4 100644 --- a/include/lbann/layers/activations/identity.hpp +++ b/include/lbann/layers/activations/identity.hpp @@ -27,7 +27,7 @@ #ifndef LBANN_LAYERS_ACTIVATIONS_IDENTITY_HPP_INCLUDED #define LBANN_LAYERS_ACTIVATIONS_IDENTITY_HPP_INCLUDED -#include "lbann/layers/layer.hpp" +#include "lbann/layers/data_type_layer.hpp" namespace lbann { @@ -36,24 +36,24 @@ namespace lbann { * Forward and backward prop simply involve setting up tensor views, * and hence are very cheap. */ -template -class identity_layer : public Layer { +template +class identity_layer : public data_type_layer { public: - identity_layer(lbann_comm *comm) : Layer(comm) {} + identity_layer(lbann_comm *comm) : data_type_layer(comm) {} identity_layer* copy() const override { return new identity_layer(*this); } std::string get_type() const override { return "identity"; } data_layout get_data_layout() const override { return Layout; } El::Device get_device_allocation() const override { return Device; } protected: void setup_dims() override { - Layer::setup_dims(); - set_output_dims(get_input_dims()); + data_type_layer::setup_dims(); + this->set_output_dims(this->get_input_dims()); } void fp_setup_outputs(El::Int mini_batch_size) override { - El::LockedView(get_activations(), get_prev_activations()); + El::LockedView(this->get_activations(), this->get_prev_activations()); } void bp_setup_gradient_wrt_inputs(El::Int mini_batch_size) override { - El::LockedView(get_error_signals(), get_prev_error_signals()); + El::LockedView(this->get_error_signals(), this->get_prev_error_signals()); } void fp_compute() override {} void bp_compute() override {} @@ -61,14 +61,14 @@ class identity_layer : public Layer { #ifndef LBANN_IDENTITY_LAYER_INSTANTIATE extern template class identity_layer< - data_layout::DATA_PARALLEL, El::Device::CPU>; + DataType, data_layout::DATA_PARALLEL, El::Device::CPU>; extern template class identity_layer< - data_layout::MODEL_PARALLEL, El::Device::CPU>; + DataType, data_layout::MODEL_PARALLEL, El::Device::CPU>; #ifdef LBANN_HAS_GPU extern template class identity_layer< - data_layout::DATA_PARALLEL, El::Device::GPU>; + DataType, data_layout::DATA_PARALLEL, El::Device::GPU>; extern template class identity_layer< - data_layout::MODEL_PARALLEL, El::Device::GPU>; + DataType, data_layout::MODEL_PARALLEL, El::Device::GPU>; #endif // LBANN_HAS_GPU #endif // LBANN_IDENTITY_LAYER_INSTANTIATE diff --git a/include/lbann/layers/activations/leaky_relu.hpp b/include/lbann/layers/activations/leaky_relu.hpp index ad55718a7fa..d4a80ad8b2b 100644 --- a/include/lbann/layers/activations/leaky_relu.hpp +++ b/include/lbann/layers/activations/leaky_relu.hpp @@ -27,7 +27,7 @@ #ifndef LBANN_LAYERS_ACTIVATIONS_LEAKY_RELU_HPP_INCLUDED #define LBANN_LAYERS_ACTIVATIONS_LEAKY_RELU_HPP_INCLUDED -#include "lbann/layers/layer.hpp" +#include "lbann/layers/data_type_layer.hpp" namespace lbann { @@ -46,46 +46,46 @@ namespace lbann { * nonlinearities improve neural network acoustic models." In * Proc. ICML, vol. 30, no. 1, p. 3. 2013. */ -template -class leaky_relu_layer : public Layer { +template +class leaky_relu_layer : public data_type_layer { public: - leaky_relu_layer(lbann_comm *comm, DataType negative_slope = 0.01) - : Layer(comm), m_negative_slope(negative_slope) {} + leaky_relu_layer(lbann_comm *comm, TensorDataType negative_slope = 0.01) + : data_type_layer(comm), m_negative_slope(negative_slope) {} leaky_relu_layer* copy() const override { return new leaky_relu_layer(*this); } std::string get_type() const override { return "leaky ReLU"; } data_layout get_data_layout() const override { return Layout; } El::Device get_device_allocation() const override { return Device; } description get_description() const override { - auto desc = Layer::get_description(); + auto desc = data_type_layer::get_description(); desc.add("Negative slope", m_negative_slope); return desc; } protected: void setup_dims() override { - Layer::setup_dims(); - set_output_dims(get_input_dims()); + data_type_layer::setup_dims(); + this->set_output_dims(this->get_input_dims()); } void fp_compute() override; void bp_compute() override; private: /** Function slope in negative region. */ - DataType m_negative_slope; + TensorDataType m_negative_slope; }; #ifndef LBANN_LEAKY_RELU_LAYER_INSTANTIATE extern template class leaky_relu_layer< - data_layout::DATA_PARALLEL, El::Device::CPU>; + DataType, data_layout::DATA_PARALLEL, El::Device::CPU>; extern template class leaky_relu_layer< - data_layout::MODEL_PARALLEL, El::Device::CPU>; + DataType, data_layout::MODEL_PARALLEL, El::Device::CPU>; #ifdef LBANN_HAS_GPU extern template class leaky_relu_layer< - data_layout::DATA_PARALLEL, El::Device::GPU>; + DataType, data_layout::DATA_PARALLEL, El::Device::GPU>; extern template class leaky_relu_layer< - data_layout::MODEL_PARALLEL, El::Device::GPU>; + DataType, data_layout::MODEL_PARALLEL, El::Device::GPU>; #endif // LBANN_HAS_GPU #endif // LBANN_LEAKY_RELU_LAYER_INSTANTIATE diff --git a/include/lbann/layers/activations/log_softmax.hpp b/include/lbann/layers/activations/log_softmax.hpp index fa4e1c48582..6100576afcb 100644 --- a/include/lbann/layers/activations/log_softmax.hpp +++ b/include/lbann/layers/activations/log_softmax.hpp @@ -27,7 +27,7 @@ #ifndef LBANN_LAYERS_ACTIVATIONS_LOG_SOFTMAX_HPP_INCLUDED #define LBANN_LAYERS_ACTIVATIONS_LOG_SOFTMAX_HPP_INCLUDED -#include "lbann/layers/layer.hpp" +#include "lbann/layers/data_type_layer.hpp" #include "lbann/utils/cudnn.hpp" namespace lbann { @@ -36,19 +36,28 @@ namespace lbann { * * @f[ \log \text{softmax}(x)_i = x_i - \log \sum_j e^{x_j} @f] */ -template -class log_softmax_layer : public Layer { +template +class log_softmax_layer : public data_type_layer { +public: + /** @name Public Types */ + ///@{ + + /** @brief The tensor type expected in this object. */ + using AbsDistMatrixType = El::AbstractDistMatrix; + + ///@} + public: log_softmax_layer(lbann_comm *comm) - : Layer(comm) + : data_type_layer(comm) #ifdef LBANN_HAS_CUDNN , m_tensors_cudnn_desc(this) #endif // LBANN_HAS_CUDNN {} log_softmax_layer(const log_softmax_layer& other) - : Layer(other), + : data_type_layer(other), m_workspace(other.m_workspace ? other.m_workspace->Copy() : nullptr) #ifdef LBANN_HAS_CUDNN @@ -61,7 +70,7 @@ class log_softmax_layer : public Layer { } log_softmax_layer& operator=(const log_softmax_layer& other) { - Layer::operator=(other); + data_type_layer::operator=(other); m_workspace.reset(other.m_workspace ? other.m_workspace->Copy() : nullptr); #ifdef LBANN_HAS_CUDNN @@ -79,15 +88,15 @@ class log_softmax_layer : public Layer { El::Device get_device_allocation() const override { return Device; } void setup_dims() override { - Layer::setup_dims(); - set_output_dims(get_input_dims()); + data_type_layer::setup_dims(); + this->set_output_dims(this->get_input_dims()); } void setup_matrices(const El::Grid& grid) override { - Layer::setup_matrices(grid); - auto dist = get_prev_activations().DistData(); + data_type_layer::setup_matrices(grid); + auto dist = this->get_prev_activations().DistData(); dist.colDist = El::STAR; - m_workspace.reset(AbsDistMat::Instantiate(dist)); + m_workspace.reset(AbsDistMatrixType::Instantiate(dist)); #ifdef HYDROGEN_HAVE_CUB if (m_workspace->GetLocalDevice() == El::Device::GPU) { m_workspace->Matrix().SetMemoryMode(1); // CUB memory pool @@ -96,8 +105,8 @@ class log_softmax_layer : public Layer { } void fp_setup_outputs(El::Int mini_batch_size) override { - Layer::fp_setup_outputs(mini_batch_size); - const auto& dist_data = get_prev_activations().DistData(); + data_type_layer::fp_setup_outputs(mini_batch_size); + const auto& dist_data = this->get_prev_activations().DistData(); m_workspace->Empty(false); m_workspace->AlignWith(dist_data); m_workspace->Resize(1, mini_batch_size); @@ -106,28 +115,33 @@ class log_softmax_layer : public Layer { void fp_compute() override; void bp_compute() override; + template + friend void fp_compute_impl(log_softmax_layer& l); + template + friend void bp_compute_impl(log_softmax_layer& l); + private: /** Workspace for column-wise reductions. */ - std::unique_ptr m_workspace; + std::unique_ptr m_workspace; #ifdef LBANN_HAS_CUDNN /** Tensor cuDNN descriptors. */ - cudnn::data_parallel_layer_tensor_manager m_tensors_cudnn_desc; + cudnn::data_parallel_layer_tensor_manager m_tensors_cudnn_desc; #endif // LBANN_HAS_CUDNN }; #ifndef LBANN_LOG_SOFTMAX_LAYER_INSTANTIATE extern template class log_softmax_layer< - data_layout::DATA_PARALLEL, El::Device::CPU>; + DataType, data_layout::DATA_PARALLEL, El::Device::CPU>; extern template class log_softmax_layer< - data_layout::MODEL_PARALLEL, El::Device::CPU>; + DataType, data_layout::MODEL_PARALLEL, El::Device::CPU>; #ifdef LBANN_HAS_GPU extern template class log_softmax_layer< - data_layout::DATA_PARALLEL, El::Device::GPU>; + DataType, data_layout::DATA_PARALLEL, El::Device::GPU>; extern template class log_softmax_layer< - data_layout::MODEL_PARALLEL, El::Device::GPU>; + DataType, data_layout::MODEL_PARALLEL, El::Device::GPU>; #endif // LBANN_HAS_GPU #endif // LBANN_LOG_SOFTMAX_LAYER_INSTANTIATE diff --git a/include/lbann/layers/activations/softmax.hpp b/include/lbann/layers/activations/softmax.hpp index e86969181a1..071b295df1f 100644 --- a/include/lbann/layers/activations/softmax.hpp +++ b/include/lbann/layers/activations/softmax.hpp @@ -27,7 +27,7 @@ #ifndef LBANN_LAYERS_ACTIVATIONS_SOFTMAX_HPP_INCLUDED #define LBANN_LAYERS_ACTIVATIONS_SOFTMAX_HPP_INCLUDED -#include "lbann/layers/layer.hpp" +#include "lbann/layers/data_type_layer.hpp" #include "lbann/utils/cudnn.hpp" // Threshold outputs to a minimum value. @@ -44,19 +44,28 @@ namespace lbann { /** * @f[ \text{softmax}(x)_i = \frac{e^{x_i}}{\sum_j e^{x_j}} @f] */ -template -class softmax_layer : public Layer { +template +class softmax_layer : public data_type_layer { +public: + /** @name Public Types */ + ///@{ + + /** @brief The tensor type expected in this object. */ + using AbsDistMatrixType = El::AbstractDistMatrix; + + ///@} + public: softmax_layer(lbann_comm *comm) - : Layer(comm) + : data_type_layer(comm) #ifdef LBANN_HAS_CUDNN , m_tensors_cudnn_desc(this) #endif // LBANN_HAS_CUDNN {} softmax_layer(const softmax_layer& other) - : Layer(other), + : data_type_layer(other), m_workspace(other.m_workspace ? other.m_workspace->Copy() : nullptr) #ifdef LBANN_HAS_CUDNN @@ -69,7 +78,7 @@ class softmax_layer : public Layer { } softmax_layer& operator=(const softmax_layer& other) { - Layer::operator=(other); + data_type_layer::operator=(other); m_workspace.reset(other.m_workspace ? other.m_workspace->Copy() : nullptr); #ifdef LBANN_HAS_CUDNN @@ -87,15 +96,15 @@ class softmax_layer : public Layer { El::Device get_device_allocation() const override { return Device; } void setup_dims() override { - Layer::setup_dims(); - set_output_dims(get_input_dims()); + data_type_layer::setup_dims(); + this->set_output_dims(this->get_input_dims()); } void setup_matrices(const El::Grid& grid) override { - Layer::setup_matrices(grid); - auto dist = get_prev_activations().DistData(); + data_type_layer::setup_matrices(grid); + auto dist = this->get_prev_activations().DistData(); dist.colDist = El::STAR; - m_workspace.reset(AbsDistMat::Instantiate(dist)); + m_workspace.reset(AbsDistMatrixType::Instantiate(dist)); #ifdef HYDROGEN_HAVE_CUB if (m_workspace->GetLocalDevice() == El::Device::GPU) { m_workspace->Matrix().SetMemoryMode(1); // CUB memory pool @@ -104,8 +113,8 @@ class softmax_layer : public Layer { } void fp_setup_outputs(El::Int mini_batch_size) override { - Layer::fp_setup_outputs(mini_batch_size); - const auto& dist_data = get_prev_activations().DistData(); + data_type_layer::fp_setup_outputs(mini_batch_size); + const auto& dist_data = this->get_prev_activations().DistData(); m_workspace->Empty(false); m_workspace->AlignWith(dist_data); m_workspace->Resize(1, mini_batch_size); @@ -114,28 +123,41 @@ class softmax_layer : public Layer { void fp_compute() override; void bp_compute() override; + template + friend void fp_compute_impl(softmax_layer& l); + template + friend void bp_compute_impl(softmax_layer& l); + private: /** Workspace for column-wise reductions. */ - std::unique_ptr m_workspace; + std::unique_ptr m_workspace; #ifdef LBANN_HAS_CUDNN /** Tensor cuDNN descriptors. */ - cudnn::data_parallel_layer_tensor_manager m_tensors_cudnn_desc; + cudnn::data_parallel_layer_tensor_manager m_tensors_cudnn_desc; #endif // LBANN_HAS_CUDNN +// Minimum output value to avoid denormalized floats +#ifdef LBANN_ENABLE_SOFTMAX_THRESHOLD + const TensorDataType threshold_val = std::sqrt(std::numeric_limits::min()); +#else + const TensorDataType threshold_val = 0; +#endif // LBANN_ENABLE_SOFTMAX_THRESHOLD + + }; #ifndef LBANN_SOFTMAX_LAYER_INSTANTIATE extern template class softmax_layer< - data_layout::DATA_PARALLEL, El::Device::CPU>; + DataType, data_layout::DATA_PARALLEL, El::Device::CPU>; extern template class softmax_layer< - data_layout::MODEL_PARALLEL, El::Device::CPU>; + DataType, data_layout::MODEL_PARALLEL, El::Device::CPU>; #ifdef LBANN_HAS_GPU extern template class softmax_layer< - data_layout::DATA_PARALLEL, El::Device::GPU>; + DataType, data_layout::DATA_PARALLEL, El::Device::GPU>; extern template class softmax_layer< - data_layout::MODEL_PARALLEL, El::Device::GPU>; + DataType, data_layout::MODEL_PARALLEL, El::Device::GPU>; #endif // LBANN_HAS_GPU #endif // LBANN_SOFTMAX_LAYER_INSTANTIATE diff --git a/include/lbann/layers/data_type_layer.hpp b/include/lbann/layers/data_type_layer.hpp new file mode 100644 index 00000000000..f4cdcbd9cdd --- /dev/null +++ b/include/lbann/layers/data_type_layer.hpp @@ -0,0 +1,304 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_LAYERS_DATA_TYPE_LAYER_HPP_INCLUDED +#define LBANN_LAYERS_DATA_TYPE_LAYER_HPP_INCLUDED + +#include "lbann/layers/layer.hpp" +#include "lbann/weights/data_type_weights.hpp" + +namespace lbann { + +// Forward declarations +//template +//class data_type_weights; + +using supported_layer_data_type = El::TypeList; + +template struct IsElement; + +template +struct IsElement> : std::true_type {}; + +template +struct IsElement> : IsElement> {}; + +template +struct IsElement> : std::false_type {}; + +template using is_supported_layer_data_type = IsElement; + +template +class data_type_layer : public Layer { +public: + /** @name Public Types */ + ///@{ + + /** @brief The tensor type expected in this object. */ + using AbsDistMatrixType = El::AbstractDistMatrix; + + /** @brief The local tensor type expected in this object. */ + using AbsMatrixType = El::AbstractMatrix; + + /** @brief The concrete weights type used by this object. */ + using WeightsType = data_type_weights; + + ///@} + +public: + static_assert(is_supported_layer_data_type::value, + "Must use a supported type."); + + data_type_layer(lbann_comm *comm) : Layer(comm) {} + data_type_layer(const data_type_layer& other); + data_type_layer& operator=(const data_type_layer& other); + virtual ~data_type_layer() = default; + + /** Forward propagation step. + * Apply a mathematical operation to input tensors to obtain output + * tensors. + */ + void forward_prop() override; + /** Backward propagation step. + * Given the objective function gradients w.r.t. the output + * tensors, compute the gradients w.r.t. the input tensors and + * w.r.t. the weights. This is essentially an application of the + * chain rule. + */ + void back_prop() override; + + void summarize_matrices(lbann_summary& summarizer, int step) override; + + /** Check that the setup is reasonable. */ + void check_setup() override; + + // =========================================================== + // Weights access functions + // =========================================================== + + /** @brief Set list of pointers to weights. */ + void set_weights(std::vector& w) override { + m_weights.resize(w.size()); + std::transform(begin(w), end(w), begin(m_weights), + [](weights* wptr) { + return (wptr + ? &(dynamic_cast(*wptr)) + : nullptr); + }); + } + + /** @brief Replace weights with another Layer's weights*/ + void replace_weights(Layer* other_layer) override; + + // =========================================================== + // Tensor access functions + // =========================================================== + + /** Get activation tensor. */ + AbsDistMatrixType& get_activations(int child_index = 0); + /** Get error signal tensor. */ + AbsDistMatrixType& get_error_signals(int parent_index = 0); + /** Get previous activation tensor. */ + const AbsDistMatrixType& get_prev_activations(int parent_index = 0) const; + /** Get activation tensor. */ + const AbsDistMatrixType& get_activations(int child_index = 0) const; + /** Get previous error signal tensor. */ + const AbsDistMatrixType& get_prev_error_signals(int child_index = 0) const; + /** Get error signal tensor. */ + const AbsDistMatrixType& get_error_signals(int parent_index = 0) const; + /** Get local portion of activation tensor. */ + AbsMatrixType& get_local_activations(int child_index = 0); + /** Get local portion of error signal tensor. */ + AbsMatrixType& get_local_error_signals(int parent_index = 0); + /** Get local portion of previous activation tensor. */ + const AbsMatrixType& get_local_prev_activations(int parent_index = 0) const; + /** Get local portion of activation tensor. */ + const AbsMatrixType& get_local_activations(int child_index = 0) const; + /** Get local portion of previous error signal tensor. */ + const AbsMatrixType& get_local_prev_error_signals(int child_index = 0) const; + /** Get local portion of error signal tensor. */ + const AbsMatrixType& get_local_error_signals(int parent_index = 0) const; + +protected: + + // =========================================================== + // Setup helper functions + // =========================================================== + + /** Setup distributed matrices. + * Called by the 'setup' function. Each column of these distributed + * matrices is interpreted as the flattened tensor for a mini-batch + * sample. The matrices themselves are constructed by calling the + * 'construct_matrix' function. If any matrices have already been + * setup, they are destroyed and reinstantiated. + */ + void setup_matrices(const El::Grid& grid) override; + /** Construct distributed matrix. + * Called by the 'setup_matrices' function. 'type' is one of the + * following: "input", "output", "gradient_wrt_output", + * "gradient_wrt_input". + */ + virtual std::unique_ptr construct_matrix(const El::Grid& grid, + std::string type, + El::Int index); + /** Setup layer data. + * Called by the 'setup' function. Memory is allocated for + * distributed matrices. + */ + void setup_data() override; + + // =========================================================== + // Forward prop step helper functions + // =========================================================== + + /** Setup input tensors. + * Called by the 'forward_prop' function. Each input tensor is + * setup as a view or copy of the corresponding parent layer's + * output tensor. + */ + void fp_setup_inputs(El::Int mini_batch_size) override; + /** Setup output tensors. + * Called by the 'forward_prop' function. Each output tensor is + * resized to match the mini-batch size. + */ + void fp_setup_outputs(El::Int mini_batch_size) override; + + // =========================================================== + // Back prop step helper functions + // =========================================================== + + /** Setup gradient w.r.t. output tensors. + * Called by the 'back_prop' function. Each gradient w.r.t. output + * tensor is setup as a view or copy of the corresponding child + * layer's gradient w.r.t. input tensor. + */ + void bp_setup_gradient_wrt_outputs(El::Int mini_batch_size) override; + /** Setup gradient w.r.t. input tensors. + * Called by the 'back_prop' function. Each gradient w.r.t. input + * tensor is resized to match the mini-batch size. + */ + void bp_setup_gradient_wrt_inputs(El::Int mini_batch_size) override; + /** Compute objective funciton gradients. + * Called by the 'back_prop' function. Given the input, output, and + * gradient w.r.t. output tensors, the gradient w.r.t. input + * tensors are populated with the computed values and the gradients + * w.r.t. the weights are sent to the appropriate optimizers. + */ + void bp_compute() override; + + // =========================================================== + // Protected Weights access functions + // =========================================================== + + /** Get references to weights. */ + std::vector& get_data_type_weights() { return m_weights; } + /** Get references to weights. (const) */ + const std::vector& get_data_type_weights() const { + return m_weights; + } + + /** @brief Get a specific weights object */ + WeightsType& get_data_type_weights(size_t idx) { + return *(m_weights.at(idx)); + } + WeightsType const& get_data_type_weights(size_t idx) const { + return *(m_weights.at(idx)); + } + + bool has_data_type_weights(size_t idx) const noexcept { + return (idx < m_weights.size() && m_weights[idx] != nullptr); + } + + void set_num_data_type_weights(size_t num_weights) { + m_weights.resize(num_weights, nullptr); + } + + void set_data_type_weights(size_t idx, WeightsType* w) { + m_weights.at(idx) = w; + } + + /** Set list of pointers to weights. */ + void set_data_type_weights(std::vector w) { m_weights = w; } + /** Replace weights with another Layer's weights*/ + //void replace_weights(Layer* other_layer) override; + + void add_weights(WeightsType* w) { m_weights.push_back(w); } + size_t num_weights() const noexcept { return m_weights.size(); } + bool has_weights() const noexcept { return num_weights() > 0; } + +private: + // =========================================================== + // Private access functions + // =========================================================== + + /** @brief Get references to weights. */ + std::vector get_weights() override { + return std::vector(begin(m_weights), end(m_weights)); + } + + /** @brief Get references to weights. (const) */ + std::vector get_weights() const override { + return std::vector(begin(m_weights), end(m_weights)); + } + + /** Get activation tensor corresponding to child layer. */ + const AbsDistMatrixType& get_activations(const data_type_layer& child) const; + /** Get error signal tensor corresponding to parent layer. */ + const AbsDistMatrixType& get_error_signals(const data_type_layer& parent) const; + + // =========================================================== + // Private class members + // =========================================================== + + /** References to layer weights. */ + std::vector m_weights; + + /** Input tensors. + * Each matrix column corresponds to a flattened mini-batch sample. + */ + std::vector> m_inputs; + /** Output tensors. + * Each matrix column corresponds to a flattened mini-batch sample. + */ + std::vector> m_outputs; + /** Objective function gradients w.r.t. the output tensors. + * Each matrix column corresponds to a flattened mini-batch sample. + */ + std::vector> m_gradient_wrt_outputs; + /** Objective function gradients w.r.t. the input tensors. + * Each matrix column corresponds to a flattened mini-batch sample. + */ + std::vector> m_gradient_wrt_inputs; + +}; + +#ifndef LBANN_DATA_TYPE_LAYER_INSTANTIATE +extern template class data_type_layer; +#endif // LBANN_DATA_TYPE_LAYER_INSTANTIATE + +} // namespace lbann + +#endif // LBANN_LAYERS_DATA_TYPE_LAYER_HPP_INCLUDED diff --git a/include/lbann/layers/image/bilinear_resize.hpp b/include/lbann/layers/image/bilinear_resize.hpp index 1de21e3dc16..1fa824211a3 100644 --- a/include/lbann/layers/image/bilinear_resize.hpp +++ b/include/lbann/layers/image/bilinear_resize.hpp @@ -27,7 +27,7 @@ #ifndef LBANN_LAYERS_IMAGE_BILINEAR_RESIZE_HPP_INCLUDED #define LBANN_LAYERS_IMAGE_BILINEAR_RESIZE_HPP_INCLUDED -#include "lbann/layers/layer.hpp" +#include "lbann/layers/data_type_layer.hpp" namespace lbann { @@ -36,14 +36,14 @@ namespace lbann { * Tensors are assumed to be image data in CHW format. Gradients are * not propagated during backprop. */ -template -class bilinear_resize_layer : public Layer { +template +class bilinear_resize_layer : public data_type_layer { static_assert(Layout == data_layout::DATA_PARALLEL, "bilinear_resize_layer only supports DATA_PARALLEL"); public: bilinear_resize_layer(lbann_comm *comm, El::Int height, El::Int width) - : Layer(comm), m_height(height), m_width(width) { + : data_type_layer(comm), m_height(height), m_width(width) { } bilinear_resize_layer* copy() const override { @@ -58,16 +58,16 @@ class bilinear_resize_layer : public Layer { protected: void setup_dims() override { - Layer::setup_dims(); + data_type_layer::setup_dims(); // Get input dimensions - auto dims = get_input_dims(); + auto dims = this->get_input_dims(); const auto& num_dims = dims.size(); // Check that dimensions are valid std::stringstream err; if (num_dims < 2) { - err << get_type() << " layer \"" << get_name() << "\" " + err << get_type() << " layer \"" << this->get_name() << "\" " << "expects input with at least two dimensions, " << "but input dimensions are "; for (size_t i = 0; i < num_dims; ++i) { @@ -75,12 +75,12 @@ class bilinear_resize_layer : public Layer { } LBANN_ERROR(err.str()); } else if (m_height <= 0) { - err << get_type() << " layer \"" << get_name() << "\" " + err << get_type() << " layer \"" << this->get_name() << "\" " << "attempted to resize with " << "negative height (" << m_height << ")"; LBANN_ERROR(err.str()); } else if (m_width <= 0) { - err << get_type() << " layer \"" << get_name() << "\" " + err << get_type() << " layer \"" << this->get_name() << "\" " << "attempted to resize with " << "negative width (" << m_width << ")"; LBANN_ERROR(err.str()); @@ -89,7 +89,7 @@ class bilinear_resize_layer : public Layer { // Resize output tensor dims[num_dims-2] = m_height; dims[num_dims-1] = m_width; - set_output_dims(dims); + this->set_output_dims(dims); } @@ -108,10 +108,10 @@ class bilinear_resize_layer : public Layer { #ifndef LBANN_BILINEAR_RESIZE_LAYER_INSTANTIATE extern template class bilinear_resize_layer< - data_layout::DATA_PARALLEL, El::Device::CPU>; + DataType, data_layout::DATA_PARALLEL, El::Device::CPU>; #ifdef LBANN_HAS_GPU extern template class bilinear_resize_layer< - data_layout::DATA_PARALLEL, El::Device::GPU>; + DataType, data_layout::DATA_PARALLEL, El::Device::GPU>; #endif // LBANN_HAS_GPU #endif // LBANN_BILINEAR_RESIZE_LAYER_INSTANTIATE diff --git a/include/lbann/layers/io/input/generic_input_layer.hpp b/include/lbann/layers/io/input/generic_input_layer.hpp index b594109ee0f..37bbe4789bf 100644 --- a/include/lbann/layers/io/input/generic_input_layer.hpp +++ b/include/lbann/layers/io/input/generic_input_layer.hpp @@ -46,7 +46,8 @@ namespace lbann { /** @todo Move functionality to input_layer. */ -class generic_input_layer : public io_layer { +template +class generic_input_layer : public io_layer { public: using data_reader_map_t = std::map; using io_buffer_map_t = std::map>; @@ -57,7 +58,7 @@ class generic_input_layer : public io_layer { std::map data_readers, bool data_set_spans_models = true, data_reader_target_mode dr_mode = data_reader_target_mode::CLASSIFICATION) - : io_layer(comm, data_set_spans_models, dr_mode), + : io_layer(comm, data_set_spans_models, dr_mode), m_io_buffers(), m_training_dataset(), m_testing_dataset(), @@ -66,30 +67,30 @@ class generic_input_layer : public io_layer { m_data_set_processed(false) { //m_data_sets_span_models(data_sets_span_models) { // Input layers have no parents - m_expected_num_parent_layers = 0; + this->m_expected_num_parent_layers = 0; if(dr_mode == data_reader_target_mode::NA) { - m_expected_num_child_layers = 1; + this->m_expected_num_child_layers = 1; }else { // Input layers output a sample and target, which could be the // original value, categorical label, or regression value - m_expected_num_child_layers = 2; + this->m_expected_num_child_layers = 2; } if(m_data_readers[execution_mode::training] != nullptr) { - m_training_dataset.total_samples() = m_data_readers[execution_mode::training]->get_num_data(); + this->m_training_dataset.total_samples() = m_data_readers[execution_mode::training]->get_num_data(); } if(m_data_readers[execution_mode::validation] != nullptr) { - m_validation_dataset.total_samples() = m_data_readers[execution_mode::validation]->get_num_data(); + this->m_validation_dataset.total_samples() = m_data_readers[execution_mode::validation]->get_num_data(); } if(m_data_readers[execution_mode::testing] != nullptr) { - m_testing_dataset.total_samples() = m_data_readers[execution_mode::testing]->get_num_data(); + this->m_testing_dataset.total_samples() = m_data_readers[execution_mode::testing]->get_num_data(); } - m_active_buffer[execution_mode::training].store(-1); - m_active_buffer[execution_mode::validation].store(-1); - m_active_buffer[execution_mode::testing].store(-1); + this->m_active_buffer[execution_mode::training].store(-1); + this->m_active_buffer[execution_mode::validation].store(-1); + this->m_active_buffer[execution_mode::testing].store(-1); } ~generic_input_layer() override { @@ -116,7 +117,7 @@ class generic_input_layer : public io_layer { // Input layers copy their datareaders. generic_input_layer(const generic_input_layer& other) - : io_layer(other), + : io_layer(other), m_io_buffers(other.m_io_buffers), m_training_dataset(other.m_training_dataset), m_testing_dataset(other.m_testing_dataset), @@ -131,7 +132,7 @@ class generic_input_layer : public io_layer { } generic_input_layer& operator=(const generic_input_layer& other) { - io_layer::operator=(other); + io_layer::operator=(other); for (auto& io_buffer : m_io_buffers) { io_buffer = io_buffer->copy(); } @@ -152,34 +153,36 @@ class generic_input_layer : public io_layer { } template - inline void initialize_io_buffer(lbann_comm *comm, int num_parallel_readers, std::map data_readers); + inline void initialize_io_buffer(lbann_comm *comm, int num_parallel_readers, std::map data_readers) { + m_io_buffers.push_back(new T_io_buffer(comm, num_parallel_readers, data_readers, this->m_expected_num_child_layers)); + } std::string get_type() const override { return "generic_input"; } description get_description() const override { - auto desc = io_layer::get_description(); + auto desc = io_layer::get_description(); desc.add("Buffer", m_io_buffers[0]->get_type()); return desc; } void setup_dims() override { - io_layer::setup_dims(); - for (int i = 0; i < get_num_children(); ++i) { - set_output_dims(get_data_dims(i), i); + io_layer::setup_dims(); + for (int i = 0; i < this->get_num_children(); ++i) { + this->set_output_dims(get_data_dims(i), i); } } void setup_data() override { - io_layer::setup_data(); + io_layer::setup_data(); // Resize output to maximum mini-batch size const auto& max_mb_size = this->m_model->get_max_mini_batch_size(); - for (int i = 0; i < get_num_children(); ++i) { - auto& output = get_activations(i); + for (int i = 0; i < this->get_num_children(); ++i) { + auto& output = this->get_activations(i); output.Resize(output.Height(), max_mb_size); } - if(io_layer::m_data_set_spans_models) { + if(io_layer::m_data_set_spans_models) { calculate_num_iterations_per_epoch_training_spans_models(max_mb_size); } else { calculate_num_iterations_per_epoch_training_unique_per_models(max_mb_size); @@ -187,7 +190,7 @@ class generic_input_layer : public io_layer { for (auto& io_buffer : m_io_buffers) { int linearized_target_size; - switch(m_data_reader_mode) { + switch(this->m_data_reader_mode) { case data_reader_target_mode::REGRESSION: linearized_target_size = get_linearized_response_size(); break; @@ -201,7 +204,7 @@ class generic_input_layer : public io_layer { default: linearized_target_size = 0; } - io_buffer->setup_data(get_output_size(0), + io_buffer->setup_data(this->get_output_size(0), linearized_target_size, max_mb_size); } @@ -235,10 +238,10 @@ class generic_input_layer : public io_layer { } // Initialize matrices - io_layer::fp_setup_outputs(mini_batch_size); + io_layer::fp_setup_outputs(mini_batch_size); for (auto& io_buffer : m_io_buffers) { - for (int i = 0; i < get_num_children(); ++i) { + for (int i = 0; i < this->get_num_children(); ++i) { io_buffer->fp_setup_data(mini_batch_size, i); } } @@ -296,19 +299,19 @@ class generic_input_layer : public io_layer { } } - if(dynamic_cast(io_buffer) != nullptr) { + if(dynamic_cast*>(io_buffer) != nullptr) { // Use the predetermined size of the mini-batch to set the current // batch size for the neural network num_samples_in_batch = get_current_mini_batch_size(); update_num_samples_processed(num_samples_in_batch); - if(m_expected_num_child_layers == 1) { - io_buffer->distribute_from_local_matrix(get_data_reader(), mode, get_activations(0)); + if(this->m_expected_num_child_layers == 1) { + io_buffer->distribute_from_local_matrix(get_data_reader(), mode, this->get_activations(0)); }else { - io_buffer->distribute_from_local_matrix(get_data_reader(), mode, get_activations(0), get_activations(1)); + io_buffer->distribute_from_local_matrix(get_data_reader(), mode, this->get_activations(0), this->get_activations(1)); } }else { - LBANN_ERROR("could not fp_compute for I/O layers : encoutered generic_io_buffer type"); + LBANN_ERROR("could not fp_compute for I/O layers : encoutered generic_io_buffer type"); } m_data_set_processed = io_buffer->update_data_set(get_data_reader(mode), mode); @@ -325,7 +328,7 @@ class generic_input_layer : public io_layer { void setup_next_io_buffer(generic_io_buffer* io_buffer) { int mini_batch_size = get_current_mini_batch_size(); - for (int i = 0; i < get_num_children(); ++i) { + for (int i = 0; i < this->get_num_children(); ++i) { io_buffer->fp_setup_data(mini_batch_size, i); } } @@ -535,8 +538,8 @@ class generic_input_layer : public io_layer { /** * Return the dataset associated with the current execution mode. */ - dataset& select_dataset() override { return get_dataset(m_model->get_execution_context().get_execution_mode()); } - const dataset& select_dataset() const override { return get_dataset(m_model->get_execution_context().get_execution_mode()); } + dataset& select_dataset() override { return get_dataset(this->m_model->get_execution_context().get_execution_mode()); } + const dataset& select_dataset() const override { return get_dataset(this->m_model->get_execution_context().get_execution_mode()); } /** * Return the first dataset with a valid (non-null) datareader. @@ -595,7 +598,7 @@ class generic_input_layer : public io_layer { if(child_index == 0) { return dr->get_data_dims(); }else if(child_index == 1) { - switch(m_data_reader_mode) { + switch(this->m_data_reader_mode) { case data_reader_target_mode::REGRESSION: return std::vector(1, dr->get_num_responses()); case data_reader_target_mode::RECONSTRUCTION: @@ -649,7 +652,7 @@ class generic_input_layer : public io_layer { * Get the linearized size of the labels for the underlying data. */ long get_linearized_label_size() const override { - if (is_for_regression()) { + if (this->is_for_regression()) { return static_cast(1); } long linearized_label_size = -1; @@ -678,7 +681,7 @@ class generic_input_layer : public io_layer { } long get_linearized_response_size() const override { - if (!is_for_regression()) { + if (!this->is_for_regression()) { return static_cast(1); } long linearized_response_size = -1; @@ -753,7 +756,7 @@ class generic_input_layer : public io_layer { (it->second)->save_to_checkpoint_shared(p, execution_mode::validation); } - if (get_comm()->am_trainer_master()) { + if (this->get_comm()->am_trainer_master()) { write_cereal_archive(*this, p, execution_mode::training, "_io.xml"); } @@ -780,16 +783,16 @@ class generic_input_layer : public io_layer { } std::string buf; - if (get_comm()->am_trainer_master()) { + if (this->get_comm()->am_trainer_master()) { read_cereal_archive(*this, p, execution_mode::training, "_io.xml"); buf = create_cereal_archive_binary_string(*this); } // TODO: this assumes homogeneous processors // broadcast state from rank 0 - get_comm()->trainer_broadcast(0, buf); + this->get_comm()->trainer_broadcast(0, buf); - if (!get_comm()->am_trainer_master()) { + if (!this->get_comm()->am_trainer_master()) { unpack_cereal_archive_binary_string(*this, buf); } @@ -860,10 +863,6 @@ class generic_input_layer : public io_layer { std::mutex dr_mutex; }; -template inline void generic_input_layer::initialize_io_buffer(lbann_comm *comm, int num_parallel_readers, std::map data_readers) { - m_io_buffers.push_back(new T(comm, num_parallel_readers, data_readers, m_expected_num_child_layers)); -} - } // namespace lbann #endif // LBANN_LAYERS_GENERIC_INPUT_LAYER_HPP_INCLUDED diff --git a/include/lbann/layers/io/input/input_layer.hpp b/include/lbann/layers/io/input/input_layer.hpp index 1f85202fd73..f424aea41c7 100644 --- a/include/lbann/layers/io/input/input_layer.hpp +++ b/include/lbann/layers/io/input/input_layer.hpp @@ -38,22 +38,24 @@ namespace lbann { /** @brief Interface with data reader. */ -template -class input_layer : public generic_input_layer { +class input_layer : public generic_input_layer { + static_assert(T_layout == data_layout::DATA_PARALLEL, + "input layer only supports DATA_PARALLEL data layout"); public: /// @todo make the map and vector references input_layer(lbann_comm *comm, int num_parallel_readers, std::map data_readers, bool data_set_spans_models = true, data_reader_target_mode target_mode = data_reader_target_mode::CLASSIFICATION) - : generic_input_layer(comm, num_parallel_readers, data_readers, data_set_spans_models, target_mode) { - validate_data_layout(); + : generic_input_layer(comm, num_parallel_readers, data_readers, data_set_spans_models, target_mode) { // Initialize two buffers - initialize_io_buffer(comm, std::min(num_parallel_readers, Layer::m_comm->get_procs_per_trainer()), data_readers); - initialize_io_buffer(comm, std::min(num_parallel_readers, Layer::m_comm->get_procs_per_trainer()), data_readers); - for (auto io_buffer : m_io_buffers) { + initialize_io_buffer(comm, std::min(num_parallel_readers, data_type_layer::m_comm->get_procs_per_trainer()), data_readers); + initialize_io_buffer(comm, std::min(num_parallel_readers, data_type_layer::m_comm->get_procs_per_trainer()), data_readers); + for (auto io_buffer : this->m_io_buffers) { io_buffer->fetch_data_fn = new fetch_data_functor(target_mode); io_buffer->update_data_reader_fn = new update_data_reader_functor(); } @@ -64,10 +66,8 @@ class input_layer : public generic_input_layer { return new input_layer(*this); } - inline void validate_data_layout(); - inline void initialize_io_buffer(lbann_comm *comm, int num_parallel_readers, std::map data_readers) { - generic_input_layer::initialize_io_buffer(comm, num_parallel_readers, data_readers); + generic_input_layer::template initialize_io_buffer(comm, num_parallel_readers, data_readers); } std::string get_type() const override { return "input"; } @@ -76,40 +76,14 @@ class input_layer : public generic_input_layer { }; -template<> -inline void input_layer::validate_data_layout() { - std::stringstream err; - err << __FILE__ << " " << __LINE__ << " :: " - << "input_layer with partitioned_io_buffer does not supports MODEL_PARALLEL data layout"; - throw lbann_exception(err.str()); -} - -template<> -inline void input_layer::validate_data_layout() {} - -#ifdef LBANN_HAS_GPU -template<> -inline void input_layer::validate_data_layout() { - std::stringstream err; - err << __FILE__ << " " << __LINE__ << " :: " - << "input_layer with partitioned_io_buffer does not supports MODEL_PARALLEL data layout"; - throw lbann_exception(err.str()); -} - -template<> -inline void input_layer::validate_data_layout() {} -#endif // LBANN_HAS_GPU - #ifndef LBANN_INPUT_LAYER_INSTANTIATE extern template class input_layer< - partitioned_io_buffer, data_layout::DATA_PARALLEL, El::Device::CPU>; -extern template class input_layer< - partitioned_io_buffer, data_layout::MODEL_PARALLEL, El::Device::CPU>; + DataType, partitioned_io_buffer, + data_layout::DATA_PARALLEL, El::Device::CPU>; #ifdef LBANN_HAS_GPU extern template class input_layer< - partitioned_io_buffer, data_layout::DATA_PARALLEL, El::Device::GPU>; -extern template class input_layer< - partitioned_io_buffer, data_layout::MODEL_PARALLEL, El::Device::GPU>; + DataType, partitioned_io_buffer, + data_layout::DATA_PARALLEL, El::Device::GPU>; #endif // LBANN_HAS_GPU #endif // LBANN_INPUT_LAYER_INSTANTIATE diff --git a/include/lbann/layers/io/io_layer.hpp b/include/lbann/layers/io/io_layer.hpp index 4f0b22ec529..f828089cdb5 100644 --- a/include/lbann/layers/io/io_layer.hpp +++ b/include/lbann/layers/io/io_layer.hpp @@ -27,7 +27,7 @@ #ifndef LBANN_LAYERS_IO_LAYER_HPP_INCLUDED #define LBANN_LAYERS_IO_LAYER_HPP_INCLUDED -#include "lbann/layers/layer.hpp" +#include "lbann/layers/data_type_layer.hpp" #include "lbann/data_readers/data_reader.hpp" #include "lbann/utils/dataset.hpp" #include "lbann/io/persist.hpp" @@ -43,7 +43,8 @@ namespace lbann { /** @todo Move functionality to input_layer. */ -class io_layer : public Layer { +template +class io_layer : public data_type_layer { protected: bool m_data_set_spans_models; data_reader_target_mode m_data_reader_mode; @@ -52,7 +53,7 @@ class io_layer : public Layer { io_layer(lbann_comm *comm, bool data_set_spans_models = true, data_reader_target_mode data_reader_mode = data_reader_target_mode::CLASSIFICATION) - : Layer(comm), + : data_type_layer(comm), m_data_set_spans_models(data_set_spans_models), m_data_reader_mode(data_reader_mode) { } diff --git a/include/lbann/layers/layer.hpp b/include/lbann/layers/layer.hpp index 8e4f3bd62f5..52be4d4f2fb 100644 --- a/include/lbann/layers/layer.hpp +++ b/include/lbann/layers/layer.hpp @@ -109,14 +109,14 @@ class Layer { * Apply a mathematical operation to input tensors to obtain output * tensors. */ - virtual void forward_prop(); + virtual void forward_prop() {}; /** Backward propagation step. * Given the objective function gradients w.r.t. the output * tensors, compute the gradients w.r.t. the input tensors and * w.r.t. the weights. This is essentially an application of the * chain rule. */ - virtual void back_prop(); + virtual void back_prop() {}; /** Update step. * Update the layer's internal members. Note that the optimization * step for the weights happens elsewhere. @@ -124,7 +124,7 @@ class Layer { virtual bool update(); virtual void summarize_stats(lbann_summary& summarizer, int step); - virtual void summarize_matrices(lbann_summary& summarizer, int step); + virtual void summarize_matrices(lbann_summary& summarizer, int step) = 0; /** Setup layer members. * This calls the 'setup_pointers', 'setup_dims', 'setup_matrices', @@ -203,6 +203,11 @@ class Layer { /** Get child layers. (const) */ inline const std::vector& get_child_layers() const { return m_child_layers; } + inline int find_layer_index(const Layer* l) const { + return (std::find(m_child_layers.begin(), + m_child_layers.end(), + l) - m_child_layers.begin()); } + /** Get number of parent layers. */ inline int get_num_parents() const { return get_parent_layers().size(); } /** Get number of child layers. */ @@ -246,14 +251,10 @@ class Layer { // Weights access functions // =========================================================== - /** Get references to weights. */ - inline std::vector& get_weights() { return m_weights; } - /** Get references to weights. (const) */ - inline const std::vector& get_weights() const { return m_weights; } /** Set list of pointers to weights. */ - inline void set_weights(std::vector w) { get_weights() = w; } + virtual void set_weights(std::vector& w) = 0; /** Replace weights with another Layer's weights*/ - void replace_weights(Layer* other_layer); + virtual void replace_weights(Layer* other_layer) = 0; // =========================================================== // Tensor dimension access functions @@ -271,34 +272,6 @@ class Layer { /** Set output tensor dimensions. */ void set_output_dims(std::vector dims, int output_index = 0); - // =========================================================== - // Tensor access functions - // =========================================================== - - /** Get activation tensor. */ - AbsDistMat& get_activations(int child_index = 0); - /** Get error signal tensor. */ - AbsDistMat& get_error_signals(int parent_index = 0); - /** Get previous activation tensor. */ - const AbsDistMat& get_prev_activations(int parent_index = 0) const; - /** Get activation tensor. */ - const AbsDistMat& get_activations(int child_index = 0) const; - /** Get previous error signal tensor. */ - const AbsDistMat& get_prev_error_signals(int child_index = 0) const; - /** Get error signal tensor. */ - const AbsDistMat& get_error_signals(int parent_index = 0) const; - /** Get local portion of activation tensor. */ - AbsMat& get_local_activations(int child_index = 0); - /** Get local portion of error signal tensor. */ - AbsMat& get_local_error_signals(int parent_index = 0); - /** Get local portion of previous activation tensor. */ - const AbsMat& get_local_prev_activations(int parent_index = 0) const; - /** Get local portion of activation tensor. */ - const AbsMat& get_local_activations(int child_index = 0) const; - /** Get local portion of previous error signal tensor. */ - const AbsMat& get_local_prev_error_signals(int child_index = 0) const; - /** Get local portion of error signal tensor. */ - const AbsMat& get_local_error_signals(int parent_index = 0) const; /** Get reference to LBANN communicator. */ lbann_comm* get_comm() const { return m_comm; } @@ -349,20 +322,12 @@ class Layer { * 'construct_matrix' function. If any matrices have already been * setup, they are destroyed and reinstantiated. */ - virtual void setup_matrices(const El::Grid& grid); - /** Construct distributed matrix. - * Called by the 'setup_matrices' function. 'type' is one of the - * following: "input", "output", "gradient_wrt_output", - * "gradient_wrt_input". - */ - virtual std::unique_ptr construct_matrix(const El::Grid& grid, - std::string type, - El::Int index); + virtual void setup_matrices(const El::Grid& grid) = 0; /** Setup layer data. * Called by the 'setup' function. Memory is allocated for * distributed matrices. */ - virtual void setup_data(); + virtual void setup_data() {}; /** Setup GPU objects. * Called by the 'setup' function if the layer is on GPUs. */ @@ -377,12 +342,12 @@ class Layer { * setup as a view or copy of the corresponding parent layer's * output tensor. */ - virtual void fp_setup_inputs(El::Int mini_batch_size); + virtual void fp_setup_inputs(El::Int mini_batch_size) = 0; /** Setup output tensors. * Called by the 'forward_prop' function. Each output tensor is * resized to match the mini-batch size. */ - virtual void fp_setup_outputs(El::Int mini_batch_size); + virtual void fp_setup_outputs(El::Int mini_batch_size) = 0; /** Apply layer operation. * Called by the 'forward_prop' function. Given the input tensors, * the output tensors are populated with computed values. @@ -398,19 +363,19 @@ class Layer { * tensor is setup as a view or copy of the corresponding child * layer's gradient w.r.t. input tensor. */ - virtual void bp_setup_gradient_wrt_outputs(El::Int mini_batch_size); + virtual void bp_setup_gradient_wrt_outputs(El::Int mini_batch_size) = 0; /** Setup gradient w.r.t. input tensors. * Called by the 'back_prop' function. Each gradient w.r.t. input * tensor is resized to match the mini-batch size. */ - virtual void bp_setup_gradient_wrt_inputs(El::Int mini_batch_size); + virtual void bp_setup_gradient_wrt_inputs(El::Int mini_batch_size) = 0; /** Compute objective funciton gradients. * Called by the 'back_prop' function. Given the input, output, and * gradient w.r.t. output tensors, the gradient w.r.t. input * tensors are populated with the computed values and the gradients * w.r.t. the weights are sent to the appropriate optimizers. */ - virtual void bp_compute(); + virtual void bp_compute() {}; // =========================================================== // Update step helper functions @@ -428,9 +393,6 @@ class Layer { /** Reference to LBANN communicator. */ lbann_comm *m_comm; - /** References to layer weights. */ - std::vector m_weights; - /** References to parent layers. */ std::vector m_parent_layers; /** References to child layers. */ @@ -473,11 +435,10 @@ class Layer { // =========================================================== // Private access functions // =========================================================== - - /** Get activation tensor corresponding to child layer. */ - const AbsDistMat& get_activations(const Layer& child) const; - /** Get error signal tensor corresponding to parent layer. */ - const AbsDistMat& get_error_signals(const Layer& parent) const; + /** Get references to weights. */ + virtual std::vector get_weights() = 0; + /** Get references to weights. (const) */ + virtual std::vector get_weights() const = 0; // =========================================================== // Private class members @@ -486,23 +447,6 @@ class Layer { /** Dimensions of output tensors. */ std::vector> m_output_dims_list; - /** Input tensors. - * Each matrix column corresponds to a flattened mini-batch sample. - */ - std::vector> m_inputs; - /** Output tensors. - * Each matrix column corresponds to a flattened mini-batch sample. - */ - std::vector> m_outputs; - /** Objective function gradients w.r.t. the output tensors. - * Each matrix column corresponds to a flattened mini-batch sample. - */ - std::vector> m_gradient_wrt_outputs; - /** Objective function gradients w.r.t. the input tensors. - * Each matrix column corresponds to a flattened mini-batch sample. - */ - std::vector> m_gradient_wrt_inputs; - /** Hint layer. * During setup, the output tensor dimensions are set to match the * first output tensor of the hint layer. Derived classes may do @@ -510,8 +454,19 @@ class Layer { */ const Layer* m_hint_layer = nullptr; +private: + friend std::vector extract_weights(Layer const& l); + friend std::vector extract_weights(Layer& l); }; +inline std::vector extract_weights(Layer& l) { + return l.get_weights(); +} + +inline std::vector extract_weights(Layer const& l) { + return l.get_weights(); +} + } // namespace lbann #endif // LBANN_LAYERS_LAYER_HPP_INCLUDED diff --git a/include/lbann/layers/learning/base_convolution.hpp b/include/lbann/layers/learning/base_convolution.hpp index c27c80aa731..88588e8949d 100644 --- a/include/lbann/layers/learning/base_convolution.hpp +++ b/include/lbann/layers/learning/base_convolution.hpp @@ -29,6 +29,7 @@ #include "lbann/layers/layer.hpp" #include "lbann/models/model.hpp" +#include "lbann/layers/data_type_layer.hpp" #include "lbann/weights/initializer.hpp" #include "lbann/weights/variance_scaling_initializers.hpp" #include "lbann/utils/cudnn.hpp" @@ -44,8 +45,19 @@ namespace lbann { /** @brief Computation kernels for convolution and deconvolution layers. */ -template -class base_convolution_layer : public Layer { +template +class base_convolution_layer : public data_type_layer { +public: + /** @name Public Types */ + ///@{ + + /** @brief The concrete weights type used by this object. */ + using WeightsType = data_type_weights; + + /** @brief The concrete optimizer type used by this object. */ + using OptimizerType = data_type_optimizer; + + ///@} protected: @@ -70,7 +82,7 @@ class base_convolution_layer : public Layer { /** Scaling factor for bias term. * If the scaling factor is zero, bias is not applied. */ - DataType m_bias_scaling_factor; + TensorDataType m_bias_scaling_factor; #ifdef LBANN_HAS_CUDNN @@ -81,7 +93,7 @@ class base_convolution_layer : public Layer { /** Bias tensor cuDNN descriptor. */ cudnnTensorDescriptor_t m_bias_cudnn_desc = nullptr; /** Tensor cuDNN descriptors. */ - cudnn::data_parallel_layer_tensor_manager m_tensors_cudnn_desc; + cudnn::data_parallel_layer_tensor_manager m_tensors_cudnn_desc; /** Forward algorithm cache (mini-batch size -> algo). */ std::unordered_map m_fwd_cudnn_algos; /** Backward data algorithm cache (mini-batch size -> algo). */ @@ -102,7 +114,7 @@ class base_convolution_layer : public Layer { std::vector dilations, int groups, bool has_bias) - : Layer(comm), + : data_type_layer(comm), m_output_channels(output_channels), m_conv_dims(std::move(conv_dims)), m_pads(std::move(pads)), @@ -116,7 +128,7 @@ class base_convolution_layer : public Layer { {} base_convolution_layer(const base_convolution_layer& other) - : Layer(other), + : data_type_layer(other), m_output_channels(other.m_output_channels), m_conv_dims(other.m_conv_dims), m_pads(other.m_pads), @@ -136,7 +148,7 @@ class base_convolution_layer : public Layer { m_kernel_cudnn_desc); copy_convolution_cudnn_desc(other.m_convolution_cudnn_desc, m_convolution_cudnn_desc); - if (other.m_bias_scaling_factor != DataType(0)) { + if (other.m_bias_scaling_factor != TensorDataType(0)) { cudnn::copy_tensor_desc(other.m_bias_cudnn_desc, m_bias_cudnn_desc); } @@ -145,7 +157,7 @@ class base_convolution_layer : public Layer { } base_convolution_layer& operator=(const base_convolution_layer& other) { - Layer::operator=(other); + data_type_layer::operator=(other); m_output_channels = other.m_output_channels; m_conv_dims = other.m_conv_dims; m_pads = other.m_pads; @@ -160,7 +172,7 @@ class base_convolution_layer : public Layer { m_kernel_cudnn_desc); copy_convolution_cudnn_desc(other.m_convolution_cudnn_desc, m_convolution_cudnn_desc); - if (other.m_bias_scaling_factor != DataType(0)) { + if (other.m_bias_scaling_factor != TensorDataType(0)) { cudnn::copy_tensor_desc(other.m_bias_cudnn_desc, m_bias_cudnn_desc); } @@ -189,7 +201,7 @@ class base_convolution_layer : public Layer { } description get_description() const override { - auto desc = Layer::get_description(); + auto desc = data_type_layer::get_description(); std::ostringstream ss; // Convolution dimensions @@ -230,7 +242,7 @@ class base_convolution_layer : public Layer { // Bias ss.str(std::string{}); ss.clear(); - ss << (m_bias_scaling_factor == DataType(0) ? + ss << (m_bias_scaling_factor == TensorDataType(0) ? "disabled" : "enabled"); desc.add("Bias", ss.str()); @@ -240,23 +252,23 @@ class base_convolution_layer : public Layer { } void setup_dims() override { - Layer::setup_dims(); + data_type_layer::setup_dims(); std::ostringstream err; // Check number of channels and channel groups - const auto& input_dims = get_input_dims(); + const auto& input_dims = this->get_input_dims(); if (m_output_channels < 1) { - err << get_type() << " layer \"" << get_name() << "\" " + err << this->get_type() << " layer \"" << this->get_name() << "\" " << "has an invalid number of output channels " << "(" << m_output_channels << ")"; LBANN_ERROR(err.str()); } else if (m_groups < 1) { - err << get_type() << " layer \"" << get_name() << "\" " + err << this->get_type() << " layer \"" << this->get_name() << "\" " << "has an invalid number of groups (" << m_groups << ")"; LBANN_ERROR(err.str()); } else if (input_dims[0] % m_groups != 0 || m_output_channels % m_groups != 0) { - err << get_type() << " layer \"" << get_name() << "\" " + err << this->get_type() << " layer \"" << this->get_name() << "\" " << "has " << m_groups << " groups, which does not divide " << "the input channels (" << input_dims[0] << ") or " << "the output channels (" << m_output_channels << ")"; @@ -268,7 +280,7 @@ class base_convolution_layer : public Layer { if (m_conv_dims.size() != num_spatial_dims || std::any_of(m_conv_dims.begin(), m_conv_dims.end(), [](El::Int d) { return d < 1; })) { - err << get_type() << " layer \"" << get_name() << "\" " + err << this->get_type() << " layer \"" << this->get_name() << "\" " << "has invalid spatial dimensions for convolution kernel ("; if (m_conv_dims.empty()) { err << "no dimensions"; } for (size_t i = 0; i < m_conv_dims.size(); ++i) { @@ -277,7 +289,7 @@ class base_convolution_layer : public Layer { err << ", expected " << num_spatial_dims << " spatial dimensions)"; LBANN_ERROR(err.str()); } else if (m_pads.size() != num_spatial_dims) { - err << get_type() << " layer \"" << get_name() << "\" " + err << this->get_type() << " layer \"" << this->get_name() << "\" " << "has invalid convolution pads (("; for (size_t i = 0; i < m_pads.size(); ++i) { err << (i > 0 ? "," : "") << m_pads[i]; @@ -287,7 +299,7 @@ class base_convolution_layer : public Layer { } else if (m_strides.size() != num_spatial_dims || std::any_of(m_strides.begin(), m_strides.end(), [](El::Int d) { return d < 1; })) { - err << get_type() << " layer \"" << get_name() << "\" " + err << this->get_type() << " layer \"" << this->get_name() << "\" " << "has invalid convolution strides (("; for (size_t i = 0; i < m_strides.size(); ++i) { err << (i > 0 ? "," : "") << m_strides[i]; @@ -297,7 +309,7 @@ class base_convolution_layer : public Layer { } else if (m_dilations.size() != num_spatial_dims || std::any_of(m_dilations.begin(), m_dilations.end(), [](El::Int d) { return d < 1; })) { - err << get_type() << " layer \"" << get_name() << "\" " + err << this->get_type() << " layer \"" << this->get_name() << "\" " << "has invalid convolution dilations (("; for (size_t i = 0; i < m_dilations.size(); ++i) { err << (i > 0 ? "," : "") << m_dilations[i]; @@ -310,12 +322,12 @@ class base_convolution_layer : public Layer { if (Device == El::Device::CPU && std::any_of(m_dilations.begin(), m_dilations.end(), [](El::Int d) { return d != 1; })) { - err << get_type() << " layer \"" << get_name() << "\" " + err << this->get_type() << " layer \"" << this->get_name() << "\" " << "has non-unit dilation, which is not yet supported on CPU"; LBANN_ERROR(err.str()); } if (Device == El::Device::CPU && m_groups != 1) { - err << get_type() << " layer \"" << get_name() << "\" " + err << this->get_type() << " layer \"" << this->get_name() << "\" " << "has " << m_groups << " groups, " << "but only one group is currently supported on CPU"; LBANN_ERROR(err.str()); @@ -327,85 +339,87 @@ class base_convolution_layer : public Layer { * The kernel weights are setup in the convolution and * deconvolution classes. */ void setup_data() override { - Layer::setup_data(); + data_type_layer::setup_data(); // Tensor dimensions - const auto& input_dims = get_input_dims(); - const auto& output_dims = get_output_dims(); - const auto& kernel_dims = get_kernel_dims(); + const auto& input_dims = this->get_input_dims(); + const auto& output_dims = this->get_output_dims(); + const auto& kernel_dims = this->get_kernel_dims(); const auto& kernel_size = std::accumulate(kernel_dims.begin(), kernel_dims.end(), 1, std::multiplies()); // Initialize default weights if none are provided - if (this->m_weights.size() > 2) { + if (this->num_weights() > 2) { std::stringstream err; - err << "attempted to setup layer \"" << get_name() << "\" " + err << "attempted to setup layer \"" << this->get_name() << "\" " << "with an invalid number of weights " << "(expected at most 2, " - << "found " << this->m_weights.size() << ")"; + << "found " << this->num_weights() << ")"; LBANN_ERROR(err.str()); } - if (m_bias_scaling_factor != DataType(0)) { - this->m_weights.resize(2, nullptr); + if (m_bias_scaling_factor != TensorDataType(0)) { + this->set_num_data_type_weights(2); } else { - this->m_weights.resize(1, nullptr); + this->set_num_data_type_weights(1); } - if (this->m_weights[0] == nullptr) { - auto w = make_unique(get_comm()); - auto init = make_unique(probability_distribution::gaussian); - std::unique_ptr opt(m_model->create_optimizer()); - w->set_name(get_name() + "_kernel"); + if (!this->has_data_type_weights(0)) { + auto w = make_unique(this->get_comm()); + auto init = make_unique>(probability_distribution::gaussian); + auto opt = to_unique_ptr(dynamic_cast( + this->m_model->create_optimizer())); + w->set_name(this->get_name() + "_kernel"); w->set_initializer(std::move(init)); w->set_optimizer(std::move(opt)); - this->m_weights[0] = w.get(); + this->set_data_type_weights(0, w.get()); this->m_model->add_weights(std::move(w)); } - auto& kernel_weights = *this->m_weights[0]; + auto& kernel_weights = this->get_data_type_weights(0); // Initialize variance scaling initialization auto* cast_initializer - = dynamic_cast(kernel_weights.get_initializer()); + = dynamic_cast*>(kernel_weights.get_initializer()); if (cast_initializer != nullptr) { cast_initializer->set_fan_in(kernel_size / output_dims[0]); cast_initializer->set_fan_out(kernel_size / input_dims[0]); } // Initialize weight matrices - auto dist = get_prev_activations().DistData(); + auto dist = this->get_prev_activations().DistData(); dist.colDist = El::STAR; dist.rowDist = El::STAR; kernel_weights.set_dims(kernel_dims); kernel_weights.set_matrix_distribution(dist); // Set up bias if needed. - if (m_bias_scaling_factor != DataType(0)) { - if (this->m_weights[1] == nullptr) { - auto w = make_unique(get_comm()); - std::unique_ptr opt(m_model->create_optimizer()); - w->set_name(get_name() + "_bias"); + if (m_bias_scaling_factor != TensorDataType(0)) { + if (!this->has_data_type_weights(1)) { + auto w = make_unique(this->get_comm()); + auto opt = to_unique_ptr(dynamic_cast( + this->m_model->create_optimizer())); + w->set_name(this->get_name() + "_bias"); w->set_optimizer(std::move(opt)); - this->m_weights[1] = w.get(); + this->set_data_type_weights(1, w.get()); this->m_model->add_weights(std::move(w)); } - auto& bias_weights = *this->m_weights[1]; + auto& bias_weights = this->get_data_type_weights(1); bias_weights.set_dims(output_dims[0]); bias_weights.set_matrix_distribution(dist); } // Initialize freeze state - for (auto&& w : this->m_weights) { - if (m_frozen) { + for (auto&& w : this->get_data_type_weights()) { + if (this->m_frozen) { w->freeze(); } else { w->unfreeze(); } } - for (auto&& w : this->m_weights) { - if (w->is_frozen() != m_frozen) { + for (auto&& w : this->get_data_type_weights()) { + if (w->is_frozen() != this->m_frozen) { std::stringstream err; - err << (m_frozen ? "" : "un") << "frozen " - << "layer \"" << get_name() << "\" has " + err << (this->m_frozen ? "" : "un") << "frozen " + << "layer \"" << this->get_name() << "\" has " << (w->is_frozen() ? "" : "un") << "frozen " << "weights \"" << w->get_name() << "\""; LBANN_ERROR(err.str()); @@ -416,13 +430,13 @@ class base_convolution_layer : public Layer { /// Initialize GPU objects void setup_gpu() override { - Layer::setup_gpu(); + data_type_layer::setup_gpu(); #ifndef LBANN_HAS_CUDNN LBANN_ERROR("cuDNN not detected"); #else - const auto& output_dims = get_output_dims(); - const auto& kernel_dims = get_kernel_dims(); + const auto& output_dims = this->get_output_dims(); + const auto& kernel_dims = this->get_kernel_dims(); // Set kernel descriptor CHECK_CUDNN(cudnnCreateFilterDescriptor(&m_kernel_cudnn_desc)); @@ -445,7 +459,7 @@ class base_convolution_layer : public Layer { m_groups)); // Set bias tensor descriptor - if (m_bias_scaling_factor != DataType(0)) { + if (m_bias_scaling_factor != TensorDataType(0)) { std::vector bias_dims(output_dims.size() + 1, 1); bias_dims[1] = output_dims[0]; cudnn::set_tensor_desc(m_bias_cudnn_desc, bias_dims); @@ -466,17 +480,17 @@ class base_convolution_layer : public Layer { #else // Useful constants - const DataType zero = DataType(0); - const DataType one = DataType(1); + const TensorDataType zero = TensorDataType(0); + const TensorDataType one = TensorDataType(1); // Matrices - const auto& kernel = m_weights[0]->get_values(); + const auto& kernel = this->get_data_type_weights(0).get_values(); const auto& input = (during_forward_prop ? - get_local_prev_activations() : - get_local_prev_error_signals()); + this->get_local_prev_activations() : + this->get_local_prev_error_signals()); auto& output = (during_forward_prop ? - get_local_activations() : - get_local_error_signals()); + this->get_local_activations() : + this->get_local_error_signals()); // Do nothing if there is no local data if (input.Height() < 1 || input.Width() < 1 @@ -485,7 +499,7 @@ class base_convolution_layer : public Layer { } // Initialize GPU workspace - GPUMat workspace; + El::Matrix workspace; #ifdef HYDROGEN_HAVE_CUB workspace.SetMemoryMode(1); #endif // HYDROGEN_HAVE_CUB @@ -497,14 +511,14 @@ class base_convolution_layer : public Layer { std::vector input_dims, output_dims; cudnnTensorDescriptor_t input_desc, output_desc; if (during_forward_prop) { - input_dims = get_input_dims(); - output_dims = get_output_dims(); + input_dims = this->get_input_dims(); + output_dims = this->get_output_dims(); input_desc = m_tensors_cudnn_desc.get_prev_activations(); output_desc = m_tensors_cudnn_desc.get_activations(); } else { - input_dims = get_output_dims(); - output_dims = get_input_dims(); + input_dims = this->get_output_dims(); + output_dims = this->get_input_dims(); input_desc = m_tensors_cudnn_desc.get_prev_error_signals(); output_desc = m_tensors_cudnn_desc.get_error_signals(); } @@ -543,17 +557,17 @@ class base_convolution_layer : public Layer { #else // Useful constants - const DataType zero = DataType(0); - const DataType one = DataType(1); + const TensorDataType zero = TensorDataType(0); + const TensorDataType one = TensorDataType(1); // GPU data - const auto& kernel = m_weights[0]->get_values(); + const auto& kernel = this->get_data_type_weights(0).get_values(); const auto& input = (during_forward_prop ? - get_local_prev_activations() : - get_local_prev_error_signals()); + this->get_local_prev_activations() : + this->get_local_prev_error_signals()); auto& output = (during_forward_prop ? - get_local_activations() : - get_local_error_signals()); + this->get_local_activations() : + this->get_local_error_signals()); // Do nothing if there is no local data if (input.Height() < 1 || input.Width() < 1 @@ -563,7 +577,7 @@ class base_convolution_layer : public Layer { // Initialize GPU workspace // Note: Use CUB GPU memory pool if possible - GPUMat workspace; + El::Matrix workspace; #ifdef HYDROGEN_HAVE_CUB workspace.SetMemoryMode(1); #endif // HYDROGEN_HAVE_CUB @@ -575,14 +589,14 @@ class base_convolution_layer : public Layer { std::vector input_dims, output_dims; cudnnTensorDescriptor_t input_desc, output_desc; if (during_forward_prop) { - input_dims = get_input_dims(); - output_dims = get_output_dims(); + input_dims = this->get_input_dims(); + output_dims = this->get_output_dims(); input_desc = m_tensors_cudnn_desc.get_prev_activations(); output_desc = m_tensors_cudnn_desc.get_activations(); } else { - input_dims = get_output_dims(); - output_dims = get_input_dims(); + input_dims = this->get_output_dims(); + output_dims = this->get_input_dims(); input_desc = m_tensors_cudnn_desc.get_prev_error_signals(); output_desc = m_tensors_cudnn_desc.get_error_signals(); } @@ -619,12 +633,12 @@ class base_convolution_layer : public Layer { #ifndef LBANN_HAS_CUDNN LBANN_ERROR("cuDNN not detected"); #else - auto& local_output = get_local_activations(); - if (m_bias_scaling_factor != DataType(0) + auto& local_output = this->get_local_activations(); + if (m_bias_scaling_factor != TensorDataType(0) && local_output.Height() > 0 && local_output.Width() > 0) { - const DataType one = 1; - const auto& bias = m_weights[1]->get_values(); + const TensorDataType one = 1; + const auto& bias = this->get_data_type_weights(1).get_values(); CHECK_CUDNN(cudnnAddTensor(cudnn::get_handle(), &m_bias_scaling_factor, m_bias_cudnn_desc, @@ -642,8 +656,8 @@ class base_convolution_layer : public Layer { #else // Matrices - const auto& local_input = get_local_prev_activations(); - const auto& local_gradient_wrt_output = get_local_prev_error_signals(); + const auto& local_input = this->get_local_prev_activations(); + const auto& local_gradient_wrt_output = this->get_local_prev_error_signals(); const bool has_local_data = (local_input.Height() > 0 && local_input.Width() > 0 @@ -651,10 +665,10 @@ class base_convolution_layer : public Layer { && local_gradient_wrt_output.Width() > 0); // Compute bias gradient - if (m_bias_scaling_factor != DataType(0) - && m_weights[1]->get_optimizer() != nullptr) { - optimizer* bias_optimizer = m_weights[1]->get_optimizer(); - DataType dst_scale = DataType(0), gradient_scale = DataType(0); + if (m_bias_scaling_factor != TensorDataType(0) + && this->get_data_type_weights(1).get_optimizer() != nullptr) { + OptimizerType* bias_optimizer = this->get_data_type_weights(1).get_optimizer(); + TensorDataType dst_scale = TensorDataType(0), gradient_scale = TensorDataType(0); auto& bias_gradient = bias_optimizer->get_gradient_buffer( dst_scale, gradient_scale, true); if (has_local_data) { @@ -672,14 +686,14 @@ class base_convolution_layer : public Layer { } // Compute kernel gradient - optimizer* kernel_optimizer = m_weights[0]->get_optimizer(); + OptimizerType* kernel_optimizer = this->get_data_type_weights(0).get_optimizer(); if (kernel_optimizer != nullptr) { - DataType dst_scale = DataType(0), gradient_scale = DataType(0); + TensorDataType dst_scale = TensorDataType(0), gradient_scale = TensorDataType(0); auto& kernel_gradient = kernel_optimizer->get_gradient_buffer( dst_scale, gradient_scale, true); if (has_local_data) { // Initialize GPU workspace - GPUMat workspace; + El::Matrix workspace; #ifdef HYDROGEN_HAVE_CUB workspace.SetMemoryMode(1); // CUB GPU memory pool #endif // HYDROGEN_HAVE_CUB @@ -751,27 +765,27 @@ class base_convolution_layer : public Layer { void apply_convolution_im2col(bool during_forward_prop) { // Local matrices - const auto& local_kernel = this->m_weights[0]->get_values().LockedMatrix(); + const auto& local_kernel = this->get_data_type_weights(0).get_values().LockedMatrix(); const auto& local_input = (during_forward_prop ? - get_local_prev_activations() : - get_local_prev_error_signals()); + this->get_local_prev_activations() : + this->get_local_prev_error_signals()); auto& local_output = (during_forward_prop ? - get_local_activations() : - get_local_error_signals()); + this->get_local_activations() : + this->get_local_error_signals()); // Matrix parameters const int output_size = local_output.Height(); const El::Int local_width = local_input.Width(); std::vector input_dims, output_dims; if (during_forward_prop) { - input_dims = get_input_dims(); - output_dims = get_output_dims(); + input_dims = this->get_input_dims(); + output_dims = this->get_output_dims(); } else { - input_dims = get_output_dims(); - output_dims = get_input_dims(); + input_dims = this->get_output_dims(); + output_dims = this->get_input_dims(); } - const auto& kernel_dims = get_kernel_dims(); + const auto& kernel_dims = this->get_kernel_dims(); const auto& kernel_size = std::accumulate(kernel_dims.begin(), kernel_dims.end(), 1, std::multiplies()); @@ -801,8 +815,8 @@ class base_convolution_layer : public Layer { // Apply convolution to current input column output_col.Attach(m, n, local_output.Buffer(0, col), m); El::Gemm(El::TRANSPOSE, El::NORMAL, - DataType(1), im2col_matrix, kernel_matrix, - DataType(0), output_col); + TensorDataType(1), im2col_matrix, kernel_matrix, + TensorDataType(0), output_col); } @@ -812,27 +826,27 @@ class base_convolution_layer : public Layer { void apply_transposed_convolution_im2col(bool during_forward_prop) { // Local matrices - const auto& local_kernel = this->m_weights[0]->get_values().LockedMatrix(); + const auto& local_kernel = this->get_data_type_weights(0).get_values().LockedMatrix(); const auto& local_input = (during_forward_prop ? - get_local_prev_activations() : - get_local_prev_error_signals()); + this->get_local_prev_activations() : + this->get_local_prev_error_signals()); DMat& local_output = (during_forward_prop ? - get_local_activations() : - get_local_error_signals()); + this->get_local_activations() : + this->get_local_error_signals()); // Matrix parameters const int input_size = local_input.Height(); const El::Int local_width = local_input.Width(); std::vector input_dims, output_dims; if (during_forward_prop) { - input_dims = get_input_dims(); - output_dims = get_output_dims(); + input_dims = this->get_input_dims(); + output_dims = this->get_output_dims(); } else { - input_dims = get_output_dims(); - output_dims = get_input_dims(); + input_dims = this->get_output_dims(); + output_dims = this->get_input_dims(); } - const auto& kernel_dims = get_kernel_dims(); + const auto& kernel_dims = this->get_kernel_dims(); const auto& kernel_size = std::accumulate(kernel_dims.begin(), kernel_dims.end(), 1, std::multiplies()); @@ -851,8 +865,8 @@ class base_convolution_layer : public Layer { // Apply transposed convolution to current input column input_col.LockedAttach(n, k, local_input.LockedBuffer(0, col), n); El::Gemm(El::NORMAL, El::TRANSPOSE, - DataType(1), kernel_matrix, input_col, - DataType(0), im2col_matrix); + TensorDataType(1), kernel_matrix, input_col, + TensorDataType(0), im2col_matrix); // Perform col2im to accumulate contributions from each kernel // position @@ -873,24 +887,24 @@ class base_convolution_layer : public Layer { void apply_bias_cpu() { // Return immediately if there is no bias - if (m_bias_scaling_factor == DataType(0)) return; + if (m_bias_scaling_factor == TensorDataType(0)) return; // Local matrices - const auto& local_bias = m_weights[1]->get_values().LockedMatrix(); - auto& local_output = get_local_activations(); + const auto& local_bias = this->get_data_type_weights(1).get_values().LockedMatrix(); + auto& local_output = this->get_local_activations(); // Matrix parameters const El::Int local_width = local_output.Width(); - const auto& output_dims = get_output_dims(); + const auto& output_dims = this->get_output_dims(); const El::Int num_output_channels = output_dims[0]; - const El::Int num_per_output_channel = get_output_size() / num_output_channels; + const El::Int num_per_output_channel = this->get_output_size() / num_output_channels; // Apply bias to each output channel LBANN_OMP_PARALLEL_FOR for (El::Int channel = 0; channel < num_output_channels; ++channel) { const El::Int row_start = channel * num_per_output_channel; const El::Int row_end = (channel+1) * num_per_output_channel; - const DataType bias_term = m_bias_scaling_factor * local_bias(channel, 0); + const TensorDataType bias_term = m_bias_scaling_factor * local_bias(channel, 0); for (El::Int col = 0; col < local_width; ++col) { for (El::Int row = row_start; row < row_end; ++row) { local_output(row, col) += bias_term; @@ -903,29 +917,29 @@ class base_convolution_layer : public Layer { void compute_gradients_im2col(bool using_transposed_convolution) { // Local matrices - const DMat& local_input = get_local_prev_activations(); - const DMat& local_gradient_wrt_output = get_local_prev_error_signals(); + const DMat& local_input = this->get_local_prev_activations(); + const DMat& local_gradient_wrt_output = this->get_local_prev_error_signals(); const bool has_local_data = (!local_input.IsEmpty() && !local_gradient_wrt_output.IsEmpty()); // Get convolution parameters const El::Int local_width = local_input.Width(); - const auto& input_dims = get_input_dims(); - const auto& output_dims = get_output_dims(); + const auto& input_dims = this->get_input_dims(); + const auto& output_dims = this->get_output_dims(); const int num_input_channels = input_dims[0]; const int num_output_channels = output_dims[0]; - const int num_per_output_channel = get_output_size() / num_output_channels; - const auto& kernel_dims = get_kernel_dims(); + const int num_per_output_channel = this->get_output_size() / num_output_channels; + const auto& kernel_dims = this->get_kernel_dims(); const auto& kernel_size = std::accumulate(kernel_dims.begin(), kernel_dims.end(), 1, std::multiplies()); // Compute bias gradient // Note: Sum is computed with Kahan summation - if (m_bias_scaling_factor != DataType(0) - && this->m_weights[1]->get_optimizer() != nullptr) { - optimizer* bias_optimizer = this->m_weights[1]->get_optimizer(); - DataType dst_scale = DataType(0), gradient_scale = DataType(0); + if (m_bias_scaling_factor != TensorDataType(0) + && this->get_data_type_weights(1).get_optimizer() != nullptr) { + OptimizerType* bias_optimizer = this->get_data_type_weights(1).get_optimizer(); + TensorDataType dst_scale = TensorDataType(0), gradient_scale = TensorDataType(0); auto& bias_gradient = bias_optimizer->get_gradient_buffer( dst_scale, gradient_scale, true); if (has_local_data) { @@ -934,13 +948,13 @@ class base_convolution_layer : public Layer { for (int channel = 0; channel < num_output_channels; ++channel) { const El::Int row_start = channel * num_per_output_channel; const El::Int row_end = (channel+1) * num_per_output_channel; - DataType sum = 0; - DataType correction = 0; + TensorDataType sum = 0; + TensorDataType correction = 0; for (El::Int col = 0; col < local_width; ++col) { for (El::Int row = row_start; row < row_end; ++row) { - DataType term = local_gradient_wrt_output(row, col); + TensorDataType term = local_gradient_wrt_output(row, col); term += correction; - const DataType next_sum = sum + term; + const TensorDataType next_sum = sum + term; correction = term - (next_sum - sum); sum = next_sum; } @@ -954,7 +968,7 @@ class base_convolution_layer : public Layer { } // Stop early if kernel is not being optimized - optimizer* kernel_optimizer = this->m_weights[0]->get_optimizer(); + OptimizerType* kernel_optimizer = this->get_data_type_weights(0).get_optimizer(); if (kernel_optimizer == nullptr) { return; } // Initialize matrices @@ -965,9 +979,9 @@ class base_convolution_layer : public Layer { num_input_channels : num_output_channels); const int k = (using_transposed_convolution ? - get_input_size() / num_input_channels : - get_output_size() / num_output_channels); - DataType dst_scale = 0, gradient_scale = 0; + this->get_input_size() / num_input_channels : + this->get_output_size() / num_output_channels); + TensorDataType dst_scale = 0, gradient_scale = 0; auto& kernel_gradient = kernel_optimizer->get_gradient_buffer( dst_scale, gradient_scale, true); El::Scale(dst_scale, kernel_gradient); @@ -990,7 +1004,7 @@ class base_convolution_layer : public Layer { m_strides.data()); El::Gemm(El::NORMAL, El::NORMAL, gradient_scale, im2col_matrix, input_col, - DataType(1), kernel_gradient_matrix); + TensorDataType(1), kernel_gradient_matrix); } else { const DMat input_col @@ -1006,7 +1020,7 @@ class base_convolution_layer : public Layer { m_strides.data()); El::Gemm(El::NORMAL, El::NORMAL, gradient_scale, im2col_matrix, gradient_wrt_output_col, - DataType(1), kernel_gradient_matrix); + TensorDataType(1), kernel_gradient_matrix); } } @@ -1112,14 +1126,14 @@ class base_convolution_layer : public Layer { cudnnConvolutionFwdAlgo_t get_forward_algo_cudnn( const int local_mini_batch_size, const cudnnTensorDescriptor_t& input_desc, - const DataType* input, + const TensorDataType* input, const cudnnFilterDescriptor_t& kernel_desc, - const DataType* kernel, + const TensorDataType* kernel, const cudnnConvolutionDescriptor_t& conv_desc, const cudnnTensorDescriptor_t& output_desc, - DataType* output, + TensorDataType* output, size_t ws_size, - DataType* ws) { + TensorDataType* ws) { if (m_fwd_cudnn_algos.count(local_mini_batch_size) == 0) { #ifdef LBANN_DETERMINISTIC bool deterministic = true; @@ -1142,14 +1156,14 @@ class base_convolution_layer : public Layer { cudnnConvolutionBwdDataAlgo_t get_backward_data_algo_cudnn( const int local_mini_batch_size, const cudnnFilterDescriptor_t& kernel_desc, - const DataType* kernel, + const TensorDataType* kernel, const cudnnTensorDescriptor_t& prev_error_signal_desc, - const DataType* prev_error_signal, + const TensorDataType* prev_error_signal, const cudnnConvolutionDescriptor_t& conv_desc, const cudnnTensorDescriptor_t& error_signal_desc, - DataType* error_signal, + TensorDataType* error_signal, size_t ws_size, - DataType* ws) { + TensorDataType* ws) { if (m_bwd_data_cudnn_algos.count(local_mini_batch_size) == 0) { #ifdef LBANN_DETERMINISTIC bool deterministic = true; @@ -1175,13 +1189,13 @@ class base_convolution_layer : public Layer { cudnnConvolutionBwdFilterAlgo_t get_backward_filter_algo_cudnn( const int local_mini_batch_size, const cudnnTensorDescriptor_t& input_desc, - const DataType* input, + const TensorDataType* input, const cudnnTensorDescriptor_t& prev_error_signal_desc, - const DataType* prev_error_signal, + const TensorDataType* prev_error_signal, const cudnnConvolutionDescriptor_t& conv_desc, const cudnnFilterDescriptor_t& kernel_gradient_desc, size_t ws_size, - DataType* ws) { + TensorDataType* ws) { if (m_bwd_filter_cudnn_algos.count(local_mini_batch_size) == 0) { #ifdef LBANN_DETERMINISTIC bool deterministic = true; @@ -1189,12 +1203,12 @@ class base_convolution_layer : public Layer { bool deterministic = false; #endif // Temporary filter gradient buffer. - GPUMat kernel_gradient; + El::Matrix kernel_gradient; #ifdef HYDROGEN_HAVE_CUB kernel_gradient.SetMemoryMode(1); #endif - kernel_gradient.Resize(this->m_weights[0]->get_matrix_height(), - this->m_weights[0]->get_matrix_width()); + kernel_gradient.Resize(this->get_data_type_weights(0).get_matrix_height(), + this->get_data_type_weights(0).get_matrix_width()); m_bwd_filter_cudnn_algos[local_mini_batch_size] = cudnn::get_bwd_filter_algorithm( true, deterministic, @@ -1212,9 +1226,9 @@ class base_convolution_layer : public Layer { }; #ifndef LBANN_BASE_CONVOLUTION_LAYER_INSTANTIATE -extern template class base_convolution_layer; +extern template class base_convolution_layer; #ifdef LBANN_HAS_GPU -extern template class base_convolution_layer; +extern template class base_convolution_layer; #endif // LBANN_HAS_GPU #endif // LBANN_BASE_CONVOLUTION_LAYER_INSTANTIATE diff --git a/include/lbann/layers/learning/channelwise_scale_bias.hpp b/include/lbann/layers/learning/channelwise_scale_bias.hpp index 02eb1c74694..ee4bd37e210 100644 --- a/include/lbann/layers/learning/channelwise_scale_bias.hpp +++ b/include/lbann/layers/learning/channelwise_scale_bias.hpp @@ -27,7 +27,7 @@ #ifndef LBANN_LAYER_LEARNING_CHANNELWISE_SCALE_BIAS_HPP_INCLUDED #define LBANN_LAYER_LEARNING_CHANNELWISE_SCALE_BIAS_HPP_INCLUDED -#include "lbann/layers/layer.hpp" +#include "lbann/layers/data_type_layer.hpp" #include "lbann/models/model.hpp" #include "lbann/utils/exception.hpp" @@ -52,24 +52,39 @@ namespace lbann { * column correspond to scale terms and the second column to bias * terms. */ -template -class channelwise_scale_bias_layer : public Layer { +class channelwise_scale_bias_layer : public data_type_layer { static_assert(Layout == data_layout::DATA_PARALLEL, "channelwise_mean_layer only supports " "data-parallel data layout"); +public: + /** @name Public Types */ + ///@{ + + /** @brief The tensor type expected in this object. */ + using AbsDistMatrixType = El::AbstractDistMatrix; + + /** @brief The concrete weights type used by this object. */ + using WeightsType = data_type_weights; + + /** @brief The concrete optimizer type used by this object. */ + using OptimizerType = data_type_optimizer; + + ///@} + public: channelwise_scale_bias_layer(lbann_comm *comm) - : Layer(comm) { + : data_type_layer(comm) { } channelwise_scale_bias_layer(const channelwise_scale_bias_layer& other) - : Layer(other), + : data_type_layer(other), m_weights_gradient(other.m_weights_gradient ? other.m_weights_gradient->Copy() : nullptr) {} channelwise_scale_bias_layer& operator=(const channelwise_scale_bias_layer& other) { - Layer::operator=(other); + data_type_layer::operator=(other); m_weights_gradient.reset(other.m_weights_gradient ? other.m_weights_gradient->Copy() : nullptr); @@ -84,42 +99,43 @@ class channelwise_scale_bias_layer : public Layer { El::Device get_device_allocation() const override { return Device; } void setup_matrices(const El::Grid& grid) override { - Layer::setup_matrices(grid); + data_type_layer::setup_matrices(grid); m_weights_gradient.reset(new StarMat(grid)); } void setup_data() override { - Layer::setup_data(); - const El::Int num_channels = get_output_dims()[0]; + data_type_layer::setup_data(); + const El::Int num_channels = this->get_output_dims()[0]; // Construct default weights if needed // Note: Scale is initialized to 1 and bias to 0 - if (this->m_weights.empty()) { - auto w = make_unique(get_comm()); - std::vector vals(2*num_channels, DataType{0}); - std::fill(vals.begin(), vals.begin()+num_channels, DataType{1}); - auto init = make_unique(vals); - std::unique_ptr opt(m_model->create_optimizer()); - w->set_name(get_name() + "_weights"); + if (!this->has_weights()) { + auto w = make_unique(this->get_comm()); + std::vector vals(2*num_channels, TensorDataType{0}); + std::fill(vals.begin(), vals.begin()+num_channels, TensorDataType{1}); + auto init = make_unique>(vals); + auto opt = to_unique_ptr(dynamic_cast( + this->m_model->create_optimizer())); + w->set_name(this->get_name() + "_weights"); w->set_initializer(std::move(init)); w->set_optimizer(std::move(opt)); - this->m_weights.push_back(w.get()); + this->add_weights(w.get()); this->m_model->add_weights(std::move(w)); } - if (this->m_weights.size() != 1) { + if (this->num_weights() != 1) { LBANN_ERROR("attempted to setup ", this->get_type()," layer \"",this->get_name(),"\" ", "with an invalid number of weights ", - "(expected 1, found ",this->m_weights.size(),")"); + "(expected 1, found ",this->num_weights(),")"); } // Setup weights - auto dist = get_prev_activations().DistData(); + auto dist = this->get_prev_activations().DistData(); dist.colDist = El::STAR; dist.rowDist = El::STAR; - m_weights[0]->set_dims({static_cast(num_channels)}, - {static_cast(2)}); - m_weights[0]->set_matrix_distribution(dist); + this->get_data_type_weights(0).set_dims({static_cast(num_channels)}, + {2}); + this->get_data_type_weights(0).set_matrix_distribution(dist); // Setup gradient w.r.t. weights m_weights_gradient->AlignWith(dist); @@ -134,16 +150,16 @@ class channelwise_scale_bias_layer : public Layer { private: /** Objective function gradient w.r.t. weights. */ - std::unique_ptr m_weights_gradient; + std::unique_ptr m_weights_gradient; }; #ifndef LBANN_CHANNELWISE_SCALE_BIAS_LAYER_INSTANTIATE extern template class channelwise_scale_bias_layer< - data_layout::DATA_PARALLEL, El::Device::CPU>; + DataType, data_layout::DATA_PARALLEL, El::Device::CPU>; #ifdef LBANN_HAS_GPU extern template class channelwise_scale_bias_layer< - data_layout::DATA_PARALLEL, El::Device::GPU>; + DataType, data_layout::DATA_PARALLEL, El::Device::GPU>; #endif // LBANN_HAS_GPU #endif // LBANN_CHANNELWISE_SCALE_BIAS_LAYER_INSTANTIATE diff --git a/include/lbann/layers/learning/convolution.hpp b/include/lbann/layers/learning/convolution.hpp index 155ddfde3b9..422393ec888 100644 --- a/include/lbann/layers/learning/convolution.hpp +++ b/include/lbann/layers/learning/convolution.hpp @@ -43,9 +43,10 @@ class imcomm; * tensors. This is primarily optimized for image data in NCHW * format. */ -template -class convolution_layer : public base_convolution_layer { +class convolution_layer : public base_convolution_layer { static_assert(Layout == data_layout::DATA_PARALLEL, "convolution layer only supports DATA_PARALLEL"); private: @@ -82,7 +83,7 @@ class convolution_layer : public base_convolution_layer { std::vector dilations, int groups, bool has_bias = true) - : base_convolution_layer( + : base_convolution_layer( comm, num_data_dims, num_output_channels, @@ -105,7 +106,7 @@ class convolution_layer : public base_convolution_layer { protected: void setup_dims() override { - base_convolution_layer::setup_dims(); + base_convolution_layer::setup_dims(); // Get tensor dimensions const auto& input_dims = this->get_input_dims(); @@ -140,21 +141,21 @@ class convolution_layer : public base_convolution_layer { void fp_compute() override { if(this->using_gpus()) { - base_convolution_layer::apply_convolution_cudnn(true); - base_convolution_layer::apply_bias_cudnn(); + base_convolution_layer::apply_convolution_cudnn(true); + base_convolution_layer::apply_bias_cudnn(); } else { - base_convolution_layer::apply_convolution_im2col(true); - base_convolution_layer::apply_bias_cpu(); + base_convolution_layer::apply_convolution_im2col(true); + base_convolution_layer::apply_bias_cpu(); } } void bp_compute() override { if(this->using_gpus()) { - base_convolution_layer::compute_gradients_cudnn(false); - base_convolution_layer::apply_transposed_convolution_cudnn(false); + base_convolution_layer::compute_gradients_cudnn(false); + base_convolution_layer::apply_transposed_convolution_cudnn(false); } else { - base_convolution_layer::compute_gradients_im2col(false); - base_convolution_layer::apply_transposed_convolution_im2col(false); + base_convolution_layer::compute_gradients_im2col(false); + base_convolution_layer::apply_transposed_convolution_im2col(false); } } @@ -162,10 +163,10 @@ class convolution_layer : public base_convolution_layer { #ifndef LBANN_CONVOLUTION_LAYER_INSTANTIATE extern template class convolution_layer< - data_layout::DATA_PARALLEL, El::Device::CPU>; + DataType, data_layout::DATA_PARALLEL, El::Device::CPU>; #ifdef LBANN_HAS_GPU extern template class convolution_layer< - data_layout::DATA_PARALLEL, El::Device::GPU>; + DataType, data_layout::DATA_PARALLEL, El::Device::GPU>; #endif // LBANN_HAS_GPU #endif // LBANN_CONVOLUTION_LAYER_INSTANTIATE diff --git a/include/lbann/layers/learning/deconvolution.hpp b/include/lbann/layers/learning/deconvolution.hpp index 2cb5c93391b..8c60ca8d921 100644 --- a/include/lbann/layers/learning/deconvolution.hpp +++ b/include/lbann/layers/learning/deconvolution.hpp @@ -38,8 +38,8 @@ class imcomm; } /** @brief Transpose of the convolution layer. */ -template -class deconvolution_layer : public base_convolution_layer { +template +class deconvolution_layer : public base_convolution_layer { static_assert(Layout == data_layout::DATA_PARALLEL, "deconvolution layer only supports DATA_PARALLEL"); private: @@ -76,7 +76,7 @@ class deconvolution_layer : public base_convolution_layer { std::vector dilations, int groups, bool has_bias = true) - : base_convolution_layer( + : base_convolution_layer( comm, num_data_dims, num_output_channels, @@ -97,7 +97,7 @@ class deconvolution_layer : public base_convolution_layer { El::Device get_device_allocation() const override { return Device; } void setup_dims() override { - base_convolution_layer::setup_dims(); + base_convolution_layer::setup_dims(); std::stringstream err; // Get tensor dimensions @@ -155,21 +155,21 @@ class deconvolution_layer : public base_convolution_layer { void fp_compute() override { if(this->using_gpus()) { - base_convolution_layer::apply_transposed_convolution_cudnn(true); - base_convolution_layer::apply_bias_cudnn(); + base_convolution_layer::apply_transposed_convolution_cudnn(true); + base_convolution_layer::apply_bias_cudnn(); } else { - base_convolution_layer::apply_transposed_convolution_im2col(true); - base_convolution_layer::apply_bias_cpu(); + base_convolution_layer::apply_transposed_convolution_im2col(true); + base_convolution_layer::apply_bias_cpu(); } } void bp_compute() override { if(this->using_gpus()) { - base_convolution_layer::compute_gradients_cudnn(true); - base_convolution_layer::apply_convolution_cudnn(false); + base_convolution_layer::compute_gradients_cudnn(true); + base_convolution_layer::apply_convolution_cudnn(false); } else { - base_convolution_layer::compute_gradients_im2col(true); - base_convolution_layer::apply_convolution_im2col(false); + base_convolution_layer::compute_gradients_im2col(true); + base_convolution_layer::apply_convolution_im2col(false); } } @@ -177,10 +177,10 @@ class deconvolution_layer : public base_convolution_layer { #ifndef LBANN_DECONVOLUTION_LAYER_INSTANTIATE extern template class deconvolution_layer< - data_layout::DATA_PARALLEL, El::Device::CPU>; + DataType, data_layout::DATA_PARALLEL, El::Device::CPU>; #ifdef LBANN_HAS_GPU extern template class deconvolution_layer< - data_layout::DATA_PARALLEL, El::Device::GPU>; + DataType, data_layout::DATA_PARALLEL, El::Device::GPU>; #endif // LBANN_HAS_GPU #endif // LBANN_DECONVOLUTION_LAYER_INSTANTIATE diff --git a/include/lbann/layers/learning/embedding.hpp b/include/lbann/layers/learning/embedding.hpp index b73a4b979ce..0bd0409a52f 100644 --- a/include/lbann/layers/learning/embedding.hpp +++ b/include/lbann/layers/learning/embedding.hpp @@ -27,7 +27,7 @@ #ifndef LBANN_LAYERS_LEARNING_EMBEDDING_HPP_INCLUDED #define LBANN_LAYERS_LEARNING_EMBEDDING_HPP_INCLUDED -#include "lbann/layers/layer.hpp" +#include "lbann/layers/data_type_layer.hpp" #include "lbann/models/model.hpp" #include "lbann/utils/memory.hpp" @@ -45,10 +45,25 @@ namespace lbann { * weights matrix. Note that this is the transpose of the weights in * the PyTorch embedding layer. */ -template -class embedding_layer : public Layer { +template +class embedding_layer : public data_type_layer { static_assert(Layout == data_layout::DATA_PARALLEL, "embedding layer only supports data parallel layout"); +public: + /** @name Public Types */ + ///@{ + + /** @brief The tensor type expected in this object. */ + using AbsDistMatrixType = El::AbstractDistMatrix; + + /** @brief The concrete weights type used by this object. */ + using WeightsType = data_type_weights; + + /** @brief The concrete optimizer type used by this object. */ + using OptimizerType = data_type_optimizer; + + ///@} + public: /** @@ -101,7 +116,7 @@ class embedding_layer : public Layer { El::Int m_padding_idx; /** Gradient w.r.t. embedding weights. */ - std::unique_ptr m_gradient_wrt_embeddings; + std::unique_ptr m_gradient_wrt_embeddings; }; @@ -109,21 +124,21 @@ class embedding_layer : public Layer { // Implementation // ========================================================= -template -embedding_layer::embedding_layer( +template +embedding_layer::embedding_layer( lbann_comm* comm, size_t num_embeddings, size_t embedding_dim, El::Int padding_idx) - : Layer(comm), + : data_type_layer(comm), m_num_embeddings{num_embeddings}, m_embedding_dim{embedding_dim}, m_padding_idx{padding_idx} {} -template -embedding_layer::embedding_layer( - const embedding_layer& other) - : Layer(other), +template +embedding_layer::embedding_layer( + const embedding_layer& other) + : data_type_layer(other), m_num_embeddings{other.m_num_embeddings}, m_embedding_dim{other.m_embedding_dim}, m_padding_idx{other.m_padding_idx}, @@ -131,10 +146,10 @@ embedding_layer::embedding_layer( ? other.m_gradient_wrt_embeddings->Copy() : nullptr) {} -template -embedding_layer& embedding_layer::operator=( - const embedding_layer& other) { - Layer::operator=(other); +template +embedding_layer& embedding_layer::operator=( + const embedding_layer& other) { + data_type_layer::operator=(other); m_num_embeddings = other.m_num_embeddings; m_embedding_dim = other.m_embedding_dim; m_padding_idx = other.m_padding_idx; @@ -144,18 +159,18 @@ embedding_layer& embedding_layer::operator=( return *this; } -template -description embedding_layer::get_description() const { - auto desc = Layer::get_description(); +template +description embedding_layer::get_description() const { + auto desc = data_type_layer::get_description(); desc.add("Num embeddings", m_num_embeddings); desc.add("Embedding dim", m_embedding_dim); desc.add("Padding index", m_padding_idx); return desc; } -template -void embedding_layer::setup_dims() { - Layer::setup_dims(); +template +void embedding_layer::setup_dims() { + data_type_layer::setup_dims(); // Make sure input dimensions are valid if (this->get_input_size() != 1) { @@ -174,33 +189,34 @@ void embedding_layer::setup_dims() { } -template -void embedding_layer::setup_data() { - Layer::setup_data(); +template +void embedding_layer::setup_data() { + data_type_layer::setup_data(); // Construct default weights if needed // Note: Randomly drawn from normal distribution with mean 0 and // standard deviation 1. - if (this->m_weights.empty()) { - auto w = make_unique(get_comm()); - auto init = make_unique(0,1); - auto opt = std::unique_ptr(m_model->create_optimizer()); + if (!this->has_weights()) { + auto w = make_unique(this->get_comm()); + auto init = make_unique>(0,1); + auto opt = to_unique_ptr(dynamic_cast( + this->m_model->create_optimizer())); w->set_name(this->get_name() + "_weights"); w->set_initializer(std::move(init)); w->set_optimizer(std::move(opt)); - this->m_weights.push_back(w.get()); + this->add_weights(w.get()); this->m_model->add_weights(std::move(w)); } - if (this->m_weights.size() != 1) { + if (this->num_weights() != 1) { LBANN_ERROR("attempted to setup ", this->get_type()," layer \"",this->get_name(),"\" ", "with an invalid number of weights ", - "(expected 1, found ",this->m_weights.size(),")"); + "(expected 1, found ",this->num_weights(),")"); } // Initialize dictionary - auto& embeddings = *m_weights[0]; - auto matrix_dist = get_prev_activations().DistData(); + auto& embeddings = this->get_data_type_weights(0); + auto matrix_dist = this->get_prev_activations().DistData(); matrix_dist.colDist = El::STAR; matrix_dist.rowDist = El::STAR; embeddings.set_dims({static_cast(m_embedding_dim)}, @@ -226,10 +242,10 @@ void embedding_layer::setup_data() { #ifndef LBANN_EMBEDDING_LAYER_INSTANTIATE extern template class embedding_layer< - data_layout::DATA_PARALLEL, El::Device::CPU>; + DataType, data_layout::DATA_PARALLEL, El::Device::CPU>; #ifdef LBANN_HAS_GPU extern template class embedding_layer< - data_layout::DATA_PARALLEL, El::Device::GPU>; + DataType, data_layout::DATA_PARALLEL, El::Device::GPU>; #endif // LBANN_HAS_GPU #endif // LBANN_EMBEDDING_LAYER_INSTANTIATE diff --git a/include/lbann/layers/learning/entrywise_scale_bias.hpp b/include/lbann/layers/learning/entrywise_scale_bias.hpp index e7ff19a1bfb..8eb44806661 100644 --- a/include/lbann/layers/learning/entrywise_scale_bias.hpp +++ b/include/lbann/layers/learning/entrywise_scale_bias.hpp @@ -27,7 +27,7 @@ #ifndef LBANN_LAYER_LEARNING_ENTRYWISE_SCALE_BIAS_HPP_INCLUDED #define LBANN_LAYER_LEARNING_ENTRYWISE_SCALE_BIAS_HPP_INCLUDED -#include "lbann/layers/layer.hpp" +#include "lbann/layers/data_type_layer.hpp" #include "lbann/models/model.hpp" #include "lbann/utils/exception.hpp" @@ -49,20 +49,36 @@ namespace lbann { * column correspond to scale terms and the second column to bias * terms. */ -template -class entrywise_scale_bias_layer : public Layer { +class entrywise_scale_bias_layer : public data_type_layer { +public: + /** @name Public Types */ + ///@{ + + /** @brief The tensor type expected in this object. */ + using AbsDistMatrixType = El::AbstractDistMatrix; + + /** @brief The concrete weights type used by this object. */ + using WeightsType = data_type_weights; + + /** @brief The concrete optimizer type used by this object. */ + using OptimizerType = data_type_optimizer; + + ///@} + public: entrywise_scale_bias_layer(lbann_comm *comm) - : Layer(comm) {} + : data_type_layer(comm) {} entrywise_scale_bias_layer(const entrywise_scale_bias_layer& other) - : Layer(other), + : data_type_layer(other), m_weights_gradient(other.m_weights_gradient ? other.m_weights_gradient->Copy() : nullptr) {} entrywise_scale_bias_layer& operator=(const entrywise_scale_bias_layer& other) { - Layer::operator=(other); + data_type_layer::operator=(other); m_weights_gradient.reset(other.m_weights_gradient ? other.m_weights_gradient->Copy() : nullptr); @@ -77,47 +93,48 @@ class entrywise_scale_bias_layer : public Layer { El::Device get_device_allocation() const override { return Device; } void setup_matrices(const El::Grid& grid) override { - Layer::setup_matrices(grid); - auto dist = get_prev_activations().DistData(); + data_type_layer::setup_matrices(grid); + auto dist = this->get_prev_activations().DistData(); dist.rowDist = El::STAR; - m_weights_gradient.reset(AbsDistMat::Instantiate(dist)); + m_weights_gradient.reset(AbsDistMatrixType::Instantiate(dist)); } void setup_data() override { - Layer::setup_data(); + data_type_layer::setup_data(); // Initialize output dimensions - set_output_dims(get_input_dims()); - const auto output_dims = get_output_dims(); - const El::Int output_size = get_output_size(); + this->set_output_dims(this->get_input_dims()); + const auto output_dims = this->get_output_dims(); + const El::Int output_size = this->get_output_size(); // Construct default weights if needed // Note: Scale is initialized to 1 and bias to 0 - if (this->m_weights.empty()) { - auto w = make_unique(get_comm()); - std::vector vals(2*output_size, DataType{0}); - std::fill(vals.begin(), vals.begin()+output_size, DataType{1}); - auto init = make_unique(vals); - std::unique_ptr opt(m_model->create_optimizer()); - w->set_name(get_name() + "_weights"); + if (!this->has_weights()) { + auto w = make_unique(this->get_comm()); + std::vector vals(2*output_size, TensorDataType{0}); + std::fill(vals.begin(), vals.begin()+output_size, TensorDataType{1}); + auto init = make_unique>(vals); + auto opt = to_unique_ptr(dynamic_cast( + this->m_model->create_optimizer())); + w->set_name(this->get_name() + "_weights"); w->set_initializer(std::move(init)); w->set_optimizer(std::move(opt)); - this->m_weights.push_back(w.get()); + this->add_weights(w.get()); this->m_model->add_weights(std::move(w)); } - if (this->m_weights.size() != 1) { + if (this->num_weights() != 1) { LBANN_ERROR("attempted to setup ", this->get_type()," layer \"",this->get_name(),"\" ", "with an invalid number of weights ", - "(expected 1, found ",this->m_weights.size(),")"); + "(expected 1, found ",this->num_weights(),")"); } // Setup weights - auto dist = get_prev_activations().DistData(); + auto dist = this->get_prev_activations().DistData(); dist.rowDist = El::STAR; - m_weights[0]->set_dims(output_dims, - {static_cast(2)}); - m_weights[0]->set_matrix_distribution(dist); + this->get_data_type_weights(0).set_dims(output_dims, + {static_cast(2)}); + this->get_data_type_weights(0).set_matrix_distribution(dist); // Setup gradient w.r.t. weights m_weights_gradient->AlignWith(dist); @@ -126,7 +143,7 @@ class entrywise_scale_bias_layer : public Layer { } void fp_setup_outputs(El::Int mini_batch_size) override { - Layer::fp_setup_outputs(mini_batch_size); + data_type_layer::fp_setup_outputs(mini_batch_size); #if 0 /// @todo See https://github.com/LLNL/lbann/issues/1123 @@ -134,7 +151,7 @@ class entrywise_scale_bias_layer : public Layer { /// @todo Realign weights tensor if misaligned bool aligned = true; try { - const auto& x = get_prev_activations(); + const auto& x = this->get_prev_activations(); const auto& w = m_weights[0]->get_values(); aligned = (x.ColAlign() == w.ColAlign() && x.RowAlign() == w.RowAlign()); @@ -156,10 +173,10 @@ class entrywise_scale_bias_layer : public Layer { } void bp_setup_gradient_wrt_inputs(El::Int mini_batch_size) override { - Layer::bp_setup_gradient_wrt_inputs(mini_batch_size); + data_type_layer::bp_setup_gradient_wrt_inputs(mini_batch_size); m_weights_gradient->Empty(false); - m_weights_gradient->AlignWith(get_prev_activations()); - m_weights_gradient->Resize(get_input_size(), 2); + m_weights_gradient->AlignWith(this->get_prev_activations()); + m_weights_gradient->Resize(this->get_input_size(), 2); } protected: @@ -169,20 +186,20 @@ class entrywise_scale_bias_layer : public Layer { private: /** Objective function gradient w.r.t. weights. */ - std::unique_ptr m_weights_gradient; + std::unique_ptr m_weights_gradient; }; #ifndef LBANN_ENTRYWISE_SCALE_BIAS_LAYER_INSTANTIATE extern template class entrywise_scale_bias_layer< - data_layout::DATA_PARALLEL, El::Device::CPU>; + DataType, data_layout::DATA_PARALLEL, El::Device::CPU>; extern template class entrywise_scale_bias_layer< - data_layout::MODEL_PARALLEL, El::Device::CPU>; + DataType, data_layout::MODEL_PARALLEL, El::Device::CPU>; #ifdef LBANN_HAS_GPU extern template class entrywise_scale_bias_layer< - data_layout::DATA_PARALLEL, El::Device::GPU>; + DataType, data_layout::DATA_PARALLEL, El::Device::GPU>; extern template class entrywise_scale_bias_layer< - data_layout::MODEL_PARALLEL, El::Device::GPU>; + DataType, data_layout::MODEL_PARALLEL, El::Device::GPU>; #endif // LBANN_HAS_GPU #endif // LBANN_ENTRYWISE_SCALE_BIAS_LAYER_INSTANTIATE diff --git a/include/lbann/layers/learning/fully_connected.hpp b/include/lbann/layers/learning/fully_connected.hpp index a3573397a21..9d9e7b70b48 100644 --- a/include/lbann/layers/learning/fully_connected.hpp +++ b/include/lbann/layers/learning/fully_connected.hpp @@ -49,30 +49,45 @@ namespace lbann { * initialized with He normal initialization and the bias weights are * initialized to zero. */ -template -class fully_connected_layer : public learning_layer { +template +class fully_connected_layer : public learning_layer { +public: + /** @name Public Types */ + ///@{ + + /** @brief The tensor type expected in this object. */ + using AbsDistMatrixType = El::AbstractDistMatrix; + + /** @brief The concrete weights type used by this object. */ + using WeightsType = data_type_weights; + + /** @brief The concrete optimizer type used by this object. */ + using OptimizerType = data_type_optimizer; + + ///@} + public: /** @todo Accept a vector for output_size */ fully_connected_layer(lbann_comm *comm, int output_size, bool transpose = false, - weights* weight = nullptr, + WeightsType* weight = nullptr, bool has_bias = true) - : learning_layer(comm), + : learning_layer(comm), m_bias_gradient(nullptr), m_transpose(transpose) { // Initialize output tensor dimensions - set_output_dims({output_size}); + this->set_output_dims({output_size}); // Initialize bias - m_bias_scaling_factor = has_bias ? DataType(1) : DataType(0); + m_bias_scaling_factor = has_bias ? TensorDataType(1) : TensorDataType(0); } fully_connected_layer(const fully_connected_layer& other) : - learning_layer(other), + learning_layer(other), m_bias_scaling_factor(other.m_bias_scaling_factor), m_transpose(other.m_transpose) { @@ -85,7 +100,7 @@ class fully_connected_layer : public learning_layer { } fully_connected_layer& operator=(const fully_connected_layer& other) { - learning_layer::operator=(other); + learning_layer::operator=(other); m_bias_scaling_factor = other.m_bias_scaling_factor; m_transpose = other.m_transpose; @@ -112,8 +127,8 @@ class fully_connected_layer : public learning_layer { El::Device get_device_allocation() const override { return Dev; } description get_description() const override { - auto desc = learning_layer::get_description(); - const auto& bias_str = (m_bias_scaling_factor == DataType(0) ? + auto desc = learning_layer::get_description(); + const auto& bias_str = (m_bias_scaling_factor == TensorDataType(0) ? "disabled" : "enabled"); desc.add("Bias", bias_str); return desc; @@ -124,69 +139,68 @@ class fully_connected_layer : public learning_layer { void setup_matrices(const El::Grid& grid) override; void setup_data() override { - learning_layer::setup_data(); + learning_layer::setup_data(); // Initialize default weights if none are provided - if (this->m_weights.size() > 2) { - std::stringstream err; - err << __FILE__ << " " << __LINE__ << " :: " - << "attempted to setup " << m_name << " with an invalid number of weights"; - throw lbann_exception(err.str()); + if (this->num_weights() > 2) { + LBANN_ERROR("attempted to setup ", this->get_name(), " with an invalid number of weights"); } - if (m_bias_scaling_factor != DataType(0)) { - this->m_weights.resize(2, nullptr); + if (m_bias_scaling_factor != TensorDataType(0)) { + this->set_num_data_type_weights(2); } else { - this->m_weights.resize(1, nullptr); + this->set_num_data_type_weights(1); } - if (this->m_weights[0] == nullptr) { - auto w = make_unique(get_comm()); - auto init = make_unique(probability_distribution::gaussian); - std::unique_ptr opt(m_model->create_optimizer()); - w->set_name(get_name() + "_linearity_weights"); + if (!this->has_data_type_weights(0)) { + auto w = make_unique(this->get_comm()); + auto init = make_unique>(probability_distribution::gaussian); + auto opt = to_unique_ptr(dynamic_cast( + this->get_model()->create_optimizer())); + w->set_name(this->get_name() + "_linearity_weights"); w->set_initializer(std::move(init)); w->set_optimizer(std::move(opt)); - this->m_weights[0] = w.get(); + this->set_data_type_weights(0, w.get()); this->m_model->add_weights(std::move(w)); } - auto& linearity_weights = *this->m_weights[0]; + auto& linearity_weights = this->get_data_type_weights(0); // Initialize variance scaling initialization auto* cast_initializer - = dynamic_cast(linearity_weights.get_initializer()); + = dynamic_cast*>(linearity_weights.get_initializer()); if (cast_initializer != nullptr) { - cast_initializer->set_fan_in(get_input_size()); - cast_initializer->set_fan_out(get_output_size()); + cast_initializer->set_fan_in(this->get_input_size()); + cast_initializer->set_fan_out(this->get_output_size()); } // Setup linearity weights - auto linearity_dist = get_prev_activations().DistData(); + auto linearity_dist = this->get_prev_activations().DistData(); if (linearity_dist.colDist != El::MC || linearity_dist.rowDist != El::MR) { linearity_dist.colDist = El::STAR; linearity_dist.rowDist = El::STAR; } if (m_transpose) { - linearity_weights.set_dims(get_input_dims(), get_output_dims()); + linearity_weights.set_dims(this->get_input_dims(), this->get_output_dims()); } else { - linearity_weights.set_dims(get_output_dims(), get_input_dims()); + linearity_weights.set_dims(this->get_output_dims(), this->get_input_dims()); } linearity_weights.set_matrix_distribution(linearity_dist); // Set up bias if needed. - if (m_bias_scaling_factor != DataType(0)) { - if (this->m_weights[1] == nullptr) { - auto w = make_unique(get_comm()); - std::unique_ptr opt(m_model->create_optimizer()); - w->set_name(get_name() + "_bias_weights"); + if (m_bias_scaling_factor != TensorDataType(0)) { + if (!this->has_data_type_weights(1)) { + auto w = make_unique(this->get_comm()); + auto opt = to_unique_ptr(dynamic_cast( + this->get_model()->create_optimizer())); + w->set_name(this->get_name() + "_bias_weights"); w->set_optimizer(std::move(opt)); - this->m_weights[1] = w.get(); + this->set_data_type_weights(1, w.get()); this->m_model->add_weights(std::move(w)); } - auto& bias_weights = *this->m_weights[1]; + auto& bias_weights = this->get_data_type_weights(1); // Setup bias weights - auto bias_dist = get_activations().DistData(); + auto bias_dist = this->get_activations().DistData(); bias_dist.rowDist = El::STAR; - bias_weights.set_dims(get_output_dims()); + bias_weights.set_dims(this->get_output_dims()); bias_weights.set_matrix_distribution(bias_dist); if (this->m_bias_gradient != nullptr) { El::Zeros(*this->m_bias_gradient, @@ -196,21 +210,19 @@ class fully_connected_layer : public learning_layer { } // Initialize freeze state - for (auto&& w : this->m_weights) { - if (m_frozen) { + for (auto&& w : this->get_data_type_weights()) { + if (this->is_frozen()) { w->freeze(); } else { w->unfreeze(); } } - for (auto&& w : this->m_weights) { - if (w->is_frozen() != m_frozen) { - std::stringstream err; - err << (m_frozen ? "" : "un") << "frozen " - << "layer \"" << get_name() << "\" has " - << (w->is_frozen() ? "" : "un") << "frozen " - << "weights \"" << w->get_name() << "\""; - LBANN_ERROR(err.str()); + for (auto&& w : this->get_data_type_weights()) { + if (w->is_frozen() != this->is_frozen()) { + LBANN_ERROR((this->is_frozen() ? "" : "un"), "frozen ", + "layer \"", this->get_name(), "\" has ", + (w->is_frozen() ? "" : "un"), "frozen ", + "weights \"", w->get_name(), "\""); } } @@ -224,13 +236,13 @@ class fully_connected_layer : public learning_layer { /** Scaling factor for bias term. * If the scaling factor is zero, bias is not applied. */ - DataType m_bias_scaling_factor; + TensorDataType m_bias_scaling_factor; /** Bias weights gradient. * This is this layer's contribution to the objective function * gradient w.r.t. the bias weights. */ - AbsDistMat* m_bias_gradient; + AbsDistMatrixType* m_bias_gradient; /** Whether the transpose of the linearity matrix is applied. */ bool m_transpose; @@ -240,18 +252,22 @@ class fully_connected_layer : public learning_layer { if (m_bias_gradient != nullptr) delete m_bias_gradient; } + template + friend void fp_compute_impl(fully_connected_layer& l); + template + friend void bp_compute_impl(fully_connected_layer& l); }; #ifndef LBANN_FULLY_CONNECTED_LAYER_INSTANTIATE extern template class fully_connected_layer< - data_layout::DATA_PARALLEL, El::Device::CPU>; + DataType, data_layout::DATA_PARALLEL, El::Device::CPU>; extern template class fully_connected_layer< - data_layout::MODEL_PARALLEL, El::Device::CPU>; + DataType, data_layout::MODEL_PARALLEL, El::Device::CPU>; #ifdef LBANN_HAS_GPU extern template class fully_connected_layer< - data_layout::DATA_PARALLEL, El::Device::GPU>; + DataType, data_layout::DATA_PARALLEL, El::Device::GPU>; extern template class fully_connected_layer< - data_layout::MODEL_PARALLEL, El::Device::GPU>; + DataType, data_layout::MODEL_PARALLEL, El::Device::GPU>; #endif // LBANN_HAS_GPU #endif // LBANN_FULLY_CONNECTED_LAYER_INSTANTIATE diff --git a/include/lbann/layers/learning/learning.hpp b/include/lbann/layers/learning/learning.hpp index 2f2ab120bc5..f3b0d5e451c 100644 --- a/include/lbann/layers/learning/learning.hpp +++ b/include/lbann/layers/learning/learning.hpp @@ -27,16 +27,18 @@ #ifndef LBANN_LAYER_LEARNING_HPP_INCLUDED #define LBANN_LAYER_LEARNING_HPP_INCLUDED -#include "lbann/layers/layer.hpp" +#include "lbann/layers/data_type_layer.hpp" namespace lbann { /** @todo Remove. Layers should inherit directly from the base layer * class. */ -class learning_layer : public Layer { + +template +class learning_layer : public data_type_layer { public: - learning_layer(lbann_comm *comm) : Layer(comm) {} + learning_layer(lbann_comm *comm) : data_type_layer(comm) {} }; } // namespace lbann diff --git a/include/lbann/layers/loss/categorical_accuracy.hpp b/include/lbann/layers/loss/categorical_accuracy.hpp index 12acae52184..1e76f97edca 100644 --- a/include/lbann/layers/loss/categorical_accuracy.hpp +++ b/include/lbann/layers/loss/categorical_accuracy.hpp @@ -27,7 +27,7 @@ #ifndef LBANN_LAYERS_LOSS_CATEGORICAL_ACCURACY_HPP_INCLUDED #define LBANN_LAYERS_LOSS_CATEGORICAL_ACCURACY_HPP_INCLUDED -#include "lbann/layers/layer.hpp" +#include "lbann/layers/data_type_layer.hpp" namespace lbann { @@ -42,11 +42,11 @@ namespace lbann { * This is primarily intended for use as a metric since it is not * differentiable. */ -template -class categorical_accuracy_layer : public Layer { +template +class categorical_accuracy_layer : public data_type_layer { public: - categorical_accuracy_layer(lbann_comm *comm) : Layer(comm) { + categorical_accuracy_layer(lbann_comm *comm) : data_type_layer(comm) { this->m_expected_num_parent_layers = 2; } @@ -58,17 +58,17 @@ class categorical_accuracy_layer : public Layer { El::Device get_device_allocation() const override { return Dev; } void setup_dims() override { - Layer::setup_dims(); - set_output_dims({1}); + data_type_layer::setup_dims(); + this->set_output_dims({1}); // Check that input dimensions match - if (get_input_dims(0) != get_input_dims(1)) { - const auto& parents = get_parent_layers(); + if (this->get_input_dims(0) != this->get_input_dims(1)) { + const auto& parents = this->get_parent_layers(); std::stringstream err; - err << get_type() << " layer \"" << get_name() << "\" " + err << get_type() << " layer \"" << this->get_name() << "\" " << "has input tensors with different dimensions ("; - for (int i = 0; i < get_num_parents(); ++i) { - const auto& dims = get_input_dims(i); + for (int i = 0; i < this->get_num_parents(); ++i) { + const auto& dims = this->get_input_dims(i); err << (i > 0 ? ", " : "") << "layer \"" << parents[i]->get_name() << "\" outputs "; for (size_t j = 0; j < dims.size(); ++j) { @@ -87,14 +87,14 @@ class categorical_accuracy_layer : public Layer { #ifndef LBANN_CATEGORICAL_ACCURACY_LAYER_INSTANTIATE extern template class categorical_accuracy_layer< - data_layout::DATA_PARALLEL, El::Device::CPU>; + DataType, data_layout::DATA_PARALLEL, El::Device::CPU>; extern template class categorical_accuracy_layer< - data_layout::MODEL_PARALLEL, El::Device::CPU>; + DataType, data_layout::MODEL_PARALLEL, El::Device::CPU>; #ifdef LBANN_HAS_GPU extern template class categorical_accuracy_layer< - data_layout::DATA_PARALLEL, El::Device::GPU>; + DataType, data_layout::DATA_PARALLEL, El::Device::GPU>; extern template class categorical_accuracy_layer< - data_layout::MODEL_PARALLEL, El::Device::GPU>; + DataType, data_layout::MODEL_PARALLEL, El::Device::GPU>; #endif // LBANN_HAS_GPU #endif // LBANN_CATEGORICAL_ACCURACY_LAYER_INSTANTIATE diff --git a/include/lbann/layers/loss/cross_entropy.hpp b/include/lbann/layers/loss/cross_entropy.hpp index 96a7b27f66e..561db3efbc6 100644 --- a/include/lbann/layers/loss/cross_entropy.hpp +++ b/include/lbann/layers/loss/cross_entropy.hpp @@ -27,7 +27,7 @@ #ifndef LBANN_LAYERS_LOSS_CROSS_ENTROPY_HPP_INCLUDED #define LBANN_LAYERS_LOSS_CROSS_ENTROPY_HPP_INCLUDED -#include "lbann/layers/layer.hpp" +#include "lbann/layers/data_type_layer.hpp" namespace lbann { @@ -37,23 +37,32 @@ namespace lbann { * distribution @f$\hat{y}@f$, * @f[ CE(y,\hat{y}) = - \sum\limits_{i} \hat{y}_i \log y_i @f] */ -template -class cross_entropy_layer : public Layer { +template +class cross_entropy_layer : public data_type_layer { public: + /** @name Public Types */ + ///@{ - cross_entropy_layer(lbann_comm *comm) : Layer(comm) { + /** @brief The tensor type expected in this object. */ + using AbsDistMatrixType = El::AbstractDistMatrix; + + ///@} + +public: + + cross_entropy_layer(lbann_comm *comm) : data_type_layer(comm) { this->m_expected_num_parent_layers = 2; } cross_entropy_layer(const cross_entropy_layer& other) - : Layer(other) { + : data_type_layer(other) { m_workspace.reset(other.m_workspace ? other.m_workspace->Copy() : nullptr); } cross_entropy_layer& operator=(const cross_entropy_layer& other) { - Layer::operator=(other); + data_type_layer::operator=(other); m_workspace.reset(other.m_workspace ? other.m_workspace->Copy() : nullptr); @@ -66,17 +75,17 @@ class cross_entropy_layer : public Layer { El::Device get_device_allocation() const override { return Dev; } void setup_dims() override { - Layer::setup_dims(); - set_output_dims({1}); + data_type_layer::setup_dims(); + this->set_output_dims({1}); // Check that input dimensions match - if (get_input_dims(0) != get_input_dims(1)) { - const auto& parents = get_parent_layers(); + if (this->get_input_dims(0) != this->get_input_dims(1)) { + const auto& parents = this->get_parent_layers(); std::stringstream err; - err << get_type() << " layer \"" << get_name() << "\" " + err << get_type() << " layer \"" << this->get_name() << "\" " << "has input tensors with different dimensions ("; - for (int i = 0; i < get_num_parents(); ++i) { - const auto& dims = get_input_dims(i); + for (int i = 0; i < this->get_num_parents(); ++i) { + const auto& dims = this->get_input_dims(i); err << (i > 0 ? ", " : "") << "layer \"" << parents[i]->get_name() << "\" outputs "; for (size_t j = 0; j < dims.size(); ++j) { @@ -90,11 +99,11 @@ class cross_entropy_layer : public Layer { } void setup_data() override { - Layer::setup_data(); + data_type_layer::setup_data(); // Initialize workspace - const auto& prediction = get_prev_activations(0); - switch (get_data_layout()) { + const auto& prediction = this->get_prev_activations(0); + switch (this->get_data_layout()) { case data_layout::DATA_PARALLEL: m_workspace.reset(new StarVCMat(prediction.Grid(), prediction.Root())); @@ -116,64 +125,52 @@ class cross_entropy_layer : public Layer { void fp_compute() override { // Initialize workspace - const auto& prediction = get_prev_activations(0); + const auto& prediction = this->get_prev_activations(0); m_workspace->AlignWith(prediction.DistData()); m_workspace->Resize(1, prediction.Width()); // Compute local contributions and accumulate /// @todo Consider reduce rather than allreduce - local_fp_compute(get_local_prev_activations(0), - get_local_prev_activations(1), - m_workspace->Matrix()); - m_comm->allreduce(*m_workspace, m_workspace->RedundantComm()); - El::Copy(*m_workspace, get_activations()); + local_fp_compute(); + this->get_comm()->allreduce(*m_workspace, m_workspace->RedundantComm()); + El::Copy(*m_workspace, this->get_activations()); } void bp_compute() override { // Initialize workspace - const auto& prediction = get_prev_activations(0); + const auto& prediction = this->get_prev_activations(0); m_workspace->AlignWith(prediction.DistData()); - El::Copy(get_prev_error_signals(), *m_workspace); + El::Copy(this->get_prev_error_signals(), *m_workspace); // Compute local gradients - local_bp_compute(get_local_prev_activations(0), - get_local_prev_activations(1), - m_workspace->LockedMatrix(), - get_local_error_signals(0), - get_local_error_signals(1)); + local_bp_compute(); } private: /** Compute local contributions to cross entropy loss. */ - static void local_fp_compute(const AbsMat& local_prediction, - const AbsMat& local_ground_truth, - AbsMat& local_contribution); + void local_fp_compute(); /** Compute local gradients. */ - static void local_bp_compute(const AbsMat& local_prediction, - const AbsMat& local_ground_truth, - const AbsMat& local_gradient_wrt_output, - AbsMat& local_gradient_wrt_prediction, - AbsMat& local_gradient_wrt_ground_truth); + void local_bp_compute(); /** Workspace matrix. */ - std::unique_ptr m_workspace; + std::unique_ptr m_workspace; }; #ifndef LBANN_CROSS_ENTROPY_LAYER_INSTANTIATE extern template class cross_entropy_layer< - data_layout::DATA_PARALLEL, El::Device::CPU>; + DataType, data_layout::DATA_PARALLEL, El::Device::CPU>; extern template class cross_entropy_layer< - data_layout::MODEL_PARALLEL, El::Device::CPU>; + DataType, data_layout::MODEL_PARALLEL, El::Device::CPU>; #ifdef LBANN_HAS_GPU extern template class cross_entropy_layer< - data_layout::DATA_PARALLEL, El::Device::GPU>; + DataType, data_layout::DATA_PARALLEL, El::Device::GPU>; extern template class cross_entropy_layer< - data_layout::MODEL_PARALLEL, El::Device::GPU>; + DataType, data_layout::MODEL_PARALLEL, El::Device::GPU>; #endif // LBANN_HAS_GPU #endif // LBANN_CROSS_ENTROPY_LAYER_INSTANTIATE diff --git a/include/lbann/layers/loss/entrywise.hpp b/include/lbann/layers/loss/entrywise.hpp index 33cb7c9262d..9a81f20e18c 100644 --- a/include/lbann/layers/loss/entrywise.hpp +++ b/include/lbann/layers/loss/entrywise.hpp @@ -33,8 +33,8 @@ namespace lbann { #ifndef LBANN_ENTRYWISE_LAYER_INSTANTIATE #define BINARY_ETI_DECL_MACRO_DEV(LAYER_NAME, DEVICE) \ - extern template class LAYER_NAME; \ - extern template class LAYER_NAME + extern template class LAYER_NAME; \ + extern template class LAYER_NAME #else #define BINARY_ETI_DECL_MACRO_DEV(...) #endif // LBANN_BINARY_LAYER_INSTANTIATE diff --git a/include/lbann/layers/loss/l1_norm.hpp b/include/lbann/layers/loss/l1_norm.hpp index 687a8bcb3a1..9727d6a2cd1 100644 --- a/include/lbann/layers/loss/l1_norm.hpp +++ b/include/lbann/layers/loss/l1_norm.hpp @@ -27,7 +27,7 @@ #ifndef LBANN_LAYERS_LOSS_L1_NORM_HPP_INCLUDED #define LBANN_LAYERS_LOSS_L1_NORM_HPP_INCLUDED -#include "lbann/layers/layer.hpp" +#include "lbann/layers/data_type_layer.hpp" namespace lbann { @@ -35,18 +35,27 @@ namespace lbann { * * @f[ \lVert x\rVert_1 = \sum\limits_{i} | x_i | @f] */ -template -class l1_norm_layer : public Layer { +template +class l1_norm_layer : public data_type_layer { public: + /** @name Public Types */ + ///@{ - l1_norm_layer(lbann_comm *comm) : Layer(comm) {} + /** @brief The tensor type expected in this object. */ + using AbsDistMatrixType = El::AbstractDistMatrix; + + ///@} + +public: + + l1_norm_layer(lbann_comm *comm) : data_type_layer(comm) {} l1_norm_layer(const l1_norm_layer& other) - : Layer(other), + : data_type_layer(other), m_workspace(other.m_workspace ? other.m_workspace->Copy() : nullptr) {} l1_norm_layer& operator=(const l1_norm_layer& other) { - Layer::operator=(other); + data_type_layer::operator=(other); m_workspace.reset(other.m_workspace ? other.m_workspace->Copy() : nullptr); return *this; @@ -58,17 +67,17 @@ class l1_norm_layer : public Layer { El::Device get_device_allocation() const override { return Dev; } void setup_dims() override { - Layer::setup_dims(); - set_output_dims({1}); + data_type_layer::setup_dims(); + this->set_output_dims({1}); } void setup_data() override { - Layer::setup_data(); + data_type_layer::setup_data(); // Initialize workspace - auto dist = get_prev_activations().DistData(); + auto dist = this->get_prev_activations().DistData(); dist.colDist = El::STAR; - m_workspace.reset(AbsDistMat::Instantiate(dist)); + m_workspace.reset(AbsDistMatrixType::Instantiate(dist)); #ifdef HYDROGEN_HAVE_CUB if (m_workspace->GetLocalDevice() == El::Device::GPU) { m_workspace->Matrix().SetMemoryMode(1); // CUB memory pool @@ -81,15 +90,14 @@ class l1_norm_layer : public Layer { // Initialize workspace m_workspace->Empty(); - m_workspace->AlignWith(get_prev_activations()); - m_workspace->Resize(1, get_prev_activations().Width()); + m_workspace->AlignWith(this->get_prev_activations()); + m_workspace->Resize(1, this->get_prev_activations().Width()); // Compute local contributions and accumulate /// @todo Consider reduce rather than allreduce - local_fp_compute(get_local_prev_activations(), - m_workspace->Matrix()); - m_comm->allreduce(*m_workspace, m_workspace->RedundantComm()); - El::Copy(*m_workspace, get_activations()); + local_fp_compute(); + this->get_comm()->allreduce(*m_workspace, m_workspace->RedundantComm()); + El::Copy(*m_workspace, this->get_activations()); // Clean up m_workspace->Empty(); @@ -100,13 +108,11 @@ class l1_norm_layer : public Layer { // Initialize workspace m_workspace->Empty(); - m_workspace->AlignWith(get_prev_activations()); - El::Copy(get_prev_error_signals(), *m_workspace); + m_workspace->AlignWith(this->get_prev_activations()); + El::Copy(this->get_prev_error_signals(), *m_workspace); // Compute local gradients - local_bp_compute(get_local_prev_activations(), - m_workspace->LockedMatrix(), - get_local_error_signals()); + local_bp_compute(); // Clean up m_workspace->Empty(); @@ -116,28 +122,25 @@ class l1_norm_layer : public Layer { private: /** Compute local contributions to L2 norm. */ - static void local_fp_compute(const AbsMat& local_input, - AbsMat& local_contribution); + void local_fp_compute(); /** Compute local gradients. */ - static void local_bp_compute(const AbsMat& local_input, - const AbsMat& local_gradient_wrt_output, - AbsMat& local_gradient_wrt_input); + void local_bp_compute(); /** Workspace matrix. */ - std::unique_ptr m_workspace; + std::unique_ptr m_workspace; }; #ifndef LBANN_L1_NORM_LAYER_INSTANTIATE extern template class l1_norm_layer< - data_layout::DATA_PARALLEL, El::Device::CPU>; + DataType, data_layout::DATA_PARALLEL, El::Device::CPU>; extern template class l1_norm_layer< - data_layout::MODEL_PARALLEL, El::Device::CPU>; + DataType, data_layout::MODEL_PARALLEL, El::Device::CPU>; #ifdef LBANN_HAS_GPU extern template class l1_norm_layer< - data_layout::DATA_PARALLEL, El::Device::GPU>; + DataType, data_layout::DATA_PARALLEL, El::Device::GPU>; extern template class l1_norm_layer< - data_layout::MODEL_PARALLEL, El::Device::GPU>; + DataType, data_layout::MODEL_PARALLEL, El::Device::GPU>; #endif // LBANN_HAS_GPU #endif // LBANN_L1_NORM_LAYER_INSTANTIATE diff --git a/include/lbann/layers/loss/l2_norm2.hpp b/include/lbann/layers/loss/l2_norm2.hpp index c49822f3711..399adfea95d 100644 --- a/include/lbann/layers/loss/l2_norm2.hpp +++ b/include/lbann/layers/loss/l2_norm2.hpp @@ -27,7 +27,7 @@ #ifndef LBANN_LAYERS_LOSS_L2_NORM2_HPP_INCLUDED #define LBANN_LAYERS_LOSS_L2_NORM2_HPP_INCLUDED -#include "lbann/layers/layer.hpp" +#include "lbann/layers/data_type_layer.hpp" namespace lbann { @@ -35,18 +35,27 @@ namespace lbann { * * @f[ \lVert x\rVert_2^2 = \sum\limits_{i} x_i^2 @f] */ -template -class l2_norm2_layer : public Layer { +template +class l2_norm2_layer : public data_type_layer { public: + /** @name Public Types */ + ///@{ - l2_norm2_layer(lbann_comm *comm) : Layer(comm) {} + /** @brief The tensor type expected in this object. */ + using AbsDistMatrixType = El::AbstractDistMatrix; + + ///@} + +public: + + l2_norm2_layer(lbann_comm *comm) : data_type_layer(comm) {} l2_norm2_layer(const l2_norm2_layer& other) - : Layer(other), + : data_type_layer(other), m_workspace(other.m_workspace ? other.m_workspace->Copy() : nullptr) {} l2_norm2_layer& operator=(const l2_norm2_layer& other) { - Layer::operator=(other); + data_type_layer::operator=(other); m_workspace.reset(other.m_workspace ? other.m_workspace->Copy() : nullptr); return *this; @@ -58,17 +67,17 @@ class l2_norm2_layer : public Layer { El::Device get_device_allocation() const override { return Dev; } void setup_dims() override { - Layer::setup_dims(); - set_output_dims({1}); + data_type_layer::setup_dims(); + this->set_output_dims({1}); } void setup_data() override { - Layer::setup_data(); + data_type_layer::setup_data(); // Initialize workspace - auto dist = get_prev_activations().DistData(); + auto dist = this->get_prev_activations().DistData(); dist.colDist = El::STAR; - m_workspace.reset(AbsDistMat::Instantiate(dist)); + m_workspace.reset(AbsDistMatrixType::Instantiate(dist)); #ifdef HYDROGEN_HAVE_CUB if (m_workspace->GetLocalDevice() == El::Device::GPU) { m_workspace->Matrix().SetMemoryMode(1); // CUB memory pool @@ -81,15 +90,14 @@ class l2_norm2_layer : public Layer { // Initialize workspace m_workspace->Empty(); - m_workspace->AlignWith(get_prev_activations()); - m_workspace->Resize(1, get_prev_activations().Width()); + m_workspace->AlignWith(this->get_prev_activations()); + m_workspace->Resize(1, this->get_prev_activations().Width()); // Compute local contributions and accumulate /// @todo Consider reduce rather than allreduce - local_fp_compute(get_local_prev_activations(), - m_workspace->Matrix()); - m_comm->allreduce(*m_workspace, m_workspace->RedundantComm()); - El::Copy(*m_workspace, get_activations()); + local_fp_compute(); + this->get_comm()->allreduce(*m_workspace, m_workspace->RedundantComm()); + El::Copy(*m_workspace, this->get_activations()); // Clean up m_workspace->Empty(); @@ -100,13 +108,11 @@ class l2_norm2_layer : public Layer { // Initialize workspace m_workspace->Empty(); - m_workspace->AlignWith(get_prev_activations()); - El::Copy(get_prev_error_signals(), *m_workspace); + m_workspace->AlignWith(this->get_prev_activations()); + El::Copy(this->get_prev_error_signals(), *m_workspace); // Compute local gradients - local_bp_compute(get_local_prev_activations(), - m_workspace->LockedMatrix(), - get_local_error_signals()); + local_bp_compute(); // Clean up m_workspace->Empty(); @@ -116,28 +122,25 @@ class l2_norm2_layer : public Layer { private: /** Compute local contributions to L2 norm. */ - static void local_fp_compute(const AbsMat& local_input, - AbsMat& local_contribution); + void local_fp_compute(); /** Compute local gradients. */ - static void local_bp_compute(const AbsMat& local_input, - const AbsMat& local_gradient_wrt_output, - AbsMat& local_gradient_wrt_input); + void local_bp_compute(); /** Workspace matrix. */ - std::unique_ptr m_workspace; + std::unique_ptr m_workspace; }; #ifndef LBANN_L2_NORM2_LAYER_INSTANTIATE extern template class l2_norm2_layer< - data_layout::DATA_PARALLEL, El::Device::CPU>; + DataType, data_layout::DATA_PARALLEL, El::Device::CPU>; extern template class l2_norm2_layer< - data_layout::MODEL_PARALLEL, El::Device::CPU>; + DataType, data_layout::MODEL_PARALLEL, El::Device::CPU>; #ifdef LBANN_HAS_GPU extern template class l2_norm2_layer< - data_layout::DATA_PARALLEL, El::Device::GPU>; + DataType, data_layout::DATA_PARALLEL, El::Device::GPU>; extern template class l2_norm2_layer< - data_layout::MODEL_PARALLEL, El::Device::GPU>; + DataType, data_layout::MODEL_PARALLEL, El::Device::GPU>; #endif // LBANN_HAS_GPU #endif // LBANN_L2_NORM2_LAYER_INSTANTIATE diff --git a/include/lbann/layers/loss/mean_absolute_error.hpp b/include/lbann/layers/loss/mean_absolute_error.hpp index 34acbd9c04e..0375cca406d 100644 --- a/include/lbann/layers/loss/mean_absolute_error.hpp +++ b/include/lbann/layers/loss/mean_absolute_error.hpp @@ -27,7 +27,7 @@ #ifndef LBANN_LAYERS_LOSS_MEAN_ABSOLUTE_ERROR_HPP_INCLUDED #define LBANN_LAYERS_LOSS_MEAN_ABSOLUTE_ERROR_HPP_INCLUDED -#include "lbann/layers/layer.hpp" +#include "lbann/layers/data_type_layer.hpp" namespace lbann { @@ -39,23 +39,32 @@ namespace lbann { * = \frac{1}{n} \sum\limits_{i=1}^{n} | y_i - \hat{y}_i | * @f] */ -template -class mean_absolute_error_layer : public Layer { +template +class mean_absolute_error_layer : public data_type_layer { public: + /** @name Public Types */ + ///@{ - mean_absolute_error_layer(lbann_comm *comm) : Layer(comm) { + /** @brief The tensor type expected in this object. */ + using AbsDistMatrixType = El::AbstractDistMatrix; + + ///@} + +public: + + mean_absolute_error_layer(lbann_comm *comm) : data_type_layer(comm) { this->m_expected_num_parent_layers = 2; } mean_absolute_error_layer(const mean_absolute_error_layer& other) - : Layer(other) { + : data_type_layer(other) { m_workspace.reset(other.m_workspace ? other.m_workspace->Copy() : nullptr); } mean_absolute_error_layer& operator=(const mean_absolute_error_layer& other) { - Layer::operator=(other); + data_type_layer::operator=(other); m_workspace.reset(other.m_workspace ? other.m_workspace->Copy() : nullptr); @@ -68,17 +77,17 @@ class mean_absolute_error_layer : public Layer { El::Device get_device_allocation() const override { return Dev; } void setup_dims() override { - Layer::setup_dims(); - set_output_dims({1}); + data_type_layer::setup_dims(); + this->set_output_dims({1}); // Check that input dimensions match - if (get_input_dims(0) != get_input_dims(1)) { - const auto& parents = get_parent_layers(); + if (this->get_input_dims(0) != this->get_input_dims(1)) { + const auto& parents = this->get_parent_layers(); std::stringstream err; - err << get_type() << " layer \"" << get_name() << "\" " + err << get_type() << " layer \"" << this->get_name() << "\" " << "has input tensors with different dimensions ("; - for (int i = 0; i < get_num_parents(); ++i) { - const auto& dims = get_input_dims(i); + for (int i = 0; i < this->get_num_parents(); ++i) { + const auto& dims = this->get_input_dims(i); err << (i > 0 ? ", " : "") << "layer \"" << parents[i]->get_name() << "\" outputs "; for (size_t j = 0; j < dims.size(); ++j) { @@ -92,11 +101,11 @@ class mean_absolute_error_layer : public Layer { } void setup_data() override { - Layer::setup_data(); + data_type_layer::setup_data(); // Initialize workspace - const auto& input_dist = get_prev_activations(0).DistData(); - m_workspace.reset(AbsDistMat::Instantiate(*input_dist.grid, + const auto& input_dist = this->get_prev_activations(0).DistData(); + m_workspace.reset(AbsDistMatrixType::Instantiate(*input_dist.grid, input_dist.root, El::STAR, input_dist.rowDist, @@ -116,17 +125,14 @@ class mean_absolute_error_layer : public Layer { // Initialize workspace m_workspace->Empty(); - m_workspace->AlignWith(get_prev_activations()); - m_workspace->Resize(1, get_prev_activations().Width()); + m_workspace->AlignWith(this->get_prev_activations()); + m_workspace->Resize(1, this->get_prev_activations().Width()); // Compute local contributions and accumulate /// @todo Consider reduce rather than allreduce - local_fp_compute(get_input_size(), - get_local_prev_activations(0), - get_local_prev_activations(1), - m_workspace->Matrix()); - m_comm->allreduce(*m_workspace, m_workspace->RedundantComm()); - El::Copy(*m_workspace, get_activations()); + local_fp_compute(); + this->get_comm()->allreduce(*m_workspace, m_workspace->RedundantComm()); + El::Copy(*m_workspace, this->get_activations()); // Clean up m_workspace->Empty(); @@ -137,16 +143,11 @@ class mean_absolute_error_layer : public Layer { // Initialize workspace m_workspace->Empty(); - m_workspace->AlignWith(get_prev_activations()); - El::Copy(get_prev_error_signals(), *m_workspace); + m_workspace->AlignWith(this->get_prev_activations()); + El::Copy(this->get_prev_error_signals(), *m_workspace); // Compute local gradients - local_bp_compute(get_input_size(), - get_local_prev_activations(0), - get_local_prev_activations(1), - m_workspace->LockedMatrix(), - get_local_error_signals(0), - get_local_error_signals(1)); + local_bp_compute(); // Clean up m_workspace->Empty(); @@ -156,33 +157,25 @@ class mean_absolute_error_layer : public Layer { private: /** Compute local contributions to mean absolute error loss. */ - static void local_fp_compute(El::Int height, - const AbsMat& local_prediction, - const AbsMat& local_ground_truth, - AbsMat& local_contribution); + void local_fp_compute(); /** Compute local gradients. */ - static void local_bp_compute(El::Int height, - const AbsMat& local_prediction, - const AbsMat& local_ground_truth, - const AbsMat& local_gradient_wrt_output, - AbsMat& local_gradient_wrt_prediction, - AbsMat& local_gradient_wrt_ground_truth); + void local_bp_compute(); /** Workspace matrix. */ - std::unique_ptr m_workspace; + std::unique_ptr m_workspace; }; #ifndef LBANN_MEAN_ABSOLUTE_ERROR_LAYER_INSTANTIATE extern template class mean_absolute_error_layer< - data_layout::DATA_PARALLEL, El::Device::CPU>; + DataType, data_layout::DATA_PARALLEL, El::Device::CPU>; extern template class mean_absolute_error_layer< - data_layout::MODEL_PARALLEL, El::Device::CPU>; + DataType, data_layout::MODEL_PARALLEL, El::Device::CPU>; #ifdef LBANN_HAS_GPU extern template class mean_absolute_error_layer< - data_layout::DATA_PARALLEL, El::Device::GPU>; + DataType, data_layout::DATA_PARALLEL, El::Device::GPU>; extern template class mean_absolute_error_layer< - data_layout::MODEL_PARALLEL, El::Device::GPU>; + DataType, data_layout::MODEL_PARALLEL, El::Device::GPU>; #endif // LBANN_HAS_GPU #endif // LBANN_MEAN_ABSOLUTE_ERROR_LAYER_INSTANTIATE diff --git a/include/lbann/layers/loss/mean_squared_error.hpp b/include/lbann/layers/loss/mean_squared_error.hpp index 0ee52441d83..f40eb4a58f7 100644 --- a/include/lbann/layers/loss/mean_squared_error.hpp +++ b/include/lbann/layers/loss/mean_squared_error.hpp @@ -27,7 +27,7 @@ #ifndef LBANN_LAYERS_LOSS_MEAN_SQUARED_ERROR_HPP_INCLUDED #define LBANN_LAYERS_LOSS_MEAN_SQUARED_ERROR_HPP_INCLUDED -#include "lbann/layers/layer.hpp" +#include "lbann/layers/data_type_layer.hpp" namespace lbann { @@ -39,23 +39,32 @@ namespace lbann { * = \frac{1}{n} \sum\limits_{i=1}^{n} (y_i - \hat{y}_i)^2 * @f] */ -template -class mean_squared_error_layer : public Layer { +template +class mean_squared_error_layer : public data_type_layer { public: + /** @name Public Types */ + ///@{ - mean_squared_error_layer(lbann_comm *comm) : Layer(comm) { + /** @brief The tensor type expected in this object. */ + using AbsDistMatrixType = El::AbstractDistMatrix; + + ///@} + +public: + + mean_squared_error_layer(lbann_comm *comm) : data_type_layer(comm) { this->m_expected_num_parent_layers = 2; } mean_squared_error_layer(const mean_squared_error_layer& other) - : Layer(other) { + : data_type_layer(other) { m_workspace.reset(other.m_workspace ? other.m_workspace->Copy() : nullptr); } mean_squared_error_layer& operator=(const mean_squared_error_layer& other) { - Layer::operator=(other); + data_type_layer::operator=(other); m_workspace.reset(other.m_workspace ? other.m_workspace->Copy() : nullptr); @@ -68,17 +77,17 @@ class mean_squared_error_layer : public Layer { El::Device get_device_allocation() const override { return Dev; } void setup_dims() override { - Layer::setup_dims(); - set_output_dims({1}); + data_type_layer::setup_dims(); + this->set_output_dims({1}); // Check that input dimensions match - if (get_input_dims(0) != get_input_dims(1)) { - const auto& parents = get_parent_layers(); + if (this->get_input_dims(0) != this->get_input_dims(1)) { + const auto& parents = this->get_parent_layers(); std::stringstream err; - err << get_type() << " layer \"" << get_name() << "\" " + err << get_type() << " layer \"" << this->get_name() << "\" " << "has input tensors with different dimensions ("; - for (int i = 0; i < get_num_parents(); ++i) { - const auto& dims = get_input_dims(i); + for (int i = 0; i < this->get_num_parents(); ++i) { + const auto& dims = this->get_input_dims(i); err << (i > 0 ? ", " : "") << "layer \"" << parents[i]->get_name() << "\" outputs "; for (size_t j = 0; j < dims.size(); ++j) { @@ -92,11 +101,11 @@ class mean_squared_error_layer : public Layer { } void setup_data() override { - Layer::setup_data(); + data_type_layer::setup_data(); // Initialize workspace - const auto& input_dist = get_prev_activations(0).DistData(); - m_workspace.reset(AbsDistMat::Instantiate(*input_dist.grid, + const auto& input_dist = this->get_prev_activations(0).DistData(); + m_workspace.reset(AbsDistMatrixType::Instantiate(*input_dist.grid, input_dist.root, El::STAR, input_dist.rowDist, @@ -116,17 +125,14 @@ class mean_squared_error_layer : public Layer { // Initialize workspace m_workspace->Empty(); - m_workspace->AlignWith(get_prev_activations()); - m_workspace->Resize(1, get_prev_activations().Width()); + m_workspace->AlignWith(this->get_prev_activations()); + m_workspace->Resize(1, this->get_prev_activations().Width()); // Compute local contributions and accumulate /// @todo Consider reduce rather than allreduce - local_fp_compute(get_input_size(), - get_local_prev_activations(0), - get_local_prev_activations(1), - m_workspace->Matrix()); - m_comm->allreduce(*m_workspace, m_workspace->RedundantComm()); - El::Copy(*m_workspace, get_activations()); + local_fp_compute(); + this->get_comm()->allreduce(*m_workspace, m_workspace->RedundantComm()); + El::Copy(*m_workspace, this->get_activations()); // Clean up m_workspace->Empty(); @@ -137,16 +143,11 @@ class mean_squared_error_layer : public Layer { // Initialize workspace m_workspace->Empty(); - m_workspace->AlignWith(get_prev_activations()); - El::Copy(get_prev_error_signals(), *m_workspace); + m_workspace->AlignWith(this->get_prev_activations()); + El::Copy(this->get_prev_error_signals(), *m_workspace); // Compute local gradients - local_bp_compute(get_input_size(), - get_local_prev_activations(0), - get_local_prev_activations(1), - m_workspace->LockedMatrix(), - get_local_error_signals(0), - get_local_error_signals(1)); + local_bp_compute(); // Clean up m_workspace->Empty(); @@ -156,33 +157,25 @@ class mean_squared_error_layer : public Layer { private: /** Compute local contributions to mean squared error loss. */ - static void local_fp_compute(El::Int height, - const AbsMat& local_prediction, - const AbsMat& local_ground_truth, - AbsMat& local_contribution); + void local_fp_compute(); /** Compute local gradients. */ - static void local_bp_compute(El::Int height, - const AbsMat& local_prediction, - const AbsMat& local_ground_truth, - const AbsMat& local_gradient_wrt_output, - AbsMat& local_gradient_wrt_prediction, - AbsMat& local_gradient_wrt_ground_truth); + void local_bp_compute(); /** Workspace matrix. */ - std::unique_ptr m_workspace; + std::unique_ptr m_workspace; }; #ifndef LBANN_MEAN_SQUARED_ERROR_LAYER_INSTANTIATE extern template class mean_squared_error_layer< - data_layout::DATA_PARALLEL, El::Device::CPU>; + DataType, data_layout::DATA_PARALLEL, El::Device::CPU>; extern template class mean_squared_error_layer< - data_layout::MODEL_PARALLEL, El::Device::CPU>; + DataType, data_layout::MODEL_PARALLEL, El::Device::CPU>; #ifdef LBANN_HAS_GPU extern template class mean_squared_error_layer< - data_layout::DATA_PARALLEL, El::Device::GPU>; + DataType, data_layout::DATA_PARALLEL, El::Device::GPU>; extern template class mean_squared_error_layer< - data_layout::MODEL_PARALLEL, El::Device::GPU>; + DataType, data_layout::MODEL_PARALLEL, El::Device::GPU>; #endif // LBANN_HAS_GPU #endif // LBANN_MEAN_SQUARED_ERROR_LAYER_INSTANTIATE diff --git a/include/lbann/layers/loss/top_k_categorical_accuracy.hpp b/include/lbann/layers/loss/top_k_categorical_accuracy.hpp index 855ab34607c..10f285c757d 100644 --- a/include/lbann/layers/loss/top_k_categorical_accuracy.hpp +++ b/include/lbann/layers/loss/top_k_categorical_accuracy.hpp @@ -27,7 +27,7 @@ #ifndef LBANN_LAYERS_LOSS_TOP_K_CATEGORICAL_ACCURACY_HPP_INCLUDED #define LBANN_LAYERS_LOSS_TOP_K_CATEGORICAL_ACCURACY_HPP_INCLUDED -#include "lbann/layers/layer.hpp" +#include "lbann/layers/data_type_layer.hpp" namespace lbann { @@ -42,12 +42,12 @@ namespace lbann { * * @todo Gracefully handle case where label is not a one-hot vector. */ -template -class top_k_categorical_accuracy_layer : public Layer { +template +class top_k_categorical_accuracy_layer : public data_type_layer { public: top_k_categorical_accuracy_layer(lbann_comm *comm, El::Int k) - : Layer(comm), m_k(k) { + : data_type_layer(comm), m_k(k) { this->m_expected_num_parent_layers = 2; } @@ -59,7 +59,7 @@ class top_k_categorical_accuracy_layer : public Layer { El::Device get_device_allocation() const override { return Dev; } description get_description() const override { - auto desc = Layer::get_description(); + auto desc = data_type_layer::get_description(); desc.add("k", m_k); return desc; } @@ -67,17 +67,17 @@ class top_k_categorical_accuracy_layer : public Layer { protected: void setup_dims() override { - Layer::setup_dims(); - set_output_dims({1}); + data_type_layer::setup_dims(); + this->set_output_dims({1}); // Check that input dimensions match - if (get_input_dims(0) != get_input_dims(1)) { - const auto& parents = get_parent_layers(); + if (this->get_input_dims(0) != this->get_input_dims(1)) { + const auto& parents = this->get_parent_layers(); std::stringstream err; - err << get_type() << " layer \"" << get_name() << "\" " + err << get_type() << " layer \"" << this->get_name() << "\" " << "has input tensors with different dimensions ("; - for (int i = 0; i < get_num_parents(); ++i) { - const auto& dims = get_input_dims(i); + for (int i = 0; i < this->get_num_parents(); ++i) { + const auto& dims = this->get_input_dims(i); err << (i > 0 ? ", " : "") << "layer \"" << parents[i]->get_name() << "\" outputs "; for (size_t j = 0; j < dims.size(); ++j) { @@ -101,14 +101,14 @@ class top_k_categorical_accuracy_layer : public Layer { #ifndef LBANN_TOP_K_CATEGORICAL_ACCURACY_LAYER_INSTANTIATE extern template class top_k_categorical_accuracy_layer< - data_layout::DATA_PARALLEL, El::Device::CPU>; + DataType, data_layout::DATA_PARALLEL, El::Device::CPU>; extern template class top_k_categorical_accuracy_layer< - data_layout::MODEL_PARALLEL, El::Device::CPU>; + DataType, data_layout::MODEL_PARALLEL, El::Device::CPU>; #ifdef LBANN_HAS_GPU extern template class top_k_categorical_accuracy_layer< - data_layout::DATA_PARALLEL, El::Device::GPU>; + DataType, data_layout::DATA_PARALLEL, El::Device::GPU>; extern template class top_k_categorical_accuracy_layer< - data_layout::MODEL_PARALLEL, El::Device::GPU>; + DataType, data_layout::MODEL_PARALLEL, El::Device::GPU>; #endif // LBANN_HAS_GPU #endif // LBANN_TOP_K_CATEGORICAL_ACCURACY_LAYER_INSTANTIATE diff --git a/include/lbann/layers/math/binary.hpp b/include/lbann/layers/math/binary.hpp index e95df55fd97..aad7b8e9944 100644 --- a/include/lbann/layers/math/binary.hpp +++ b/include/lbann/layers/math/binary.hpp @@ -27,62 +27,62 @@ #ifndef LBANN_LAYERS_MATH_BINARY_HPP_INCLUDED #define LBANN_LAYERS_MATH_BINARY_HPP_INCLUDED -#include "lbann/layers/layer.hpp" +#include "lbann/layers/data_type_layer.hpp" namespace lbann { -#define LBANN_DECLARE_ENTRYWISE_BINARY_LAYER(LAYER_NAME, LAYER_STRING) \ - template \ - class LAYER_NAME : public Layer { \ - public: \ - LAYER_NAME(lbann_comm *comm) : Layer(comm) { \ - this->m_expected_num_parent_layers = 2; \ - } \ - LAYER_NAME* copy() const override { \ - return new LAYER_NAME(*this); \ - } \ - std::string get_type() const override { return LAYER_STRING; } \ - data_layout get_data_layout() const override { return Layout; } \ - El::Device get_device_allocation() const override { return Device; } \ - protected: \ - void setup_dims() override { \ - Layer::setup_dims(); \ - set_output_dims(get_input_dims()); \ - /* Check that input dimensions match */ \ - if (get_input_dims(0) != get_input_dims(1)) { \ - const auto& parents = get_parent_layers(); \ - std::stringstream err; \ - err << get_type() << " layer \"" << get_name() << "\" " \ - << "has input tensors with different dimensions ("; \ - for (int i = 0; i < get_num_parents(); ++i) { \ - const auto& dims = get_input_dims(i); \ - err << (i > 0 ? ", " : "") \ - << "layer \"" << parents[i]->get_name() << "\" outputs "; \ - for (size_t j = 0; j < dims.size(); ++j) { \ - err << (j > 0 ? " x " : "") << dims[j]; \ - } \ - } \ - err << ")"; \ - LBANN_ERROR(err.str()); \ - } \ - } \ - void fp_compute() override; \ - void bp_compute() override; \ +#define LBANN_DECLARE_ENTRYWISE_BINARY_LAYER(LAYER_NAME, LAYER_STRING) \ + template \ + class LAYER_NAME : public data_type_layer { \ + public: \ + LAYER_NAME(lbann_comm *comm) : data_type_layer(comm) { \ + this->m_expected_num_parent_layers = 2; \ + } \ + LAYER_NAME* copy() const override { \ + return new LAYER_NAME(*this); \ + } \ + std::string get_type() const override { return LAYER_STRING; } \ + data_layout get_data_layout() const override { return Layout; } \ + El::Device get_device_allocation() const override { return Device; } \ + protected: \ + void setup_dims() override { \ + data_type_layer::setup_dims(); \ + this->set_output_dims(this->get_input_dims()); \ + /* Check that input dimensions match */ \ + if (this->get_input_dims(0) != this->get_input_dims(1)) { \ + const auto& parents = this->get_parent_layers(); \ + std::stringstream err; \ + err << this->get_type() << " layer \"" << this->get_name() << "\" " \ + << "has input tensors with different dimensions ("; \ + for (int i = 0; i < this->get_num_parents(); ++i) { \ + const auto& dims = this->get_input_dims(i); \ + err << (i > 0 ? ", " : "") \ + << "layer \"" << parents[i]->get_name() << "\" outputs "; \ + for (size_t j = 0; j < dims.size(); ++j) { \ + err << (j > 0 ? " x " : "") << dims[j]; \ + } \ + } \ + err << ")"; \ + LBANN_ERROR(err.str()); \ + } \ + } \ + void fp_compute() override; \ + void bp_compute() override; \ } // Convenience macros for ETI decls for binary layers #ifndef LBANN_BINARY_LAYER_INSTANTIATE -#define BINARY_ETI_DECL_MACRO_DEV(LAYER_NAME, DEVICE) \ - extern template class LAYER_NAME; \ - extern template class LAYER_NAME +#define BINARY_ETI_DECL_MACRO_DEV(LAYER_NAME, DEVICE) \ + extern template class LAYER_NAME; \ + extern template class LAYER_NAME #else #define BINARY_ETI_DECL_MACRO_DEV(...) #endif // LBANN_BINARY_LAYER_INSTANTIATE -#define BINARY_ETI_INST_MACRO_DEV(LAYER_NAME, DEVICE) \ - template class LAYER_NAME; \ - template class LAYER_NAME +#define BINARY_ETI_INST_MACRO_DEV(LAYER_NAME, DEVICE) \ + template class LAYER_NAME; \ + template class LAYER_NAME #ifdef LBANN_HAS_GPU #define BINARY_ETI_DECL_MACRO(LAYER_NAME) \ diff --git a/include/lbann/layers/math/clamp.hpp b/include/lbann/layers/math/clamp.hpp index 6a36143d73f..09ddf798d54 100644 --- a/include/lbann/layers/math/clamp.hpp +++ b/include/lbann/layers/math/clamp.hpp @@ -27,7 +27,7 @@ #ifndef LBANN_LAYERS_MATH_CLAMP_HPP_INCLUDED #define LBANN_LAYERS_MATH_CLAMP_HPP_INCLUDED -#include "lbann/layers/layer.hpp" +#include "lbann/layers/data_type_layer.hpp" namespace lbann { @@ -42,11 +42,11 @@ namespace lbann { * \end{cases} * @f] */ -template -class clamp_layer : public Layer { +template +class clamp_layer : public data_type_layer { public: - clamp_layer(lbann_comm *comm, DataType min, DataType max) - : Layer(comm), m_min(min), m_max(max) { + clamp_layer(lbann_comm *comm, TensorDataType min, TensorDataType max) + : data_type_layer(comm), m_min(min), m_max(max) { if (m_min > m_max) { std::stringstream err; err << "[" << m_min << "," << m_max << "] is an invalid range"; @@ -59,7 +59,7 @@ class clamp_layer : public Layer { El::Device get_device_allocation() const override { return Device; } description get_description() const override { - auto desc = Layer::get_description(); + auto desc = data_type_layer::get_description(); std::stringstream ss; ss << "[" << m_min << "," << m_max << "]"; desc.add("Range", ss.str()); @@ -68,30 +68,30 @@ class clamp_layer : public Layer { protected: void setup_dims() override { - Layer::setup_dims(); - set_output_dims(get_input_dims()); + data_type_layer::setup_dims(); + this->set_output_dims(this->get_input_dims()); } void fp_compute() override; void bp_compute() override; private: /** Minimum output. */ - DataType m_min; + TensorDataType m_min; /** Maximum output. */ - DataType m_max; + TensorDataType m_max; }; #ifndef LBANN_CLAMP_LAYER_INSTANTIATE extern template class clamp_layer< - data_layout::DATA_PARALLEL, El::Device::CPU>; + DataType, data_layout::DATA_PARALLEL, El::Device::CPU>; extern template class clamp_layer< - data_layout::MODEL_PARALLEL, El::Device::CPU>; + DataType, data_layout::MODEL_PARALLEL, El::Device::CPU>; #ifdef LBANN_HAS_GPU extern template class clamp_layer< - data_layout::DATA_PARALLEL, El::Device::GPU>; + DataType, data_layout::DATA_PARALLEL, El::Device::GPU>; extern template class clamp_layer< - data_layout::MODEL_PARALLEL, El::Device::GPU>; + DataType, data_layout::MODEL_PARALLEL, El::Device::GPU>; #endif // LBANN_HAS_GPU #endif // LBANN_CLAMP_LAYER_INSTANTIATE diff --git a/include/lbann/layers/math/matmul.hpp b/include/lbann/layers/math/matmul.hpp index 77b3e711c1b..81c27d3a04d 100644 --- a/include/lbann/layers/math/matmul.hpp +++ b/include/lbann/layers/math/matmul.hpp @@ -27,7 +27,7 @@ #ifndef LBANN_LAYER_MATH_MATMUL_HPP_INCLUDED #define LBANN_LAYER_MATH_MATMUL_HPP_INCLUDED -#include "lbann/layers/layer.hpp" +#include "lbann/layers/data_type_layer.hpp" namespace lbann { @@ -40,9 +40,10 @@ namespace lbann { * @todo Support >2 dimensions, transposes, matvecs, and dot products * */ -template -class matmul_layer : public Layer { +class matmul_layer : public data_type_layer { static_assert(Layout == data_layout::DATA_PARALLEL, "matmul_layer only supports " "data-parallel data layout"); @@ -69,35 +70,35 @@ class matmul_layer : public Layer { // Implementation // ========================================================= -template -matmul_layer::matmul_layer(lbann_comm *comm) - : Layer(comm) { +template +matmul_layer::matmul_layer(lbann_comm *comm) + : data_type_layer(comm) { this->m_expected_num_parent_layers = 2; } -template -matmul_layer* matmul_layer::copy() const { +template +matmul_layer* matmul_layer::copy() const { return new matmul_layer(*this); } -template -std::string matmul_layer::get_type() const { +template +std::string matmul_layer::get_type() const { return "matrix multiply"; } -template -data_layout matmul_layer::get_data_layout() const { +template +data_layout matmul_layer::get_data_layout() const { return Layout; } -template -El::Device matmul_layer::get_device_allocation() const { +template +El::Device matmul_layer::get_device_allocation() const { return Device; } -template -void matmul_layer::setup_dims() { - Layer::setup_dims(); +template +void matmul_layer::setup_dims() { + data_type_layer::setup_dims(); // Input dimensions const auto& input0_dims = this->get_input_dims(0); @@ -158,10 +159,10 @@ void matmul_layer::setup_dims() { #ifndef LBANN_MATMUL_LAYER_INSTANTIATE extern template class matmul_layer< - data_layout::DATA_PARALLEL, El::Device::CPU>; + DataType, data_layout::DATA_PARALLEL, El::Device::CPU>; #ifdef LBANN_HAS_GPU extern template class matmul_layer< - data_layout::DATA_PARALLEL, El::Device::GPU>; + DataType, data_layout::DATA_PARALLEL, El::Device::GPU>; #endif // LBANN_HAS_GPU #endif // LBANN_MATMUL_LAYER_INSTANTIATE diff --git a/include/lbann/layers/math/unary.hpp b/include/lbann/layers/math/unary.hpp index b4fd0c82d4c..581a64bbb03 100644 --- a/include/lbann/layers/math/unary.hpp +++ b/include/lbann/layers/math/unary.hpp @@ -27,44 +27,43 @@ #ifndef LBANN_LAYERS_MATH_UNARY_HPP_INCLUDED #define LBANN_LAYERS_MATH_UNARY_HPP_INCLUDED -#include "lbann/layers/layer.hpp" +#include "lbann/layers/data_type_layer.hpp" namespace lbann { - -#define LBANN_DECLARE_ENTRYWISE_UNARY_LAYER(LAYER_NAME, LAYER_STRING) \ - template \ - class LAYER_NAME : public Layer { \ - public: \ - LAYER_NAME(lbann_comm *comm) : Layer(comm) {} \ - LAYER_NAME* copy() const override { \ - return new LAYER_NAME(*this); \ - } \ - std::string get_type() const override { return LAYER_STRING; } \ - data_layout get_data_layout() const override { return Layout; } \ - El::Device get_device_allocation() const override { return Device; } \ - protected: \ - void setup_dims() override { \ - Layer::setup_dims(); \ - set_output_dims(get_input_dims()); \ - } \ - void fp_compute() override; \ - void bp_compute() override; \ +#define LBANN_DECLARE_ENTRYWISE_UNARY_LAYER(LAYER_NAME, LAYER_STRING) \ + template \ + class LAYER_NAME : public data_type_layer { \ + public: \ + LAYER_NAME(lbann_comm *comm) : data_type_layer(comm) {} \ + LAYER_NAME* copy() const override { \ + return new LAYER_NAME(*this); \ + } \ + std::string get_type() const override { return LAYER_STRING; } \ + data_layout get_data_layout() const override { return Layout; } \ + El::Device get_device_allocation() const override { return Device; } \ + protected: \ + void setup_dims() override { \ + data_type_layer::setup_dims(); \ + this->set_output_dims(this->get_input_dims()); \ + } \ + void fp_compute() override; \ + void bp_compute() override; \ } // Convenience macros for ETI decls for unary layers #ifndef LBANN_UNARY_LAYER_INSTANTIATE -#define UNARY_ETI_DECL_MACRO_DEV(LAYER_NAME, DEVICE) \ - extern template class LAYER_NAME; \ - extern template class LAYER_NAME +#define UNARY_ETI_DECL_MACRO_DEV(LAYER_NAME, DEVICE) \ + extern template class LAYER_NAME; \ + extern template class LAYER_NAME #else #define UNARY_ETI_DECL_MACRO_DEV(...) #endif // LBANN_UNARY_LAYER_INSTANTIATE -#define UNARY_ETI_INST_MACRO_DEV(LAYER_NAME, DEVICE) \ - template class LAYER_NAME; \ - template class LAYER_NAME +#define UNARY_ETI_INST_MACRO_DEV(LAYER_NAME, DEVICE) \ + template class LAYER_NAME; \ + template class LAYER_NAME #ifdef LBANN_HAS_GPU #define UNARY_ETI_DECL_MACRO(LAYER_NAME) \ @@ -77,7 +76,7 @@ namespace lbann { // Convenience macro to define an entry-wise unary layer class #define DEFINE_ENTRYWISE_UNARY_LAYER(layer_name, layer_string) \ - LBANN_DECLARE_ENTRYWISE_UNARY_LAYER(layer_name, layer_string); \ + LBANN_DECLARE_ENTRYWISE_UNARY_LAYER(layer_name, layer_string); \ UNARY_ETI_DECL_MACRO(layer_name) // Logical operations diff --git a/include/lbann/layers/misc/argmax.hpp b/include/lbann/layers/misc/argmax.hpp index 524e9feeae8..06166fc67de 100644 --- a/include/lbann/layers/misc/argmax.hpp +++ b/include/lbann/layers/misc/argmax.hpp @@ -27,7 +27,7 @@ #ifndef LBANN_LAYERS_MISC_ARGMAX_HPP_INCLUDED #define LBANN_LAYERS_MISC_ARGMAX_HPP_INCLUDED -#include "lbann/layers/layer.hpp" +#include "lbann/layers/data_type_layer.hpp" namespace lbann { @@ -36,15 +36,15 @@ namespace lbann { * Expects a 1-D input tensor. If multiple entries have the same * maximum value, outputs the index of the first one. */ -template -class argmax_layer : public Layer { +template +class argmax_layer : public data_type_layer { static_assert(Layout == data_layout::DATA_PARALLEL, "argmax layer only supports data parallel layout"); static_assert(Device == El::Device::CPU, "argmax layer only supports CPU"); public: - argmax_layer(lbann_comm* comm) : Layer(comm) { } + argmax_layer(lbann_comm* comm) : data_type_layer(comm) { } argmax_layer* copy() const override { return new argmax_layer(*this); } std::string get_type() const override { return "argmax"; } data_layout get_data_layout() const override { return Layout; } @@ -53,15 +53,15 @@ class argmax_layer : public Layer { protected: void setup_dims() override { - Layer::setup_dims(); - set_output_dims({1}); + data_type_layer::setup_dims(); + this->set_output_dims({1}); // Make sure input tensor is 1-D - const auto input_dims = get_input_dims(); + const auto input_dims = this->get_input_dims(); if (input_dims.size() != 1) { - LBANN_ERROR(get_type()," layer \"",get_name(),"\" ", + LBANN_ERROR(get_type()," layer \"",this->get_name(),"\" ", "expects a 1-D input tensor, ", - "but parent layer \"",m_parent_layers[0]->get_name(),"\" ", + "but parent layer \"",this->m_parent_layers[0]->get_name(),"\" ", "outputs a ",input_dims.size(),"-D tensor"); } @@ -73,7 +73,7 @@ class argmax_layer : public Layer { #ifndef LBANN_ARGMAX_LAYER_INSTANTIATE extern template class argmax_layer< - data_layout::DATA_PARALLEL, El::Device::CPU>; + DataType, data_layout::DATA_PARALLEL, El::Device::CPU>; #endif // LBANN_ARGMAX_LAYER_INSTANTIATE } // namespace lbann diff --git a/include/lbann/layers/misc/argmin.hpp b/include/lbann/layers/misc/argmin.hpp index c05ddecd08b..2409af36fc1 100644 --- a/include/lbann/layers/misc/argmin.hpp +++ b/include/lbann/layers/misc/argmin.hpp @@ -27,7 +27,7 @@ #ifndef LBANN_LAYERS_MISC_ARGMIN_HPP_INCLUDED #define LBANN_LAYERS_MISC_ARGMIN_HPP_INCLUDED -#include "lbann/layers/layer.hpp" +#include "lbann/layers/data_type_layer.hpp" namespace lbann { @@ -36,15 +36,15 @@ namespace lbann { * Expects a 1-D input tensor. If multiple entries have the same * minimum value, outputs the index of the first one. */ -template -class argmin_layer : public Layer { +template +class argmin_layer : public data_type_layer { static_assert(Layout == data_layout::DATA_PARALLEL, "argmin layer only supports data parallel layout"); static_assert(Device == El::Device::CPU, "argmin layer only supports CPU"); public: - argmin_layer(lbann_comm* comm) : Layer(comm) { } + argmin_layer(lbann_comm* comm) : data_type_layer(comm) { } argmin_layer* copy() const override { return new argmin_layer(*this); } std::string get_type() const override { return "argmin"; } data_layout get_data_layout() const override { return Layout; } @@ -53,15 +53,15 @@ class argmin_layer : public Layer { protected: void setup_dims() override { - Layer::setup_dims(); - set_output_dims({1}); + data_type_layer::setup_dims(); + this->set_output_dims({1}); // Make sure input tensor is 1-D - const auto input_dims = get_input_dims(); + const auto input_dims = this->get_input_dims(); if (input_dims.size() != 1) { - LBANN_ERROR(get_type()," layer \"",get_name(),"\" ", + LBANN_ERROR(get_type()," layer \"",this->get_name(),"\" ", "expects a 1-D input tensor, ", - "but parent layer \"",m_parent_layers[0]->get_name(),"\" ", + "but parent layer \"",this->m_parent_layers[0]->get_name(),"\" ", "outputs a ",input_dims.size(),"-D tensor"); } @@ -73,7 +73,7 @@ class argmin_layer : public Layer { #ifndef LBANN_ARGMIN_LAYER_INSTANTIATE extern template class argmin_layer< - data_layout::DATA_PARALLEL, El::Device::CPU>; + DataType, data_layout::DATA_PARALLEL, El::Device::CPU>; #endif // LBANN_ARGMIN_LAYER_INSTANTIATE } // namespace lbann diff --git a/include/lbann/layers/misc/channelwise_mean.hpp b/include/lbann/layers/misc/channelwise_mean.hpp index 8aea533dfc1..a977390af35 100644 --- a/include/lbann/layers/misc/channelwise_mean.hpp +++ b/include/lbann/layers/misc/channelwise_mean.hpp @@ -27,21 +27,22 @@ #ifndef LBANN_LAYERS_MISC_CHANNELWISE_MEAN_HPP_INCLUDED #define LBANN_LAYERS_MISC_CHANNELWISE_MEAN_HPP_INCLUDED -#include "lbann/layers/layer.hpp" +#include "lbann/layers/data_type_layer.hpp" namespace lbann { /** @todo Replace with more general reduction layer. */ -template -class channelwise_mean_layer : public Layer { +class channelwise_mean_layer : public data_type_layer { static_assert(Layout == data_layout::DATA_PARALLEL, "channelwise_mean_layer only supports " "data-parallel data layout"); public: channelwise_mean_layer(lbann_comm *comm) - : Layer(comm) { + : data_type_layer(comm) { if (comm->am_trainer_master()) { LBANN_WARNING("channelwise_mean_layer is experimental " "and may be deprecated at any time"); @@ -56,9 +57,9 @@ class channelwise_mean_layer : public Layer { protected: void setup_dims() override { - Layer::setup_dims(); - const auto& input_dims = get_input_dims(); - set_output_dims({input_dims[0]}); + data_type_layer::setup_dims(); + const auto& input_dims = this->get_input_dims(); + this->set_output_dims({input_dims[0]}); } void fp_compute() override; @@ -68,10 +69,10 @@ class channelwise_mean_layer : public Layer { #ifndef LBANN_CHANNELWISE_MEAN_LAYER_INSTANTIATE extern template class channelwise_mean_layer< - data_layout::DATA_PARALLEL, El::Device::CPU>; + DataType, data_layout::DATA_PARALLEL, El::Device::CPU>; #ifdef LBANN_HAS_GPU extern template class channelwise_mean_layer< - data_layout::DATA_PARALLEL, El::Device::GPU>; + DataType, data_layout::DATA_PARALLEL, El::Device::GPU>; #endif // LBANN_HAS_GPU #endif // LBANN_CHANNELWISE_MEAN_LAYER_INSTANTIATE diff --git a/include/lbann/layers/misc/covariance.hpp b/include/lbann/layers/misc/covariance.hpp index 57c3d304b38..48d0ec523f4 100644 --- a/include/lbann/layers/misc/covariance.hpp +++ b/include/lbann/layers/misc/covariance.hpp @@ -27,7 +27,7 @@ #ifndef LBANN_LAYERS_MISC_COVARIANCE_HPP_INCLUDED #define LBANN_LAYERS_MISC_COVARIANCE_HPP_INCLUDED -#include "lbann/layers/layer.hpp" +#include "lbann/layers/data_type_layer.hpp" namespace lbann { @@ -43,22 +43,31 @@ namespace lbann { * Scaling by @f$ 1/n @f$ instead of @f$ 1/(n-1) @f$ is a biased * estimator. */ -template -class covariance_layer : public Layer { +template +class covariance_layer : public data_type_layer { +public: + /** @name Public Types */ + ///@{ + + /** @brief The tensor type expected in this object. */ + using AbsDistMatrixType = El::AbstractDistMatrix; + + ///@} + public: covariance_layer(lbann_comm *comm, bool biased) - : Layer(comm), m_biased(biased) { + : data_type_layer(comm), m_biased(biased) { this->m_expected_num_parent_layers = 2; } covariance_layer(const covariance_layer& other) - : Layer(other), + : data_type_layer(other), m_biased(other.m_biased), m_means(other.m_means ? other.m_means->Copy() : nullptr), m_workspace(other.m_workspace ? other.m_workspace->Copy() : nullptr) {} covariance_layer& operator=(const covariance_layer& other) { - Layer::operator=(other); + data_type_layer::operator=(other); m_biased = other.m_biased; m_means.reset(other.m_means ? other.m_means->Copy() : nullptr); m_workspace.reset(other.m_workspace ? @@ -72,7 +81,7 @@ class covariance_layer : public Layer { El::Device get_device_allocation() const override { return Device; } description get_description() const override { - auto desc = Layer::get_description(); + auto desc = data_type_layer::get_description(); desc.add("Biased", m_biased); return desc; } @@ -80,23 +89,23 @@ class covariance_layer : public Layer { protected: void setup_matrices(const El::Grid& grid) override { - Layer::setup_matrices(grid); - auto dist_data = get_prev_activations().DistData(); + data_type_layer::setup_matrices(grid); + auto dist_data = this->get_prev_activations().DistData(); dist_data.colDist = El::STAR; - m_means.reset(AbsDistMat::Instantiate(dist_data)); - m_workspace.reset(AbsDistMat::Instantiate(dist_data)); + m_means.reset(AbsDistMatrixType::Instantiate(dist_data)); + m_workspace.reset(AbsDistMatrixType::Instantiate(dist_data)); } void setup_dims() override { - Layer::setup_dims(); - set_output_dims({1}); - if (get_input_dims(0) != get_input_dims(1)) { - const auto& parents = get_parent_layers(); + data_type_layer::setup_dims(); + this->set_output_dims({1}); + if (this->get_input_dims(0) != this->get_input_dims(1)) { + const auto& parents = this->get_parent_layers(); std::stringstream err; - err << get_type() << " layer \"" << get_name() << "\" " + err << get_type() << " layer \"" << this->get_name() << "\" " << "has input tensors with different dimensions ("; - for (int i = 0; i < get_num_parents(); ++i) { - const auto& dims = get_input_dims(i); + for (int i = 0; i < this->get_num_parents(); ++i) { + const auto& dims = this->get_input_dims(i); err << (i > 0 ? ", " : "") << "layer \"" << parents[i]->get_name() << "\" outputs "; for (size_t j = 0; j < dims.size(); ++j) { @@ -117,22 +126,22 @@ class covariance_layer : public Layer { bool m_biased; /** Means for each mini-batch sample. */ - std::unique_ptr m_means; + std::unique_ptr m_means; /** Workspace. */ - std::unique_ptr m_workspace; + std::unique_ptr m_workspace; }; #ifndef LBANN_COVARIANCE_LAYER_INSTANTIATE extern template class covariance_layer< - data_layout::DATA_PARALLEL, El::Device::CPU>; + DataType, data_layout::DATA_PARALLEL, El::Device::CPU>; extern template class covariance_layer< - data_layout::MODEL_PARALLEL, El::Device::CPU>; + DataType, data_layout::MODEL_PARALLEL, El::Device::CPU>; #ifdef LBANN_HAS_GPU extern template class covariance_layer< - data_layout::DATA_PARALLEL, El::Device::GPU>; + DataType, data_layout::DATA_PARALLEL, El::Device::GPU>; extern template class covariance_layer< - data_layout::MODEL_PARALLEL, El::Device::GPU>; + DataType, data_layout::MODEL_PARALLEL, El::Device::GPU>; #endif // LBANN_HAS_GPU #endif // LBANN_COVARIANCE_LAYER_INSTANTIATE diff --git a/include/lbann/layers/misc/mini_batch_index.hpp b/include/lbann/layers/misc/mini_batch_index.hpp index 60176e87851..660722778b7 100644 --- a/include/lbann/layers/misc/mini_batch_index.hpp +++ b/include/lbann/layers/misc/mini_batch_index.hpp @@ -27,7 +27,7 @@ #ifndef LBANN_LAYERS_MISC_MINI_BATCH_INDEX_HPP_INCLUDED #define LBANN_LAYERS_MISC_MINI_BATCH_INDEX_HPP_INCLUDED -#include "lbann/layers/layer.hpp" +#include "lbann/layers/data_type_layer.hpp" namespace lbann { @@ -37,12 +37,13 @@ namespace lbann { * mini-batch sample. Each sample in a model's mini-batch has a * unique index in [0, mini_batch_size). */ -template -class mini_batch_index_layer : public Layer { +class mini_batch_index_layer : public data_type_layer { public: - mini_batch_index_layer(lbann_comm* comm) : Layer(comm) { + mini_batch_index_layer(lbann_comm* comm) : data_type_layer(comm) { this->m_expected_num_parent_layers = 0; } @@ -54,19 +55,20 @@ class mini_batch_index_layer : public Layer { protected: void setup_dims() override { - Layer::setup_dims(); - set_output_dims({1}); + data_type_layer::setup_dims(); + this->set_output_dims({1}); } void fp_compute() override { + using CPUMatType = El::Matrix; // Get output matrix - auto& output = get_activations(); + auto& output = this->get_activations(); auto& local_output = output.Matrix(); const auto& local_width = local_output.Width(); // Create temporary matrix if output matrix is not on CPU - CPUMat local_output_v; + CPUMatType local_output_v; if (local_output.GetDevice() == El::Device::CPU) { El::View(local_output_v, local_output); } else { @@ -76,7 +78,7 @@ class mini_batch_index_layer : public Layer { // Populate matrix on CPU LBANN_OMP_PARALLEL_FOR for (El::Int col = 0; col < local_width; ++col) { - local_output_v(0, col) = DataType(output.GlobalCol(col)); + local_output_v(0, col) = TensorDataType(output.GlobalCol(col)); } // Copy result from CPU if needed @@ -90,14 +92,14 @@ class mini_batch_index_layer : public Layer { #ifndef LBANN_MINI_BATCH_INDEX_LAYER_INSTANTIATE extern template class mini_batch_index_layer< - data_layout::DATA_PARALLEL, El::Device::CPU>; + DataType, data_layout::DATA_PARALLEL, El::Device::CPU>; extern template class mini_batch_index_layer< - data_layout::MODEL_PARALLEL, El::Device::CPU>; + DataType, data_layout::MODEL_PARALLEL, El::Device::CPU>; #ifdef LBANN_HAS_GPU extern template class mini_batch_index_layer< - data_layout::DATA_PARALLEL, El::Device::GPU>; + DataType, data_layout::DATA_PARALLEL, El::Device::GPU>; extern template class mini_batch_index_layer< - data_layout::MODEL_PARALLEL, El::Device::GPU>; + DataType, data_layout::MODEL_PARALLEL, El::Device::GPU>; #endif // LBANN_HAS_GPU #endif // LBANN_MINI_BATCH_INDEX_LAYER_INSTANTIATE diff --git a/include/lbann/layers/misc/mini_batch_size.hpp b/include/lbann/layers/misc/mini_batch_size.hpp index 3b20486de65..fd7dfce8041 100644 --- a/include/lbann/layers/misc/mini_batch_size.hpp +++ b/include/lbann/layers/misc/mini_batch_size.hpp @@ -27,7 +27,7 @@ #ifndef LBANN_LAYERS_MISC_MINI_BATCH_SIZE_HPP_INCLUDED #define LBANN_LAYERS_MISC_MINI_BATCH_SIZE_HPP_INCLUDED -#include "lbann/layers/layer.hpp" +#include "lbann/layers/data_type_layer.hpp" namespace lbann { @@ -36,12 +36,13 @@ namespace lbann { * Output tensor is a 1D tensor with a single entry containing the * model's current mini-batch size. */ -template -class mini_batch_size_layer : public Layer { +class mini_batch_size_layer : public data_type_layer { public: - mini_batch_size_layer(lbann_comm* comm) : Layer(comm) { + mini_batch_size_layer(lbann_comm* comm) : data_type_layer(comm) { this->m_expected_num_parent_layers = 0; } @@ -53,17 +54,17 @@ class mini_batch_size_layer : public Layer { protected: void setup_dims() override { - Layer::setup_dims(); - set_output_dims({1}); + data_type_layer::setup_dims(); + this->set_output_dims({1}); } void fp_setup_outputs(El::Int mini_batch_size) override { - Layer::fp_setup_outputs(mini_batch_size); + data_type_layer::fp_setup_outputs(mini_batch_size); m_mini_batch_size = mini_batch_size; } void fp_compute() override { - El::Fill(get_activations(), DataType(m_mini_batch_size)); + El::Fill(this->get_activations(), TensorDataType(m_mini_batch_size)); } private: @@ -75,14 +76,14 @@ class mini_batch_size_layer : public Layer { #ifndef LBANN_MINI_BATCH_SIZE_LAYER_INSTANTIATE extern template class mini_batch_size_layer< - data_layout::DATA_PARALLEL, El::Device::CPU>; + DataType, data_layout::DATA_PARALLEL, El::Device::CPU>; extern template class mini_batch_size_layer< - data_layout::MODEL_PARALLEL, El::Device::CPU>; + DataType, data_layout::MODEL_PARALLEL, El::Device::CPU>; #ifdef LBANN_HAS_GPU extern template class mini_batch_size_layer< - data_layout::DATA_PARALLEL, El::Device::GPU>; + DataType, data_layout::DATA_PARALLEL, El::Device::GPU>; extern template class mini_batch_size_layer< - data_layout::MODEL_PARALLEL, El::Device::GPU>; + DataType, data_layout::MODEL_PARALLEL, El::Device::GPU>; #endif // LBANN_HAS_GPU #endif // LBANN_MINI_BATCH_SIZE_LAYER_INSTANTIATE diff --git a/include/lbann/layers/misc/one_hot.hpp b/include/lbann/layers/misc/one_hot.hpp index c362e091ab3..1a3ff080beb 100644 --- a/include/lbann/layers/misc/one_hot.hpp +++ b/include/lbann/layers/misc/one_hot.hpp @@ -27,7 +27,7 @@ #ifndef LBANN_LAYERS_MISC_ONE_HOT_HPP_INCLUDED #define LBANN_LAYERS_MISC_ONE_HOT_HPP_INCLUDED -#include "lbann/layers/layer.hpp" +#include "lbann/layers/data_type_layer.hpp" namespace lbann { @@ -39,14 +39,14 @@ namespace lbann { * otherwise. If the input is outside @f$[0,\text{size})@f$, then the * output is all zeros. */ -template -class one_hot_layer : public Layer { +template +class one_hot_layer : public data_type_layer { static_assert(Layout == data_layout::DATA_PARALLEL, "one-hot layer only supports data-parallel layout"); public: - one_hot_layer(lbann_comm* comm, size_t size) : Layer(comm) { - set_output_dims({static_cast(size)}); + one_hot_layer(lbann_comm* comm, size_t size) : data_type_layer(comm) { + this->set_output_dims({static_cast(size)}); } one_hot_layer* copy() const override { return new one_hot_layer(*this); } std::string get_type() const override { return "one-hot"; } @@ -56,16 +56,16 @@ class one_hot_layer : public Layer { protected: void setup_dims() override { - Layer::setup_dims(); + data_type_layer::setup_dims(); // Make sure input tensor is scalar - if (get_input_size() != 1) { - const auto input_dims = get_input_dims(); + if (this->get_input_size() != 1) { + const auto input_dims = this->get_input_dims(); std::ostringstream dim_ss; for (size_t i = 0; i < input_dims.size(); ++i) { dim_ss << (i > 0 ? "x" : "") << input_dims[i]; } - LBANN_ERROR(get_type()," layer \"",get_name(),"\" ", + LBANN_ERROR(get_type()," layer \"",this->get_name(),"\" ", "received an input tensor with invalid dimensions ", "(expected 1, got ",dim_ss.str(),")"); } @@ -78,10 +78,10 @@ class one_hot_layer : public Layer { #ifndef LBANN_ONE_HOT_LAYER_INSTANTIATE extern template class one_hot_layer< - data_layout::DATA_PARALLEL, El::Device::CPU>; + DataType, data_layout::DATA_PARALLEL, El::Device::CPU>; #ifdef LBANN_HAS_GPU extern template class one_hot_layer< - data_layout::DATA_PARALLEL, El::Device::GPU>; + DataType, data_layout::DATA_PARALLEL, El::Device::GPU>; #endif // LBANN_HAS_GPU #endif // LBANN_ONE_HOT_LAYER_INSTANTIATE diff --git a/include/lbann/layers/misc/variance.hpp b/include/lbann/layers/misc/variance.hpp index c9839b446be..b71d032ddf6 100644 --- a/include/lbann/layers/misc/variance.hpp +++ b/include/lbann/layers/misc/variance.hpp @@ -27,7 +27,7 @@ #ifndef LBANN_LAYERS_MISC_VARIANCE_HPP_INCLUDED #define LBANN_LAYERS_MISC_VARIANCE_HPP_INCLUDED -#include "lbann/layers/layer.hpp" +#include "lbann/layers/data_type_layer.hpp" namespace lbann { @@ -42,20 +42,29 @@ namespace lbann { * Scaling by @f$ 1/n @f$ instead of @f$ 1/(n-1) @f$ is a biased * estimator. */ -template -class variance_layer : public Layer { +template +class variance_layer : public data_type_layer { +public: + /** @name Public Types */ + ///@{ + + /** @brief The tensor type expected in this object. */ + using AbsDistMatrixType = El::AbstractDistMatrix; + + ///@} + public: variance_layer(lbann_comm *comm, bool biased) - : Layer(comm), m_biased(biased) {} + : data_type_layer(comm), m_biased(biased) {} variance_layer(const variance_layer& other) - : Layer(other), + : data_type_layer(other), m_biased(other.m_biased), m_means(other.m_means ? other.m_means->Copy() : nullptr), m_workspace(other.m_workspace ? other.m_workspace->Copy() : nullptr) {} variance_layer& operator=(const variance_layer& other) { - Layer::operator=(other); + data_type_layer::operator=(other); m_biased = other.m_biased; m_means.reset(other.m_means ? other.m_means->Copy() : nullptr); m_workspace.reset(other.m_workspace ? @@ -69,7 +78,7 @@ class variance_layer : public Layer { El::Device get_device_allocation() const override { return Device; } description get_description() const override { - auto desc = Layer::get_description(); + auto desc = data_type_layer::get_description(); desc.add("Biased", m_biased); return desc; } @@ -77,21 +86,21 @@ class variance_layer : public Layer { protected: void setup_matrices(const El::Grid& grid) override { - Layer::setup_matrices(grid); - auto dist_data = get_prev_activations().DistData(); + data_type_layer::setup_matrices(grid); + auto dist_data = this->get_prev_activations().DistData(); dist_data.colDist = El::STAR; - m_means.reset(AbsDistMat::Instantiate(dist_data)); - m_workspace.reset(AbsDistMat::Instantiate(dist_data)); + m_means.reset(AbsDistMatrixType::Instantiate(dist_data)); + m_workspace.reset(AbsDistMatrixType::Instantiate(dist_data)); } void setup_dims() override { - Layer::setup_dims(); - set_output_dims({1}); - if (get_input_size() <= 1) { + data_type_layer::setup_dims(); + this->set_output_dims({1}); + if (this->get_input_size() <= 1) { std::stringstream err; - const auto& parents = get_parent_layers(); - const auto& dims = get_input_dims(); - err << get_type() << " layer \"" << get_name() << "\" " + const auto& parents = this->get_parent_layers(); + const auto& dims = this->get_input_dims(); + err << get_type() << " layer \"" << this->get_name() << "\" " << "expects an input tensor with at least two entries, " << "but parent layer \"" << parents[0]->get_name() << "\" " << "outputs a tensor with dimensions "; @@ -111,22 +120,22 @@ class variance_layer : public Layer { bool m_biased; /** Means for each mini-batch sample. */ - std::unique_ptr m_means; + std::unique_ptr m_means; /** Workspace. */ - std::unique_ptr m_workspace; + std::unique_ptr m_workspace; }; #ifndef LBANN_VARIANCE_LAYER_INSTANTIATE extern template class variance_layer< - data_layout::DATA_PARALLEL, El::Device::CPU>; + DataType, data_layout::DATA_PARALLEL, El::Device::CPU>; extern template class variance_layer< - data_layout::MODEL_PARALLEL, El::Device::CPU>; + DataType, data_layout::MODEL_PARALLEL, El::Device::CPU>; #ifdef LBANN_HAS_GPU extern template class variance_layer< - data_layout::DATA_PARALLEL, El::Device::GPU>; + DataType, data_layout::DATA_PARALLEL, El::Device::GPU>; extern template class variance_layer< - data_layout::MODEL_PARALLEL, El::Device::GPU>; + DataType, data_layout::MODEL_PARALLEL, El::Device::GPU>; #endif // LBANN_HAS_GPU #endif // LBANN_VARIANCE_LAYER_INSTANTIATE diff --git a/include/lbann/layers/regularizers/batch_normalization.hpp b/include/lbann/layers/regularizers/batch_normalization.hpp index a981d770f1e..ee733e4661e 100644 --- a/include/lbann/layers/regularizers/batch_normalization.hpp +++ b/include/lbann/layers/regularizers/batch_normalization.hpp @@ -54,16 +54,31 @@ enum class batch_normalization_stats_aggregation { * Shift." In International Conference on Machine Learning, * pp. 448-456. 2015. */ -template -class batch_normalization_layer : public regularizer_layer { +template +class batch_normalization_layer : public regularizer_layer { static_assert(T_layout == data_layout::DATA_PARALLEL, "batch normalization only supports DATA_PARALLEL"); +public: + /** @name Public Types */ + ///@{ + + /** @brief The tensor type expected in this object. */ + using AbsDistMatrixType = El::AbstractDistMatrix; + + /** @brief The concrete weights type used by this object. */ + using WeightsType = data_type_weights; + + /** @brief The concrete optimizer type used by this object. */ + using OptimizerType = data_type_optimizer; + + ///@} + private: /** Decay rate for the running statistics. */ - DataType m_decay; + TensorDataType m_decay; /** Small number to avoid division by zero. */ - DataType m_epsilon; + TensorDataType m_epsilon; /** @brief Size of group to aggregate statistics over. * * If this is 1, the group consists of one process and aggregation @@ -80,24 +95,24 @@ class batch_normalization_layer : public regularizer_layer { * * These are fused for performance when doing non-local batchnorm. */ - std::unique_ptr m_mean_and_var; + std::unique_ptr m_mean_and_var; /** View of current mini-batch means. */ - std::unique_ptr m_mean_v; + std::unique_ptr m_mean_v; /** View of current mini-batch standard deviations. */ - std::unique_ptr m_var_v; + std::unique_ptr m_var_v; /** @brief Gradients w.r.t. means and standard deviations. * * These are fused for performance when doing non-local batchnorm. */ - std::unique_ptr m_mean_and_var_gradient; + std::unique_ptr m_mean_and_var_gradient; /** View of gradient w.r.t. means. */ - std::unique_ptr m_mean_gradient_v; + std::unique_ptr m_mean_gradient_v; /** View of gradient w.r.t. standard deviations. */ - std::unique_ptr m_var_gradient_v; + std::unique_ptr m_var_gradient_v; /** Gradient w.r.t. scaling terms. */ - std::unique_ptr m_scale_gradient; + std::unique_ptr m_scale_gradient; /** Gradient w.r.t. bias terms. */ - std::unique_ptr m_bias_gradient; + std::unique_ptr m_bias_gradient; public: /** @brief Set up batch normalization. @@ -110,10 +125,10 @@ class batch_normalization_layer : public regularizer_layer { * statistics over. Defaults to 1 (i.e. local aggregation). */ batch_normalization_layer(lbann_comm *comm, - DataType decay=0.9, - DataType epsilon=1e-5, + TensorDataType decay=0.9, + TensorDataType epsilon=1e-5, int statistics_group_size=1) - : regularizer_layer(comm), + : regularizer_layer(comm), m_decay(decay), m_epsilon(epsilon), m_statistics_group_size(statistics_group_size) { @@ -124,7 +139,7 @@ class batch_normalization_layer : public regularizer_layer { } batch_normalization_layer(const batch_normalization_layer& other) - : regularizer_layer(other), + : regularizer_layer(other), m_decay(other.m_decay), m_epsilon(other.m_epsilon), m_statistics_group_size(other.m_statistics_group_size), @@ -145,7 +160,7 @@ class batch_normalization_layer : public regularizer_layer { other.m_bias_gradient->Copy() : nullptr) {} batch_normalization_layer& operator=(const batch_normalization_layer& other) { - regularizer_layer::operator=(other); + regularizer_layer::operator=(other); m_decay = other.m_decay; m_epsilon = other.m_epsilon; m_statistics_group_size = other.m_statistics_group_size; @@ -178,7 +193,7 @@ class batch_normalization_layer : public regularizer_layer { El::Device get_device_allocation() const override { return Dev; } description get_description() const override { - auto desc = regularizer_layer::get_description(); + auto desc = regularizer_layer::get_description(); desc.add("Decay", m_decay); desc.add("Epsilon", m_epsilon); desc.add("Statistics group size", m_statistics_group_size); @@ -188,7 +203,7 @@ class batch_normalization_layer : public regularizer_layer { protected: void setup_matrices(const El::Grid& grid) override { - regularizer_layer::setup_matrices(grid); + regularizer_layer::setup_matrices(grid); m_mean_and_var.reset(new StarMat(grid)); m_mean_v.reset(new StarMat(grid)); m_var_v.reset(new StarMat(grid)); @@ -200,24 +215,24 @@ class batch_normalization_layer : public regularizer_layer { } void setup_dims() override { - regularizer_layer::setup_dims(); - set_output_dims(get_input_dims()); + regularizer_layer::setup_dims(); + this->set_output_dims(this->get_input_dims()); } void setup_data() override { - regularizer_layer::setup_data(); - const auto& output_dims = get_output_dims(); + regularizer_layer::setup_data(); + const auto& output_dims = this->get_output_dims(); const auto& num_channels = output_dims[0]; // Display warning if mini-batch size is small - const auto& output = get_activations(); + const auto& output = this->get_activations(); const auto& mini_batch_size = output.Width(); const auto& local_mini_batch_size = mini_batch_size / output.DistSize(); if (m_statistics_group_size == 0 && mini_batch_size <= 4) { if (output.DistRank() == 0) { std::stringstream err; err << "LBANN warning: " - << get_type() << " layer \"" << get_name() << "\" " + << get_type() << " layer \"" << this->get_name() << "\" " << "is using global statistics and " << "the mini-batch size (" << mini_batch_size << ") " << "may be too small to get good statistics"; @@ -230,7 +245,7 @@ class batch_normalization_layer : public regularizer_layer { if (output.DistRank() == 0) { std::stringstream err; err << "LBANN warning: " - << get_type() << " layer \"" << get_name() << "\" " + << get_type() << " layer \"" << this->get_name() << "\" " << "is aggregating statistics over " << m_statistics_group_size << "processors and the aggregated mini-batch size (" @@ -241,55 +256,57 @@ class batch_normalization_layer : public regularizer_layer { } // Initialize default weights if none are provided - if (this->m_weights.size() > 4) { + if (this->num_weights() > 4) { std::stringstream err; - err << "attempted to setup layer \"" << m_name << "\" " + err << "attempted to setup layer \"" << this->m_name << "\" " << "with an invalid number of weights"; LBANN_ERROR(err.str()); } - this->m_weights.resize(4, nullptr); - if (this->m_weights[0] == nullptr) { - auto w = make_unique(get_comm()); - auto init = make_unique(DataType(1)); - std::unique_ptr opt(m_model->create_optimizer()); - w->set_name(get_name() + "_scale"); + this->set_num_data_type_weights(4); + if (!this->has_data_type_weights(0)) { + auto w = make_unique(this->get_comm()); + auto init = make_unique>(TensorDataType(1)); + auto opt = to_unique_ptr(dynamic_cast( + this->m_model->create_optimizer())); + w->set_name(this->get_name() + "_scale"); w->set_initializer(std::move(init)); w->set_optimizer(std::move(opt)); - this->m_weights[0] = w.get(); + this->set_data_type_weights(0, w.get()); this->m_model->add_weights(std::move(w)); } - if (this->m_weights[1] == nullptr) { - auto w = make_unique(get_comm()); - auto init = make_unique(DataType(0)); - std::unique_ptr opt(m_model->create_optimizer()); - w->set_name(get_name() + "_bias"); + if (!this->has_data_type_weights(1)) { + auto w = make_unique(this->get_comm()); + auto init = make_unique>(TensorDataType(0)); + auto opt = to_unique_ptr(dynamic_cast( + this->m_model->create_optimizer())); + w->set_name(this->get_name() + "_bias"); w->set_initializer(std::move(init)); w->set_optimizer(std::move(opt)); - this->m_weights[1] = w.get(); + this->set_data_type_weights(1, w.get()); this->m_model->add_weights(std::move(w)); } - if (this->m_weights[2] == nullptr) { - auto w = make_unique(get_comm()); - auto init = make_unique(DataType(0)); - w->set_name(get_name() + "_running_mean"); + if (!this->has_data_type_weights(2)) { + auto w = make_unique(this->get_comm()); + auto init = make_unique>(TensorDataType(0)); + w->set_name(this->get_name() + "_running_mean"); w->set_initializer(std::move(init)); - this->m_weights[2] = w.get(); + this->set_data_type_weights(2, w.get()); this->m_model->add_weights(std::move(w)); } - if (this->m_weights[3] == nullptr) { - auto w = make_unique(get_comm()); - auto init = make_unique(DataType(1)); - w->set_name(get_name() + "_running_variance"); + if (!this->has_data_type_weights(3)) { + auto w = make_unique(this->get_comm()); + auto init = make_unique>(TensorDataType(1)); + w->set_name(this->get_name() + "_running_variance"); w->set_initializer(std::move(init)); - this->m_weights[3] = w.get(); + this->set_data_type_weights(3, w.get()); this->m_model->add_weights(std::move(w)); } // Setup weights - auto dist = get_prev_activations().DistData(); + auto dist = this->get_prev_activations().DistData(); dist.colDist = El::STAR; dist.rowDist = El::STAR; - for (auto* w : this->m_weights) { + for (auto* w : this->get_data_type_weights()) { w->set_dims(num_channels); w->set_matrix_distribution(dist); } @@ -309,18 +326,18 @@ class batch_normalization_layer : public regularizer_layer { El::ALL, El::IR(1, 2)); // Initialize freeze state - for (auto&& w : this->m_weights) { - if (m_frozen) { + for (auto&& w : this->get_data_type_weights()) { + if (this->m_frozen) { w->freeze(); } else { w->unfreeze(); } } - for (auto&& w : this->m_weights) { - if (w->is_frozen() != m_frozen) { + for (auto&& w : this->get_data_type_weights()) { + if (w->is_frozen() != this->m_frozen) { std::stringstream err; - err << (m_frozen ? "" : "un") << "frozen " - << "layer \"" << get_name() << "\" has " + err << (this->m_frozen ? "" : "un") << "frozen " + << "layer \"" << this->get_name() << "\" has " << (w->is_frozen() ? "" : "un") << "frozen " << "weights \"" << w->get_name() << "\""; LBANN_ERROR(err.str()); @@ -336,10 +353,10 @@ class batch_normalization_layer : public regularizer_layer { #ifndef LBANN_BATCH_NORMALIZATION_LAYER_INSTANTIATE extern template class batch_normalization_layer< - data_layout::DATA_PARALLEL, El::Device::CPU>; + DataType, data_layout::DATA_PARALLEL, El::Device::CPU>; #ifdef LBANN_HAS_GPU extern template class batch_normalization_layer< - data_layout::DATA_PARALLEL, El::Device::GPU>; + DataType, data_layout::DATA_PARALLEL, El::Device::GPU>; #endif // LBANN_HAS_GPU #endif // LBANN_BATCH_NORMALIZATION_LAYER_INSTANTIATE diff --git a/include/lbann/layers/regularizers/dropout.hpp b/include/lbann/layers/regularizers/dropout.hpp index 95f85f54498..dd60a02c5e0 100644 --- a/include/lbann/layers/regularizers/dropout.hpp +++ b/include/lbann/layers/regularizers/dropout.hpp @@ -45,13 +45,22 @@ namespace lbann { * prevent neural networks from overfitting." The Journal of Machine * Learning Research 15, no. 1 (2014): 1929-1958. */ -template -class dropout : public regularizer_layer { +template +class dropout : public regularizer_layer { +public: + /** @name Public Types */ + ///@{ + + /** @brief The tensor type expected in this object. */ + using AbsDistMatrixType = El::AbstractDistMatrix; + + ///@} + public: /** Keep units with probabiliy keep_prob. */ dropout(lbann_comm *comm, EvalType keep_prob = EvalType(0.5)) - : regularizer_layer(comm), + : regularizer_layer(comm), m_keep_prob(keep_prob) #ifdef LBANN_HAS_CUDNN , m_dropout_cudnn_desc(nullptr), @@ -68,7 +77,7 @@ class dropout : public regularizer_layer { } dropout(const dropout& other) - : regularizer_layer(other), + : regularizer_layer(other), m_keep_prob(other.m_keep_prob), m_mask(other.m_mask ? other.m_mask->Copy() : nullptr) #ifdef LBANN_HAS_CUDNN @@ -87,7 +96,7 @@ class dropout : public regularizer_layer { } dropout& operator=(const dropout& other) { - regularizer_layer::operator=(other); + regularizer_layer::operator=(other); m_keep_prob = other.m_keep_prob; m_mask = other.m_mask ? std::unique_ptr(other.m_mask->Copy()) : nullptr; #ifdef LBANN_HAS_CUDNN @@ -119,7 +128,7 @@ class dropout : public regularizer_layer { El::Device get_device_allocation() const override { return Dev; } description get_description() const override { - auto desc = regularizer_layer::get_description(); + auto desc = regularizer_layer::get_description(); desc.add("Keep probability", m_keep_prob); return desc; } @@ -135,17 +144,17 @@ class dropout : public regularizer_layer { protected: void setup_dims() override { - regularizer_layer::setup_dims(); - set_output_dims(get_input_dims()); + regularizer_layer::setup_dims(); + this->set_output_dims(this->get_input_dims()); } void setup_matrices(const El::Grid& grid) override { - regularizer_layer::setup_matrices(grid); - m_mask = std::unique_ptr(get_activations().Copy()); + regularizer_layer::setup_matrices(grid); + m_mask = std::unique_ptr(this->get_activations().Copy()); } void setup_gpu() override { - regularizer_layer::setup_gpu(); + regularizer_layer::setup_gpu(); #ifndef LBANN_HAS_CUDNN LBANN_ERROR("cuDNN not detected"); #else @@ -157,7 +166,7 @@ class dropout : public regularizer_layer { } void fp_compute () override { - if (using_gpus()) { + if (this->using_gpus()) { fp_compute_gpu(); } else { fp_compute_cpu(); @@ -165,7 +174,7 @@ class dropout : public regularizer_layer { } void bp_compute () override { - if (using_gpus()) { + if (this->using_gpus()) { bp_compute_gpu(); } else { bp_compute_cpu(); @@ -177,8 +186,8 @@ class dropout : public regularizer_layer { void fp_compute_cpu() { // Matrices - const auto& input = get_prev_activations(); - auto& output = get_activations(); + const auto& input = this->get_prev_activations(); + auto& output = this->get_activations(); // Do nothing if dropout is disabled const auto& mode = this->m_model->get_execution_context().get_execution_mode(); @@ -188,20 +197,20 @@ class dropout : public regularizer_layer { } // Construct mask matrix - const DataType scale = 1 / m_keep_prob; + const TensorDataType scale = 1 / m_keep_prob; const auto& height = input.Height(); const auto& width = input.Width(); m_mask->Resize(height, width); #ifdef LBANN_DETERMINISTIC - bernoulli_fill_procdet(*m_mask, height, width, DataType(m_keep_prob)); + bernoulli_fill_procdet(*m_mask, height, width, TensorDataType(m_keep_prob)); El::Scale(scale, *m_mask); #else El::EntrywiseMap(*m_mask, - (std::function) - ([this,scale](const DataType& z)->DataType { + (std::function) + ([this,scale](const TensorDataType& z)->DataType { auto& gen = get_fast_generator(); std::bernoulli_distribution dist(m_keep_prob); - return dist(gen) ? scale : DataType(0); + return dist(gen) ? scale : TensorDataType(0); })); #endif // LBANN_DETERMINISTIC @@ -212,8 +221,8 @@ class dropout : public regularizer_layer { /** Adjust gradients for dropout in backprop. */ void bp_compute_cpu() { - const auto& gradient_wrt_output = get_prev_error_signals(); - auto& gradient_wrt_input = get_error_signals(); + const auto& gradient_wrt_output = this->get_prev_error_signals(); + auto& gradient_wrt_input = this->get_error_signals(); const auto& mode = this->m_model->get_execution_context().get_execution_mode(); if (mode != execution_mode::training || m_keep_prob < EvalType(0)) { El::Copy(gradient_wrt_output, gradient_wrt_input); @@ -228,9 +237,9 @@ class dropout : public regularizer_layer { #else // Matrices - const auto& input = get_prev_activations(); + const auto& input = this->get_prev_activations(); const auto& local_input = input.LockedMatrix(); - auto& output = get_activations(); + auto& output = this->get_activations(); auto& local_output = output.Matrix(); // Do nothing if dropout is disabled or there is no local data @@ -246,7 +255,7 @@ class dropout : public regularizer_layer { auto&& output_desc = m_tensors_cudnn_desc.get_activations(); size_t size; CHECK_CUDNN(cudnnDropoutGetReserveSpaceSize(input_desc, &size)); - m_reserve_space.Resize((size + sizeof(DataType) - 1) / sizeof(DataType), 1); + m_reserve_space.Resize((size + sizeof(TensorDataType) - 1) / sizeof(TensorDataType), 1); // Apply dropout on the GPU CHECK_CUDNN(cudnnDropoutForward(cudnn::get_handle(), @@ -256,7 +265,7 @@ class dropout : public regularizer_layer { output_desc, local_output.Buffer(), m_reserve_space.Buffer(), - m_reserve_space.Height() * sizeof(DataType))); + m_reserve_space.Height() * sizeof(TensorDataType))); #endif // LBANN_HAS_CUDNN } @@ -267,9 +276,9 @@ class dropout : public regularizer_layer { #else // Matrices - const auto& gradient_wrt_output = get_prev_error_signals(); + const auto& gradient_wrt_output = this->get_prev_error_signals(); const auto& local_gradient_wrt_output = gradient_wrt_output.LockedMatrix(); - auto& gradient_wrt_input = get_error_signals(); + auto& gradient_wrt_input = this->get_error_signals(); auto& local_gradient_wrt_input = gradient_wrt_input.Matrix(); // Copy error signal if dropout is disabled @@ -286,7 +295,7 @@ class dropout : public regularizer_layer { m_tensors_cudnn_desc.get_error_signals(), local_gradient_wrt_input.Buffer(), m_reserve_space.Buffer(), - m_reserve_space.Height() * sizeof(DataType))); + m_reserve_space.Height() * sizeof(TensorDataType))); } } #endif // LBANN_HAS_CUDNN @@ -306,7 +315,7 @@ class dropout : public regularizer_layer { // Setup RNG state size_t size; CHECK_CUDNN(cudnnDropoutGetStatesSize(cudnn::get_handle(), &size)); - m_states.Resize((size + sizeof(DataType) - 1) / sizeof(DataType), 1); + m_states.Resize((size + sizeof(TensorDataType) - 1) / sizeof(TensorDataType), 1); // Setup dropout descriptor CHECK_CUDNN(cudnnCreateDropoutDescriptor(&m_dropout_cudnn_desc)); @@ -314,7 +323,7 @@ class dropout : public regularizer_layer { cudnn::get_handle(), float(1 - m_keep_prob), m_states.Buffer(), - m_states.Height() * sizeof(DataType), + m_states.Height() * sizeof(TensorDataType), get_generator()())); } @@ -323,27 +332,27 @@ class dropout : public regularizer_layer { /** Probability of keeping each unit. */ EvalType m_keep_prob; /** Current dropout mask (a scaled Bernoulli random matrix). */ - std::unique_ptr m_mask; + std::unique_ptr m_mask; #ifdef LBANN_HAS_CUDNN /** Dropout cuDNN descriptor. */ cudnnDropoutDescriptor_t m_dropout_cudnn_desc; /** Tensor cuDNN descriptors. */ - cudnn::entrywise_layer_tensor_manager m_tensors_cudnn_desc; + cudnn::entrywise_layer_tensor_manager m_tensors_cudnn_desc; /** RNG state for cuDNN dropout. */ - GPUMat m_states; + El::Matrix m_states; /** Work space for cuDNN dropout. */ - GPUMat m_reserve_space; + El::Matrix m_reserve_space; #endif // LBANN_HAS_CUDNN }; #ifndef LBANN_DROPOUT_LAYER_INSTANTIATE -extern template class dropout; -extern template class dropout; +extern template class dropout; +extern template class dropout; #ifdef LBANN_HAS_GPU -extern template class dropout; -extern template class dropout; +extern template class dropout; +extern template class dropout; #endif // LBANN_HAS_GPU #endif // LBANN_DROPOUT_LAYER_INSTANTIATE diff --git a/include/lbann/layers/regularizers/entrywise_batch_normalization.hpp b/include/lbann/layers/regularizers/entrywise_batch_normalization.hpp index 7bf925519c4..9996bbcc1e3 100644 --- a/include/lbann/layers/regularizers/entrywise_batch_normalization.hpp +++ b/include/lbann/layers/regularizers/entrywise_batch_normalization.hpp @@ -27,7 +27,7 @@ #ifndef LBANN_LAYERS_REGULARIZERS_ENTRYWISE_BATCH_NORMALIZATION_HPP_INCLUDED #define LBANN_LAYERS_REGULARIZERS_ENTRYWISE_BATCH_NORMALIZATION_HPP_INCLUDED -#include "lbann/layers/layer.hpp" +#include "lbann/layers/data_type_layer.hpp" #include "lbann/models/model.hpp" #include "lbann/utils/memory.hpp" @@ -45,17 +45,29 @@ namespace lbann { * Shift." In International Conference on Machine Learning, * pp. 448-456. 2015. */ -template -class entrywise_batch_normalization_layer : public Layer { +template +class entrywise_batch_normalization_layer : public data_type_layer { +public: + /** @name Public Types */ + ///@{ + + /** @brief The tensor type expected in this object. */ + using AbsDistMatrixType = El::AbstractDistMatrix; + + /** @brief The concrete weights type used by this object. */ + using WeightsType = data_type_weights; + + ///@} + public: entrywise_batch_normalization_layer(lbann_comm* comm, - DataType decay=0.9, - DataType epsilon=1e-5) - : Layer(comm), m_decay(decay), m_epsilon(epsilon) {} + TensorDataType decay=0.9, + TensorDataType epsilon=1e-5) + : data_type_layer(comm), m_decay(decay), m_epsilon(epsilon) {} entrywise_batch_normalization_layer(const entrywise_batch_normalization_layer& other) - : Layer(other), + : data_type_layer(other), m_decay(other.m_decay), m_epsilon(other.m_epsilon), m_batch_statistics(other.m_batch_statistics ? @@ -66,7 +78,7 @@ class entrywise_batch_normalization_layer : public Layer { nullptr) {} entrywise_batch_normalization_layer& operator=(const entrywise_batch_normalization_layer& other) { - Layer::operator=(other); + data_type_layer::operator=(other); m_decay = other.m_decay; m_epsilon = other.m_epsilon; m_batch_statistics.reset(other.m_batch_statistics ? @@ -84,7 +96,7 @@ class entrywise_batch_normalization_layer : public Layer { El::Device get_device_allocation() const override { return Device; } description get_description() const override { - auto desc = Layer::get_description(); + auto desc = data_type_layer::get_description(); desc.add("Decay", m_decay); desc.add("Epsilon", m_epsilon); return desc; @@ -93,51 +105,51 @@ class entrywise_batch_normalization_layer : public Layer { protected: void setup_matrices(const El::Grid& grid) override { - Layer::setup_matrices(grid); - auto dist = get_prev_activations().DistData(); + data_type_layer::setup_matrices(grid); + auto dist = this->get_prev_activations().DistData(); dist.rowDist = El::STAR; - m_batch_statistics.reset(AbsDistMat::Instantiate(dist)); - m_batch_statistics_gradient.reset(AbsDistMat::Instantiate(dist)); + m_batch_statistics.reset(AbsDistMatrixType::Instantiate(dist)); + m_batch_statistics_gradient.reset(AbsDistMatrixType::Instantiate(dist)); } void setup_data() override { - Layer::setup_data(); + data_type_layer::setup_data(); // Initialize output dimensions - set_output_dims(get_input_dims()); - const auto output_dims = get_output_dims(); - const auto output_size = get_output_size(); + this->set_output_dims(this->get_input_dims()); + const auto output_dims = this->get_output_dims(); + const auto output_size = this->get_output_size(); // Initialize default weights if none are provided - if (this->m_weights.size() > 2) { + if (this->num_weights() > 2) { std::stringstream err; - err << "attempted to setup layer \"" << m_name << "\" " + err << "attempted to setup layer \"" << this->get_name() << "\" " << "with an invalid number of weights " - << "(found " << this->m_weights.size() << ", expected 2)"; + << "(found " << this->num_weights() << ", expected 2)"; LBANN_ERROR(err.str()); } - this->m_weights.resize(2, nullptr); - if (this->m_weights[0] == nullptr) { - auto w = make_unique(get_comm()); - auto init = make_unique(DataType{0}); - w->set_name(get_name() + "_running_mean"); + this->set_num_data_type_weights(2); + if (!this->has_data_type_weights(0)) { + auto w = make_unique(this->get_comm()); + auto init = make_unique>(TensorDataType{0}); + w->set_name(this->get_name() + "_running_mean"); w->set_initializer(std::move(init)); - this->m_weights[0] = w.get(); + this->set_data_type_weights(0, w.get()); this->m_model->add_weights(std::move(w)); } - if (this->m_weights[1] == nullptr) { - auto w = make_unique(get_comm()); - auto init = make_unique(DataType{1}); - w->set_name(get_name() + "_running_variance"); + if (!this->has_data_type_weights(1)) { + auto w = make_unique(this->get_comm()); + auto init = make_unique>(TensorDataType{1}); + w->set_name(this->get_name() + "_running_variance"); w->set_initializer(std::move(init)); - this->m_weights[1] = w.get(); + this->set_data_type_weights(1, w.get()); this->m_model->add_weights(std::move(w)); } // Setup weights - auto dist = get_prev_activations().DistData(); + auto dist = this->get_prev_activations().DistData(); dist.rowDist = El::STAR; - for (auto* w : this->m_weights) { + for (auto* w : this->get_data_type_weights()) { w->set_dims(output_dims); w->set_matrix_distribution(dist); } @@ -151,9 +163,9 @@ class entrywise_batch_normalization_layer : public Layer { } void fp_setup_outputs(El::Int mini_batch_size) override { - Layer::fp_setup_outputs(mini_batch_size); - const auto& input = get_prev_activations(); - const auto input_size = get_input_size(); + data_type_layer::fp_setup_outputs(mini_batch_size); + const auto& input = this->get_prev_activations(); + const auto input_size = this->get_input_size(); // Make sure batch statistics tensor is aligned with input tensor m_batch_statistics->Empty(false); @@ -166,8 +178,8 @@ class entrywise_batch_normalization_layer : public Layer { /// @todo Realign tensors if misaligned bool aligned = true; try { - const auto& running_mean = m_weights[0]->get_values(); - const auto& running_var = m_weights[1]->get_values(); + const auto& running_mean = get_data_type_weights(0).get_values(); + const auto& running_var = get_data_type_weights(1).get_values(); aligned = (input.ColAlign() == running_mean.ColAlign() && input.RowAlign() == running_mean.RowAlign() && input.ColAlign() == running_var.ColAlign() @@ -190,10 +202,10 @@ class entrywise_batch_normalization_layer : public Layer { } void bp_setup_gradient_wrt_inputs(El::Int mini_batch_size) override { - Layer::bp_setup_gradient_wrt_inputs(mini_batch_size); + data_type_layer::bp_setup_gradient_wrt_inputs(mini_batch_size); m_batch_statistics_gradient->Empty(false); - m_batch_statistics_gradient->AlignWith(get_prev_activations()); - m_batch_statistics_gradient->Resize(get_input_size(), 2); + m_batch_statistics_gradient->AlignWith(this->get_prev_activations()); + m_batch_statistics_gradient->Resize(this->get_input_size(), 2); } void fp_compute() override; @@ -202,33 +214,33 @@ class entrywise_batch_normalization_layer : public Layer { private: /** Decay rate for the running statistics. */ - DataType m_decay; + TensorDataType m_decay; /** Small number to avoid division by zero. */ - DataType m_epsilon; + TensorDataType m_epsilon; /** @brief Current mini-batch statistics. * * These are fused for performance when doing non-local batchnorm. */ - std::unique_ptr m_batch_statistics; + std::unique_ptr m_batch_statistics; /** @brief Gradients w.r.t. current mini-batch statistics. * * These are fused for performance when doing non-local batchnorm. */ - std::unique_ptr m_batch_statistics_gradient; + std::unique_ptr m_batch_statistics_gradient; }; #ifndef LBANN_ENTRYWISE_BATCH_NORMALIZATION_LAYER_INSTANTIATE extern template class entrywise_batch_normalization_layer< - data_layout::DATA_PARALLEL, El::Device::CPU>; + DataType, data_layout::DATA_PARALLEL, El::Device::CPU>; extern template class entrywise_batch_normalization_layer< - data_layout::MODEL_PARALLEL, El::Device::CPU>; + DataType, data_layout::MODEL_PARALLEL, El::Device::CPU>; #ifdef LBANN_HAS_GPU extern template class entrywise_batch_normalization_layer< - data_layout::DATA_PARALLEL, El::Device::GPU>; + DataType, data_layout::DATA_PARALLEL, El::Device::GPU>; extern template class entrywise_batch_normalization_layer< - data_layout::MODEL_PARALLEL, El::Device::GPU>; + DataType, data_layout::MODEL_PARALLEL, El::Device::GPU>; #endif // LBANN_HAS_GPU #endif // LBANN_ENTRYWISE_BATCH_NORMALIZATION_LAYER_INSTANTIATE diff --git a/include/lbann/layers/regularizers/layer_norm.hpp b/include/lbann/layers/regularizers/layer_norm.hpp index b2c85b873be..af8f7630e52 100644 --- a/include/lbann/layers/regularizers/layer_norm.hpp +++ b/include/lbann/layers/regularizers/layer_norm.hpp @@ -27,7 +27,7 @@ #ifndef LBANN_LAYERS_REGULARIZERS_LAYER_NORM_HPP_INCLUDED #define LBANN_LAYERS_REGULARIZERS_LAYER_NORM_HPP_INCLUDED -#include "lbann/layers/layer.hpp" +#include "lbann/layers/data_type_layer.hpp" #include @@ -46,15 +46,15 @@ namespace lbann { * reproduce that functionality. * */ -template -class layer_norm_layer : public Layer { +template +class layer_norm_layer : public data_type_layer { public: /** * @param comm LBANN communicator * @param epsilon Small number to avoid division by zero */ - layer_norm_layer(lbann_comm* comm, DataType epsilon=1e-5); + layer_norm_layer(lbann_comm* comm, TensorDataType epsilon=1e-5); layer_norm_layer(const layer_norm_layer& other); layer_norm_layer& operator=(const layer_norm_layer& other); @@ -77,19 +77,21 @@ class layer_norm_layer : public Layer { private: + using AbsDistMatType = El::AbstractDistMatrix; + /** Small number to avoid division by zero. */ - DataType m_epsilon; + TensorDataType m_epsilon; /** @brief Per-sample statistics. * * The means and variances are fused for performance. */ - std::unique_ptr m_statistics; + std::unique_ptr m_statistics; /** @brief Gradients w.r.t. per-sample statistics. * * The means and variances are fused for performance. */ - std::unique_ptr m_statistics_gradient; + std::unique_ptr m_statistics_gradient; }; @@ -97,17 +99,17 @@ class layer_norm_layer : public Layer { // Implementation // ========================================================= -template -layer_norm_layer::layer_norm_layer( +template +layer_norm_layer::layer_norm_layer( lbann_comm* comm, - DataType epsilon) - : Layer(comm), m_epsilon(epsilon) + TensorDataType epsilon) + : data_type_layer(comm), m_epsilon(epsilon) {} -template -layer_norm_layer::layer_norm_layer( - const layer_norm_layer& other) - : Layer(other), +template +layer_norm_layer::layer_norm_layer( + const layer_norm_layer& other) + : data_type_layer(other), m_epsilon(other.m_epsilon), m_statistics(other.m_statistics ? other.m_statistics->Copy() @@ -117,10 +119,10 @@ layer_norm_layer::layer_norm_layer( : nullptr) {} -template -layer_norm_layer& layer_norm_layer::operator=( - const layer_norm_layer& other) { - Layer::operator=(other); +template +layer_norm_layer& layer_norm_layer::operator=( + const layer_norm_layer& other) { + data_type_layer::operator=(other); m_epsilon = other.m_epsilon; m_statistics.reset(other.m_statistics ? other.m_statistics->Copy() @@ -131,61 +133,61 @@ layer_norm_layer& layer_norm_layer::operator=( return *this; } -template -layer_norm_layer* layer_norm_layer::copy() const { +template +layer_norm_layer* layer_norm_layer::copy() const { return new layer_norm_layer(*this); } -template -std::string layer_norm_layer::get_type() const { +template +std::string layer_norm_layer::get_type() const { return "layer norm"; } -template -data_layout layer_norm_layer::get_data_layout() const { +template +data_layout layer_norm_layer::get_data_layout() const { return Layout; } -template -El::Device layer_norm_layer::get_device_allocation() const { +template +El::Device layer_norm_layer::get_device_allocation() const { return Device; } -template -description layer_norm_layer::get_description() const { - auto desc = Layer::get_description(); +template +description layer_norm_layer::get_description() const { + auto desc = data_type_layer::get_description(); desc.add("Epsilon", m_epsilon); return desc; } -template -void layer_norm_layer::setup_dims() { - Layer::setup_dims(); +template +void layer_norm_layer::setup_dims() { + data_type_layer::setup_dims(); this->set_output_dims(this->get_input_dims()); } -template -void layer_norm_layer::setup_matrices(const El::Grid& grid) { - Layer::setup_matrices(grid); - auto dist = get_prev_activations().DistData(); +template +void layer_norm_layer::setup_matrices(const El::Grid& grid) { + data_type_layer::setup_matrices(grid); + auto dist = this->get_prev_activations().DistData(); dist.colDist = El::STAR; m_statistics.reset(AbsDistMat::Instantiate(dist)); m_statistics_gradient.reset(AbsDistMat::Instantiate(dist)); } -template -void layer_norm_layer::fp_setup_outputs(El::Int mini_batch_size) { - Layer::fp_setup_outputs(mini_batch_size); - const auto& input = get_prev_activations(); +template +void layer_norm_layer::fp_setup_outputs(El::Int mini_batch_size) { + data_type_layer::fp_setup_outputs(mini_batch_size); + const auto& input = this->get_prev_activations(); m_statistics->Empty(false); m_statistics->AlignWith(input); m_statistics->Resize(2, input.Width()); } -template -void layer_norm_layer::bp_setup_gradient_wrt_inputs(El::Int mini_batch_size) { - Layer::bp_setup_gradient_wrt_inputs(mini_batch_size); - const auto& input = get_prev_activations(); +template +void layer_norm_layer::bp_setup_gradient_wrt_inputs(El::Int mini_batch_size) { + data_type_layer::bp_setup_gradient_wrt_inputs(mini_batch_size); + const auto& input = this->get_prev_activations(); m_statistics_gradient->Empty(false); m_statistics_gradient->AlignWith(input); m_statistics_gradient->Resize(2, input.Width()); @@ -197,14 +199,14 @@ void layer_norm_layer::bp_setup_gradient_wrt_inputs(El::Int mini_ #ifndef LBANN_LAYER_NORM_LAYER_INSTANTIATE extern template class layer_norm_layer< - data_layout::DATA_PARALLEL, El::Device::CPU>; + DataType, data_layout::DATA_PARALLEL, El::Device::CPU>; extern template class layer_norm_layer< - data_layout::MODEL_PARALLEL, El::Device::CPU>; + DataType, data_layout::MODEL_PARALLEL, El::Device::CPU>; #ifdef LBANN_HAS_GPU extern template class layer_norm_layer< - data_layout::DATA_PARALLEL, El::Device::GPU>; + DataType, data_layout::DATA_PARALLEL, El::Device::GPU>; extern template class layer_norm_layer< - data_layout::MODEL_PARALLEL, El::Device::GPU>; + DataType, data_layout::MODEL_PARALLEL, El::Device::GPU>; #endif // LBANN_HAS_GPU #endif // LBANN_LAYER_NORM_LAYER_INSTANTIATE diff --git a/include/lbann/layers/regularizers/local_response_normalization.hpp b/include/lbann/layers/regularizers/local_response_normalization.hpp index 7866ef96282..01e06108330 100644 --- a/include/lbann/layers/regularizers/local_response_normalization.hpp +++ b/include/lbann/layers/regularizers/local_response_normalization.hpp @@ -43,19 +43,20 @@ namespace lbann { * Advances in Neural Information Processing Systems, * pp. 1097-1105. 2012. */ -template -class local_response_normalization_layer : public regularizer_layer { +class local_response_normalization_layer : public regularizer_layer { static_assert(T_layout == data_layout::DATA_PARALLEL, "local_response_normalization only supports DATA_PARALLEL"); public: local_response_normalization_layer(lbann_comm *comm, int window_width, - DataType alpha, - DataType beta, - DataType k) - : regularizer_layer(comm), + TensorDataType alpha, + TensorDataType beta, + TensorDataType k) + : regularizer_layer(comm), m_window_width(window_width), m_alpha(alpha), m_beta(beta), m_k(k) #ifdef LBANN_HAS_CUDNN , m_lrn_cudnn_desc(nullptr), @@ -64,7 +65,7 @@ class local_response_normalization_layer : public regularizer_layer { { } local_response_normalization_layer(const local_response_normalization_layer& other) - : regularizer_layer(other), + : regularizer_layer(other), m_window_width(other.m_window_width), m_alpha(other.m_alpha), m_beta(other.m_beta), @@ -87,7 +88,7 @@ class local_response_normalization_layer : public regularizer_layer { } local_response_normalization_layer& operator=(const local_response_normalization_layer& other) { - regularizer_layer::operator=(other); + regularizer_layer::operator=(other); m_window_width = other.m_window_width; m_alpha = other.m_alpha; m_beta = other.m_beta; @@ -129,7 +130,7 @@ class local_response_normalization_layer : public regularizer_layer { El::Device get_device_allocation() const override { return Dev; } description get_description() const override { - auto desc = regularizer_layer::get_description(); + auto desc = regularizer_layer::get_description(); desc.add("alpha", m_alpha); desc.add("beta", m_beta); desc.add("k", m_k); @@ -139,13 +140,13 @@ class local_response_normalization_layer : public regularizer_layer { protected: void setup_dims() override { - regularizer_layer::setup_dims(); - set_output_dims(get_input_dims()); + regularizer_layer::setup_dims(); + this->set_output_dims(this->get_input_dims()); } /// Initialize GPU objects void setup_gpu() override { - regularizer_layer::setup_gpu(); + regularizer_layer::setup_gpu(); #ifndef LBANN_HAS_CUDNN LBANN_ERROR("cuDNN not detected"); #else @@ -179,17 +180,17 @@ class local_response_normalization_layer : public regularizer_layer { /** Normalization window width. */ int m_window_width; /** LRN alpha scaling parameter. */ - DataType m_alpha; + TensorDataType m_alpha; /** LRN beta power parameter. */ - DataType m_beta; + TensorDataType m_beta; /** LRN k parameter. */ - DataType m_k; + TensorDataType m_k; #ifdef LBANN_HAS_CUDNN /** LRN cuDNN descriptor. */ cudnnLRNDescriptor_t m_lrn_cudnn_desc; /** Tensor cuDNN descriptors. */ - cudnn::data_parallel_layer_tensor_manager m_tensors_cudnn_desc; + cudnn::data_parallel_layer_tensor_manager m_tensors_cudnn_desc; #endif // LBANN_HAS_CUDNN /// GPU implementation of forward propagation @@ -197,11 +198,11 @@ class local_response_normalization_layer : public regularizer_layer { #ifndef LBANN_HAS_CUDNN LBANN_ERROR("cuDNN not detected"); #else - const auto& local_input = get_local_prev_activations(); - auto& local_output = get_local_activations(); + const auto& local_input = this->get_local_prev_activations(); + auto& local_output = this->get_local_activations(); if (local_input.Height() > 0 && local_input.Width() > 0) { - const DataType zero = DataType(0); - const DataType one = DataType(1); + const TensorDataType zero = TensorDataType(0); + const TensorDataType one = TensorDataType(1); CHECK_CUDNN(cudnnLRNCrossChannelForward(cudnn::get_handle(), m_lrn_cudnn_desc, CUDNN_LRN_CROSS_CHANNEL_DIM1, @@ -220,13 +221,13 @@ class local_response_normalization_layer : public regularizer_layer { #ifndef LBANN_HAS_CUDNN LBANN_ERROR("cuDNN not detected"); #else - const auto& local_input = get_local_prev_activations(); - const auto& local_output = get_local_activations(); - const auto& local_gradient_wrt_output = get_local_prev_error_signals(); - auto& local_gradient_wrt_input = get_local_error_signals(); + const auto& local_input = this->get_local_prev_activations(); + const auto& local_output = this->get_local_activations(); + const auto& local_gradient_wrt_output = this->get_local_prev_error_signals(); + auto& local_gradient_wrt_input = this->get_local_error_signals(); if (local_input.Height() > 0 && local_input.Width() > 0) { - const DataType zero = DataType(0); - const DataType one = DataType(1); + const TensorDataType zero = TensorDataType(0); + const TensorDataType one = TensorDataType(1); CHECK_CUDNN(cudnnLRNCrossChannelBackward(cudnn::get_handle(), m_lrn_cudnn_desc, CUDNN_LRN_CROSS_CHANNEL_DIM1, @@ -248,20 +249,20 @@ class local_response_normalization_layer : public regularizer_layer { void fp_compute_cpu() { // Local matrices - const auto& local_input = get_local_prev_activations(); - auto& local_output = get_local_activations(); + const auto& local_input = this->get_local_prev_activations(); + auto& local_output = this->get_local_activations(); // Matrix parameters const int local_width = local_input.Width(); - const DataType* input_buffer = local_input.LockedBuffer(); + const TensorDataType* input_buffer = local_input.LockedBuffer(); const int input_ldim = local_input.LDim(); - DataType* output_buffer = local_output.Buffer(); + TensorDataType* output_buffer = local_output.Buffer(); const int output_ldim = local_output.LDim(); // Get LRN parameters - const auto& output_dims = get_output_dims(); + const auto& output_dims = this->get_output_dims(); const int num_channels = output_dims[0]; - const int num_per_channel = get_output_size() / num_channels; + const int num_per_channel = this->get_output_size() / num_channels; // Check if LRN is using default beta parameter const bool default_beta = (std::fabs((m_beta - 0.75) / 0.75) @@ -283,7 +284,7 @@ class local_response_normalization_layer : public regularizer_layer { block_start += max_block_size) { const int block_size = std::min(max_block_size, num_per_channel - block_start); - DataType workspace[max_block_size]; + TensorDataType workspace[max_block_size]; // Iterate through channels for (int channel = 0; channel < num_channels; ++channel) { @@ -291,11 +292,11 @@ class local_response_normalization_layer : public regularizer_layer { const int window_end = std::min(channel + m_window_width / 2, num_channels - 1); // Compute sum of squares in workspace - std::fill(workspace, workspace + block_size, DataType(0)); + std::fill(workspace, workspace + block_size, TensorDataType(0)); for (int window_pos = window_start; window_pos <= window_end; ++window_pos) { for (int block_pos = 0; block_pos < block_size; ++block_pos) { const int index = block_start + block_pos + window_pos * num_per_channel; - const DataType input_entry = input_buffer[index + sample * input_ldim]; + const TensorDataType input_entry = input_buffer[index + sample * input_ldim]; workspace[block_pos] += input_entry * input_entry; } } @@ -308,9 +309,9 @@ class local_response_normalization_layer : public regularizer_layer { // Compute output for (int block_pos = 0; block_pos < block_size; ++block_pos) { const int index = block_start + block_pos + channel * num_per_channel; - const DataType scale_factor = workspace[block_pos]; - const DataType input_entry = input_buffer[index + sample * input_ldim]; - DataType& output_entry = output_buffer[index + sample * output_ldim]; + const TensorDataType scale_factor = workspace[block_pos]; + const TensorDataType input_entry = input_buffer[index + sample * input_ldim]; + TensorDataType& output_entry = output_buffer[index + sample * output_ldim]; if (default_beta) { // Special case when beta = 0.75 output_entry = (input_entry * std::sqrt(scale_factor * std::sqrt(scale_factor))); @@ -331,26 +332,26 @@ class local_response_normalization_layer : public regularizer_layer { void bp_compute_cpu() { // Get local matrices - const auto& local_input = get_local_prev_activations(); - const auto& local_output = get_local_activations(); - const auto& local_gradient_wrt_output = get_local_prev_error_signals(); - auto& local_gradient_wrt_input = get_local_error_signals(); + const auto& local_input = this->get_local_prev_activations(); + const auto& local_output = this->get_local_activations(); + const auto& local_gradient_wrt_output = this->get_local_prev_error_signals(); + auto& local_gradient_wrt_input = this->get_local_error_signals(); // Get matrix buffers const int local_width = local_input.Width(); - const DataType* input_buffer = local_input.LockedBuffer(); + const TensorDataType* input_buffer = local_input.LockedBuffer(); const int input_ldim = local_input.LDim(); - const DataType* output_buffer = local_output.LockedBuffer(); + const TensorDataType* output_buffer = local_output.LockedBuffer(); const int output_ldim = local_output.LDim(); - const DataType* gradient_wrt_output_buffer = local_gradient_wrt_output.LockedBuffer(); + const TensorDataType* gradient_wrt_output_buffer = local_gradient_wrt_output.LockedBuffer(); const int gradient_wrt_output_ldim = local_gradient_wrt_output.LDim(); - DataType* gradient_wrt_input_buffer = local_gradient_wrt_input.Buffer(); + TensorDataType* gradient_wrt_input_buffer = local_gradient_wrt_input.Buffer(); const int gradient_wrt_input_ldim = local_gradient_wrt_input.LDim(); // Get LRN parameters - const auto& output_dims = get_output_dims(); + const auto& output_dims = this->get_output_dims(); const int num_channels = output_dims[0]; - const int num_per_channel = get_output_size() / num_channels; + const int num_per_channel = this->get_output_size() / num_channels; // Check if LRN is using default beta parameter const bool default_beta = (std::fabs((m_beta - 0.75) / 0.75) @@ -376,7 +377,7 @@ class local_response_normalization_layer : public regularizer_layer { block_start += max_block_size) { const int block_size = std::min(max_block_size, num_per_channel - block_start); - DataType workspace[max_block_size]; + TensorDataType workspace[max_block_size]; // Iterate through channels for (int channel = 0; channel < num_channels; ++channel) { @@ -384,11 +385,11 @@ class local_response_normalization_layer : public regularizer_layer { const int window_end = std::min(channel + m_window_width / 2, num_channels - 1); // Compute sum of squares in workspace - std::fill(workspace, workspace + block_size, DataType(0)); + std::fill(workspace, workspace + block_size, TensorDataType(0)); for (int window_pos = window_start; window_pos <= window_end; ++window_pos) { for (int block_pos = 0; block_pos < block_size; ++block_pos) { const int index = block_start + block_pos + window_pos * num_per_channel; - const DataType input_entry = input_buffer[index + sample * input_ldim]; + const TensorDataType input_entry = input_buffer[index + sample * input_ldim]; workspace[block_pos] += input_entry * input_entry; } } @@ -401,10 +402,10 @@ class local_response_normalization_layer : public regularizer_layer { // Compute error signal contribution for current entry for (int block_pos = 0; block_pos < block_size; ++block_pos) { const int index = block_start + block_pos + channel * num_per_channel; - const DataType scale_factor = workspace[block_pos]; - const DataType gradient_wrt_output_entry + const TensorDataType scale_factor = workspace[block_pos]; + const TensorDataType gradient_wrt_output_entry = gradient_wrt_output_buffer[index + sample * gradient_wrt_output_ldim]; - DataType& gradient_wrt_input_entry + TensorDataType& gradient_wrt_input_entry = gradient_wrt_input_buffer[index + sample * gradient_wrt_input_ldim]; if (default_beta) { // Special case when beta = 0.75 gradient_wrt_input_entry @@ -419,8 +420,8 @@ class local_response_normalization_layer : public regularizer_layer { // Compute y * dy / (k + alpha * sum(x^2) ) in workspace for (int block_pos = 0; block_pos < block_size; ++block_pos) { const int index = block_start + block_pos + channel * num_per_channel; - const DataType output_entry = output_buffer[index + sample * output_ldim]; - const DataType gradient_wrt_output_entry + const TensorDataType output_entry = output_buffer[index + sample * output_ldim]; + const TensorDataType gradient_wrt_output_entry = gradient_wrt_output_buffer[index + sample * gradient_wrt_output_ldim]; workspace[block_pos] = (-2 * m_alpha * m_beta * workspace[block_pos] * output_entry * gradient_wrt_output_entry); @@ -430,7 +431,7 @@ class local_response_normalization_layer : public regularizer_layer { for (int window_pos = window_start; window_pos <= window_end; ++window_pos) { for (int block_pos = 0; block_pos < block_size; ++block_pos) { const int index = block_start + block_pos + window_pos * num_per_channel; - const DataType input_entry = input_buffer[index + sample * input_ldim]; + const TensorDataType input_entry = input_buffer[index + sample * input_ldim]; gradient_wrt_input_buffer[index + sample * gradient_wrt_input_ldim] += workspace[block_pos] * input_entry; } @@ -447,10 +448,10 @@ class local_response_normalization_layer : public regularizer_layer { #ifndef LBANN_LOCAL_RESPONSE_NORMALIZATION_LAYER_INSTANTIATE extern template class local_response_normalization_layer< - data_layout::DATA_PARALLEL, El::Device::CPU>; + DataType, data_layout::DATA_PARALLEL, El::Device::CPU>; #ifdef LBANN_HAS_GPU extern template class local_response_normalization_layer< - data_layout::DATA_PARALLEL, El::Device::GPU>; + DataType, data_layout::DATA_PARALLEL, El::Device::GPU>; #endif // LBANN_HAS_GPU #endif // LBANN_LOCAL_RESPONSE_NORMALIZATION_LAYER_INSTANTIATE diff --git a/include/lbann/layers/regularizers/regularizer.hpp b/include/lbann/layers/regularizers/regularizer.hpp index c01b892c820..51966d28258 100644 --- a/include/lbann/layers/regularizers/regularizer.hpp +++ b/include/lbann/layers/regularizers/regularizer.hpp @@ -26,16 +26,17 @@ #ifndef LBANN_LAYER_REGULARIZER_HPP_INCLUDED #define LBANN_LAYER_REGULARIZER_HPP_INCLUDED -#include "lbann/layers/layer.hpp" +#include "lbann/layers/data_type_layer.hpp" namespace lbann { /** @todo Remove. Layers should inherit directly from the base layer * class. */ -class regularizer_layer : public Layer { +template +class regularizer_layer : public data_type_layer { public: - regularizer_layer(lbann_comm *comm) : Layer(comm) {} + regularizer_layer(lbann_comm *comm) : data_type_layer(comm) {} }; } // namespace lbann diff --git a/include/lbann/layers/regularizers/selu_dropout.hpp b/include/lbann/layers/regularizers/selu_dropout.hpp index 428493f732e..c9488c27b76 100644 --- a/include/lbann/layers/regularizers/selu_dropout.hpp +++ b/include/lbann/layers/regularizers/selu_dropout.hpp @@ -40,15 +40,24 @@ namespace lbann { * Hochreiter. "Self-normalizing neural networks." In Advances in * Neural Information Processing Systems, pp. 971-980. 2017. */ -template -class selu_dropout : public regularizer_layer { +template +class selu_dropout : public regularizer_layer { +public: + /** @name Public Types */ + ///@{ + + /** @brief The tensor type expected in this object. */ + using AbsDistMatrixType = El::AbstractDistMatrix; + + ///@} + public: /** Keep units with probabiliy keep_prob. */ selu_dropout(lbann_comm *comm, - float keep_prob=0.95f, - DataType alpha = DataType(1.6732632423543772848170429916717), - DataType scale = DataType(1.0507009873554804934193349852946)) : - regularizer_layer(comm), + TensorDataType keep_prob = TensorDataType(0.95f), + TensorDataType alpha = TensorDataType(1.6732632423543772848170429916717), + TensorDataType scale = TensorDataType(1.0507009873554804934193349852946)) : + regularizer_layer(comm), m_keep_prob(keep_prob), m_mask(nullptr) { #ifdef LBANN_DETERMINISTIC @@ -57,13 +66,13 @@ class selu_dropout : public regularizer_layer { // Compute alpha' and the affine transform. m_alpha_prime = -scale*alpha; m_a = keep_prob + - m_alpha_prime*m_alpha_prime*keep_prob*(DataType(1) - keep_prob); - m_a = DataType(1) / std::sqrt(m_a); - m_b = -m_a * m_alpha_prime*(DataType(1) - keep_prob); + m_alpha_prime*m_alpha_prime*keep_prob*(TensorDataType(1) - keep_prob); + m_a = TensorDataType(1) / std::sqrt(m_a); + m_b = -m_a * m_alpha_prime*(TensorDataType(1) - keep_prob); } selu_dropout(const selu_dropout& other) : - regularizer_layer(other), + regularizer_layer(other), m_alpha_prime(other.m_alpha_prime), m_a(other.m_a), m_b(other.m_b), @@ -73,7 +82,7 @@ class selu_dropout : public regularizer_layer { } selu_dropout& operator=(const selu_dropout& other) { - regularizer_layer::operator=(other); + regularizer_layer::operator=(other); m_alpha_prime = other.m_alpha_prime; m_a = other.m_a; m_b = other.m_b; @@ -97,14 +106,14 @@ class selu_dropout : public regularizer_layer { El::Device get_device_allocation() const override { return Dev; } void setup_dims() override { - regularizer_layer::setup_dims(); - set_output_dims(get_input_dims()); + regularizer_layer::setup_dims(); + this->set_output_dims(this->get_input_dims()); } void setup_matrices(const El::Grid& grid) override { - regularizer_layer::setup_matrices(grid); + regularizer_layer::setup_matrices(grid); if (m_mask != nullptr) { delete m_mask; } - m_mask = get_activations().Copy(); + m_mask = this->get_activations().Copy(); } protected: @@ -113,17 +122,17 @@ class selu_dropout : public regularizer_layer { if (this->m_model->get_execution_context().get_execution_mode() != execution_mode::training || m_keep_prob < 0.0f) { // Do nothing if dropout is disabled - El::Copy(get_prev_activations(), get_activations()); + El::Copy(this->get_prev_activations(), this->get_activations()); } else { - const auto *input_acts = &get_prev_activations(); + const auto *input_acts = &this->get_prev_activations(); const El::Int height = input_acts->Height(); const El::Int width = input_acts->Width(); const El::Int local_height = input_acts->LocalHeight(); const El::Int local_width = input_acts->LocalWidth(); const auto& local_input_acts = input_acts->LockedMatrix(); - Mat& local_output_acts = get_local_activations(); + Mat& local_output_acts = this->get_local_activations(); Mat& local_mask = m_mask->Matrix(); // Construct and apply mask and the affine transform. @@ -144,11 +153,11 @@ class selu_dropout : public regularizer_layer { void bp_compute() override { if (this->m_model->get_execution_context().get_execution_mode() != execution_mode::training || m_keep_prob < 0.0f) { - El::Copy(get_prev_error_signals(), get_error_signals()); + El::Copy(this->get_prev_error_signals(), this->get_error_signals()); } else { - const auto& local_prev_error_signal = get_local_prev_error_signals(); - Mat& local_error_signal = get_local_error_signals(); + const auto& local_prev_error_signal = this->get_local_prev_error_signals(); + Mat& local_error_signal = this->get_local_error_signals(); Mat& local_mask = m_mask->Matrix(); const El::Int local_height = local_prev_error_signal.Height(); const El::Int local_width = local_prev_error_signal.Width(); @@ -165,27 +174,27 @@ class selu_dropout : public regularizer_layer { private: /** Alpha prime, the low-variance saturation point. */ - DataType m_alpha_prime; + TensorDataType m_alpha_prime; /** Affine scaling parameter to keep mean/variance at desired value. */ - DataType m_a; + TensorDataType m_a; /** Affine additive parameter to keep mean/variance at desired value. */ - DataType m_b; + TensorDataType m_b; /** Probability of keeping each unit. */ - float m_keep_prob; + TensorDataType m_keep_prob; /** Current dropout mask (a scaled Bernoulli random matrix). */ - AbsDistMat *m_mask; + AbsDistMatrixType *m_mask; }; #ifndef LBANN_SELU_DROPOUT_LAYER_INSTANTIATE extern template class selu_dropout< - data_layout::DATA_PARALLEL, El::Device::CPU>; + DataType, data_layout::DATA_PARALLEL, El::Device::CPU>; extern template class selu_dropout< - data_layout::MODEL_PARALLEL, El::Device::CPU>; + DataType, data_layout::MODEL_PARALLEL, El::Device::CPU>; #ifdef LBANN_HAS_GPU extern template class selu_dropout< - data_layout::DATA_PARALLEL, El::Device::GPU>; + DataType, data_layout::DATA_PARALLEL, El::Device::GPU>; extern template class selu_dropout< - data_layout::MODEL_PARALLEL, El::Device::GPU>; + DataType, data_layout::MODEL_PARALLEL, El::Device::GPU>; #endif // LBANN_HAS_GPU #endif // LBANN_SELU_DROPOUT_LAYER_INSTANTIATE diff --git a/include/lbann/layers/transform/bernoulli.hpp b/include/lbann/layers/transform/bernoulli.hpp index 9fda020695e..7a3370a8ed8 100644 --- a/include/lbann/layers/transform/bernoulli.hpp +++ b/include/lbann/layers/transform/bernoulli.hpp @@ -37,19 +37,20 @@ namespace lbann { * * During validation and testing, outputs are all zero. */ -template -class bernoulli_layer : public transform_layer { +class bernoulli_layer : public transform_layer { private: /** Probability of outputting 1. */ - DataType m_prob; + TensorDataType m_prob; public: bernoulli_layer(lbann_comm *comm, std::vector dims, - DataType prob = DataType(0.5)) - : transform_layer(comm), m_prob(prob) { - set_output_dims(dims); + TensorDataType prob = TensorDataType(0.5)) + : transform_layer(comm), m_prob(prob) { + this->set_output_dims(dims); this->m_expected_num_parent_layers = 0; } bernoulli_layer* copy() const override { return new bernoulli_layer(*this); } @@ -58,7 +59,7 @@ class bernoulli_layer : public transform_layer { El::Device get_device_allocation() const override { return Dev; } description get_description() const override { - auto desc = transform_layer::get_description(); + auto desc = transform_layer::get_description(); desc.add("Probability", m_prob); return desc; } @@ -66,7 +67,7 @@ class bernoulli_layer : public transform_layer { protected: void fp_compute() override { - auto& output = get_activations(); + auto& output = this->get_activations(); if (this->m_model->get_execution_context().get_execution_mode() == execution_mode::training) { bernoulli_fill(output, output.Height(), output.Width(), m_prob); } else { @@ -78,14 +79,14 @@ class bernoulli_layer : public transform_layer { #ifndef LBANN_BERNOULLI_LAYER_INSTANTIATE extern template class bernoulli_layer< - data_layout::DATA_PARALLEL, El::Device::CPU>; + DataType, data_layout::DATA_PARALLEL, El::Device::CPU>; extern template class bernoulli_layer< - data_layout::MODEL_PARALLEL, El::Device::CPU>; + DataType, data_layout::MODEL_PARALLEL, El::Device::CPU>; #ifdef LBANN_HAS_GPU extern template class bernoulli_layer< - data_layout::DATA_PARALLEL, El::Device::GPU>; + DataType, data_layout::DATA_PARALLEL, El::Device::GPU>; extern template class bernoulli_layer< - data_layout::MODEL_PARALLEL, El::Device::GPU>; + DataType, data_layout::MODEL_PARALLEL, El::Device::GPU>; #endif // LBANN_HAS_GPU #endif // LBANN_BERNOULLI_LAYER_INSTANTIATE diff --git a/include/lbann/layers/transform/categorical_random.hpp b/include/lbann/layers/transform/categorical_random.hpp index 44e303530dc..49b1f04b9d4 100644 --- a/include/lbann/layers/transform/categorical_random.hpp +++ b/include/lbann/layers/transform/categorical_random.hpp @@ -41,9 +41,10 @@ namespace lbann { * * @todo Remove. */ -template -class categorical_random_layer : public transform_layer { +class categorical_random_layer : public transform_layer { static_assert(Dev == El::Device::CPU, "categorical random layer currently only supports CPU"); static_assert(T_layout == data_layout::DATA_PARALLEL, @@ -51,7 +52,7 @@ class categorical_random_layer : public transform_layer { "supports DATA_PARALLEL"); public: categorical_random_layer(lbann_comm *comm) - : transform_layer(comm) { + : transform_layer(comm) { } categorical_random_layer* copy() const override { return new categorical_random_layer(*this); } std::string get_type() const override { return "categorical random"; } @@ -63,9 +64,9 @@ class categorical_random_layer : public transform_layer { void fp_compute() override { // Input and output matrices - const auto& input = get_prev_activations(); + const auto& input = this->get_prev_activations(); const auto& local_input = input.LockedMatrix(); - auto& local_output = get_local_activations(); + auto& local_output = this->get_local_activations(); const auto& width = input.Width(); const auto& local_height = local_input.Height(); const auto& local_width = local_input.Width(); @@ -75,7 +76,7 @@ class categorical_random_layer : public transform_layer { El::Zero(local_output); StarVCMat rand_mat(input.Grid(), input.Root()); if (mode == execution_mode::training) { - uniform_fill(rand_mat, 1, width, DataType(0.5), DataType(0.5)); + uniform_fill(rand_mat, 1, width, TensorDataType(0.5), TensorDataType(0.5)); } // Process each mini-batch sample @@ -87,7 +88,7 @@ class categorical_random_layer : public transform_layer { if (mode == execution_mode::training) { // Choose first output with CDF above random number in (0,1) const auto& rand = rand_mat.GetLocal(0, col); - DataType cdf = DataType(0); + TensorDataType cdf = TensorDataType(0); for (El::Int row = 0; row < local_height; ++row) { cdf += local_input(row, col); if (rand < cdf) { @@ -103,7 +104,7 @@ class categorical_random_layer : public transform_layer { } // Output a one-hot vector - local_output(index, col) = DataType(1); + local_output(index, col) = TensorDataType(1); } @@ -113,7 +114,7 @@ class categorical_random_layer : public transform_layer { #ifndef LBANN_CATEGORICAL_RANDOM_LAYER_INSTANTIATE extern template class categorical_random_layer< - data_layout::DATA_PARALLEL, El::Device::CPU>; + DataType, data_layout::DATA_PARALLEL, El::Device::CPU>; #endif // LBANN_CATEGORICAL_RANDOM_LAYER_INSTANTIATE } // namespace lbann diff --git a/include/lbann/layers/transform/concatenation.hpp b/include/lbann/layers/transform/concatenation.hpp index 218877f9d52..e64fc18a855 100644 --- a/include/lbann/layers/transform/concatenation.hpp +++ b/include/lbann/layers/transform/concatenation.hpp @@ -34,18 +34,28 @@ namespace lbann { /** @brief Concatenate tensors along specified dimension. */ -template -class concatenation_layer : public transform_layer { +class concatenation_layer : public transform_layer { +public: + /** @name Public Types */ + ///@{ + + /** @brief The tensor type expected in this object. */ + using AbsDistMatrixType = El::AbstractDistMatrix; + + ///@} + public: concatenation_layer(lbann_comm *comm, El::Int concat_dim) - : transform_layer(comm), m_concat_dim(concat_dim) { + : transform_layer(comm), m_concat_dim(concat_dim) { this->m_expected_num_parent_layers = -1; // No limit on parents } concatenation_layer(const concatenation_layer& other) - : transform_layer(other), + : transform_layer(other), m_concat_dim(other.m_concat_dim), m_concat_points(other.m_concat_points) { m_input_v.reset(other.m_input_v ? other.m_input_v->Copy() : nullptr); @@ -53,7 +63,7 @@ class concatenation_layer : public transform_layer { } concatenation_layer& operator=(const concatenation_layer& other) { - transform_layer::operator=(other); + transform_layer::operator=(other); m_concat_dim = other.m_concat_dim; m_concat_points = other.m_concat_points; m_input_v.reset(other.m_input_v ? other.m_input_v->Copy() : nullptr); @@ -67,7 +77,7 @@ class concatenation_layer : public transform_layer { El::Device get_device_allocation() const override { return Dev; } description get_description() const override { - auto desc = transform_layer::get_description(); + auto desc = transform_layer::get_description(); desc.add("Concatenation dimension", m_concat_dim); return desc; } @@ -75,31 +85,31 @@ class concatenation_layer : public transform_layer { protected: void setup_pointers() override { - transform_layer::setup_pointers(); - if (get_num_parents() < 1) { + transform_layer::setup_pointers(); + if (this->get_num_parents() < 1) { std::stringstream err; - err << get_type() << " layer \"" << get_name() << "\" " + err << get_type() << " layer \"" << this->get_name() << "\" " << "has no parents"; LBANN_ERROR(err.str()); } } void setup_matrices(const El::Grid& grid) override { - transform_layer::setup_matrices(grid); - const auto& input = get_prev_activations(); + transform_layer::setup_matrices(grid); + const auto& input = this->get_prev_activations(); m_input_v.reset(input.Construct(input.Grid(), input.Root())); m_output_v.reset(input.Construct(input.Grid(), input.Root())); } void setup_dims() override { - transform_layer::setup_dims(); + transform_layer::setup_dims(); // Get concatenation points for first parent layer - auto output_dims = get_input_dims(0); + auto output_dims = this->get_input_dims(0); if (m_concat_dim < 0 || m_concat_dim >= (El::Int) output_dims.size()) { std::stringstream err; - err << get_type() << " layer \"" << get_name() << "\" " + err << get_type() << " layer \"" << this->get_name() << "\" " << "has " << output_dims.size() << " dimensions, " << "but attempted to concatenate along " << "dimension " << m_concat_dim; @@ -110,8 +120,8 @@ class concatenation_layer : public transform_layer { m_concat_points.push_back(output_dims[m_concat_dim]); // Get concatenation points for remaining parent layers - for (int i = 1; i < get_num_parents(); ++i) { - const auto& input_dims = get_input_dims(i); + for (int i = 1; i < this->get_num_parents(); ++i) { + const auto& input_dims = this->get_input_dims(i); if (input_dims.size() != output_dims.size() || !std::equal(input_dims.begin(), input_dims.begin() + m_concat_dim, @@ -120,7 +130,7 @@ class concatenation_layer : public transform_layer { input_dims.end(), output_dims.begin() + m_concat_dim + 1)) { std::stringstream err; - err << get_type() << " layer \"" << get_name() << "\" " + err << get_type() << " layer \"" << this->get_name() << "\" " << "expects input tensors with dimensions "; for (size_t j = 0; j < output_dims.size(); ++j) { err << (j > 0 ? " x " : ""); @@ -131,7 +141,7 @@ class concatenation_layer : public transform_layer { } } err << ", but parent layer " - << "\"" << m_parent_layers[i]->get_name() << "\" " + << "\"" << this->get_parent_layers()[i]->get_name() << "\" " << "outputs with dimensions "; for (size_t j = 0; j < input_dims.size(); ++j) { err << (j > 0 ? " x " : "") << input_dims[j]; @@ -143,22 +153,22 @@ class concatenation_layer : public transform_layer { } // Update output dimensions - set_output_dims(output_dims); + this->set_output_dims(output_dims); } void fp_setup_outputs(El::Int mini_batch_size) override { - const auto& num_inputs = get_num_parents(); - const auto& output_dims = get_output_dims(); + const auto& num_inputs = this->get_num_parents(); + const auto& output_dims = this->get_output_dims(); // Initialize output tensor - auto& output = get_activations(); + auto& output = this->get_activations(); output.Empty(false); if (num_inputs > 1) { - output.AlignWith(get_prev_activations()); - output.Resize(get_output_size(), mini_batch_size); + output.AlignWith(this->get_prev_activations()); + output.Resize(this->get_output_size(), mini_batch_size); } else { - El::LockedView(output, get_prev_activations()); + El::LockedView(output, this->get_prev_activations()); return; } @@ -179,8 +189,8 @@ class concatenation_layer : public transform_layer { // Populate slices of output tensor with input tensors for (int i = 0; i < num_inputs; ++i) { - const auto& input_dims = get_input_dims(i); - auto& input = get_prev_activations(i); + const auto& input_dims = this->get_input_dims(i); + auto& input = this->get_prev_activations(i); // Divide input tensor into unit slices const auto& input_num_unit_slices = input_dims[m_concat_dim]; @@ -208,8 +218,8 @@ class concatenation_layer : public transform_layer { } void bp_setup_gradient_wrt_inputs(El::Int mini_batch_size) override { - const auto& num_inputs = get_num_parents(); - const auto& output_dims = get_output_dims(); + const auto& num_inputs = this->get_num_parents(); + const auto& output_dims = this->get_output_dims(); // Divide output tensor into unit slices along concat dimension // Note: Each unit slice is divided into contiguous "unit blocks" @@ -227,11 +237,11 @@ class concatenation_layer : public transform_layer { * unit_block_size); // Populate gradient w.r.t. input tensors - const auto& gradient_wrt_output = get_prev_error_signals(); + const auto& gradient_wrt_output = this->get_prev_error_signals(); for (int i = 0; i < num_inputs; ++i) { - const auto& input_dims = get_input_dims(i); - const auto& input_size = get_input_size(i); - auto& gradient_wrt_input = get_error_signals(i); + const auto& input_dims = this->get_input_dims(i); + const auto& input_size = this->get_input_size(i); + auto& gradient_wrt_input = this->get_error_signals(i); // Divide input tensor into unit slices const auto& input_num_unit_slices = input_dims[m_concat_dim]; @@ -280,22 +290,22 @@ class concatenation_layer : public transform_layer { std::vector m_concat_points; /** View into input tensor. */ - std::unique_ptr m_input_v; + std::unique_ptr m_input_v; /** View into output tensor. */ - std::unique_ptr m_output_v; + std::unique_ptr m_output_v; }; #ifndef LBANN_CONCATENATION_LAYER_INSTANTIATE extern template class concatenation_layer< - data_layout::DATA_PARALLEL, El::Device::CPU>; + DataType, data_layout::DATA_PARALLEL, El::Device::CPU>; extern template class concatenation_layer< - data_layout::MODEL_PARALLEL, El::Device::CPU>; + DataType, data_layout::MODEL_PARALLEL, El::Device::CPU>; #ifdef LBANN_HAS_GPU extern template class concatenation_layer< - data_layout::DATA_PARALLEL, El::Device::GPU>; + DataType, data_layout::DATA_PARALLEL, El::Device::GPU>; extern template class concatenation_layer< - data_layout::MODEL_PARALLEL, El::Device::GPU>; + DataType, data_layout::MODEL_PARALLEL, El::Device::GPU>; #endif // LBANN_HAS_GPU #endif // LBANN_CONCATENATION_LAYER_INSTANTIATE diff --git a/include/lbann/layers/transform/constant.hpp b/include/lbann/layers/transform/constant.hpp index c6e378851cd..eb05038e8cd 100644 --- a/include/lbann/layers/transform/constant.hpp +++ b/include/lbann/layers/transform/constant.hpp @@ -32,16 +32,17 @@ namespace lbann { /** @brief Constant output. */ -template -class constant_layer : public transform_layer { +class constant_layer : public transform_layer { public: constant_layer(lbann_comm *comm, - DataType value, + TensorDataType value, std::vector dims) - : transform_layer(comm), m_value(value) { - set_output_dims(dims); + : transform_layer(comm), m_value(value) { + this->set_output_dims(dims); this->m_expected_num_parent_layers = 0; } @@ -51,7 +52,7 @@ class constant_layer : public transform_layer { El::Device get_device_allocation() const override { return Dev; } description get_description() const override { - auto desc = transform_layer::get_description(); + auto desc = transform_layer::get_description(); desc.add("Value", m_value); return desc; } @@ -60,29 +61,29 @@ class constant_layer : public transform_layer { void fp_compute() override { if (m_value == EvalType(0)) { - El::Zero(get_activations()); + El::Zero(this->get_activations()); } else { - El::Fill(get_activations(), m_value); + El::Fill(this->get_activations(), m_value); } } private: /** Constant value. */ - DataType m_value; + TensorDataType m_value; }; #ifndef LBANN_CONSTANT_LAYER_INSTANTIATE extern template class constant_layer< - data_layout::DATA_PARALLEL, El::Device::CPU>; + DataType, data_layout::DATA_PARALLEL, El::Device::CPU>; extern template class constant_layer< - data_layout::MODEL_PARALLEL, El::Device::CPU>; + DataType, data_layout::MODEL_PARALLEL, El::Device::CPU>; #ifdef LBANN_HAS_GPU extern template class constant_layer< - data_layout::DATA_PARALLEL, El::Device::GPU>; + DataType, data_layout::DATA_PARALLEL, El::Device::GPU>; extern template class constant_layer< - data_layout::MODEL_PARALLEL, El::Device::GPU>; + DataType, data_layout::MODEL_PARALLEL, El::Device::GPU>; #endif // LBANN_HAS_GPU #endif // LBANN_CONSTANT_LAYER_INSTANTIATE diff --git a/include/lbann/layers/transform/crop.hpp b/include/lbann/layers/transform/crop.hpp index 30fbc0e810e..b7eb86e698d 100644 --- a/include/lbann/layers/transform/crop.hpp +++ b/include/lbann/layers/transform/crop.hpp @@ -40,22 +40,32 @@ namespace lbann { * to the red-top-left corner and (1,1,1) to the blue-bottom-right * corner. The crop size is determined at setup. */ -template -class crop_layer : public transform_layer { +class crop_layer : public transform_layer { static_assert(T_layout == data_layout::DATA_PARALLEL, "crop layer only supports DATA_PARALLEL"); +public: + /** @name Public Types */ + ///@{ + + /** @brief The tensor type expected in this object. */ + using AbsDistMatrixType = El::AbstractDistMatrix; + + ///@} + public: crop_layer(lbann_comm *comm, std::vector dims) - : transform_layer(comm) { - set_output_dims(dims); + : transform_layer(comm) { + this->set_output_dims(dims); this->m_expected_num_parent_layers = 2; } crop_layer(const crop_layer& other) - : transform_layer(other), + : transform_layer(other), m_input_v(other.m_input_v ? other.m_input_v->Copy() : nullptr), m_output_v(other.m_output_v ? @@ -63,7 +73,7 @@ class crop_layer : public transform_layer { m_crop_pos_v(other.m_crop_pos_v ? other.m_crop_pos_v->Copy() : nullptr){} crop_layer& operator=(const crop_layer& other) { - transform_layer::operator=(other); + transform_layer::operator=(other); m_input_v.reset(other.m_input_v ? other.m_input_v->Copy() : nullptr); m_output_v.reset(other.m_output_v ? @@ -79,14 +89,14 @@ class crop_layer : public transform_layer { El::Device get_device_allocation() const override { return Dev; } void setup_matrices(const El::Grid& grid) override { - transform_layer::setup_matrices(grid); - const auto& input = get_prev_activations(); + transform_layer::setup_matrices(grid); + const auto& input = this->get_prev_activations(); const auto& dist = input.DistData(); m_input_v.reset(input.Construct(input.Grid(), input.Root())); m_output_v.reset(input.Construct(input.Grid(), input.Root())); /// @todo Setup the input tensor with this data distribution - m_crop_pos_v.reset(AbsDistMat::Instantiate(*dist.grid, + m_crop_pos_v.reset(AbsDistMatrixType::Instantiate(*dist.grid, dist.root, El::STAR, dist.rowDist, @@ -97,29 +107,29 @@ class crop_layer : public transform_layer { } void setup_dims() override { - transform_layer::setup_dims(); + transform_layer::setup_dims(); std::stringstream err; // Make sure input tensors have valid dimensions - const auto& input_dims = get_input_dims(0); - const auto& loc_dims = get_input_dims(1); - const auto& output_dims = get_output_dims(); + const auto& input_dims = this->get_input_dims(0); + const auto& loc_dims = this->get_input_dims(1); + const auto& output_dims = this->get_output_dims(); if (input_dims.size() != output_dims.size()) { - err << get_type() << " layer \"" << get_name() << "\" " + err << get_type() << " layer \"" << this->get_name() << "\" " << "expects a crop input tensor with " << output_dims.size() << " dimensions, " << "but parent layer " - << "\"" << m_parent_layers[0]->get_name() << "\" " + << "\"" << this->get_parent_layers()[0]->get_name() << "\" " << "outputs a tensor with " << input_dims.size() << " dimensions"; LBANN_ERROR(err.str()); } if (loc_dims.size() != 1 || loc_dims[0] != (int) input_dims.size()) { - err << get_type() << " layer \"" << get_name() << "\" " + err << get_type() << " layer \"" << this->get_name() << "\" " << "expects a 1D crop position tensor with " << output_dims.size() << " entries, " << "but parent layer " - << "\"" << m_parent_layers[1]->get_name() << "\" " + << "\"" << this->get_parent_layers()[1]->get_name() << "\" " << "outputs a tensor with dimensions "; for (size_t i = 0; i < loc_dims.size(); ++i) { err << (i > 0 ? " x " : "") << loc_dims[i]; @@ -132,14 +142,14 @@ class crop_layer : public transform_layer { protected: void fp_compute() override { - switch (get_input_dims().size()) { + switch (this->get_input_dims().size()) { case 3: fp_compute_3d(); break; default: fp_compute_nd(); } } void bp_compute() override { - switch (get_input_dims().size()) { + switch (this->get_input_dims().size()) { case 3: bp_compute_3d(); break; default: bp_compute_nd(); } @@ -147,22 +157,22 @@ class crop_layer : public transform_layer { private: /** View into input tensor. */ - std::unique_ptr m_input_v; + std::unique_ptr m_input_v; /** View into output tensor. */ - std::unique_ptr m_output_v; + std::unique_ptr m_output_v; /** View into crop positions. */ - std::unique_ptr m_crop_pos_v; + std::unique_ptr m_crop_pos_v; /** Forward prop implementation for n-dimensional tensors. */ void fp_compute_nd() { // Input and output tensors - const auto& input = get_prev_activations(0); - auto& output = get_activations(); + const auto& input = this->get_prev_activations(0); + auto& output = this->get_activations(); // Tensor dimensions - const auto& input_dims = get_input_dims(0); - const auto& output_dims = get_output_dims(); + const auto& input_dims = this->get_input_dims(0); + const auto& output_dims = this->get_output_dims(); const El::Int num_dims = output_dims.size(); const auto& local_width = input.LocalWidth(); const auto& region_size = output_dims.back(); @@ -170,7 +180,7 @@ class crop_layer : public transform_layer { // Get crop position m_crop_pos_v->Empty(false); m_crop_pos_v->AlignWith(input); - const auto& input1 = get_prev_activations(1); + const auto& input1 = this->get_prev_activations(1); if (m_crop_pos_v->DistData() == input1.DistData()) { El::LockedView(*m_crop_pos_v, input1); } else { @@ -187,7 +197,7 @@ class crop_layer : public transform_layer { std::vector crop_offsets; for (El::Int d = 0; d < num_dims; ++d) { const auto& pos = local_crop_pos(d, local_col); - if (pos < DataType(0) || pos > DataType(1)) { + if (pos < TensorDataType(0) || pos > TensorDataType(1)) { std::stringstream err; err << "crop position not in range [0,1] (pos=("; for (El::Int i = 0; i < local_crop_pos.Height(); ++i) { @@ -242,17 +252,17 @@ class crop_layer : public transform_layer { void bp_compute_nd() { // Clear error signals - El::Zero(get_error_signals(0)); - El::Zero(get_error_signals(1)); + El::Zero(this->get_error_signals(0)); + El::Zero(this->get_error_signals(1)); // Input and gradient tensors - const auto& gradient_wrt_output = get_prev_error_signals(); - auto& gradient_wrt_input = get_error_signals(0); + const auto& gradient_wrt_output = this->get_prev_error_signals(); + auto& gradient_wrt_input = this->get_error_signals(0); const auto& local_crop_pos = m_crop_pos_v->LockedMatrix(); // Tensor dimensions - const auto& input_dims = get_input_dims(0); - const auto& output_dims = get_output_dims(); + const auto& input_dims = this->get_input_dims(0); + const auto& output_dims = this->get_output_dims(); const El::Int num_dims = output_dims.size(); const auto& local_width = gradient_wrt_input.LocalWidth(); const auto& region_size = output_dims.back(); @@ -266,7 +276,7 @@ class crop_layer : public transform_layer { std::vector crop_offsets; for (El::Int d = 0; d < num_dims; ++d) { const auto& pos = local_crop_pos(d, local_col); - if (pos < DataType(0) || pos > DataType(1)) { + if (pos < TensorDataType(0) || pos > TensorDataType(1)) { std::stringstream err; err << "crop position not in range [0,1] (pos=("; for (El::Int i = 0; i < local_crop_pos.Height(); ++i) { @@ -329,9 +339,9 @@ class crop_layer : public transform_layer { }; #ifndef LBANN_CROP_LAYER_INSTANTIATE -extern template class crop_layer; +extern template class crop_layer; #ifdef LBANN_HAS_GPU -extern template class crop_layer; +extern template class crop_layer; #endif // LBANN_HAS_GPU #endif // LBANN_CROP_LAYER_INSTANTIATE diff --git a/include/lbann/layers/transform/discrete_random.hpp b/include/lbann/layers/transform/discrete_random.hpp index 853c5880822..5e62dc6c6f2 100644 --- a/include/lbann/layers/transform/discrete_random.hpp +++ b/include/lbann/layers/transform/discrete_random.hpp @@ -40,9 +40,10 @@ namespace lbann { * * @todo Remove. */ -template -class discrete_random_layer : public transform_layer { +class discrete_random_layer : public transform_layer { static_assert(Dev == El::Device::CPU, "discrete random layer currently only supports CPU"); static_assert(T_layout == data_layout::DATA_PARALLEL, @@ -56,9 +57,9 @@ class discrete_random_layer : public transform_layer { discrete_random_layer(lbann_comm *comm, std::vector values, std::vector dims) - : transform_layer(comm), + : transform_layer(comm), m_values(values) { - set_output_dims(dims); + this->set_output_dims(dims); } discrete_random_layer* copy() const override { return new discrete_random_layer(*this); } std::string get_type() const override { return "discrete random"; } @@ -68,8 +69,8 @@ class discrete_random_layer : public transform_layer { protected: void setup_dims() override { - transform_layer::setup_dims(); - if (get_input_size() != (int) m_values.size()) { + transform_layer::setup_dims(); + if (this->get_input_size() != (int) m_values.size()) { LBANN_ERROR("input tensor dimensions don't match number of " "values in discrete distribution"); } @@ -78,9 +79,9 @@ class discrete_random_layer : public transform_layer { void fp_compute() override { // Input and output matrices - const auto& input = get_prev_activations(); + const auto& input = this->get_prev_activations(); const auto& local_input = input.LockedMatrix(); - auto& output = get_activations(); + auto& output = this->get_activations(); auto& local_output = output.Matrix(); const int num_values = m_values.size(); const auto& num_outputs = local_output.Height(); @@ -90,7 +91,7 @@ class discrete_random_layer : public transform_layer { // Initialize random numbers const auto& mode = this->m_model->get_execution_context().get_execution_mode(); if (mode == execution_mode::training) { - uniform_fill(output, 1, width, DataType(0.5), DataType(0.5)); + uniform_fill(output, 1, width, TensorDataType(0.5), TensorDataType(0.5)); } // Process each mini-batch sample @@ -123,7 +124,7 @@ class discrete_random_layer : public transform_layer { #ifndef LBANN_DISCRETE_RANDOM_LAYER_INSTANTIATE extern template class discrete_random_layer< - data_layout::DATA_PARALLEL, El::Device::CPU>; + DataType, data_layout::DATA_PARALLEL, El::Device::CPU>; #endif // LBANN_DISCRETE_RANDOM_LAYER_INSTANTIATE } // namespace lbann diff --git a/include/lbann/layers/transform/dummy.hpp b/include/lbann/layers/transform/dummy.hpp index 053a4385b02..e0125318436 100644 --- a/include/lbann/layers/transform/dummy.hpp +++ b/include/lbann/layers/transform/dummy.hpp @@ -36,11 +36,12 @@ namespace lbann { * Does no computation and is primarily intended as a placeholder for * unused layer outputs. */ -template -class dummy_layer : public transform_layer { +class dummy_layer : public transform_layer { public: - dummy_layer(lbann_comm *comm) : transform_layer(comm) { + dummy_layer(lbann_comm *comm) : transform_layer(comm) { this->m_expected_num_child_layers = 0; } dummy_layer* copy() const override { return new dummy_layer(*this); } @@ -53,14 +54,14 @@ class dummy_layer : public transform_layer { #ifndef LBANN_DUMMY_LAYER_INSTANTIATE extern template class dummy_layer< - data_layout::DATA_PARALLEL, El::Device::CPU>; + DataType, data_layout::DATA_PARALLEL, El::Device::CPU>; extern template class dummy_layer< - data_layout::MODEL_PARALLEL, El::Device::CPU>; + DataType, data_layout::MODEL_PARALLEL, El::Device::CPU>; #ifdef LBANN_HAS_GPU extern template class dummy_layer< - data_layout::DATA_PARALLEL, El::Device::GPU>; + DataType, data_layout::DATA_PARALLEL, El::Device::GPU>; extern template class dummy_layer< - data_layout::MODEL_PARALLEL, El::Device::GPU>; + DataType, data_layout::MODEL_PARALLEL, El::Device::GPU>; #endif // LBANN_HAS_GPU #endif // LBANN_DUMMY_LAYER_INSTANTIATE diff --git a/include/lbann/layers/transform/evaluation.hpp b/include/lbann/layers/transform/evaluation.hpp index 0e237f7911f..c8143d64fc2 100644 --- a/include/lbann/layers/transform/evaluation.hpp +++ b/include/lbann/layers/transform/evaluation.hpp @@ -32,7 +32,11 @@ namespace lbann { /** @brief Interface with objective function and metrics. */ -class abstract_evaluation_layer : public transform_layer { +template +class abstract_evaluation_layer : public transform_layer { +public: + using CPUMatType = El::Matrix; + public: /** Get scaling factor. */ @@ -63,7 +67,7 @@ class abstract_evaluation_layer : public transform_layer { /** Evaluated value. * The value may be stored in pinned memory. */ - CPUMat m_value; + CPUMatType m_value; /** Non-blocking allreduce request. */ Al::request m_allreduce_req; #ifdef LBANN_HAS_GPU @@ -77,11 +81,12 @@ class abstract_evaluation_layer : public transform_layer { * Computes the average value across a mini-batch. If the input * tensor has multiple neurons, their values are added together. */ -template -class evaluation_layer : public abstract_evaluation_layer { +class evaluation_layer : public abstract_evaluation_layer { public: - evaluation_layer(lbann_comm *comm) : abstract_evaluation_layer(comm) {} + evaluation_layer(lbann_comm *comm) : abstract_evaluation_layer(comm) {} evaluation_layer* copy() const override { return new evaluation_layer(*this); } std::string get_type() const override { return "evaluation"; } data_layout get_data_layout() const override { return T_layout; } @@ -89,15 +94,17 @@ class evaluation_layer : public abstract_evaluation_layer { }; #ifndef LBANN_EVALUATION_LAYER_INSTANTIATE +extern template class abstract_evaluation_layer; + extern template class evaluation_layer< - data_layout::DATA_PARALLEL, El::Device::CPU>; + DataType, data_layout::DATA_PARALLEL, El::Device::CPU>; extern template class evaluation_layer< - data_layout::MODEL_PARALLEL, El::Device::CPU>; + DataType, data_layout::MODEL_PARALLEL, El::Device::CPU>; #ifdef LBANN_HAS_GPU extern template class evaluation_layer< - data_layout::DATA_PARALLEL, El::Device::GPU>; + DataType, data_layout::DATA_PARALLEL, El::Device::GPU>; extern template class evaluation_layer< - data_layout::MODEL_PARALLEL, El::Device::GPU>; + DataType, data_layout::MODEL_PARALLEL, El::Device::GPU>; #endif // LBANN_HAS_GPU #endif // LBANN_EVALUATION_LAYER_INSTANTIATE diff --git a/include/lbann/layers/transform/gaussian.hpp b/include/lbann/layers/transform/gaussian.hpp index 806af947448..b99f40502a7 100644 --- a/include/lbann/layers/transform/gaussian.hpp +++ b/include/lbann/layers/transform/gaussian.hpp @@ -38,22 +38,23 @@ namespace lbann { * During validation and testing, outputs are all equal to the * distribution mean. */ -template -class gaussian_layer : public transform_layer { +class gaussian_layer : public transform_layer { private: /** Gaussian distribution mean. */ - DataType m_mean; + TensorDataType m_mean; /** Gaussian distribution standard deviation. */ - DataType m_stdev; + TensorDataType m_stdev; public: gaussian_layer(lbann_comm *comm, const std::vector& dims, - DataType mean = DataType(0), - DataType stdev = DataType(1)) - : transform_layer(comm), m_mean(mean), m_stdev(stdev) { - set_output_dims(dims); + TensorDataType mean = TensorDataType(0), + TensorDataType stdev = TensorDataType(1)) + : transform_layer(comm), m_mean(mean), m_stdev(stdev) { + this->set_output_dims(dims); this->m_expected_num_parent_layers = 0; } gaussian_layer* copy() const override { return new gaussian_layer(*this); } @@ -62,7 +63,7 @@ class gaussian_layer : public transform_layer { El::Device get_device_allocation() const override { return Dev; } description get_description() const override { - auto desc = transform_layer::get_description(); + auto desc = transform_layer::get_description(); desc.add("Mean", m_mean); desc.add("Standard deviation", m_stdev); return desc; @@ -71,7 +72,7 @@ class gaussian_layer : public transform_layer { protected: void fp_compute() override { - auto& output = get_activations(); + auto& output = this->get_activations(); if (this->m_model->get_execution_context().get_execution_mode() == execution_mode::training) { gaussian_fill(output, output.Height(), output.Width(), m_mean, m_stdev); } else { @@ -83,14 +84,14 @@ class gaussian_layer : public transform_layer { #ifndef LBANN_GAUSSIAN_LAYER_INSTANTIATE extern template class gaussian_layer< - data_layout::DATA_PARALLEL, El::Device::CPU>; + DataType, data_layout::DATA_PARALLEL, El::Device::CPU>; extern template class gaussian_layer< - data_layout::MODEL_PARALLEL, El::Device::CPU>; + DataType, data_layout::MODEL_PARALLEL, El::Device::CPU>; #ifdef LBANN_HAS_GPU extern template class gaussian_layer< - data_layout::DATA_PARALLEL, El::Device::GPU>; + DataType, data_layout::DATA_PARALLEL, El::Device::GPU>; extern template class gaussian_layer< - data_layout::MODEL_PARALLEL, El::Device::GPU>; + DataType, data_layout::MODEL_PARALLEL, El::Device::GPU>; #endif // LBANN_HAS_GPU #endif // LBANN_GAUSSIAN_LAYER_INSTANTIATE diff --git a/include/lbann/layers/transform/hadamard.hpp b/include/lbann/layers/transform/hadamard.hpp index a5db4686bc1..84f0b6b3f7f 100644 --- a/include/lbann/layers/transform/hadamard.hpp +++ b/include/lbann/layers/transform/hadamard.hpp @@ -34,12 +34,13 @@ namespace lbann { /** @brief Entry-wise tensor product. */ -template -class hadamard_layer : public transform_layer { +class hadamard_layer : public transform_layer { public: - hadamard_layer(lbann_comm *comm) : transform_layer(comm) { + hadamard_layer(lbann_comm *comm) : transform_layer(comm) { this->m_expected_num_parent_layers = -1; // No limit on parents } @@ -51,29 +52,29 @@ class hadamard_layer : public transform_layer { protected: void setup_pointers() override { - transform_layer::setup_pointers(); - if (get_num_parents() < 1) { + transform_layer::setup_pointers(); + if (this->get_num_parents() < 1) { std::stringstream err; - err << get_type() << " layer \"" << get_name() << "\" " + err << get_type() << " layer \"" << this->get_name() << "\" " << "has no parent layers"; LBANN_ERROR(err.str()); } } void setup_dims() override { - transform_layer::setup_dims(); - set_output_dims(get_input_dims()); + transform_layer::setup_dims(); + this->set_output_dims(this->get_input_dims()); // Check that input dimensions match - const auto& output_dims = get_output_dims(); - for (int i = 0; i < get_num_parents(); ++i) { - if (get_input_dims(i) != output_dims) { - const auto& parents = get_parent_layers(); + const auto& output_dims = this->get_output_dims(); + for (int i = 0; i < this->get_num_parents(); ++i) { + if (this->get_input_dims(i) != output_dims) { + const auto& parents = this->get_parent_layers(); std::stringstream err; - err << get_type() << " layer \"" << get_name() << "\" " + err << get_type() << " layer \"" << this->get_name() << "\" " << "has input tensors with incompatible dimensions ("; - for (int j = 0; j < get_num_parents(); ++j) { - const auto& dims = get_input_dims(j); + for (int j = 0; j < this->get_num_parents(); ++j) { + const auto& dims = this->get_input_dims(j); err << (j > 0 ? ", " : "") << "layer \"" << parents[j]->get_name() << "\" outputs "; for (size_t k = 0; k < dims.size(); ++k) { @@ -88,35 +89,35 @@ class hadamard_layer : public transform_layer { } void fp_compute() override { - auto& output = get_activations(); - switch (get_num_parents()) { - case 0: El::Fill(output, DataType(1)); break; - case 1: El::LockedView(output, get_prev_activations()); break; + auto& output = this->get_activations(); + switch (this->get_num_parents()) { + case 0: El::Fill(output, TensorDataType(1)); break; + case 1: El::LockedView(output, this->get_prev_activations()); break; default: - El::Hadamard(get_prev_activations(0), - get_prev_activations(1), + El::Hadamard(this->get_prev_activations(0), + this->get_prev_activations(1), output); - for (int i = 2; i < get_num_parents(); ++i) { - El::Hadamard(get_prev_activations(i), output, output); + for (int i = 2; i < this->get_num_parents(); ++i) { + El::Hadamard(this->get_prev_activations(i), output, output); } } } void bp_compute() override { - const int num_parents = get_num_parents(); - const auto& gradient_wrt_output = get_prev_error_signals(); + const int num_parents = this->get_num_parents(); + const auto& gradient_wrt_output = this->get_prev_error_signals(); switch (num_parents) { case 0: break; case 1: - El::LockedView(get_error_signals(), gradient_wrt_output); + El::LockedView(this->get_error_signals(), gradient_wrt_output); break; default: for (int i = 0; i < num_parents; ++i) { - auto& gradient_wrt_input = get_error_signals(i); + auto& gradient_wrt_input = this->get_error_signals(i); El::Copy(gradient_wrt_output, gradient_wrt_input); for (int j = 0; j < num_parents; ++j) { if (i != j) { - El::Hadamard(get_prev_activations(j), + El::Hadamard(this->get_prev_activations(j), gradient_wrt_input, gradient_wrt_input); } @@ -129,14 +130,14 @@ class hadamard_layer : public transform_layer { #ifndef LBANN_HADAMARD_LAYER_INSTANTIATE extern template class hadamard_layer< - data_layout::DATA_PARALLEL, El::Device::CPU>; + DataType, data_layout::DATA_PARALLEL, El::Device::CPU>; extern template class hadamard_layer< - data_layout::MODEL_PARALLEL, El::Device::CPU>; + DataType, data_layout::MODEL_PARALLEL, El::Device::CPU>; #ifdef LBANN_HAS_GPU extern template class hadamard_layer< - data_layout::DATA_PARALLEL, El::Device::GPU>; + DataType, data_layout::DATA_PARALLEL, El::Device::GPU>; extern template class hadamard_layer< - data_layout::MODEL_PARALLEL, El::Device::GPU>; + DataType, data_layout::MODEL_PARALLEL, El::Device::GPU>; #endif // LBANN_HAS_GPU #endif // LBANN_HADAMARD_LAYER_INSTANTIATE diff --git a/include/lbann/layers/transform/in_top_k.hpp b/include/lbann/layers/transform/in_top_k.hpp index 137a1153b7f..2dd0de6126c 100644 --- a/include/lbann/layers/transform/in_top_k.hpp +++ b/include/lbann/layers/transform/in_top_k.hpp @@ -38,13 +38,14 @@ namespace lbann { * one and the rest to zero. Ties are broken in favor of entries with * smaller indices. */ -template -class in_top_k_layer : public transform_layer { +class in_top_k_layer : public transform_layer { public: in_top_k_layer(lbann_comm *comm, El::Int k) - : transform_layer(comm), m_k(k) { + : transform_layer(comm), m_k(k) { if (m_k < 0) { std::stringstream err; err << "invalid parameter for top-k search (k=" << m_k << ")"; @@ -58,7 +59,7 @@ class in_top_k_layer : public transform_layer { El::Device get_device_allocation() const override { return Dev; } description get_description() const override { - auto desc = transform_layer::get_description(); + auto desc = transform_layer::get_description(); desc.add("k", m_k); return desc; } @@ -66,8 +67,8 @@ class in_top_k_layer : public transform_layer { protected: void setup_dims() override { - Layer::setup_dims(); - set_output_dims(get_input_dims()); + data_type_layer::setup_dims(); + this->set_output_dims(this->get_input_dims()); } void fp_compute() override; @@ -81,14 +82,14 @@ class in_top_k_layer : public transform_layer { #ifndef LBANN_IN_TOP_K_LAYER_INSTANTIATE extern template class in_top_k_layer< - data_layout::DATA_PARALLEL, El::Device::CPU>; + DataType, data_layout::DATA_PARALLEL, El::Device::CPU>; extern template class in_top_k_layer< - data_layout::MODEL_PARALLEL, El::Device::CPU>; + DataType, data_layout::MODEL_PARALLEL, El::Device::CPU>; #ifdef LBANN_HAS_GPU extern template class in_top_k_layer< - data_layout::DATA_PARALLEL, El::Device::GPU>; + DataType, data_layout::DATA_PARALLEL, El::Device::GPU>; extern template class in_top_k_layer< - data_layout::MODEL_PARALLEL, El::Device::GPU>; + DataType, data_layout::MODEL_PARALLEL, El::Device::GPU>; #endif // LBANN_HAS_GPU #endif // LBANN_IN_TOP_K_LAYER_INSTANTIATE diff --git a/include/lbann/layers/transform/pooling.hpp b/include/lbann/layers/transform/pooling.hpp index 5dd2fb98e81..9a84d48c029 100644 --- a/include/lbann/layers/transform/pooling.hpp +++ b/include/lbann/layers/transform/pooling.hpp @@ -37,12 +37,13 @@ namespace lbann { // Forward declaration -template +template class unpooling_layer; -template -class pooling_layer : public transform_layer { +class pooling_layer : public transform_layer { static_assert(T_layout == data_layout::DATA_PARALLEL, "pooling only supports DATA_PARALLEL"); private: @@ -70,10 +71,10 @@ class pooling_layer : public transform_layer { /** Pooling descriptor. */ cudnnPoolingDescriptor_t m_pooling_cudnn_desc; /** Tensor cuDNN descriptors. */ - cudnn::data_parallel_layer_tensor_manager m_tensors_cudnn_desc; + cudnn::data_parallel_layer_tensor_manager m_tensors_cudnn_desc; #endif // LBANN_HAS_CUDNN - friend class unpooling_layer; + friend class unpooling_layer; public: @@ -96,7 +97,7 @@ class pooling_layer : public transform_layer { std::vector pads, std::vector strides, pool_mode mode) - : transform_layer(comm), + : transform_layer(comm), m_pool_mode(mode), m_pool_dims(pool_dims), m_pads(pads), @@ -115,7 +116,7 @@ class pooling_layer : public transform_layer { } pooling_layer(const pooling_layer& other) - : transform_layer(other), + : transform_layer(other), m_pool_mode(other.m_pool_mode), m_pool_dims(other.m_pool_dims), m_pool_size(other.m_pool_size), @@ -134,7 +135,7 @@ class pooling_layer : public transform_layer { } pooling_layer& operator=(const pooling_layer& other){ - transform_layer::operator=(other); + transform_layer::operator=(other); m_pool_mode = other.m_pool_mode; m_pool_dims = other.m_pool_dims; m_pool_size = other.m_pool_size; @@ -163,7 +164,7 @@ class pooling_layer : public transform_layer { El::Device get_device_allocation() const override { return Dev; } description get_description() const override { - auto desc = transform_layer::get_description(); + auto desc = transform_layer::get_description(); std::stringstream ss; // Pool mode @@ -211,20 +212,20 @@ class pooling_layer : public transform_layer { protected: void setup_dims() override { - transform_layer::setup_dims(); - const auto& input_dims = get_input_dims(); + transform_layer::setup_dims(); + const auto& input_dims = this->get_input_dims(); auto output_dims = input_dims; for(size_t i = 0; i < output_dims.size() - 1; ++i) { const int effective_dim = (input_dims[i+1] + 2 * m_pads[i] - m_pool_dims[i] + 1); output_dims[i+1] = (effective_dim + m_strides[i] - 1) / m_strides[i]; } - set_output_dims(output_dims); + this->set_output_dims(output_dims); } /// Initialize GPU objects void setup_gpu() override { - transform_layer::setup_gpu(); + transform_layer::setup_gpu(); #ifndef LBANN_HAS_CUDNN LBANN_ERROR("cuDNN not detected"); #else @@ -283,11 +284,11 @@ class pooling_layer : public transform_layer { #ifndef LBANN_HAS_CUDNN LBANN_ERROR("cuDNN not detected"); #else - const auto& local_input = get_local_prev_activations(); - auto& local_output = get_local_activations(); + const auto& local_input = this->get_local_prev_activations(); + auto& local_output = this->get_local_activations(); if (local_input.Height() > 0 && local_input.Width() > 0) { - const DataType zero = DataType(0); - const DataType one = DataType(1); + const TensorDataType zero = TensorDataType(0); + const TensorDataType one = TensorDataType(1); CHECK_CUDNN(cudnnPoolingForward(cudnn::get_handle(), m_pooling_cudnn_desc, &one, @@ -305,15 +306,15 @@ class pooling_layer : public transform_layer { #ifndef LBANN_HAS_CUDNN LBANN_ERROR("cuDNN not detected"); #else - const auto& local_input = get_local_prev_activations(); - const auto& local_output = get_local_activations(); - const auto& local_gradient_wrt_output = get_local_prev_error_signals(); - auto& local_gradient_wrt_input = get_local_error_signals(); + const auto& local_input = this->get_local_prev_activations(); + const auto& local_output = this->get_local_activations(); + const auto& local_gradient_wrt_output = this->get_local_prev_error_signals(); + auto& local_gradient_wrt_input = this->get_local_error_signals(); if (local_input.Height() > 0 && local_input.Width() > 0) { // Useful constants - const DataType one = DataType(1); - const DataType zero = DataType(0); + const TensorDataType one = TensorDataType(1); + const TensorDataType zero = TensorDataType(0); // Perform backprop on GPU CHECK_CUDNN(cudnnPoolingBackward(cudnn::get_handle(), @@ -340,18 +341,18 @@ class pooling_layer : public transform_layer { } // Local matrices - const auto& local_input = get_local_prev_activations(); - auto& local_output = get_local_activations(); + const auto& local_input = this->get_local_prev_activations(); + auto& local_output = this->get_local_activations(); // Pool parameters const int local_width = local_input.Width(); - const auto& input_dims = get_input_dims(); + const auto& input_dims = this->get_input_dims(); const int num_channels = input_dims[0]; - const int num_per_output_channel = get_output_size() / num_channels; + const int num_per_output_channel = this->get_output_size() / num_channels; // Initialize max pool indices if needed if(m_pool_mode == pool_mode::max) { - m_max_pool_indices.assign(get_output_size() * local_width, 0); + m_max_pool_indices.assign(this->get_output_size() * local_width, 0); } // Initialize matrices @@ -375,16 +376,16 @@ class pooling_layer : public transform_layer { if(m_pool_mode == pool_mode::max) { // Apply max pooling - DataType *output_buffer = local_output.Buffer(0, sample); - int *indices_buffer = &m_max_pool_indices[sample * get_output_size()]; + TensorDataType *output_buffer = local_output.Buffer(0, sample); + int *indices_buffer = &m_max_pool_indices[sample * this->get_output_size()]; LBANN_OMP_PARALLEL_FOR for(int channel = 0; channel < num_channels; ++channel) { for(int j = 0; j < num_per_output_channel; ++j) { - DataType *im2col_buffer = im2col_mat.Buffer(channel*m_pool_size, j); - DataType max_entry = im2col_buffer[0]; + TensorDataType *im2col_buffer = im2col_mat.Buffer(channel*m_pool_size, j); + TensorDataType max_entry = im2col_buffer[0]; int max_index = 0; for(int i = 1; i < m_pool_size; ++i) { - const DataType current_entry = im2col_buffer[i]; + const TensorDataType current_entry = im2col_buffer[i]; if(current_entry > max_entry) { max_entry = current_entry; max_index = i; @@ -399,13 +400,13 @@ class pooling_layer : public transform_layer { if(m_pool_mode == pool_mode::average) { // Apply average pooling - DataType *output_buffer = local_output.Buffer(0, sample); + TensorDataType *output_buffer = local_output.Buffer(0, sample); LBANN_OMP_PARALLEL_FOR for(int channel = 0; channel < num_channels; ++channel) { for(int j = 0; j < num_per_output_channel; ++j) { - const DataType *im2col_buffer + const TensorDataType *im2col_buffer = im2col_mat.LockedBuffer(channel*m_pool_size, j); - DataType output_entry = 0; + TensorDataType output_entry = 0; for(int i = 0; i < m_pool_size; ++i) { output_entry += im2col_buffer[i]; } @@ -422,23 +423,24 @@ class pooling_layer : public transform_layer { /// Pooling forward propagation with im2col void bp_compute_im2col() { + using CPUMatType = El::Matrix; if(m_pool_mode != pool_mode::max && m_pool_mode != pool_mode::average) { LBANN_ERROR("CPU pooling layer only supports max and average pooling"); } // Local matrices - const auto& local_gradient_wrt_output = get_local_prev_error_signals(); - auto& local_gradient_wrt_input = get_local_error_signals(); + const auto& local_gradient_wrt_output = this->get_local_prev_error_signals(); + auto& local_gradient_wrt_input = this->get_local_error_signals(); // Pool parameters const int local_width = local_gradient_wrt_output.Width(); - const auto& input_dims = get_input_dims(); + const auto& input_dims = this->get_input_dims(); const int num_channels = input_dims[0]; - const int num_per_input_channel = get_output_size() / num_channels; + const int num_per_input_channel = this->get_output_size() / num_channels; // Initialize matrices - CPUMat im2col_mat(m_pool_size * num_channels, num_per_input_channel); - CPUMat gradient_wrt_input_col; + CPUMatType im2col_mat(m_pool_size * num_channels, num_per_input_channel); + CPUMatType gradient_wrt_input_col; // Iterate through data samples for(int sample = 0; sample < local_width; ++sample) { @@ -451,16 +453,16 @@ class pooling_layer : public transform_layer { // Copy previous error signal to im2col matrix entries // corresponding to max - const DataType *gradient_wrt_output_buffer + const TensorDataType *gradient_wrt_output_buffer = local_gradient_wrt_output.LockedBuffer(0, sample); const int *indices_buffer - = &m_max_pool_indices[sample * get_output_size()]; + = &m_max_pool_indices[sample * this->get_output_size()]; LBANN_OMP_PARALLEL_FOR for(int channel = 0; channel < num_channels; ++channel) { for(int j = 0; j < num_per_input_channel; ++j) { const int input_index = j + channel * num_per_input_channel; const int max_index = indices_buffer[input_index]; - DataType *im2col_buffer = im2col_mat.Buffer(channel*m_pool_size, j); + TensorDataType *im2col_buffer = im2col_mat.Buffer(channel*m_pool_size, j); im2col_buffer[max_index] = gradient_wrt_output_buffer[input_index]; } @@ -470,14 +472,14 @@ class pooling_layer : public transform_layer { // Compute gradient w.r.t. im2col matrix for average pooling if(m_pool_mode == pool_mode::average) { - const DataType *gradient_wrt_output_buffer + const TensorDataType *gradient_wrt_output_buffer = local_gradient_wrt_output.LockedBuffer(0, sample); LBANN_OMP_PARALLEL_FOR for(int channel = 0; channel < num_channels; ++channel) { for(int j = 0; j < num_per_input_channel; ++j) { - DataType *im2col_buffer = im2col_mat.Buffer(channel*m_pool_size, j); + TensorDataType *im2col_buffer = im2col_mat.Buffer(channel*m_pool_size, j); const int input_index = j + channel * num_per_input_channel; - const DataType output_entry + const TensorDataType output_entry = gradient_wrt_output_buffer[input_index] / m_pool_size; for(int i = 0; i < m_pool_size; ++i) { im2col_buffer[i] = output_entry; @@ -555,10 +557,10 @@ class pooling_layer : public transform_layer { #ifndef LBANN_POOLING_LAYER_INSTANTIATE extern template class pooling_layer< - data_layout::DATA_PARALLEL, El::Device::CPU>; + DataType, data_layout::DATA_PARALLEL, El::Device::CPU>; #ifdef LBANN_HAS_GPU extern template class pooling_layer< - data_layout::DATA_PARALLEL, El::Device::GPU>; + DataType, data_layout::DATA_PARALLEL, El::Device::GPU>; #endif // LBANN_HAS_GPU #endif // LBANN_POOLING_LAYER_INSTANTIATE diff --git a/include/lbann/layers/transform/reduction.hpp b/include/lbann/layers/transform/reduction.hpp index dce99308c0f..abc174c34b2 100644 --- a/include/lbann/layers/transform/reduction.hpp +++ b/include/lbann/layers/transform/reduction.hpp @@ -38,9 +38,10 @@ enum class reduction_mode {INVALID, SUM, AVERAGE}; * * @todo Reduction over specified dimensions. */ -template -class reduction_layer : public transform_layer { +class reduction_layer : public transform_layer { static_assert(T_layout == data_layout::DATA_PARALLEL, "reduction currently only supports DATA_PARALLEL"); private: @@ -55,7 +56,7 @@ class reduction_layer : public transform_layer { reduction_layer(lbann_comm *comm, reduction_mode mode) - : transform_layer(comm), + : transform_layer(comm), m_mode(mode) { if (mode == reduction_mode::INVALID) { LBANN_ERROR("invalid reduction mode"); @@ -68,7 +69,7 @@ class reduction_layer : public transform_layer { El::Device get_device_allocation() const override { return Dev; } description get_description() const override { - auto desc = transform_layer::get_description(); + auto desc = transform_layer::get_description(); std::string mode_str; switch (m_mode) { case reduction_mode::SUM: mode_str = "sum"; break; @@ -84,15 +85,15 @@ class reduction_layer : public transform_layer { protected: void setup_dims() override { - Layer::setup_dims(); - set_output_dims({1}); + data_type_layer::setup_dims(); + this->set_output_dims({1}); } void fp_compute() override { // Local matrices - const auto& local_input = get_local_prev_activations(); - auto& local_output = get_local_activations(); + const auto& local_input = this->get_local_prev_activations(); + auto& local_output = this->get_local_activations(); const El::Int input_size = local_input.Height(); // Apply reduction @@ -100,14 +101,14 @@ class reduction_layer : public transform_layer { case reduction_mode::SUM: El::Ones(m_ones, input_size, 1); El::Gemv(El::TRANSPOSE, - DataType(1), local_input, m_ones, - DataType(0), local_output); + TensorDataType(1), local_input, m_ones, + TensorDataType(0), local_output); break; case reduction_mode::AVERAGE: El::Ones(m_ones, input_size, 1); El::Gemv(El::TRANSPOSE, - DataType(1) / input_size, local_input, m_ones, - DataType(0), local_output); + TensorDataType(1) / input_size, local_input, m_ones, + TensorDataType(0), local_output); break; default: LBANN_ERROR("invalid reduction mode"); @@ -118,8 +119,8 @@ class reduction_layer : public transform_layer { void bp_compute() override { // Local matrices - const auto& local_gradient_wrt_output = get_local_prev_error_signals(); - auto& local_gradient_wrt_input = get_local_error_signals(); + const auto& local_gradient_wrt_output = this->get_local_prev_error_signals(); + auto& local_gradient_wrt_input = this->get_local_error_signals(); const El::Int input_size = local_gradient_wrt_input.Height(); // Compute gradients w.r.t. inputs @@ -127,14 +128,14 @@ class reduction_layer : public transform_layer { case reduction_mode::SUM: El::Ones(m_ones, input_size, 1); El::Gemm(El::NORMAL, El::NORMAL, - DataType(1), m_ones, local_gradient_wrt_output, - DataType(0), local_gradient_wrt_input); + TensorDataType(1), m_ones, local_gradient_wrt_output, + TensorDataType(0), local_gradient_wrt_input); break; case reduction_mode::AVERAGE: El::Ones(m_ones, input_size, 1); El::Gemm(El::NORMAL, El::NORMAL, - DataType(1) / input_size, m_ones, local_gradient_wrt_output, - DataType(0), local_gradient_wrt_input); + TensorDataType(1) / input_size, m_ones, local_gradient_wrt_output, + TensorDataType(0), local_gradient_wrt_input); break; default: LBANN_ERROR("invalid reduction mode"); @@ -146,10 +147,10 @@ class reduction_layer : public transform_layer { #ifndef LBANN_REDUCTION_LAYER_INSTANTIATE extern template class reduction_layer< - data_layout::DATA_PARALLEL, El::Device::CPU>; + DataType, data_layout::DATA_PARALLEL, El::Device::CPU>; #ifdef LBANN_HAS_GPU extern template class reduction_layer< - data_layout::DATA_PARALLEL, El::Device::GPU>; + DataType, data_layout::DATA_PARALLEL, El::Device::GPU>; #endif // LBANN_HAS_GPU #endif // LBANN_REDUCTION_LAYER_INSTANTIATE diff --git a/include/lbann/layers/transform/reshape.hpp b/include/lbann/layers/transform/reshape.hpp index 982c8eab3bd..37c10bccf5f 100644 --- a/include/lbann/layers/transform/reshape.hpp +++ b/include/lbann/layers/transform/reshape.hpp @@ -36,13 +36,13 @@ namespace lbann { * Forward and backward prop simply involve setting up tensor views, * and hence are very cheap. */ -template -class reshape_layer : public transform_layer { +template +class reshape_layer : public transform_layer { public: reshape_layer(lbann_comm *comm, std::vector dims) - : transform_layer(comm) { - set_output_dims(dims); + : transform_layer(comm) { + this->set_output_dims(dims); } reshape_layer* copy() const override { return new reshape_layer(*this); } std::string get_type() const override { return "reshape"; } @@ -52,10 +52,10 @@ class reshape_layer : public transform_layer { protected: void setup_dims() override { - transform_layer::setup_dims(); + transform_layer::setup_dims(); - const auto& input_dims = get_input_dims(); - auto output_dims = get_output_dims(); + const auto& input_dims = this->get_input_dims(); + auto output_dims = this->get_output_dims(); // Determine any unspecified dimensions int unspecified_dim = -1; @@ -70,12 +70,12 @@ class reshape_layer : public transform_layer { output_dims.end(), 1, std::multiplies()); - output_dims[unspecified_dim] = get_input_size() / specified_size; - set_output_dims(output_dims); + output_dims[unspecified_dim] = this->get_input_size() / specified_size; + this->set_output_dims(output_dims); } // Check that reshape is valid - if (get_input_size() != get_output_size()) { + if (this->get_input_size() != this->get_output_size()) { std::stringstream err; err << "input tensor dimensions ("; for (size_t i = 0; i < input_dims.size(); ++i) { @@ -92,10 +92,10 @@ class reshape_layer : public transform_layer { } void fp_setup_outputs(El::Int mini_batch_size) override { - El::LockedView(get_activations(), get_prev_activations()); + El::LockedView(this->get_activations(), this->get_prev_activations()); } void bp_setup_gradient_wrt_inputs(El::Int mini_batch_size) override { - El::LockedView(get_error_signals(), get_prev_error_signals()); + El::LockedView(this->get_error_signals(), this->get_prev_error_signals()); } void fp_compute() override {} void bp_compute() override {} @@ -104,14 +104,14 @@ class reshape_layer : public transform_layer { #ifndef LBANN_RESHAPE_LAYER_INSTANTIATE extern template class reshape_layer< - data_layout::DATA_PARALLEL, El::Device::CPU>; + DataType, data_layout::DATA_PARALLEL, El::Device::CPU>; extern template class reshape_layer< - data_layout::MODEL_PARALLEL, El::Device::CPU>; + DataType, data_layout::MODEL_PARALLEL, El::Device::CPU>; #ifdef LBANN_HAS_GPU extern template class reshape_layer< - data_layout::DATA_PARALLEL, El::Device::GPU>; + DataType, data_layout::DATA_PARALLEL, El::Device::GPU>; extern template class reshape_layer< - data_layout::MODEL_PARALLEL, El::Device::GPU>; + DataType, data_layout::MODEL_PARALLEL, El::Device::GPU>; #endif // LBANN_HAS_GPU #endif // LBANN_RESHAPE_LAYER_INSTANTIATE diff --git a/include/lbann/layers/transform/slice.hpp b/include/lbann/layers/transform/slice.hpp index 091ecb1fca6..ba835d4f704 100644 --- a/include/lbann/layers/transform/slice.hpp +++ b/include/lbann/layers/transform/slice.hpp @@ -44,22 +44,32 @@ namespace lbann { * \cdots\times D_n @f$ * tensor. */ -template -class slice_layer : public transform_layer { +class slice_layer : public transform_layer { +public: + /** @name Public Types */ + ///@{ + + /** @brief The tensor type expected in this object. */ + using AbsDistMatrixType = El::AbstractDistMatrix; + + ///@} + public: slice_layer(lbann_comm *comm, El::Int slice_dim, std::vector slice_points) - : transform_layer(comm), + : transform_layer(comm), m_slice_dim(slice_dim), m_slice_points(slice_points) { this->m_expected_num_child_layers = -1; // No limit on children } slice_layer(const slice_layer& other) - : transform_layer(other), + : transform_layer(other), m_slice_dim(other.m_slice_dim), m_slice_points(other.m_slice_points) { m_input_v.reset(other.m_input_v ? other.m_input_v->Copy() : nullptr); @@ -67,7 +77,7 @@ class slice_layer : public transform_layer { } slice_layer& operator=(const slice_layer& other) { - transform_layer::operator=(other); + transform_layer::operator=(other); m_slice_dim = other.m_slice_dim; m_slice_points = other.m_slice_points; m_input_v.reset(other.m_input_v ? other.m_input_v->Copy() : nullptr); @@ -86,7 +96,7 @@ class slice_layer : public transform_layer { std::vector get_slice_points() const { return m_slice_points; } description get_description() const override { - auto desc = transform_layer::get_description(); + auto desc = transform_layer::get_description(); desc.add("Slice dimension", m_slice_dim); std::stringstream ss; for (size_t i = 0; i < m_slice_points.size(); ++i) { @@ -99,40 +109,40 @@ class slice_layer : public transform_layer { protected: void setup_matrices(const El::Grid& grid) override { - transform_layer::setup_matrices(grid); - const auto& input = get_prev_activations(); + transform_layer::setup_matrices(grid); + const auto& input = this->get_prev_activations(); m_input_v.reset(input.Construct(input.Grid(), input.Root())); m_output_v.reset(input.Construct(input.Grid(), input.Root())); } void setup_dims() override { - transform_layer::setup_dims(); - const auto& input_dims = get_input_dims(); - const auto& num_outputs = get_num_children(); + transform_layer::setup_dims(); + const auto& input_dims = this->get_input_dims(); + const auto& num_outputs = this->get_num_children(); // Check that slice parameters are valid std::stringstream err; if (m_slice_dim < 0 || m_slice_dim >= (El::Int) input_dims.size()) { - err << get_type() << " layer \"" << get_name() << "\" " + err << get_type() << " layer \"" << this->get_name() << "\" " << "has " << input_dims.size() << " dimensions, " << "but attempted to slice along dimension " << m_slice_dim; LBANN_ERROR(err.str()); } if ((int) m_slice_points.size() <= num_outputs) { - err << get_type() << " layer \"" << get_name() << "\" " + err << get_type() << " layer \"" << this->get_name() << "\" " << "requires more slice points than output tensors " << "(found " << m_slice_points.size() << " slice points " - << "and " << m_child_layers.size() << " output tensors)"; + << "and " << this->m_child_layers.size() << " output tensors)"; LBANN_ERROR(err.str()); } if (!std::is_sorted(m_slice_points.begin(), m_slice_points.end())) { - err << get_type() << " layer \"" << get_name() << "\" " + err << get_type() << " layer \"" << this->get_name() << "\" " << "has unsorted slice points"; LBANN_ERROR(err.str()); } if (m_slice_points.front() < 0 || m_slice_points.back() > input_dims[m_slice_dim]) { - err << get_type() << " layer \"" << get_name() << "\" " + err << get_type() << " layer \"" << this->get_name() << "\" " << "expects slice points in the range " << "[0, " << input_dims[m_slice_dim] << "], " << "but found an invalid slice point "; @@ -148,14 +158,14 @@ class slice_layer : public transform_layer { auto output_dims = input_dims; for (int i = 0; i < num_outputs; ++i) { output_dims[m_slice_dim] = m_slice_points[i+1] - m_slice_points[i]; - set_output_dims(output_dims, i); + this->set_output_dims(output_dims, i); } } void fp_setup_outputs(El::Int mini_batch_size) override { - const auto& num_outputs = get_num_children(); - const auto& input_dims = get_input_dims(); + const auto& num_outputs = this->get_num_children(); + const auto& input_dims = this->get_input_dims(); // Divide input tensor into unit slices along slice dimension // Note: Each unit slice is divided into contiguous "unit blocks" @@ -171,11 +181,11 @@ class slice_layer : public transform_layer { * unit_block_size); // Populate output tensors with slices of input tensor - const auto& input = get_prev_activations(); + const auto& input = this->get_prev_activations(); for (int i = 0; i < num_outputs; ++i) { - const auto& output_dims = get_output_dims(i); - const auto& output_size = get_output_size(i); - auto& output = get_activations(i); + const auto& output_dims = this->get_output_dims(i); + const auto& output_size = this->get_output_size(i); + auto& output = this->get_activations(i); output.Empty(false); // Divide output tensor into unit slices @@ -215,14 +225,14 @@ class slice_layer : public transform_layer { } void bp_setup_gradient_wrt_inputs(El::Int mini_batch_size) override { - const auto& num_outputs = get_num_children(); - const auto& input_dims = get_input_dims(); + const auto& num_outputs = this->get_num_children(); + const auto& input_dims = this->get_input_dims(); // Initialize gradient w.r.t. input tensor - auto& gradient_wrt_input = get_error_signals(); + auto& gradient_wrt_input = this->get_error_signals(); gradient_wrt_input.Empty(false); - gradient_wrt_input.AlignWith(get_prev_activations()); - gradient_wrt_input.Resize(get_input_size(), mini_batch_size); + gradient_wrt_input.AlignWith(this->get_prev_activations()); + gradient_wrt_input.Resize(this->get_input_size(), mini_batch_size); if (m_slice_points[0] != 0 || m_slice_points[num_outputs] != input_dims[m_slice_dim]) { El::Zero(gradient_wrt_input); @@ -243,8 +253,8 @@ class slice_layer : public transform_layer { // Populate slices of gradient w.r.t. input tensor for (int i = 0; i < num_outputs; ++i) { - const auto& output_dims = get_output_dims(i); - const auto& gradient_wrt_output = get_prev_error_signals(i); + const auto& output_dims = this->get_output_dims(i); + const auto& gradient_wrt_output = this->get_prev_error_signals(i); // Divide output tensor into unit slices const auto& output_num_unit_slices = output_dims[m_slice_dim]; @@ -282,18 +292,18 @@ class slice_layer : public transform_layer { std::vector m_slice_points; /** View into input tensor. */ - std::unique_ptr m_input_v; + std::unique_ptr m_input_v; /** View into output tensor. */ - std::unique_ptr m_output_v; + std::unique_ptr m_output_v; }; #ifndef LBANN_SLICE_LAYER_INSTANTIATE -extern template class slice_layer; -extern template class slice_layer; +extern template class slice_layer; +extern template class slice_layer; #ifdef LBANN_HAS_GPU -extern template class slice_layer; -extern template class slice_layer; +extern template class slice_layer; +extern template class slice_layer; #endif // LBANN_HAS_GPU #endif // LBANN_SLICE_LAYER_INSTANTIATE diff --git a/include/lbann/layers/transform/sort.hpp b/include/lbann/layers/transform/sort.hpp index d9d54821e3f..b4224afeb02 100644 --- a/include/lbann/layers/transform/sort.hpp +++ b/include/lbann/layers/transform/sort.hpp @@ -32,18 +32,19 @@ namespace lbann { /** @brief Sort tensor entries. */ -template -class sort_layer : public transform_layer { +class sort_layer : public transform_layer { static_assert(T_layout == data_layout::DATA_PARALLEL, "sort layer only supports DATA_PARALLEL"); public: sort_layer(lbann_comm *comm, bool descending = false) - : transform_layer(comm), m_descending(descending) { + : transform_layer(comm), m_descending(descending) { } sort_layer(const sort_layer& other) - : transform_layer(other), + : transform_layer(other), m_descending(other.m_descending) { if (other.m_indices) { switch (other.m_indices->GetDevice()) { @@ -61,7 +62,7 @@ class sort_layer : public transform_layer { } } sort_layer& operator=(const sort_layer& other) { - transform_layer::operator=(other); + transform_layer::operator=(other); m_descending = other.m_descending; if (!other.m_indices) { m_indices.reset(nullptr); @@ -88,7 +89,7 @@ class sort_layer : public transform_layer { El::Device get_device_allocation() const override { return Dev; } description get_description() const override { - auto desc = transform_layer::get_description(); + auto desc = transform_layer::get_description(); desc.add("Descending", m_descending); return desc; } @@ -96,13 +97,13 @@ class sort_layer : public transform_layer { protected: void setup_dims() override { - transform_layer::setup_dims(); - set_output_dims(get_input_dims()); + transform_layer::setup_dims(); + this->set_output_dims(this->get_input_dims()); } void setup_matrices(const El::Grid& grid) override { - transform_layer::setup_matrices(grid); - const auto& dist = get_activations().DistData(); + transform_layer::setup_matrices(grid); + const auto& dist = this->get_activations().DistData(); switch (dist.device) { case El::Device::CPU: m_indices.reset(new El::Matrix()); @@ -118,8 +119,8 @@ class sort_layer : public transform_layer { } void fp_setup_outputs(El::Int mini_batch_size) override { - transform_layer::fp_setup_outputs(mini_batch_size); - const auto& output = get_activations(); + transform_layer::fp_setup_outputs(mini_batch_size); + const auto& output = this->get_activations(); m_indices->Resize(output.LocalHeight(), output.LocalWidth()); } @@ -141,10 +142,10 @@ class sort_layer : public transform_layer { #ifndef LBANN_SORT_LAYER_INSTANTIATE extern template class sort_layer< - data_layout::DATA_PARALLEL, El::Device::CPU>; + DataType, data_layout::DATA_PARALLEL, El::Device::CPU>; #ifdef LBANN_HAS_GPU extern template class sort_layer< - data_layout::DATA_PARALLEL, El::Device::GPU>; + DataType, data_layout::DATA_PARALLEL, El::Device::GPU>; #endif // LBANN_HAS_GPU #endif // LBANN_SORT_LAYER_INSTANTIATE diff --git a/include/lbann/layers/transform/split.hpp b/include/lbann/layers/transform/split.hpp index 600148a8062..072160eb8d7 100644 --- a/include/lbann/layers/transform/split.hpp +++ b/include/lbann/layers/transform/split.hpp @@ -34,12 +34,13 @@ namespace lbann { /** @brief Present input tensor to multiple outputs. */ -template -class split_layer : public transform_layer { +class split_layer : public transform_layer { public: - split_layer(lbann_comm *comm) : transform_layer(comm) { + split_layer(lbann_comm *comm) : transform_layer(comm) { this->m_expected_num_child_layers = -1; // No limit on children } @@ -51,30 +52,30 @@ class split_layer : public transform_layer { protected: void setup_dims() override { - Layer::setup_dims(); - for (int i = 0; i < get_num_children(); ++i) { - set_output_dims(get_input_dims(), i); + data_type_layer::setup_dims(); + for (int i = 0; i < this->get_num_children(); ++i) { + this->set_output_dims(this->get_input_dims(), i); } } void fp_setup_outputs(El::Int mini_batch_size) override { - const auto& input = get_prev_activations(); - for (int i = 0; i < get_num_children(); ++i) { - El::LockedView(get_activations(i), input); + const auto& input = this->get_prev_activations(); + for (int i = 0; i < this->get_num_children(); ++i) { + El::LockedView(this->get_activations(i), input); } } void fp_compute() override {} void bp_compute() override { - auto& gradient_wrt_input = get_error_signals(); - if (get_num_children() > 0) { - El::Copy(get_prev_error_signals(0), gradient_wrt_input); + auto& gradient_wrt_input = this->get_error_signals(); + if (this->get_num_children() > 0) { + El::Copy(this->get_prev_error_signals(0), gradient_wrt_input); } else { El::Zero(gradient_wrt_input); } - for (int i = 1; i < get_num_children(); ++i) { - El::Axpy(DataType(1), get_prev_error_signals(i), + for (int i = 1; i < this->get_num_children(); ++i) { + El::Axpy(DataType(1), this->get_prev_error_signals(i), gradient_wrt_input); } } @@ -82,11 +83,11 @@ class split_layer : public transform_layer { }; #ifndef LBANN_SPLIT_LAYER_INSTANTIATE -extern template class split_layer; -extern template class split_layer; +extern template class split_layer; +extern template class split_layer; #ifdef LBANN_HAS_GPU -extern template class split_layer; -extern template class split_layer; +extern template class split_layer; +extern template class split_layer; #endif // LBANN_HAS_GPU #endif // LBANN_SPLIT_LAYER_INSTANTIATE diff --git a/include/lbann/layers/transform/stop_gradient.hpp b/include/lbann/layers/transform/stop_gradient.hpp index 08666c25c2c..3642bb054f8 100644 --- a/include/lbann/layers/transform/stop_gradient.hpp +++ b/include/lbann/layers/transform/stop_gradient.hpp @@ -39,10 +39,10 @@ namespace lbann { * means that computed gradients in preceeding layers are not exact * gradients of the objective function. */ -template -class stop_gradient_layer : public transform_layer { +template +class stop_gradient_layer : public transform_layer { public: - stop_gradient_layer(lbann_comm *comm) : transform_layer(comm) {} + stop_gradient_layer(lbann_comm *comm) : transform_layer(comm) {} stop_gradient_layer* copy() const override { return new stop_gradient_layer(*this); } std::string get_type() const override { return "stop_gradient"; } data_layout get_data_layout() const override { return T_layout; } @@ -50,11 +50,11 @@ class stop_gradient_layer : public transform_layer { protected: void setup_dims() override { - transform_layer::setup_dims(); - set_output_dims(get_input_dims()); + transform_layer::setup_dims(); + this->set_output_dims(this->get_input_dims()); } void fp_setup_outputs(El::Int mini_batch_size) override { - El::LockedView(get_activations(), get_prev_activations()); + El::LockedView(this->get_activations(), this->get_prev_activations()); } void fp_compute() override {} @@ -62,14 +62,14 @@ class stop_gradient_layer : public transform_layer { #ifndef LBANN_STOP_GRADIENT_LAYER_INSTANTIATE extern template class stop_gradient_layer< - data_layout::DATA_PARALLEL, El::Device::CPU>; + DataType, data_layout::DATA_PARALLEL, El::Device::CPU>; extern template class stop_gradient_layer< - data_layout::MODEL_PARALLEL, El::Device::CPU>; + DataType, data_layout::MODEL_PARALLEL, El::Device::CPU>; #ifdef LBANN_HAS_GPU extern template class stop_gradient_layer< - data_layout::DATA_PARALLEL, El::Device::GPU>; + DataType, data_layout::DATA_PARALLEL, El::Device::GPU>; extern template class stop_gradient_layer< - data_layout::MODEL_PARALLEL, El::Device::GPU>; + DataType, data_layout::MODEL_PARALLEL, El::Device::GPU>; #endif // LBANN_HAS_GPU #endif // LBANN_STOP_GRADIENT_LAYER_INSTANTIATE diff --git a/include/lbann/layers/transform/sum.hpp b/include/lbann/layers/transform/sum.hpp index 3544f90b1a7..e68c06804d2 100644 --- a/include/lbann/layers/transform/sum.hpp +++ b/include/lbann/layers/transform/sum.hpp @@ -32,13 +32,14 @@ namespace lbann { -template -class sum_layer : public transform_layer { +class sum_layer : public transform_layer { public: sum_layer(lbann_comm *comm) - : transform_layer(comm) { + : transform_layer(comm) { this->m_expected_num_parent_layers = -1; // No limit on parents } @@ -50,29 +51,29 @@ class sum_layer : public transform_layer { protected: void setup_pointers() override { - transform_layer::setup_pointers(); - if (get_num_parents() < 1) { + transform_layer::setup_pointers(); + if (this->get_num_parents() < 1) { std::stringstream err; - err << get_type() << " layer \"" << get_name() << "\" " + err << get_type() << " layer \"" << this->get_name() << "\" " << "has no parent layers"; LBANN_ERROR(err.str()); } } void setup_dims() override { - transform_layer::setup_dims(); - set_output_dims(get_input_dims()); + transform_layer::setup_dims(); + this->set_output_dims(this->get_input_dims()); // Check that input dimensions match - const auto& output_dims = get_output_dims(); - for (int i = 0; i < get_num_parents(); ++i) { - if (get_input_dims(i) != output_dims) { - const auto& parents = get_parent_layers(); + const auto& output_dims = this->get_output_dims(); + for (int i = 0; i < this->get_num_parents(); ++i) { + if (this->get_input_dims(i) != output_dims) { + const auto& parents = this->get_parent_layers(); std::stringstream err; - err << get_type() << " layer \"" << get_name() << "\" " + err << get_type() << " layer \"" << this->get_name() << "\" " << "has input tensors with incompatible dimensions ("; - for (int j = 0; j < get_num_parents(); ++j) { - const auto& dims = get_input_dims(j); + for (int j = 0; j < this->get_num_parents(); ++j) { + const auto& dims = this->get_input_dims(j); err << (j > 0 ? ", " : "") << "layer \"" << parents[j]->get_name() << "\" outputs "; for (size_t k = 0; k < dims.size(); ++k) { @@ -87,17 +88,17 @@ class sum_layer : public transform_layer { } void fp_compute() override { - auto& output = get_activations(); - El::Copy(get_prev_activations(0), output); - for (int i = 1; i < get_num_parents(); ++i) { - El::Axpy(DataType(1), get_prev_activations(i), output); + auto& output = this->get_activations(); + El::Copy(this->get_prev_activations(0), output); + for (int i = 1; i < this->get_num_parents(); ++i) { + El::Axpy(DataType(1), this->get_prev_activations(i), output); } } void bp_setup_gradient_wrt_inputs(El::Int mini_batch_size) override { - const auto& gradient_wrt_output = get_prev_error_signals(); - for (int i = 0; i < get_num_parents(); ++i) { - El::LockedView(get_error_signals(i), gradient_wrt_output); + const auto& gradient_wrt_output = this->get_prev_error_signals(); + for (int i = 0; i < this->get_num_parents(); ++i) { + El::LockedView(this->get_error_signals(i), gradient_wrt_output); } } @@ -106,11 +107,11 @@ class sum_layer : public transform_layer { }; #ifndef LBANN_SUM_LAYER_INSTANTIATE -extern template class sum_layer; -extern template class sum_layer; +extern template class sum_layer; +extern template class sum_layer; #ifdef LBANN_HAS_GPU -extern template class sum_layer; -extern template class sum_layer; +extern template class sum_layer; +extern template class sum_layer; #endif // LBANN_HAS_GPU #endif // LBANN_SUM_LAYER_INSTANTIATE diff --git a/include/lbann/layers/transform/tessellate.hpp b/include/lbann/layers/transform/tessellate.hpp index 07111dc8ab9..4d964a82369 100644 --- a/include/lbann/layers/transform/tessellate.hpp +++ b/include/lbann/layers/transform/tessellate.hpp @@ -27,7 +27,7 @@ #ifndef LBANN_LAYERS_TRANSFORM_TESSELLATE_HPP_INCLUDED #define LBANN_LAYERS_TRANSFORM_TESSELLATE_HPP_INCLUDED -#include "lbann/layers/layer.hpp" +#include "lbann/layers/data_type_layer.hpp" namespace lbann { @@ -57,21 +57,34 @@ namespace lbann { * e_n@f$. Then, denoting the modulo operator with @f$ \% @f$, * @f[ Y_{i_1,\cdots,i_n} = X_{i_1\% d_1,\cdots,i_n\% d_n} @f] */ -template -class tessellate_layer : public Layer { +class tessellate_layer : public data_type_layer { +public: + /** @name Public Types */ + ///@{ + + /** @brief The tensor type expected in this object. */ + using AbsDistMatrixType = El::AbstractDistMatrix; + + /** @brief The local tensor type expected in this object. */ + using AbsMatrixType = El::AbstractMatrix; + + ///@} + public: tessellate_layer(lbann_comm *comm, std::vector dims = {}) - : Layer(comm) { - set_output_dims(dims); + : data_type_layer(comm) { + this->set_output_dims(dims); } tessellate_layer(const tessellate_layer& other) - : Layer(other), + : data_type_layer(other), m_input_v(other.m_input_v ? other.m_input_v->Copy() : nullptr) {} tessellate_layer& operator=(const tessellate_layer& other) { - Layer::operator=(other); + data_type_layer::operator=(other); m_input_v.reset(other.m_input_v ? other.m_input_v->Copy() : nullptr); return *this; } @@ -82,14 +95,14 @@ class tessellate_layer : public Layer { El::Device get_device_allocation() const override { return Device; } void setup_dims() override { - Layer::setup_dims(); + data_type_layer::setup_dims(); std::stringstream err; // Check input and output dimensions - const auto input_dims = get_input_dims(); - const auto& output_dims = get_output_dims(); + const auto input_dims = this->get_input_dims(); + const auto& output_dims = this->get_output_dims(); if (input_dims.size() != output_dims.size()) { - err << get_type() << " layer \"" << get_name() << "\" " + err << get_type() << " layer \"" << this->get_name() << "\" " << "attempted to tessellate a "; for (size_t i = 0; i < input_dims.size(); ++i) { err << (i > 0 ? "x" : "") << input_dims[i]; @@ -104,7 +117,7 @@ class tessellate_layer : public Layer { /// @todo Support tessellation with >3 dimensions if (input_dims.size() > 3) { - err << get_type() << " layer \"" << get_name() << "\" " + err << get_type() << " layer \"" << this->get_name() << "\" " << "attempted to tessellate a "; for (size_t i = 0; i < input_dims.size(); ++i) { err << (i > 0 ? "x" : "") << input_dims[i]; @@ -116,10 +129,10 @@ class tessellate_layer : public Layer { } void setup_matrices(const El::Grid& grid) override { - Layer::setup_matrices(grid); - auto dist_data = get_prev_activations().DistData(); + data_type_layer::setup_matrices(grid); + auto dist_data = this->get_prev_activations().DistData(); dist_data.colDist = El::STAR; - m_input_v.reset(AbsDistMat::Instantiate(dist_data)); + m_input_v.reset(AbsDistMatrixType::Instantiate(dist_data)); } protected: @@ -127,14 +140,14 @@ class tessellate_layer : public Layer { void fp_compute() override { // Get input and output dimensions - auto input_dims = get_input_dims(); - auto output_dims = get_output_dims(); + auto input_dims = this->get_input_dims(); + auto output_dims = this->get_output_dims(); while (input_dims.size() < 3) { input_dims.insert(input_dims.begin(), 1); } while (output_dims.size() < 3) { output_dims.insert(output_dims.begin(), 1); } // Get input and output data - auto& output = get_activations(); - const auto& input = get_prev_activations(); + auto& output = this->get_activations(); + const auto& input = this->get_prev_activations(); m_input_v->Empty(false); m_input_v->AlignWith(output); if (m_input_v->DistData() == input.DistData()) { @@ -156,14 +169,14 @@ class tessellate_layer : public Layer { void bp_compute() override { // Get input and output dimensions - auto input_dims = get_input_dims(); - auto output_dims = get_output_dims(); + auto input_dims = this->get_input_dims(); + auto output_dims = this->get_output_dims(); while (input_dims.size() < 3) { input_dims.insert(input_dims.begin(), 1); } while (output_dims.size() < 3) { output_dims.insert(output_dims.begin(), 1); } // Get input and output data - const auto& gradient_wrt_output = get_prev_error_signals(); - auto& gradient_wrt_input = get_error_signals(); + const auto& gradient_wrt_output = this->get_prev_error_signals(); + auto& gradient_wrt_input = this->get_error_signals(); m_input_v->Empty(false); m_input_v->AlignWith(gradient_wrt_output); if (m_input_v->DistData() == gradient_wrt_input.DistData()) { @@ -181,7 +194,7 @@ class tessellate_layer : public Layer { // Accumulate local error signals, if needed if (m_input_v->DistData() != gradient_wrt_input.DistData()) { - m_comm->allreduce(*m_input_v, m_input_v->RedundantComm()); + this->m_comm->allreduce(*m_input_v, m_input_v->RedundantComm()); El::Copy(*m_input_v, gradient_wrt_input); } @@ -190,38 +203,38 @@ class tessellate_layer : public Layer { private: /** View into input tensor. */ - std::unique_ptr m_input_v; + std::unique_ptr m_input_v; /** Apply tessellation. * Columns of 'input' should be intact mini-batch samples. If the * data layout is not purely data-parallel, this means input data * is duplicated over the input matrix's column communicator. */ - static void fp_compute_3d(const std::vector& input_dims, - const std::vector& output_dims, - const AbsMat& input, - AbsDistMat& output); + void fp_compute_3d(const std::vector& input_dims, + const std::vector& output_dims, + const AbsMatrixType& input, + AbsDistMatrixType& output); /** Compute local contribution to tessellation back prop * The global gradient w.r.t. input can be obtained by performing * an allreduce over the input matrix's column communicator. */ - static void bp_compute_3d(const std::vector& input_dims, - const std::vector& output_dims, - const AbsDistMat& gradient_wrt_output, - AbsMat& gradient_wrt_input); + void bp_compute_3d(const std::vector& input_dims, + const std::vector& output_dims, + const AbsDistMatrixType& gradient_wrt_output, + AbsMatrixType& gradient_wrt_input); }; #ifndef LBANN_TESSELLATE_LAYER_INSTANTIATE extern template class tessellate_layer< - data_layout::DATA_PARALLEL, El::Device::CPU>; + DataType, data_layout::DATA_PARALLEL, El::Device::CPU>; extern template class tessellate_layer< - data_layout::MODEL_PARALLEL, El::Device::CPU>; + DataType, data_layout::MODEL_PARALLEL, El::Device::CPU>; #ifdef LBANN_HAS_GPU extern template class tessellate_layer< - data_layout::DATA_PARALLEL, El::Device::GPU>; + DataType, data_layout::DATA_PARALLEL, El::Device::GPU>; extern template class tessellate_layer< - data_layout::MODEL_PARALLEL, El::Device::GPU>; + DataType, data_layout::MODEL_PARALLEL, El::Device::GPU>; #endif // LBANN_HAS_GPU #endif // LBANN_TESSELLATE_LAYER_INSTANTIATE diff --git a/include/lbann/layers/transform/transform.hpp b/include/lbann/layers/transform/transform.hpp index 98b2a169ea3..23b579c4322 100644 --- a/include/lbann/layers/transform/transform.hpp +++ b/include/lbann/layers/transform/transform.hpp @@ -27,16 +27,17 @@ #ifndef LBANN_LAYER_TRANSFORM_HPP_INCLUDED #define LBANN_LAYER_TRANSFORM_HPP_INCLUDED -#include "lbann/layers/layer.hpp" +#include "lbann/layers/data_type_layer.hpp" namespace lbann { /** @todo Remove. Layers should inherit directly from the base layer * class. */ -class transform_layer : public Layer { +template +class transform_layer : public data_type_layer { public: - transform_layer(lbann_comm *comm) : Layer(comm) {} + transform_layer(lbann_comm *comm) : data_type_layer(comm) {} }; } // namespace lbann diff --git a/include/lbann/layers/transform/uniform.hpp b/include/lbann/layers/transform/uniform.hpp index 5b394396cb6..1b336e7e217 100644 --- a/include/lbann/layers/transform/uniform.hpp +++ b/include/lbann/layers/transform/uniform.hpp @@ -38,23 +38,24 @@ namespace lbann { * During validation and testing, outputs are all equal to the * distribution mean. */ -template -class uniform_layer : public transform_layer { +class uniform_layer : public transform_layer { private: /** Uniform distribution mean. */ - DataType m_min; + TensorDataType m_min; /** Uniform distribution standard deviation. */ - DataType m_max; + TensorDataType m_max; public: uniform_layer(lbann_comm *comm, std::vector dims, - DataType min = DataType(0), - DataType max = DataType(1)) - : transform_layer(comm), m_min(min), m_max(max) { - set_output_dims(dims); + TensorDataType min = TensorDataType(0), + TensorDataType max = TensorDataType(1)) + : transform_layer(comm), m_min(min), m_max(max) { + this->set_output_dims(dims); this->m_expected_num_parent_layers = 0; } uniform_layer* copy() const override { return new uniform_layer(*this); } @@ -63,7 +64,7 @@ class uniform_layer : public transform_layer { El::Device get_device_allocation() const override { return Dev; } description get_description() const override { - auto desc = transform_layer::get_description(); + auto desc = transform_layer::get_description(); std::stringstream ss; ss << "[" << m_min << "," << m_max << ")"; desc.add("Range", ss.str()); @@ -75,7 +76,7 @@ class uniform_layer : public transform_layer { void fp_compute() override { const auto& mean = (m_max + m_min) / 2; const auto& radius = (m_max - m_min) / 2; - auto& output = get_activations(); + auto& output = this->get_activations(); if (this->m_model->get_execution_context().get_execution_mode() == execution_mode::training) { uniform_fill(output, output.Height(), output.Width(), mean, radius); } else { @@ -87,14 +88,14 @@ class uniform_layer : public transform_layer { #ifndef LBANN_UNIFORM_LAYER_INSTANTIATE extern template class uniform_layer< - data_layout::DATA_PARALLEL, El::Device::CPU>; + DataType, data_layout::DATA_PARALLEL, El::Device::CPU>; extern template class uniform_layer< - data_layout::MODEL_PARALLEL, El::Device::CPU>; + DataType, data_layout::MODEL_PARALLEL, El::Device::CPU>; #ifdef LBANN_HAS_GPU extern template class uniform_layer< - data_layout::DATA_PARALLEL, El::Device::GPU>; + DataType, data_layout::DATA_PARALLEL, El::Device::GPU>; extern template class uniform_layer< - data_layout::MODEL_PARALLEL, El::Device::GPU>; + DataType, data_layout::MODEL_PARALLEL, El::Device::GPU>; #endif // LBANN_HAS_GPU #endif // LBANN_UNIFORM_LAYER_INSTANTIATE diff --git a/include/lbann/layers/transform/unpooling.hpp b/include/lbann/layers/transform/unpooling.hpp index 59372cf185a..64e0c6a4078 100644 --- a/include/lbann/layers/transform/unpooling.hpp +++ b/include/lbann/layers/transform/unpooling.hpp @@ -37,9 +37,9 @@ namespace lbann { /** @brief Transpose of pooling layer. * @todo GPU support. */ -template -class unpooling_layer : public transform_layer { +class unpooling_layer : public transform_layer { static_assert(T_layout == data_layout::DATA_PARALLEL, "unpooling only supports DATA_PARALLEL"); static_assert(Dev == El::Device::CPU, @@ -47,13 +47,13 @@ class unpooling_layer : public transform_layer { private: /** Corresponding pooling layer. */ - pooling_layer* m_pooling_layer; + pooling_layer* m_pooling_layer; public: unpooling_layer(lbann_comm *comm, - pooling_layer* pool = nullptr) - : transform_layer(comm), + pooling_layer* pool = nullptr) + : transform_layer(comm), m_pooling_layer(pool) { } unpooling_layer* copy() const override { return new unpooling_layer(*this); } @@ -75,20 +75,20 @@ class unpooling_layer : public transform_layer { } void setup_dims() override { - transform_layer::setup_dims(); + transform_layer::setup_dims(); // Check that input tensor is valid - const auto& input_dims = get_input_dims(); + const auto& input_dims = this->get_input_dims(); const auto& pool_output_dims = m_pooling_layer->get_output_dims(); if (input_dims != pool_output_dims) { std::stringstream err; - err << get_type() << " layer \"" << get_name() << "\" " + err << get_type() << " layer \"" << this->get_name() << "\" " << "expects input tensors with dimensions "; for (size_t i = 0; i < pool_output_dims.size(); ++i) { err << (i > 0 ? " x " : "") << pool_output_dims[i]; } err << ", but parent layer " - << "\"" << m_parent_layers[0]->get_name() << "\" " + << "\"" << this->get_parent_layers()[0]->get_name() << "\" " << "outputs with dimensions "; for (size_t i = 0; i < input_dims.size(); ++i) { err << (i > 0 ? " x " : "") << input_dims[i]; @@ -97,22 +97,22 @@ class unpooling_layer : public transform_layer { } // Initialize output tensor based on corresponding pooling layer - set_output_dims(m_pooling_layer->get_input_dims()); + this->set_output_dims(m_pooling_layer->get_input_dims()); } - void set_pooling_layer(pooling_layer* pool) { + void set_pooling_layer(pooling_layer* pool) { m_pooling_layer = pool; } std::vector get_layer_pointers() override { - std::vector layers = transform_layer::get_layer_pointers(); + std::vector layers = transform_layer::get_layer_pointers(); layers.push_back((Layer*) m_pooling_layer); return layers; } void set_layer_pointers(std::vector layers) override { - m_pooling_layer = dynamic_cast*>(layers.back()); + m_pooling_layer = dynamic_cast*>(layers.back()); if (m_pooling_layer == nullptr) { std::stringstream err; err << __FILE__ << " " << __LINE__ @@ -120,7 +120,7 @@ class unpooling_layer : public transform_layer { throw lbann_exception(err.str()); } layers.pop_back(); - transform_layer::set_layer_pointers(layers); + transform_layer::set_layer_pointers(layers); } protected: @@ -147,14 +147,14 @@ class unpooling_layer : public transform_layer { void fp_compute_im2col() { // Get local matrices - const DMat& prev_activations_local = get_local_prev_activations(); - DMat& activations_local = get_local_activations(); + const DMat& prev_activations_local = this->get_local_prev_activations(); + DMat& activations_local = this->get_local_activations(); // Get parameters const int local_width = prev_activations_local.Width(); - const auto& output_dims = get_output_dims(); + const auto& output_dims = this->get_output_dims(); const int num_channels = output_dims[0]; - const int num_per_input_channel = get_input_size() / num_channels; + const int num_per_input_channel = this->get_input_size() / num_channels; const int pool_size = m_pooling_layer->m_pool_size; // Initialize im2col matrix @@ -167,16 +167,16 @@ class unpooling_layer : public transform_layer { El::Zero(im2col_mat); // Populate im2col matrix - const DataType *prev_activations_buffer + const TensorDataType *prev_activations_buffer = prev_activations_local.LockedBuffer(0, sample); const int *indices_buffer - = &m_pooling_layer->m_max_pool_indices[sample * get_input_size()]; + = &m_pooling_layer->m_max_pool_indices[sample * this->get_input_size()]; LBANN_OMP_PARALLEL_FOR for(int channel = 0; channel < num_channels; ++channel) { for(int j = 0; j < num_per_input_channel; ++j) { const int input_index = j + channel * num_per_input_channel; const int max_index = indices_buffer[input_index]; - DataType *im2col_buffer + TensorDataType *im2col_buffer = im2col_mat.Buffer(channel * pool_size, j); im2col_buffer[max_index] = prev_activations_buffer[input_index]; @@ -184,7 +184,8 @@ class unpooling_layer : public transform_layer { } // Convert im2col matrix to output matrix - DMat output_mat = El::View(activations_local, El::ALL, El::IR(sample)); + DMat output_mat = + El::View(activations_local, El::ALL, El::IR(sample)); col2im(im2col_mat, output_mat, num_channels, @@ -193,24 +194,24 @@ class unpooling_layer : public transform_layer { m_pooling_layer->m_pads.data(), m_pooling_layer->m_pool_dims.data(), m_pooling_layer->m_strides.data(), - static_cast(&std::max)); - + [](TensorDataType const& a, TensorDataType const& b) { + return std::max(a, b); + }); } - } /// Unpooling backward propagation with im2col void bp_compute_im2col() { // Get local matrices - const DMat& prev_error_signal_local = get_local_prev_error_signals(); - DMat& error_signal_local = get_local_error_signals(); + const DMat& prev_error_signal_local = this->get_local_prev_error_signals(); + DMat& error_signal_local = this->get_local_error_signals(); // Get parameters const int local_width = prev_error_signal_local.Width(); - const auto& output_dims = get_output_dims(); + const auto& output_dims = this->get_output_dims(); const int num_channels = output_dims[0]; - const int num_per_output_channel = get_input_size() / num_channels; + const int num_per_output_channel = this->get_input_size() / num_channels; const int pool_size = m_pooling_layer->m_pool_size; // Initialize im2col matrix @@ -232,15 +233,15 @@ class unpooling_layer : public transform_layer { m_pooling_layer->m_strides.data()); // Propagate error signal based on pooling layer - DataType *output_buffer = error_signal_local.Buffer(0, sample); + TensorDataType *output_buffer = error_signal_local.Buffer(0, sample); const int *indices_buffer - = &m_pooling_layer->m_max_pool_indices[sample * get_input_size()]; + = &m_pooling_layer->m_max_pool_indices[sample * this->get_input_size()]; LBANN_OMP_PARALLEL_FOR for(int channel = 0; channel < num_channels; ++channel) { for(int j = 0; j < num_per_output_channel; ++j) { const int output_index = j + channel * num_per_output_channel; const int max_index = indices_buffer[output_index]; - DataType *im2col_buffer + TensorDataType *im2col_buffer = im2col_mat.Buffer(channel * pool_size, j); output_buffer[output_index] = im2col_buffer[max_index]; } @@ -254,7 +255,7 @@ class unpooling_layer : public transform_layer { #ifndef LBANN_UNPOOLING_LAYER_INSTANTIATE extern template class unpooling_layer< - data_layout::DATA_PARALLEL, El::Device::CPU>; + DataType, data_layout::DATA_PARALLEL, El::Device::CPU>; #endif // LBANN_UNPOOLING_LAYER_INSTANTIATE } // namespace lbann diff --git a/include/lbann/layers/transform/weighted_sum.hpp b/include/lbann/layers/transform/weighted_sum.hpp index b96bb40f0e6..2aa4155e15a 100644 --- a/include/lbann/layers/transform/weighted_sum.hpp +++ b/include/lbann/layers/transform/weighted_sum.hpp @@ -34,9 +34,10 @@ namespace lbann { /** @brief Add tensors with specified scaling factors. */ -template -class weighted_sum_layer : public transform_layer { +class weighted_sum_layer : public transform_layer { private: /** Scaling factors for weighted sum. */ @@ -45,7 +46,7 @@ class weighted_sum_layer : public transform_layer { public: weighted_sum_layer(lbann_comm *comm, std::vector scaling_factors) - : transform_layer(comm), + : transform_layer(comm), m_scaling_factors(scaling_factors) { this->m_expected_num_parent_layers = -1; // No limit on parents } @@ -56,7 +57,7 @@ class weighted_sum_layer : public transform_layer { El::Device get_device_allocation() const override { return Dev; } description get_description() const override { - auto desc = transform_layer::get_description(); + auto desc = transform_layer::get_description(); std::stringstream ss; for (size_t i = 0; i < m_scaling_factors.size(); ++i) { ss << (i > 0 ? ", " : "") << m_scaling_factors[i]; @@ -68,36 +69,36 @@ class weighted_sum_layer : public transform_layer { protected: void setup_pointers() override { - transform_layer::setup_pointers(); + transform_layer::setup_pointers(); std::stringstream err; - if (get_num_parents() < 1) { - err << get_type() << " layer \"" << get_name() << "\" " + if (this->get_num_parents() < 1) { + err << get_type() << " layer \"" << this->get_name() << "\" " << "has no parent layers"; LBANN_ERROR(err.str()); } - if ((int) m_scaling_factors.size() != get_num_parents()) { - err << get_type() << " layer \"" << get_name() << "\" " + if ((int) m_scaling_factors.size() != this->get_num_parents()) { + err << get_type() << " layer \"" << this->get_name() << "\" " << "has an invalid number of scaling factors " << "(found " << m_scaling_factors.size() << ", " - << "but there are " << get_num_parents() << " parent layers)"; + << "but there are " << this->get_num_parents() << " parent layers)"; LBANN_ERROR(err.str()); } } void setup_dims() override { - transform_layer::setup_dims(); - set_output_dims(get_input_dims()); + transform_layer::setup_dims(); + this->set_output_dims(this->get_input_dims()); // Check that input dimensions match - const auto& output_dims = get_output_dims(); - for (int i = 0; i < get_num_parents(); ++i) { - if (get_input_dims(i) != output_dims) { - const auto& parents = get_parent_layers(); + const auto& output_dims = this->get_output_dims(); + for (int i = 0; i < this->get_num_parents(); ++i) { + if (this->get_input_dims(i) != output_dims) { + const auto& parents = this->get_parent_layers(); std::stringstream err; - err << get_type() << " layer \"" << get_name() << "\" " + err << get_type() << " layer \"" << this->get_name() << "\" " << "has input tensors with incompatible dimensions ("; - for (int j = 0; j < get_num_parents(); ++j) { - const auto& dims = get_input_dims(j); + for (int j = 0; j < this->get_num_parents(); ++j) { + const auto& dims = this->get_input_dims(j); err << (j > 0 ? ", " : "") << "layer \"" << parents[j]->get_name() << "\" outputs "; for (size_t k = 0; k < dims.size(); ++k) { @@ -112,17 +113,17 @@ class weighted_sum_layer : public transform_layer { } void fp_compute() override { - auto& output = get_activations(); + auto& output = this->get_activations(); El::Zero(output); - for (int i = 0; i < get_num_parents(); ++i) { - El::Axpy(m_scaling_factors[i], get_prev_activations(i), output); + for (int i = 0; i < this->get_num_parents(); ++i) { + El::Axpy(m_scaling_factors[i], this->get_prev_activations(i), output); } } void bp_compute() override { - const auto& gradient_wrt_output = get_prev_error_signals(); - for (int i = 0; i < get_num_parents(); ++i) { - auto& gradient_wrt_input = get_error_signals(i); + const auto& gradient_wrt_output = this->get_prev_error_signals(); + for (int i = 0; i < this->get_num_parents(); ++i) { + auto& gradient_wrt_input = this->get_error_signals(i); El::Zero(gradient_wrt_input); El::Axpy(m_scaling_factors[i], gradient_wrt_output, gradient_wrt_input); @@ -133,14 +134,14 @@ class weighted_sum_layer : public transform_layer { #ifndef LBANN_WEIGHTED_SUM_LAYER_INSTANTIATE extern template class weighted_sum_layer< - data_layout::DATA_PARALLEL, El::Device::CPU>; + DataType, data_layout::DATA_PARALLEL, El::Device::CPU>; extern template class weighted_sum_layer< - data_layout::MODEL_PARALLEL, El::Device::CPU>; + DataType, data_layout::MODEL_PARALLEL, El::Device::CPU>; #ifdef LBANN_HAS_GPU extern template class weighted_sum_layer< - data_layout::DATA_PARALLEL, El::Device::GPU>; + DataType, data_layout::DATA_PARALLEL, El::Device::GPU>; extern template class weighted_sum_layer< - data_layout::MODEL_PARALLEL, El::Device::GPU>; + DataType, data_layout::MODEL_PARALLEL, El::Device::GPU>; #endif // LBANN_HAS_GPU #endif // LBANN_WEIGHTED_SUM_LAYER_INSTANTIATE diff --git a/include/lbann/layers/transform/weights.hpp b/include/lbann/layers/transform/weights.hpp index 940ccb28c20..c0932b3fa7e 100644 --- a/include/lbann/layers/transform/weights.hpp +++ b/include/lbann/layers/transform/weights.hpp @@ -36,27 +36,54 @@ namespace lbann { * * Interfaces with a @c weights object and outputs its tensor. */ -template -class weights_layer : public transform_layer { +class weights_layer : public transform_layer { + +public: + /** @name Public Types */ + ///@{ + + /** @brief The tensor type expected in this object. */ + using AbsDistMatrixType = El::AbstractDistMatrix; + + /** @brief The local tensor type expected in this object. */ + using AbsMatrixType = El::AbstractMatrix; + + /** @brief The device-specific local tensor type. */ + using CPUMatType = El::Matrix; + +#ifdef LBANN_HAS_GPU + /** @brief The GPU device-specific local tensor type. */ + using GPUMatType = El::Matrix; +#endif + + /** @brief The concrete optimizer type used by this object. */ + using OptimizerType = data_type_optimizer; + + /** @brief The concrete weights type used by this object. */ + using WeightsType = data_type_weights; + + ///@} public: weights_layer(lbann_comm *comm, std::vector dims) - : transform_layer(comm) { + : transform_layer(comm) { std::vector dims_; for (const auto& d : dims) { dims_.push_back(d); } - set_output_dims(dims_); + this->set_output_dims(dims_); this->m_expected_num_parent_layers = 0; } weights_layer(const weights_layer& other) - : transform_layer(other), + : transform_layer(other), m_gradient(other.m_gradient ? other.m_gradient->Copy() : nullptr) { if (other.m_workspace) { switch (other.m_workspace->GetDevice()) { - case El::Device::CPU: m_workspace.reset(new CPUMat()); break; + case El::Device::CPU: m_workspace.reset(new CPUMatType); break; #ifdef LBANN_HAS_GPU - case El::Device::GPU: m_workspace.reset(new GPUMat()); break; + case El::Device::GPU: m_workspace.reset(new GPUMatType); break; #endif // LBANN_HAS_GPU default: LBANN_ERROR("unknown device type"); } @@ -64,14 +91,14 @@ class weights_layer : public transform_layer { } } weights_layer& operator=(const weights_layer& other){ - transform_layer::operator=(other); + transform_layer::operator=(other); m_gradient.reset(other.m_gradient ? other.m_gradient->Copy() : nullptr); m_workspace.reset(); if (other.m_workspace) { switch (other.m_workspace->GetDevice()) { - case El::Device::CPU: m_workspace.reset(new CPUMat()); break; + case El::Device::CPU: m_workspace.reset(new CPUMatType); break; #ifdef LBANN_HAS_GPU - case El::Device::GPU: m_workspace.reset(new GPUMat()); break; + case El::Device::GPU: m_workspace.reset(new GPUMatType); break; #endif // LBANN_HAS_GPU default: LBANN_ERROR("unknown device type"); } @@ -87,19 +114,19 @@ class weights_layer : public transform_layer { protected: void setup_matrices(const El::Grid& grid) override { - transform_layer::setup_matrices(grid); + transform_layer::setup_matrices(grid); // Initialize weights gradient - auto dist = get_activations().DistData(); + auto dist = this->get_activations().DistData(); dist.rowDist = El::STAR; - m_gradient.reset(AbsDistMat::Instantiate(dist)); + m_gradient.reset(AbsDistMatrixType::Instantiate(dist)); // Initialize workspace switch (Dev) { - case El::Device::CPU: m_workspace.reset(new CPUMat()); break; + case El::Device::CPU: m_workspace.reset(new CPUMatType); break; #ifdef LBANN_HAS_GPU case El::Device::GPU: - m_workspace.reset(new GPUMat()); + m_workspace.reset(new GPUMatType); #ifdef HYDROGEN_HAVE_CUB m_workspace->SetMemoryMode(1); // Use CUB GPU memory pool if possible #endif // HYDROGEN_HAVE_CUB @@ -111,41 +138,42 @@ class weights_layer : public transform_layer { } void setup_data() override { - transform_layer::setup_data(); + transform_layer::setup_data(); // Initialize default weights if none are provided - if (this->m_weights.empty()) { - auto w = make_unique(get_comm()); - auto init = make_unique(DataType(0)); - std::unique_ptr opt(m_model->create_optimizer()); - w->set_name(get_name() + "_weights"); + if (!this->has_weights()) { + auto w = make_unique(this->get_comm()); + auto init = make_unique>(DataType(0)); + auto opt = to_unique_ptr(dynamic_cast( + this->m_model->create_optimizer())); + w->set_name(this->get_name() + "_weights"); w->set_initializer(std::move(init)); w->set_optimizer(std::move(opt)); - this->m_weights.push_back(w.get()); + this->add_weights(w.get()); this->m_model->add_weights(std::move(w)); } - if (this->m_weights.size() != 1) { + if (this->num_weights() != 1) { LBANN_ERROR("attempted to setup ", - get_type()," layer \"",get_name(),"\" ", + this->get_type()," layer \"",this->get_name(),"\" ", "with an invalid number of weights ", "(expected at most 1, ", - "but found ",this->m_weights.size(),")"); + "but found ",this->num_weights(),")"); } // Setup weights and weights gradient - m_gradient->AlignWith(get_activations()); - m_gradient->Resize(get_output_size(), 1); - m_weights[0]->set_dims(get_output_dims()); - m_weights[0]->set_matrix_distribution(m_gradient->DistData()); + m_gradient->AlignWith(this->get_activations()); + m_gradient->Resize(this->get_output_size(), 1); + this->get_data_type_weights(0).set_dims(this->get_output_dims()); + this->get_data_type_weights(0).set_matrix_distribution(m_gradient->DistData()); // Initialize freeze state - if (this->m_frozen) { m_weights[0]->freeze(); } - else { m_weights[0]->unfreeze(); } - if (m_weights[0]->is_frozen() != this->m_frozen) { - LBANN_ERROR((m_frozen ? "" : "un"),"frozen ", - "layer \"",get_name(),"\" has ", - (m_weights[0]->is_frozen() ? "" : "un"),"frozen ", - "weights \"",m_weights[0]->get_name(),"\""); + if (this->m_frozen) { this->get_data_type_weights(0).freeze(); } + else { this->get_data_type_weights(0).unfreeze(); } + if (this->get_data_type_weights(0).is_frozen() != this->m_frozen) { + LBANN_ERROR((this->m_frozen ? "" : "un"),"frozen ", + "layer \"",this->get_name(),"\" has ", + (this->get_data_type_weights(0).is_frozen() ? "" : "un"),"frozen ", + "weights \"",this->get_data_type_weights(0).get_name(),"\""); } } @@ -153,15 +181,15 @@ class weights_layer : public transform_layer { void fp_compute() override { // Matrices - const auto& local_weights = m_weights[0]->get_values().LockedMatrix(); - auto& local_output = get_local_activations(); + const auto& local_weights = this->get_data_type_weights(0).get_values().LockedMatrix(); + auto& local_output = this->get_local_activations(); m_workspace->Resize(local_output.Width(), 1); - El::Fill(*m_workspace, DataType(1)); + El::Fill(*m_workspace, TensorDataType(1)); // Duplicate weights across matrix columns El::Gemm(El::NORMAL, El::TRANSPOSE, - DataType(1), local_weights, *m_workspace, - DataType(0), local_output); + TensorDataType(1), local_weights, *m_workspace, + TensorDataType(0), local_output); // Clean up m_workspace->Empty(); @@ -172,11 +200,11 @@ class weights_layer : public transform_layer { // Get optimizer // Note: Nothing needs to be done if there is no optimizer - auto* opt = this->m_weights[0]->get_optimizer(); + auto* opt = this->get_data_type_weights(0).get_optimizer(); if (opt == nullptr) { return; } // Matrices - const auto& local_gradient_wrt_output = get_local_prev_error_signals(); + const auto& local_gradient_wrt_output = this->get_local_prev_error_signals(); m_workspace->Resize(local_gradient_wrt_output.Width(), 1); El::Fill(*m_workspace, DataType{1}); @@ -193,22 +221,22 @@ class weights_layer : public transform_layer { private: /** Weights gradient. */ - std::unique_ptr m_gradient; + std::unique_ptr m_gradient; /** Workspace. */ - std::unique_ptr m_workspace; + std::unique_ptr m_workspace; }; #ifndef LBANN_WEIGHTS_LAYER_INSTANTIATE extern template class weights_layer< - data_layout::DATA_PARALLEL, El::Device::CPU>; + DataType, data_layout::DATA_PARALLEL, El::Device::CPU>; extern template class weights_layer< - data_layout::MODEL_PARALLEL, El::Device::CPU>; + DataType, data_layout::MODEL_PARALLEL, El::Device::CPU>; #ifdef LBANN_HAS_GPU extern template class weights_layer< - data_layout::DATA_PARALLEL, El::Device::GPU>; + DataType, data_layout::DATA_PARALLEL, El::Device::GPU>; extern template class weights_layer< - data_layout::MODEL_PARALLEL, El::Device::GPU>; + DataType, data_layout::MODEL_PARALLEL, El::Device::GPU>; #endif // LBANN_HAS_GPU #endif // LBANN_WEIGHTS_LAYER_INSTANTIATE diff --git a/include/lbann/metrics/layer_metric.hpp b/include/lbann/metrics/layer_metric.hpp index f0f9c811504..42f789c9152 100644 --- a/include/lbann/metrics/layer_metric.hpp +++ b/include/lbann/metrics/layer_metric.hpp @@ -86,7 +86,7 @@ class layer_metric : public metric { Layer* m_layer; /** Get corresponding evaluation layer. */ - abstract_evaluation_layer& get_evaluation_layer(); + /*abstract_evaluation_*/Layer& get_evaluation_layer(); }; diff --git a/include/lbann/objective_functions/layer_term.hpp b/include/lbann/objective_functions/layer_term.hpp index 7a3622537fe..7d8aa1508a5 100644 --- a/include/lbann/objective_functions/layer_term.hpp +++ b/include/lbann/objective_functions/layer_term.hpp @@ -58,7 +58,7 @@ class layer_term : public objective_function_term { private: /** Get corresponding evaluation layer. */ - abstract_evaluation_layer& get_evaluation_layer(); + /*abstract_evaluation_*/Layer& get_evaluation_layer(); }; diff --git a/include/lbann/objective_functions/weight_regularization/l2.hpp b/include/lbann/objective_functions/weight_regularization/l2.hpp index d8ef6fa47c0..37f6ce513dc 100644 --- a/include/lbann/objective_functions/weight_regularization/l2.hpp +++ b/include/lbann/objective_functions/weight_regularization/l2.hpp @@ -31,6 +31,9 @@ namespace lbann { +template class data_type_optimizer; +template class data_type_weights; + /** @class l2_weight_regularization * @brief Apply L2 regularization to a set of weights. * @@ -40,7 +43,18 @@ namespace lbann { */ class l2_weight_regularization : public objective_function_term { public: + using AccumulateDataType = DataType; + + using OptimizerType = data_type_optimizer; + + using WeightsType = data_type_weights; + template + using DMatType = El::Matrix; + + using CPUMatType = DMatType; + +public: /** @param scale_factor The objective function term is * @f$ \text{scale\_factor} \times \sum L2(w_i) @f$ */ @@ -69,7 +83,7 @@ class l2_weight_regularization : public objective_function_term { private: /** Contributions to evaluated value. */ - std::map m_contributions; + std::map m_contributions; /** For non-blocking allreduces. */ Al::request m_allreduce_req; @@ -85,8 +99,8 @@ class l2_weight_regularization : public objective_function_term { * accumulation variable. */ template - static void accumulate_contribution(const DMat& vals, - DMat& contribution); + static void accumulate_contribution(const DMatType& vals, + DMatType& contribution); }; diff --git a/include/lbann/optimizers/CMakeLists.txt b/include/lbann/optimizers/CMakeLists.txt index 877cc8bb815..3147d074337 100644 --- a/include/lbann/optimizers/CMakeLists.txt +++ b/include/lbann/optimizers/CMakeLists.txt @@ -2,6 +2,7 @@ set_full_path(THIS_DIR_HEADERS adagrad.hpp adam.hpp + data_type_optimizer.hpp hypergradient_adam.hpp optimizer.hpp rmsprop.hpp diff --git a/include/lbann/optimizers/adagrad.hpp b/include/lbann/optimizers/adagrad.hpp index 53bce3ab69d..75b99cbe199 100644 --- a/include/lbann/optimizers/adagrad.hpp +++ b/include/lbann/optimizers/adagrad.hpp @@ -27,7 +27,9 @@ #ifndef LBANN_OPTIMIZERS_ADAGRAD_HPP_INCLUDED #define LBANN_OPTIMIZERS_ADAGRAD_HPP_INCLUDED -#include "lbann/optimizers/optimizer.hpp" +#include "lbann/optimizers/data_type_optimizer.hpp" +#include "lbann/io/persist.hpp" +#include namespace lbann { @@ -39,10 +41,26 @@ namespace lbann { * methods for online learning and stochastic optimization." Journal * of Machine Learning Research 12, no. Jul (2011): 2121-2159. */ -class adagrad : public optimizer { +template +class adagrad : public data_type_optimizer { public: + /** @name Public Types */ + ///@{ - adagrad(DataType learning_rate, DataType eps = 1e-8); + /** @brief The tensor type expected in this object. */ + using AbsDistMatrixType = El::AbstractDistMatrix; + + /** @brief The optimizer base type of this object. */ + using OptimizerType = data_type_optimizer; + + /** @brief The concrete weights type used by this object. */ + using WeightsType = data_type_weights; + + ///@} + +public: + + adagrad(TensorDataType learning_rate, TensorDataType eps = 1e-8); adagrad(const adagrad& other); adagrad& operator=(const adagrad& other); ~adagrad() override = default; @@ -53,25 +71,25 @@ class adagrad : public optimizer { /** Human-readable description. */ description get_description() const override; - void setup(weights* w = nullptr) override; + void setup(WeightsType* w = nullptr) override; protected: /** Computation for an optimization step. */ - void step_compute(AbsDistMat& values, const AbsDistMat& gradient) override; + void step_compute(AbsDistMatrixType& values, const AbsDistMatrixType& gradient) override; private: /** Small factor to avoid division by zero. */ - DataType m_eps; + TensorDataType m_eps; /** AdaGrad cache. */ - std::unique_ptr m_cache; + std::unique_ptr m_cache; /** CPU implementation of optimization step. */ - void step_compute_cpu(AbsDistMat& values, const AbsDistMat& gradient); + void step_compute_cpu(AbsDistMatrixType& values, const AbsDistMatrixType& gradient); #ifdef LBANN_HAS_CUDNN /** GPU implementation of optimization step. */ - void step_compute_gpu(AbsDistMat& values, const AbsDistMat& gradient); + void step_compute_gpu(AbsDistMatrixType& values, const AbsDistMatrixType& gradient); #endif // LBANN_HAS_CUDNN // =========================================== diff --git a/include/lbann/optimizers/adam.hpp b/include/lbann/optimizers/adam.hpp index b2c4884df74..baff4b781f2 100644 --- a/include/lbann/optimizers/adam.hpp +++ b/include/lbann/optimizers/adam.hpp @@ -27,7 +27,9 @@ #ifndef LBANN_OPTIMIZERS_ADAM_HPP_INCLUDED #define LBANN_OPTIMIZERS_ADAM_HPP_INCLUDED -#include "lbann/optimizers/optimizer.hpp" +#include "lbann/optimizers/data_type_optimizer.hpp" +#include "lbann/io/persist.hpp" +#include namespace lbann { namespace callback { @@ -41,16 +43,32 @@ class perturb_adam; * Diederik P. Kingma and Jimmy Ba. "Adam: A method for stochastic * optimization." arXiv preprint arXiv:1412.6980 (2014). */ -class adam : public optimizer { +template +class adam : public data_type_optimizer { +public: + /** @name Public Types */ + ///@{ + + /** @brief The tensor type expected in this object. */ + using AbsDistMatrixType = El::AbstractDistMatrix; + + /** @brief The optimizer base type of this object. */ + using OptimizerType = data_type_optimizer; + + /** @brief The concrete weights type used by this object. */ + using WeightsType = data_type_weights; + + ///@} + public: /** @name Life cycle functions */ ///@{ - adam(DataType learning_rate, - DataType beta1 = 0.9, - DataType beta2 = 0.99, - DataType eps = 1e-8); + adam(TensorDataType learning_rate, + TensorDataType beta1 = 0.9, + TensorDataType beta2 = 0.99, + TensorDataType eps = 1e-8); adam(const adam& other); adam& operator=(const adam& other); ~adam() = default; @@ -72,84 +90,84 @@ class adam : public optimizer { ///@{ /** Update factor for first moment estimate. */ - DataType get_beta1() const noexcept { return m_beta1; } + TensorDataType get_beta1() const noexcept { return m_beta1; } /** Update factor for first moment estimate. */ - void set_beta1(DataType beta1) { m_beta1 = beta1; } + void set_beta1(TensorDataType beta1) { m_beta1 = beta1; } /** Update factor for second moment estimate. */ - DataType get_beta2() const noexcept { return m_beta2; } + TensorDataType get_beta2() const noexcept { return m_beta2; } /** Update factor for second moment estimate. */ - void set_beta2(DataType beta2) { m_beta2 = beta2; } + void set_beta2(TensorDataType beta2) { m_beta2 = beta2; } /** Small factor to avoid division by zero. */ - DataType get_eps() const noexcept { return m_eps; } + TensorDataType get_eps() const noexcept { return m_eps; } /** Small factor to avoid division by zero. */ - void set_eps(DataType eps) { m_eps = eps; } + void set_eps(TensorDataType eps) { m_eps = eps; } /** First moment estimates. */ - const AbsDistMat& get_moment1() const; + const AbsDistMatrixType& get_moment1() const; /** First moment estimates. */ - AbsDistMat& get_moment1(); + AbsDistMatrixType& get_moment1(); /** Second moment estimates. */ - const AbsDistMat& get_moment2() const; + const AbsDistMatrixType& get_moment2() const; /** Second moment estimates. */ - AbsDistMat& get_moment2(); + AbsDistMatrixType& get_moment2(); /** beta1 ^ iteration. * @todo This probably shouldn't be exposed. */ - DataType get_current_beta1() const noexcept { return m_current_beta1; } + TensorDataType get_current_beta1() const noexcept { return m_current_beta1; } /** beta1 ^ iteration. * @todo This probably shouldn't be exposed. */ - void set_current_beta1(DataType current_beta1) { m_current_beta1 = current_beta1; } + void set_current_beta1(TensorDataType current_beta1) { m_current_beta1 = current_beta1; } /** beta2 ^ iteration. * @todo This probably shouldn't be exposed. */ - DataType get_current_beta2() const noexcept { return m_current_beta2; } + TensorDataType get_current_beta2() const noexcept { return m_current_beta2; } /** beta2 ^ iteration. * @todo This probably shouldn't be exposed. */ - void set_current_beta2(DataType current_beta2) { m_current_beta2 = current_beta2; } + void set_current_beta2(TensorDataType current_beta2) { m_current_beta2 = current_beta2; } ///@} /** @name Setup */ ///@{ - void setup(weights* w = nullptr) override; + void setup(WeightsType* w = nullptr) override; ///@} protected: /** Computation for an optimization step. */ - void step_compute(AbsDistMat& values, - const AbsDistMat& gradient) override; + void step_compute(AbsDistMatrixType& values, + const AbsDistMatrixType& gradient) override; private: /** Update factor for first moment estimate. */ - DataType m_beta1; + TensorDataType m_beta1; /** Update factor for second moment estimate. */ - DataType m_beta2; + TensorDataType m_beta2; /** Small factor to avoid division by zero. */ - DataType m_eps; + TensorDataType m_eps; /** beta1 ^ iteration. */ - DataType m_current_beta1 = 1; + TensorDataType m_current_beta1 = 1; /** beta2 ^ iteration. */ - DataType m_current_beta2 = 1; + TensorDataType m_current_beta2 = 1; /** First moment estimates. */ - std::unique_ptr m_moment1; + std::unique_ptr m_moment1; /** Second moment estimates. */ - std::unique_ptr m_moment2; + std::unique_ptr m_moment2; /** Hyperparameter exploration. */ friend class callback::perturb_adam; /** CPU implementation of optimization step. */ - void step_compute_cpu(AbsDistMat& values, const AbsDistMat& gradient); + void step_compute_cpu(AbsDistMatrixType& values, const AbsDistMatrixType& gradient); #ifdef LBANN_HAS_CUDA /** GPU implementation of optimization step. */ - void step_compute_gpu(AbsDistMat& values, const AbsDistMat& gradient); + void step_compute_gpu(AbsDistMatrixType& values, const AbsDistMatrixType& gradient); #endif // LBANN_HAS_CUDA /** @name Checkpointing */ @@ -157,11 +175,11 @@ class adam : public optimizer { /* struct used to serialize mode fields in file and MPI transfer */ struct packing_header { - DataType beta1; - DataType beta2; - DataType eps; - DataType current_beta1; - DataType current_beta2; + TensorDataType beta1; + TensorDataType beta2; + TensorDataType eps; + TensorDataType current_beta1; + TensorDataType current_beta2; }; bool pack_scalars(persist& p) { diff --git a/include/lbann/optimizers/data_type_optimizer.hpp b/include/lbann/optimizers/data_type_optimizer.hpp new file mode 100644 index 00000000000..48668c401a4 --- /dev/null +++ b/include/lbann/optimizers/data_type_optimizer.hpp @@ -0,0 +1,217 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2019, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_OPTIMIZERS_DATA_TYPE_OPTIMIZER_HPP_INCLUDED +#define LBANN_OPTIMIZERS_DATA_TYPE_OPTIMIZER_HPP_INCLUDED + +#include "lbann/optimizers/optimizer.hpp" + +namespace lbann { + +// Forward declarations +template +class data_type_weights; + +template +class data_type_optimizer : public optimizer { + friend class data_type_weights; + +public: + /** @name Public Types */ + ///@{ + + /** @brief The tensor type expected in this object. */ + using AbsDistMatrixType = El::AbstractDistMatrix; + + /** @brief The concrete weights type used by this object. */ + using WeightsType = data_type_weights; + + ///@} + +public: + data_type_optimizer(TensorDataType learning_rate = 0); + data_type_optimizer(const data_type_optimizer& other); + data_type_optimizer& operator=(const data_type_optimizer& other); + virtual ~data_type_optimizer() = default; + + /** @brief Create a copy of the class instance. + * + * The caller is responsible for deallocating the returned object. + */ + virtual data_type_optimizer* copy() const override = 0; + + /** @brief Human-readable description. */ + virtual description get_description() const override; + + /** @brief Weights being optimized. */ + data_type_weights& get_weights(); + /** @brief Weights being optimized. */ + const data_type_weights& get_weights() const; + /** @brief Weights being optimized. */ + void set_weights(data_type_weights* w) { m_weights = w; } + + /** @brief Objective function gradient w.r.t. the weights. + * + * An allreduce may be launched and/or synchronized if needed. + */ + AbsDistMatrixType& get_gradient(); + + /** @brief Add to the objective function gradient w.r.t. the weights. + * @param gradient Contribution to gradient. + * @param scale Scaling factor for gradient + * contribution. + * @param allreduce_needed Whether the gradient contribution + * requires an allreduce over its redundant + * communicator. If false, duplicated data + * (over the redundant communicator) is + * assumed to be identical. If true, an + * allreduce is performed lazily when the + * gradient is accessed. + */ + void add_to_gradient(const AbsDistMatrixType& gradient, + TensorDataType scale = TensorDataType(1), + bool allreduce_needed = false); + /** @brief Zero out the objective function gradient w.r.t. the weights. */ + void clear_gradient() override; + /** @brief Get the gradient buffer. + * + * This provides access to the underlying gradient buffer, which may be + * directly summed into. This buffer should be considered ephemeral and not + * stored. The caller must also ensure the buffer has an appropriate + * distribution. buf_scale provides the caller with a scale factor that must + * be applied to the gradient buffer before writing to it, and in_scale + * provides a scaling factor that must be applied to the user's data. + * Essentially, this enables computations of the form + * gradient = buf_scale*gradient + in_scale*new_gradient + * This is an expert-mode function and is intended to help eliminate copies + * and facilitate kernel fusion. + * + * @param buf_scale A scale factor provided to the caller to scale the + * returned buffer by. + * @param in_scale A scale factor provided to the caller to scale their + * gradient contributions by. + * @param allreduce_needed Whether this gradient contribution will need to + * be allreduced. + */ + AbsDistMatrixType& get_gradient_buffer(TensorDataType& buf_scale, + TensorDataType& in_scale, + bool allreduce_needed = false); + + /** @brief Must be called before training. + * + * @param w Weights being optimized. If null, no change is made to + * the weights. + */ + virtual void setup(data_type_weights* w = nullptr); + + /** @brief Unregister a gradient source. + * + * When an object adds its contribution to the objective function + * gradient during back prop, it should unregister itself. If there + * are no more gradient sources remaining, a non-blocking allreduce + * will be launched on the gradient, if needed. + */ + void remove_gradient_source(const void* source) override; + + /** @brief Optimization step. */ + void step() override; + + /** @brief Scaling factor for optimization step sizes. */ + TensorDataType get_learning_rate() const; + /** @brief Scaling factor for optimization step sizes. */ + void set_learning_rate(TensorDataType learning_rate); + +protected: + + /** @brief Computation for an optimization step. + * + * @c values and @c gradient can be assumed to have the same + * distribution. + */ + virtual void step_compute(AbsDistMatrixType& values, + const AbsDistMatrixType& gradient) = 0; + +private: + + /** @brief Weights being optimized. */ + data_type_weights* m_weights = nullptr; + + /** @brief Objective function gradient w.r.t. weights. */ + std::unique_ptr m_gradient; + + /** @brief Workspace matrix. + * + * Helps ensure gradient contributions are in the right + * distribution. Most of the time, this should just be a matrix + * view. + */ + std::unique_ptr m_gradient_v; + + /** @brief Communication request object for gradient allreduce. + * + * Used to synchronize non-blocking allreduce. + */ + Al::request m_gradient_allreduce_req; + + /** @brief Scaling factor for optimization step sizes. + * + * This is not used by the base optimizer class, but is currently + * used by all derived optimizer classes. There are several cases + * where it is convenient to expose this in the base class, + * e.g. for variable learning rate schedules. + * @todo Consider moving this to the derived classes. + */ + TensorDataType m_learning_rate; + + /** @brief Launch non-blocking allreduce on the gradient, if needed. + * + * Does nothing if an allreduce is not needed or has already been + * started. + */ + void start_gradient_allreduce(); + + /** @brief Synchronize non-blocking allreduce on the gradient, if needed. + * + * Does nothing if an allreduce isn't needed. Throws an exception + * if an allreduce is needed but hasn't been started. + */ + void finish_gradient_allreduce(); + +public: + + // =========================================== + // Checkpointing + // =========================================== + bool save_to_checkpoint_shared(persist& p, std::string m_name) override; + bool load_from_checkpoint_shared(persist& p, std::string m_name) override; + bool save_to_checkpoint_distributed(persist& p, std::string m_name) override; + bool load_from_checkpoint_distributed(persist& p, std::string m_name) override; + +}; + +} // namespace lbann + +#endif // LBANN_OPTIMIZERS_DATA_TYPE_OPTIMIZER_HPP_INCLUDED diff --git a/include/lbann/optimizers/hypergradient_adam.hpp b/include/lbann/optimizers/hypergradient_adam.hpp index 0936548cdf8..c63ed9404aa 100644 --- a/include/lbann/optimizers/hypergradient_adam.hpp +++ b/include/lbann/optimizers/hypergradient_adam.hpp @@ -27,7 +27,9 @@ #ifndef LBANN_OPTIMIZERS_HYPERGRADIENT_ADAM_HPP_INCLUDED #define LBANN_OPTIMIZERS_HYPERGRADIENT_ADAM_HPP_INCLUDED -#include "lbann/optimizers/optimizer.hpp" +#include "lbann/optimizers/data_type_optimizer.hpp" +#include "lbann/io/persist.hpp" +#include namespace lbann { @@ -39,7 +41,23 @@ namespace lbann { * Baydin et al. "Online Learning Rate Adaptation with Hypergradient * Descent", 2017. */ -class hypergradient_adam : public optimizer { +template +class hypergradient_adam : public data_type_optimizer { +public: + /** @name Public Types */ + ///@{ + + /** @brief The tensor type expected in this object. */ + using AbsDistMatrixType = El::AbstractDistMatrix; + + /** @brief The concrete weights type used by this object. */ + using WeightsType = data_type_weights; + + /** @brief The base optimizer type for this class. */ + using OptimizerType = data_type_optimizer; + + ///@} + public: /** @brief Construct a Hypergradient Adam optimizer object @@ -55,11 +73,11 @@ class hypergradient_adam : public optimizer { * @param eps Small factor to avoid division by * zero. */ - hypergradient_adam(DataType init_learning_rate = 1e-3, - DataType hyper_learning_rate = 1e-7, - DataType beta1 = 0.9, - DataType beta2 = 0.99, - DataType eps = 1e-8); + hypergradient_adam(TensorDataType init_learning_rate = 1e-3, + TensorDataType hyper_learning_rate = 1e-7, + TensorDataType beta1 = 0.9, + TensorDataType beta2 = 0.99, + TensorDataType eps = 1e-8); hypergradient_adam(const hypergradient_adam& other); hypergradient_adam& operator=(const hypergradient_adam& other); ~hypergradient_adam() override = default; @@ -70,33 +88,33 @@ class hypergradient_adam : public optimizer { /** @brief Human-readable description. */ description get_description() const override; - void setup(weights* w = nullptr) override; + void setup(WeightsType* w = nullptr) override; protected: /** @brief Computation for an optimization step. */ - void step_compute(AbsDistMat& values, const AbsDistMat& gradient) override; + void step_compute(AbsDistMatrixType& values, const AbsDistMatrixType& gradient) override; private: /** @brief Hypergradient learning rate. */ - DataType m_hyper_learning_rate; + TensorDataType m_hyper_learning_rate; /** @brief Update factor for first moment estimate. */ - DataType m_beta1; + TensorDataType m_beta1; /** @brief Update factor for second moment estimate. */ - DataType m_beta2; + TensorDataType m_beta2; /** @brief Small factor to avoid division by zero. */ - DataType m_eps; + TensorDataType m_eps; /** @brief beta1 ^ iteration. */ - DataType m_current_beta1; + TensorDataType m_current_beta1; /** @brief beta2 ^ iteration. */ - DataType m_current_beta2; + TensorDataType m_current_beta2; /** @brief First moment estimates. */ - std::unique_ptr m_moment1; + std::unique_ptr m_moment1; /** @brief Second moment estimates. */ - std::unique_ptr m_moment2; + std::unique_ptr m_moment2; /** @brief Gradient estimate from the prior step (for hypergradient). */ - std::unique_ptr m_old_gradient; + std::unique_ptr m_old_gradient; // =========================================== // Checkpointing @@ -106,12 +124,12 @@ class hypergradient_adam : public optimizer { * @brief Used to serialize mode fields in file and MPI transfer */ struct packing_header { - DataType hyper_learning_rate; - DataType beta1; - DataType beta2; - DataType eps; - DataType current_beta1; - DataType current_beta2; + TensorDataType hyper_learning_rate; + TensorDataType beta1; + TensorDataType beta2; + TensorDataType eps; + TensorDataType current_beta1; + TensorDataType current_beta2; }; bool pack_scalars(persist& p) { diff --git a/include/lbann/optimizers/optimizer.hpp b/include/lbann/optimizers/optimizer.hpp index 46be0fe0e85..37c318f6f30 100644 --- a/include/lbann/optimizers/optimizer.hpp +++ b/include/lbann/optimizers/optimizer.hpp @@ -35,7 +35,6 @@ #include "lbann/comm.hpp" #include "lbann/utils/exception.hpp" #include "lbann/utils/description.hpp" -#include "lbann/weights/weights.hpp" #ifdef LBANN_HAS_GPU #include "lbann/utils/cuda.hpp" #endif // LBANN_HAS_GPU @@ -63,7 +62,6 @@ enum class optimizer_gradient_status { std::string to_string(optimizer_gradient_status status); // Forward declarations -class weights; class persist; /** @brief Abstract base class for gradient-based optimization algorithms. @@ -77,7 +75,7 @@ class persist; class optimizer { public: - optimizer(DataType learning_rate = 0); + optimizer(); optimizer(const optimizer& other); optimizer& operator=(const optimizer& other); virtual ~optimizer() = default; @@ -93,59 +91,8 @@ class optimizer { /** @brief Human-readable description. */ virtual description get_description() const; - /** @brief Weights being optimized. */ - weights& get_weights(); - /** @brief Weights being optimized. */ - const weights& get_weights() const; - /** @brief Weights being optimized. */ - void set_weights(weights* w) { m_weights = w; } - - /** @brief Objective function gradient w.r.t. the weights. - * - * An allreduce may be launched and/or synchronized if needed. - */ - AbsDistMat& get_gradient(); - - /** @brief Add to the objective function gradient w.r.t. the weights. - * @param gradient Contribution to gradient. - * @param scale Scaling factor for gradient - * contribution. - * @param allreduce_needed Whether the gradient contribution - * requires an allreduce over its redundant - * communicator. If false, duplicated data - * (over the redundant communicator) is - * assumed to be identical. If true, an - * allreduce is performed lazily when the - * gradient is accessed. - */ - void add_to_gradient(const AbsDistMat& gradient, - DataType scale = DataType(1), - bool allreduce_needed = false); /** @brief Zero out the objective function gradient w.r.t. the weights. */ - void clear_gradient(); - /** @brief Get the gradient buffer. - * - * This provides access to the underlying gradient buffer, which may be - * directly summed into. This buffer should be considered ephemeral and not - * stored. The caller must also ensure the buffer has an appropriate - * distribution. buf_scale provides the caller with a scale factor that must - * be applied to the gradient buffer before writing to it, and in_scale - * provides a scaling factor that must be applied to the user's data. - * Essentially, this enables computations of the form - * gradient = buf_scale*gradient + in_scale*new_gradient - * This is an expert-mode function and is intended to help eliminate copies - * and facilitate kernel fusion. - * - * @param buf_scale A scale factor provided to the caller to scale the - * returned buffer by. - * @param in_scale A scale factor provided to the caller to scale their - * gradient contributions by. - * @param allreduce_needed Whether this gradient contribution will need to - * be allreduced. - */ - AbsDistMat& get_gradient_buffer(DataType& buf_scale, - DataType& in_scale, - bool allreduce_needed = false); + virtual void clear_gradient() = 0; /** @brief Objects that are expected to contribute to the gradient. */ El::Int get_num_gradient_sources() const; @@ -164,28 +111,16 @@ class optimizer { * are no more gradient sources remaining, a non-blocking allreduce * will be launched on the gradient, if needed. */ - void remove_gradient_source(const void* source); - - /** @brief Must be called before training. - * - * @param w Weights being optimized. If null, no change is made to - * the weights. - */ - virtual void setup(weights* w = nullptr); + virtual void remove_gradient_source(const void* source); /** @brief Optimization step. */ - void step(); + virtual void step() = 0; /** @brief LBANN communicator. */ lbann_comm& get_comm() { return *m_comm; } /** @brief LBANN communicator. */ const lbann_comm& get_comm() const { return *m_comm; } - /** @brief Scaling factor for optimization step sizes. */ - DataType get_learning_rate() const; - /** @brief Scaling factor for optimization step sizes. */ - void set_learning_rate(DataType learning_rate); - /** @brief Time spent in optimization step. */ EvalType get_step_time() const { return m_step_time; } /** @brief Reset stats counters. */ @@ -193,32 +128,23 @@ class optimizer { protected: - /** @brief Computation for an optimization step. - * - * @c values and @c gradient can be assumed to have the same - * distribution. - */ - virtual void step_compute(AbsDistMat& values, - const AbsDistMat& gradient) = 0; + /** @brief Return the current gradient status */ + optimizer_gradient_status get_gradient_status() const { return m_gradient_status; } -private: + void set_gradient_status(const optimizer_gradient_status status) { m_gradient_status = status; } - /** @brief LBANN communicator. */ - lbann_comm* m_comm; + std::unordered_set& get_gradient_sources() { return m_gradient_sources; } - /** @brief Weights being optimized. */ - weights* m_weights = nullptr; + void set_comm(lbann_comm& comm) { m_comm = &comm; } - /** @brief Objective function gradient w.r.t. weights. */ - std::unique_ptr m_gradient; + void set_step_time(EvalType time) { m_step_time = time; } - /** @brief Workspace matrix. - * - * Helps ensure gradient contributions are in the right - * distribution. Most of the time, this should just be a matrix - * view. - */ - std::unique_ptr m_gradient_v; + void inc_step_time(EvalType time) { m_step_time += time; } + +private: + + /** @brief LBANN communicator. */ + lbann_comm* m_comm; /** @brief Sources of gradient contributions. * @@ -235,48 +161,18 @@ class optimizer { /** @brief Status of values in objective function gradient. */ optimizer_gradient_status m_gradient_status = optimizer_gradient_status::cleared; - /** @brief Communication request object for gradient allreduce. - * - * Used to synchronize non-blocking allreduce. - */ - Al::request m_gradient_allreduce_req; - - /** @brief Scaling factor for optimization step sizes. - * - * This is not used by the base optimizer class, but is currently - * used by all derived optimizer classes. There are several cases - * where it is convenient to expose this in the base class, - * e.g. for variable learning rate schedules. - * @todo Consider moving this to the derived classes. - */ - DataType m_learning_rate; - /** @brief Time spent in optimization step. */ EvalType m_step_time = 0; - /** @brief Launch non-blocking allreduce on the gradient, if needed. - * - * Does nothing if an allreduce is not needed or has already been - * started. - */ - void start_gradient_allreduce(); - - /** @brief Synchronize non-blocking allreduce on the gradient, if needed. - * - * Does nothing if an allreduce isn't needed. Throws an exception - * if an allreduce is needed but hasn't been started. - */ - void finish_gradient_allreduce(); - public: // =========================================== // Checkpointing // =========================================== - virtual bool save_to_checkpoint_shared(persist& p, std::string m_name); - virtual bool load_from_checkpoint_shared(persist& p, std::string m_name); - virtual bool save_to_checkpoint_distributed(persist& p, std::string m_name); - virtual bool load_from_checkpoint_distributed(persist& p, std::string m_name); + virtual bool save_to_checkpoint_shared(persist& p, std::string m_name) = 0; + virtual bool load_from_checkpoint_shared(persist& p, std::string m_name) = 0; + virtual bool save_to_checkpoint_distributed(persist& p, std::string m_name) = 0; + virtual bool load_from_checkpoint_distributed(persist& p, std::string m_name) = 0; }; diff --git a/include/lbann/optimizers/rmsprop.hpp b/include/lbann/optimizers/rmsprop.hpp index 2737c33c9d6..6b544eba0fc 100644 --- a/include/lbann/optimizers/rmsprop.hpp +++ b/include/lbann/optimizers/rmsprop.hpp @@ -27,8 +27,10 @@ #ifndef LBANN_OPTIMIZERS_RMSPROP_HPP_INCLUDED #define LBANN_OPTIMIZERS_RMSPROP_HPP_INCLUDED -#include "lbann/optimizers/optimizer.hpp" +#include "lbann/optimizers/data_type_optimizer.hpp" #include +#include "lbann/io/persist.hpp" +#include namespace lbann { @@ -37,12 +39,28 @@ namespace lbann { * See * https://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf. */ -class rmsprop : public optimizer { +template +class rmsprop : public data_type_optimizer { public: + /** @name Public Types */ + ///@{ - rmsprop(DataType learning_rate, - DataType decay_rate, - DataType eps = 1e-8); + /** @brief The tensor type expected in this object. */ + using AbsDistMatrixType = El::AbstractDistMatrix; + + /** @brief The optimizer base type of this object. */ + using OptimizerType = data_type_optimizer; + + /** @brief The concrete weights type used by this object. */ + using WeightsType = data_type_weights; + + ///@} + +public: + + rmsprop(TensorDataType learning_rate, + TensorDataType decay_rate, + TensorDataType eps = 1e-8); rmsprop(const rmsprop& other); rmsprop& operator=(const rmsprop& other); ~rmsprop() override = default; @@ -53,28 +71,28 @@ class rmsprop : public optimizer { /** Human-readable description. */ description get_description() const override; - void setup(weights* w = nullptr) override; + void setup(WeightsType* w = nullptr) override; protected: /** Computation for an optimization step. */ - void step_compute(AbsDistMat& values, - const AbsDistMat& gradient) override; + void step_compute(AbsDistMatrixType& values, + const AbsDistMatrixType& gradient) override; private: /** Decay rate. */ - DataType m_decay_rate; + TensorDataType m_decay_rate; /** Small factor to avoid division by zero. */ - DataType m_eps; + TensorDataType m_eps; /** RMSprop cache. */ - std::unique_ptr m_cache; + std::unique_ptr m_cache; /** CPU implementation of optimization step. */ - void step_compute_cpu(AbsDistMat& values, const AbsDistMat& gradient); + void step_compute_cpu(AbsDistMatrixType& values, const AbsDistMatrixType& gradient); #ifdef LBANN_HAS_CUDA /** GPU implementation of optimization step. */ - void step_compute_gpu(AbsDistMat& values, const AbsDistMat& gradient); + void step_compute_gpu(AbsDistMatrixType& values, const AbsDistMatrixType& gradient); #endif // LBANN_HAS_CUDA // =========================================== @@ -82,7 +100,7 @@ class rmsprop : public optimizer { // =========================================== struct packing_header { - DataType decay_rate; + TensorDataType decay_rate; }; bool pack_scalars(persist& p) { diff --git a/include/lbann/optimizers/sgd.hpp b/include/lbann/optimizers/sgd.hpp index 8d6960f4e34..761b15298a4 100644 --- a/include/lbann/optimizers/sgd.hpp +++ b/include/lbann/optimizers/sgd.hpp @@ -27,7 +27,9 @@ #ifndef LBANN_OPTIMIZERS_SGD_HPP_INCLUDED #define LBANN_OPTIMIZERS_SGD_HPP_INCLUDED -#include "lbann/optimizers/optimizer.hpp" +#include "lbann/optimizers/data_type_optimizer.hpp" +#include "lbann/io/persist.hpp" +#include namespace lbann { @@ -35,15 +37,31 @@ namespace lbann { * @details Supports momentum and Nesterov acceleration. * @todo Dedicated optimizers for momentum or Nesterov SGD. */ -class sgd : public optimizer { +template +class sgd : public data_type_optimizer { + +public: + /** @name Public Types */ + ///@{ + + /** @brief The tensor type expected in this object. */ + using AbsDistMatrixType = El::AbstractDistMatrix; + + /** @brief The optimizer base type of this object. */ + using OptimizerType = data_type_optimizer; + + /** @brief The concrete weights type used by this object. */ + using WeightsType = data_type_weights; + + ///@} public: /** @name Life cycle functions */ ///@{ - sgd(DataType learning_rate, - DataType momentum = 0, + sgd(TensorDataType learning_rate, + TensorDataType momentum = 0, bool nesterov = false); sgd(const sgd& other); sgd& operator=(const sgd& other); @@ -68,11 +86,11 @@ class sgd : public optimizer { /** @brief Decay rate for gradient accumulation. * @details A momentum of zero corresponds to vanilla SGD. */ - DataType get_momentum() const noexcept { return m_momentum; } + TensorDataType get_momentum() const noexcept { return m_momentum; } /** @brief Decay rate for gradient accumulation. * @details A momentum of zero corresponds to vanilla SGD. */ - void set_momentum(DataType momentum) { m_momentum = momentum; } + void set_momentum(TensorDataType momentum) { m_momentum = momentum; } /** Whether Nesterov acceleration is applied. */ bool using_nesterov() const noexcept { return m_nesterov; } @@ -80,49 +98,49 @@ class sgd : public optimizer { void set_nesterov(bool nesterov) { m_nesterov = nesterov; } /** Accumulated gradients for momentum optimizer. */ - const AbsDistMat& get_velocity() const; + const AbsDistMatrixType& get_velocity() const; /** Accumulated gradients for momentum optimizer. */ - AbsDistMat& get_velocity(); + AbsDistMatrixType& get_velocity(); ///@} /** @name Setup */ ///@{ - void setup(weights* w = nullptr) override; + void setup(WeightsType* w = nullptr) override; ///@} protected: /** Computation for an optimization step. */ - void step_compute(AbsDistMat& values, const AbsDistMat& gradient) override; + void step_compute(AbsDistMatrixType& values, const AbsDistMatrixType& gradient) override; private: /** @brief Decay rate for gradient accumulation. * @details A momentum of zero corresponds to vanilla SGD. */ - DataType m_momentum; + TensorDataType m_momentum; /** Whether Nesterov acceleration is used. */ bool m_nesterov; /** @brief Accumulated gradients. * @details Not used for vanilla SGD. */ - std::unique_ptr m_velocity; + std::unique_ptr m_velocity; /** CPU implementation of momentum or Nesterov step. */ - void momentum_step_cpu(AbsDistMat& values, const AbsDistMat& gradient); + void momentum_step_cpu(AbsDistMatrixType& values, const AbsDistMatrixType& gradient); #ifdef LBANN_HAS_CUDA /** GPU implementation of momentum or Nesterov step. */ - void momentum_step_gpu(AbsDistMat& values, const AbsDistMat& gradient); + void momentum_step_gpu(AbsDistMatrixType& values, const AbsDistMatrixType& gradient); #endif // LBANN_HAS_CUDA /** @name Checkpointing */ ///@{ struct packing_header { - DataType momentum; + TensorDataType momentum; }; bool pack_scalars(persist& p) { diff --git a/include/lbann/proto/factories.hpp b/include/lbann/proto/factories.hpp index 41836a51de2..c7569ace608 100644 --- a/include/lbann/proto/factories.hpp +++ b/include/lbann/proto/factories.hpp @@ -70,7 +70,7 @@ std::vector> construct_layer_graph( const lbann_data::Model& proto_model); /** Construct a layer specified with prototext. */ -template +template std::unique_ptr construct_layer( lbann_comm* comm, const std::map& data_readers, diff --git a/include/lbann/utils/cuda.hpp b/include/lbann/utils/cuda.hpp index 6fba40a03ca..fa123b17e97 100644 --- a/include/lbann/utils/cuda.hpp +++ b/include/lbann/utils/cuda.hpp @@ -221,36 +221,40 @@ class event_wrapper { * The input and output data must be on GPU and must have the same * dimensions. */ -template -void apply_entrywise_unary_operator(const AbsMat& input, - AbsMat& output); +template